35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
103 template<
typename VT
105 class TDVecDMatMultExpr :
public DenseVector< TDVecDMatMultExpr<VT,MT>, true >
106 ,
private TVecMatMultExpr
107 ,
private Computation
135 template<
typename T1,
typename T2,
typename T3 >
136 struct UseSMPAssignKernel {
137 enum { value = evaluateVector || evaluateMatrix };
148 template<
typename T1,
typename T2,
typename T3 >
149 struct UseSinglePrecisionKernel {
150 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
151 IsFloat<typename T1::ElementType>::value &&
152 IsFloat<typename T2::ElementType>::value &&
153 IsFloat<typename T3::ElementType>::value };
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseDoublePrecisionKernel {
166 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
167 IsDouble<typename T1::ElementType>::value &&
168 IsDouble<typename T2::ElementType>::value &&
169 IsDouble<typename T3::ElementType>::value };
180 template<
typename T1,
typename T2,
typename T3 >
181 struct UseSinglePrecisionComplexKernel {
182 typedef complex<float> Type;
183 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
184 IsSame<typename T1::ElementType,Type>::value &&
185 IsSame<typename T2::ElementType,Type>::value &&
186 IsSame<typename T3::ElementType,Type>::value };
197 template<
typename T1,
typename T2,
typename T3 >
198 struct UseDoublePrecisionComplexKernel {
199 typedef complex<double> Type;
200 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
201 IsSame<typename T1::ElementType,Type>::value &&
202 IsSame<typename T2::ElementType,Type>::value &&
203 IsSame<typename T3::ElementType,Type>::value };
213 template<
typename T1,
typename T2,
typename T3 >
214 struct UseDefaultKernel {
215 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
216 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
217 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
218 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
229 template<
typename T1,
typename T2,
typename T3 >
230 struct UseVectorizedDefaultKernel {
231 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
232 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
233 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
234 IntrinsicTrait<typename T1::ElementType>::addition &&
235 IntrinsicTrait<typename T1::ElementType>::multiplication };
265 enum { vectorizable = VT::vectorizable && MT::vectorizable &&
271 enum { smpAssignable = !evaluateVector && !evaluateMatrix };
300 if(
mat_.rows() != 0UL ) {
302 for(
size_t j=1UL; j<
end_; j+=2UL ) {
305 if( end_ < mat_.rows() ) {
323 return mat_.columns();
353 template<
typename T >
355 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
365 template<
typename T >
367 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
377 return vec_.isAligned() &&
mat_.isAligned();
414 template<
typename VT1 >
421 if( rhs.mat_.rows() == 0UL ) {
425 else if( rhs.mat_.columns() == 0UL ) {
437 TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
453 template<
typename VT1
457 selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
461 TDVecDMatMultExpr::selectDefaultAssignKernel( y, x, A );
463 TDVecDMatMultExpr::selectBlasAssignKernel( y, x, A );
479 template<
typename VT1
482 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
483 selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
504 template<
typename VT1
507 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
508 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
510 const size_t M( A.rows() );
511 const size_t N( A.columns() );
514 const size_t jend( N &
size_t(-2) );
516 for(
size_t j=0UL; j<N; ++j ) {
517 y[j] = x[0UL] * A(0UL,j);
519 for(
size_t i=1UL; i<M; ++i ) {
520 for(
size_t j=0UL; j<jend; j+=2UL ) {
521 y[j ] += x[i] * A(i,j );
522 y[j+1UL] += x[i] * A(i,j+1UL);
525 y[jend] += x[i] * A(i,jend);
546 template<
typename VT1
549 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
550 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
552 typedef IntrinsicTrait<ElementType> IT;
554 const size_t M( A.rows() );
555 const size_t N( A.columns() );
559 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
560 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
561 for(
size_t i=0UL; i<M; ++i ) {
563 xmm1 = xmm1 + x1 * A.load(i,j );
564 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
565 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
566 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
567 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
568 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
569 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
570 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
573 y.store( j+IT::size , xmm2 );
574 y.store( j+IT::size*2UL, xmm3 );
575 y.store( j+IT::size*3UL, xmm4 );
576 y.store( j+IT::size*4UL, xmm5 );
577 y.store( j+IT::size*5UL, xmm6 );
578 y.store( j+IT::size*6UL, xmm7 );
579 y.store( j+IT::size*7UL, xmm8 );
581 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
583 for(
size_t i=0UL; i<M; ++i ) {
585 xmm1 = xmm1 + x1 * A.load(i,j );
586 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
587 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
588 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
591 y.store( j+IT::size , xmm2 );
592 y.store( j+IT::size*2UL, xmm3 );
593 y.store( j+IT::size*3UL, xmm4 );
595 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
597 for(
size_t i=0UL; i<M; ++i ) {
599 xmm1 = xmm1 + x1 * A.load(i,j );
600 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
601 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
604 y.store( j+IT::size , xmm2 );
605 y.store( j+IT::size*2UL, xmm3 );
607 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
609 for(
size_t i=0UL; i<M; ++i ) {
611 xmm1 = xmm1 + x1 * A.load(i,j );
612 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
615 y.store( j+IT::size, xmm2 );
619 for(
size_t i=0UL; i<M; ++i ) {
620 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
642 template<
typename VT1
645 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
646 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
648 selectDefaultAssignKernel( y, x, A );
668 template<
typename VT1
671 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
672 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
674 using boost::numeric_cast;
680 const int M ( numeric_cast<int>( A.rows() ) );
681 const int N ( numeric_cast<int>( A.columns() ) );
682 const int lda( numeric_cast<int>( A.spacing() ) );
684 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
685 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
706 template<
typename VT1
709 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
710 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
712 using boost::numeric_cast;
718 const int M ( numeric_cast<int>( A.rows() ) );
719 const int N ( numeric_cast<int>( A.columns() ) );
720 const int lda( numeric_cast<int>( A.spacing() ) );
722 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
723 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
744 template<
typename VT1
747 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
748 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
750 using boost::numeric_cast;
759 const int M ( numeric_cast<int>( A.rows() ) );
760 const int N ( numeric_cast<int>( A.columns() ) );
761 const int lda( numeric_cast<int>( A.spacing() ) );
762 const complex<float> alpha( 1.0F, 0.0F );
763 const complex<float> beta ( 0.0F, 0.0F );
765 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
766 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
787 template<
typename VT1
790 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
791 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
793 using boost::numeric_cast;
802 const int M ( numeric_cast<int>( A.rows() ) );
803 const int N ( numeric_cast<int>( A.columns() ) );
804 const int lda( numeric_cast<int>( A.spacing() ) );
805 const complex<double> alpha( 1.0, 0.0 );
806 const complex<double> beta ( 0.0, 0.0 );
808 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
809 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
828 template<
typename VT1 >
858 template<
typename VT1 >
865 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
877 TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
893 template<
typename VT1
896 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
897 selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
899 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
901 TDVecDMatMultExpr::selectDefaultAddAssignKernel( y, x, A );
903 TDVecDMatMultExpr::selectBlasAddAssignKernel( y, x, A );
919 template<
typename VT1
922 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
923 selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
944 template<
typename VT1
947 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
948 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
950 const size_t M( A.rows() );
951 const size_t N( A.columns() );
954 const size_t jend( N &
size_t(-2) );
956 for(
size_t i=0UL; i<M; ++i ) {
957 for(
size_t j=0UL; j<jend; j+=2UL ) {
958 y[j ] += x[i] * A(i,j );
959 y[j+1UL] += x[i] * A(i,j+1UL);
962 y[jend] += x[i] * A(i,jend);
983 template<
typename VT1
986 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
987 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
989 typedef IntrinsicTrait<ElementType> IT;
991 const size_t M( A.rows() );
992 const size_t N( A.columns() );
996 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1005 for(
size_t i=0UL; i<M; ++i ) {
1007 xmm1 = xmm1 + x1 * A.load(i,j );
1008 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
1009 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
1010 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
1011 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
1012 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
1013 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
1014 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
1016 y.store( j , xmm1 );
1017 y.store( j+IT::size , xmm2 );
1018 y.store( j+IT::size*2UL, xmm3 );
1019 y.store( j+IT::size*3UL, xmm4 );
1020 y.store( j+IT::size*4UL, xmm5 );
1021 y.store( j+IT::size*5UL, xmm6 );
1022 y.store( j+IT::size*6UL, xmm7 );
1023 y.store( j+IT::size*7UL, xmm8 );
1025 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1030 for(
size_t i=0UL; i<M; ++i ) {
1032 xmm1 = xmm1 + x1 * A.load(i,j );
1033 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
1034 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
1035 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
1037 y.store( j , xmm1 );
1038 y.store( j+IT::size , xmm2 );
1039 y.store( j+IT::size*2UL, xmm3 );
1040 y.store( j+IT::size*3UL, xmm4 );
1042 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
1046 for(
size_t i=0UL; i<M; ++i ) {
1048 xmm1 = xmm1 + x1 * A.load(i,j );
1049 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
1050 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
1052 y.store( j , xmm1 );
1053 y.store( j+IT::size , xmm2 );
1054 y.store( j+IT::size*2UL, xmm3 );
1056 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1059 for(
size_t i=0UL; i<M; ++i ) {
1061 xmm1 = xmm1 + x1 * A.load(i,j );
1062 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
1064 y.store( j , xmm1 );
1065 y.store( j+IT::size, xmm2 );
1069 for(
size_t i=0UL; i<M; ++i ) {
1070 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
1092 template<
typename VT1
1095 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1096 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1098 selectDefaultAddAssignKernel( y, x, A );
1118 template<
typename VT1
1121 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1122 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1124 using boost::numeric_cast;
1130 const int M ( numeric_cast<int>( A.rows() ) );
1131 const int N ( numeric_cast<int>( A.columns() ) );
1132 const int lda( numeric_cast<int>( A.spacing() ) );
1134 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
1135 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1156 template<
typename VT1
1159 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1160 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1162 using boost::numeric_cast;
1168 const int M ( numeric_cast<int>( A.rows() ) );
1169 const int N ( numeric_cast<int>( A.columns() ) );
1170 const int lda( numeric_cast<int>( A.spacing() ) );
1172 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
1173 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1194 template<
typename VT1
1197 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1198 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1200 using boost::numeric_cast;
1209 const int M ( numeric_cast<int>( A.rows() ) );
1210 const int N ( numeric_cast<int>( A.columns() ) );
1211 const int lda( numeric_cast<int>( A.spacing() ) );
1212 const complex<float> alpha( 1.0F, 0.0F );
1213 const complex<float> beta ( 1.0F, 0.0F );
1215 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1216 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1237 template<
typename VT1
1240 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1241 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1243 using boost::numeric_cast;
1252 const int M ( numeric_cast<int>( A.rows() ) );
1253 const int N ( numeric_cast<int>( A.columns() ) );
1254 const int lda( numeric_cast<int>( A.spacing() ) );
1255 const complex<double> alpha( 1.0, 0.0 );
1256 const complex<double> beta ( 1.0, 0.0 );
1258 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1259 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1282 template<
typename VT1 >
1289 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1301 TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1317 template<
typename VT1
1320 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
1321 selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1323 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1325 TDVecDMatMultExpr::selectDefaultSubAssignKernel( y, x, A );
1327 TDVecDMatMultExpr::selectBlasSubAssignKernel( y, x, A );
1343 template<
typename VT1
1346 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
1347 selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1368 template<
typename VT1
1371 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1372 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1374 const size_t M( A.rows() );
1375 const size_t N( A.columns() );
1378 const size_t jend( N &
size_t(-2) );
1380 for(
size_t i=0UL; i<M; ++i ) {
1381 for(
size_t j=0UL; j<jend; j+=2UL ) {
1382 y[j ] -= x[i] * A(i,j );
1383 y[j+1UL] -= x[i] * A(i,j+1UL);
1386 y[jend] -= x[i] * A(i,jend);
1407 template<
typename VT1
1410 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1411 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1413 typedef IntrinsicTrait<ElementType> IT;
1415 const size_t M( A.rows() );
1416 const size_t N( A.columns() );
1420 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1429 for(
size_t i=0UL; i<M; ++i ) {
1431 xmm1 = xmm1 - x1 * A.load(i,j );
1432 xmm2 = xmm2 - x1 * A.load(i,j+IT::size );
1433 xmm3 = xmm3 - x1 * A.load(i,j+IT::size*2UL);
1434 xmm4 = xmm4 - x1 * A.load(i,j+IT::size*3UL);
1435 xmm5 = xmm5 - x1 * A.load(i,j+IT::size*4UL);
1436 xmm6 = xmm6 - x1 * A.load(i,j+IT::size*5UL);
1437 xmm7 = xmm7 - x1 * A.load(i,j+IT::size*6UL);
1438 xmm8 = xmm8 - x1 * A.load(i,j+IT::size*7UL);
1440 y.store( j , xmm1 );
1441 y.store( j+IT::size , xmm2 );
1442 y.store( j+IT::size*2UL, xmm3 );
1443 y.store( j+IT::size*3UL, xmm4 );
1444 y.store( j+IT::size*4UL, xmm5 );
1445 y.store( j+IT::size*5UL, xmm6 );
1446 y.store( j+IT::size*6UL, xmm7 );
1447 y.store( j+IT::size*7UL, xmm8 );
1449 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1454 for(
size_t i=0UL; i<M; ++i ) {
1456 xmm1 = xmm1 - x1 * A.load(i,j );
1457 xmm2 = xmm2 - x1 * A.load(i,j+IT::size );
1458 xmm3 = xmm3 - x1 * A.load(i,j+IT::size*2UL);
1459 xmm4 = xmm4 - x1 * A.load(i,j+IT::size*3UL);
1461 y.store( j , xmm1 );
1462 y.store( j+IT::size , xmm2 );
1463 y.store( j+IT::size*2UL, xmm3 );
1464 y.store( j+IT::size*3UL, xmm4 );
1466 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
1470 for(
size_t i=0UL; i<M; ++i ) {
1472 xmm1 = xmm1 - x1 * A.load(i,j );
1473 xmm2 = xmm2 - x1 * A.load(i,j+IT::size );
1474 xmm3 = xmm3 - x1 * A.load(i,j+IT::size*2UL);
1476 y.store( j , xmm1 );
1477 y.store( j+IT::size , xmm2 );
1478 y.store( j+IT::size*2UL, xmm3 );
1480 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1483 for(
size_t i=0UL; i<M; ++i ) {
1485 xmm1 = xmm1 - x1 * A.load(i,j );
1486 xmm2 = xmm2 - x1 * A.load(i,j+IT::size);
1488 y.store( j , xmm1 );
1489 y.store( j+IT::size, xmm2 );
1493 for(
size_t i=0UL; i<M; ++i ) {
1494 xmm1 = xmm1 -
set( x[i] ) * A.load(i,j);
1516 template<
typename VT1
1519 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1520 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1522 selectDefaultSubAssignKernel( y, x, A );
1542 template<
typename VT1
1545 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1546 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1548 using boost::numeric_cast;
1554 const int M ( numeric_cast<int>( A.rows() ) );
1555 const int N ( numeric_cast<int>( A.columns() ) );
1556 const int lda( numeric_cast<int>( A.spacing() ) );
1558 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -1.0F,
1559 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1580 template<
typename VT1
1583 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1584 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1586 using boost::numeric_cast;
1592 const int M ( numeric_cast<int>( A.rows() ) );
1593 const int N ( numeric_cast<int>( A.columns() ) );
1594 const int lda( numeric_cast<int>( A.spacing() ) );
1596 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -1.0,
1597 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1618 template<
typename VT1
1621 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1622 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1624 using boost::numeric_cast;
1633 const int M ( numeric_cast<int>( A.rows() ) );
1634 const int N ( numeric_cast<int>( A.columns() ) );
1635 const int lda( numeric_cast<int>( A.spacing() ) );
1636 const complex<float> alpha( -1.0F, 0.0F );
1637 const complex<float> beta ( 1.0F, 0.0F );
1639 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1640 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1661 template<
typename VT1
1664 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1665 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1667 using boost::numeric_cast;
1676 const int M ( numeric_cast<int>( A.rows() ) );
1677 const int N ( numeric_cast<int>( A.columns() ) );
1678 const int lda( numeric_cast<int>( A.spacing() ) );
1679 const complex<double> alpha( -1.0, 0.0 );
1680 const complex<double> beta ( 1.0, 0.0 );
1682 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1683 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1706 template<
typename VT1 >
1755 template<
typename VT
1759 :
public DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true >
1760 ,
private VecScalarMultExpr
1761 ,
private Computation
1765 typedef TDVecDMatMultExpr<VT,MT> VMM;
1777 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
1782 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
1783 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
1790 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1791 struct UseSMPAssignKernel {
1792 enum { value = evaluateVector || evaluateMatrix };
1801 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1802 struct UseSinglePrecisionKernel {
1803 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1804 IsFloat<typename T1::ElementType>::value &&
1805 IsFloat<typename T2::ElementType>::value &&
1806 IsFloat<typename T3::ElementType>::value &&
1807 !IsComplex<T4>::value };
1816 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1817 struct UseDoublePrecisionKernel {
1818 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1819 IsDouble<typename T1::ElementType>::value &&
1820 IsDouble<typename T2::ElementType>::value &&
1821 IsDouble<typename T3::ElementType>::value &&
1822 !IsComplex<T4>::value };
1831 template<
typename T1,
typename T2,
typename T3 >
1832 struct UseSinglePrecisionComplexKernel {
1833 typedef complex<float> Type;
1834 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1835 IsSame<typename T1::ElementType,Type>::value &&
1836 IsSame<typename T2::ElementType,Type>::value &&
1837 IsSame<typename T3::ElementType,Type>::value };
1846 template<
typename T1,
typename T2,
typename T3 >
1847 struct UseDoublePrecisionComplexKernel {
1848 typedef complex<double> Type;
1849 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1850 IsSame<typename T1::ElementType,Type>::value &&
1851 IsSame<typename T2::ElementType,Type>::value &&
1852 IsSame<typename T3::ElementType,Type>::value };
1860 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1861 struct UseDefaultKernel {
1862 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1863 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1864 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1865 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1874 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1875 struct UseVectorizedDefaultKernel {
1876 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1877 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1878 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1879 IsSame<typename T1::ElementType,T4>::value &&
1880 IntrinsicTrait<typename T1::ElementType>::addition &&
1881 IntrinsicTrait<typename T1::ElementType>::multiplication };
1887 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1888 typedef typename MultTrait<RES,ST>::Type
ResultType;
1891 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1896 typedef const TDVecDMatMultExpr<VT,MT>
LeftOperand;
1902 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
LT;
1905 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
RT;
1910 enum { vectorizable = VT::vectorizable && MT::vectorizable &&
1911 IsSame<VET,MET>::value &&
1912 IsSame<VET,ST>::value &&
1913 IntrinsicTrait<VET>::addition &&
1914 IntrinsicTrait<VET>::multiplication };
1917 enum { smpAssignable = !evaluateVector && !evaluateMatrix };
1926 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
1940 return vector_[index] * scalar_;
1949 inline size_t size()
const {
1950 return vector_.size();
1980 template<
typename T >
1981 inline bool canAlias(
const T* alias )
const {
1982 return vector_.canAlias( alias );
1992 template<
typename T >
1993 inline bool isAliased(
const T* alias )
const {
1994 return vector_.isAliased( alias );
2004 return vector_.isAligned();
2014 typename VMM::RightOperand A( vector_.rightOperand() );
2016 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2040 template<
typename VT1 >
2041 friend inline void assign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2047 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2048 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2050 if( right.rows() == 0UL ) {
2054 else if( right.columns() == 0UL ) {
2066 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2081 template<
typename VT1
2085 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2086 selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2088 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2090 DVecScalarMultExpr::selectDefaultAssignKernel( y, x, A, scalar );
2092 DVecScalarMultExpr::selectBlasAssignKernel( y, x, A, scalar );
2107 template<
typename VT1
2111 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2112 selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2132 template<
typename VT1
2136 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2137 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2139 const size_t M( A.rows() );
2140 const size_t N( A.columns() );
2143 const size_t jend( N &
size_t(-2) );
2145 for(
size_t j=0UL; j<N; ++j ) {
2146 y[j] = x[0UL] * A(0UL,j);
2148 for(
size_t i=1UL; i<M; ++i ) {
2149 for(
size_t j=0UL; j<jend; j+=2UL ) {
2150 y[j ] += x[i] * A(i,j );
2151 y[j+1UL] += x[i] * A(i,j+1UL);
2154 y[jend] += x[i] * A(i,jend);
2157 for(
size_t j=0UL; j<N; ++j ) {
2177 template<
typename VT1
2181 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2182 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2184 typedef IntrinsicTrait<ElementType> IT;
2186 const size_t M( A.rows() );
2187 const size_t N( A.columns() );
2193 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2194 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2195 for(
size_t i=0UL; i<M; ++i ) {
2197 xmm1 = xmm1 + x1 * A.load(i,j );
2198 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2199 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2200 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
2201 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
2202 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
2203 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
2204 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
2206 y.store( j , xmm1*factor );
2207 y.store( j+IT::size , xmm2*factor );
2208 y.store( j+IT::size*2UL, xmm3*factor );
2209 y.store( j+IT::size*3UL, xmm4*factor );
2210 y.store( j+IT::size*4UL, xmm5*factor );
2211 y.store( j+IT::size*5UL, xmm6*factor );
2212 y.store( j+IT::size*6UL, xmm7*factor );
2213 y.store( j+IT::size*7UL, xmm8*factor );
2215 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2217 for(
size_t i=0UL; i<M; ++i ) {
2219 xmm1 = xmm1 + x1 * A.load(i,j );
2220 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2221 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2222 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
2224 y.store( j , xmm1*factor );
2225 y.store( j+IT::size , xmm2*factor );
2226 y.store( j+IT::size*2UL, xmm3*factor );
2227 y.store( j+IT::size*3UL, xmm4*factor );
2229 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
2231 for(
size_t i=0UL; i<M; ++i ) {
2233 xmm1 = xmm1 + x1 * A.load(i,j );
2234 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2235 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2237 y.store( j , xmm1*factor );
2238 y.store( j+IT::size , xmm2*factor );
2239 y.store( j+IT::size*2UL, xmm3*factor );
2241 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2243 for(
size_t i=0UL; i<M; ++i ) {
2245 xmm1 = xmm1 + x1 * A.load(i,j );
2246 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
2248 y.store( j , xmm1*factor );
2249 y.store( j+IT::size, xmm2*factor );
2253 for(
size_t i=0UL; i<M; ++i ) {
2254 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
2256 y.store( j, xmm1*factor );
2274 template<
typename VT1
2278 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2279 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2281 selectDefaultAssignKernel( y, x, A, scalar );
2300 template<
typename VT1
2304 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2305 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2307 using boost::numeric_cast;
2313 const int M ( numeric_cast<int>( A.rows() ) );
2314 const int N ( numeric_cast<int>( A.columns() ) );
2315 const int lda( numeric_cast<int>( A.spacing() ) );
2317 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2318 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2338 template<
typename VT1
2342 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2343 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2345 using boost::numeric_cast;
2351 const int M ( numeric_cast<int>( A.rows() ) );
2352 const int N ( numeric_cast<int>( A.columns() ) );
2353 const int lda( numeric_cast<int>( A.spacing() ) );
2355 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2356 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2376 template<
typename VT1
2380 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2381 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2383 using boost::numeric_cast;
2392 const int M ( numeric_cast<int>( A.rows() ) );
2393 const int N ( numeric_cast<int>( A.columns() ) );
2394 const int lda( numeric_cast<int>( A.spacing() ) );
2395 const complex<float> alpha( scalar );
2396 const complex<float> beta ( 0.0F, 0.0F );
2398 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2399 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2419 template<
typename VT1
2423 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2424 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2426 using boost::numeric_cast;
2435 const int M ( numeric_cast<int>( A.rows() ) );
2436 const int N ( numeric_cast<int>( A.columns() ) );
2437 const int lda( numeric_cast<int>( A.spacing() ) );
2438 const complex<double> alpha( scalar );
2439 const complex<double> beta ( 0.0, 0.0 );
2441 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2442 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2459 template<
typename VT1 >
2460 friend inline void assign( SparseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2487 template<
typename VT1 >
2488 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2494 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2495 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2497 if( right.rows() == 0UL || right.columns() == 0UL ) {
2509 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2524 template<
typename VT1
2528 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2529 selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2531 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2533 DVecScalarMultExpr::selectDefaultAddAssignKernel( y, x, A, scalar );
2535 DVecScalarMultExpr::selectBlasAddAssignKernel( y, x, A, scalar );
2550 template<
typename VT1
2554 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2555 selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2575 template<
typename VT1
2579 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2580 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2582 y.addAssign( x * A * scalar );
2600 template<
typename VT1
2604 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2605 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2607 typedef IntrinsicTrait<ElementType> IT;
2609 const size_t M( A.rows() );
2610 const size_t N( A.columns() );
2616 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2617 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2618 for(
size_t i=0UL; i<M; ++i ) {
2620 xmm1 = xmm1 + x1 * A.load(i,j );
2621 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2622 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2623 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
2624 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
2625 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
2626 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
2627 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
2629 y.store( j , y.load(j ) + xmm1*factor );
2630 y.store( j+IT::size , y.load(j+IT::size ) + xmm2*factor );
2631 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) + xmm3*factor );
2632 y.store( j+IT::size*3UL, y.load(j+IT::size*3UL) + xmm4*factor );
2633 y.store( j+IT::size*4UL, y.load(j+IT::size*4UL) + xmm5*factor );
2634 y.store( j+IT::size*5UL, y.load(j+IT::size*5UL) + xmm6*factor );
2635 y.store( j+IT::size*6UL, y.load(j+IT::size*6UL) + xmm7*factor );
2636 y.store( j+IT::size*7UL, y.load(j+IT::size*7UL) + xmm8*factor );
2638 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2640 for(
size_t i=0UL; i<M; ++i ) {
2642 xmm1 = xmm1 + x1 * A.load(i,j );
2643 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2644 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2645 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
2647 y.store( j , y.load(j ) + xmm1*factor );
2648 y.store( j+IT::size , y.load(j+IT::size ) + xmm2*factor );
2649 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) + xmm3*factor );
2650 y.store( j+IT::size*3UL, y.load(j+IT::size*3UL) + xmm4*factor );
2652 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
2654 for(
size_t i=0UL; i<M; ++i ) {
2656 xmm1 = xmm1 + x1 * A.load(i,j );
2657 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2658 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2660 y.store( j , y.load(j ) + xmm1*factor );
2661 y.store( j+IT::size , y.load(j+IT::size ) + xmm2*factor );
2662 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) + xmm3*factor );
2664 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2666 for(
size_t i=0UL; i<M; ++i ) {
2668 xmm1 = xmm1 + x1 * A.load(i,j );
2669 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
2671 y.store( j , y.load(j ) + xmm1*factor );
2672 y.store( j+IT::size, y.load(j+IT::size) + xmm2*factor );
2676 for(
size_t i=0UL; i<M; ++i ) {
2677 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
2679 y.store( j, y.load(j) + xmm1*factor );
2698 template<
typename VT1
2702 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2703 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2705 selectDefaultAddAssignKernel( y, x, A, scalar );
2724 template<
typename VT1
2728 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2729 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2731 using boost::numeric_cast;
2737 const int M ( numeric_cast<int>( A.rows() ) );
2738 const int N ( numeric_cast<int>( A.columns() ) );
2739 const int lda( numeric_cast<int>( A.spacing() ) );
2741 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2742 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2762 template<
typename VT1
2766 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2767 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2769 using boost::numeric_cast;
2775 const int M ( numeric_cast<int>( A.rows() ) );
2776 const int N ( numeric_cast<int>( A.columns() ) );
2777 const int lda( numeric_cast<int>( A.spacing() ) );
2779 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2780 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2800 template<
typename VT1
2804 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2805 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2807 using boost::numeric_cast;
2816 const int M ( numeric_cast<int>( A.rows() ) );
2817 const int N ( numeric_cast<int>( A.columns() ) );
2818 const int lda( numeric_cast<int>( A.spacing() ) );
2819 const complex<float> alpha( scalar );
2820 const complex<float> beta ( 1.0F, 0.0F );
2822 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2823 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2843 template<
typename VT1
2847 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2848 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2850 using boost::numeric_cast;
2859 const int M ( numeric_cast<int>( A.rows() ) );
2860 const int N ( numeric_cast<int>( A.columns() ) );
2861 const int lda( numeric_cast<int>( A.spacing() ) );
2862 const complex<double> alpha( scalar );
2863 const complex<double> beta ( 1.0, 0.0 );
2865 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2866 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2887 template<
typename VT1 >
2888 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2894 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2895 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2897 if( right.rows() == 0UL || right.columns() == 0UL ) {
2909 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2924 template<
typename VT1
2928 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2929 selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2931 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2933 DVecScalarMultExpr::selectDefaultSubAssignKernel( y, x, A, scalar );
2935 DVecScalarMultExpr::selectBlasSubAssignKernel( y, x, A, scalar );
2950 template<
typename VT1
2954 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2955 selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2975 template<
typename VT1
2979 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2980 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2982 y.subAssign( x * A * scalar );
3000 template<
typename VT1
3004 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3005 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3007 typedef IntrinsicTrait<ElementType> IT;
3009 const size_t M( A.rows() );
3010 const size_t N( A.columns() );
3016 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
3017 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3018 for(
size_t i=0UL; i<M; ++i ) {
3020 xmm1 = xmm1 + x1 * A.load(i,j );
3021 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
3022 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
3023 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
3024 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
3025 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
3026 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
3027 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
3029 y.store( j , y.load(j ) - xmm1*factor );
3030 y.store( j+IT::size , y.load(j+IT::size ) - xmm2*factor );
3031 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) - xmm3*factor );
3032 y.store( j+IT::size*3UL, y.load(j+IT::size*3UL) - xmm4*factor );
3033 y.store( j+IT::size*4UL, y.load(j+IT::size*4UL) - xmm5*factor );
3034 y.store( j+IT::size*5UL, y.load(j+IT::size*5UL) - xmm6*factor );
3035 y.store( j+IT::size*6UL, y.load(j+IT::size*6UL) - xmm7*factor );
3036 y.store( j+IT::size*7UL, y.load(j+IT::size*7UL) - xmm8*factor );
3038 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
3040 for(
size_t i=0UL; i<M; ++i ) {
3042 xmm1 = xmm1 + x1 * A.load(i,j );
3043 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
3044 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
3045 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
3047 y.store( j , y.load(j ) - xmm1*factor );
3048 y.store( j+IT::size , y.load(j+IT::size ) - xmm2*factor );
3049 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) - xmm3*factor );
3050 y.store( j+IT::size*3UL, y.load(j+IT::size*3UL) - xmm4*factor );
3052 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
3054 for(
size_t i=0UL; i<M; ++i ) {
3056 xmm1 = xmm1 + x1 * A.load(i,j );
3057 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
3058 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
3060 y.store( j , y.load(j ) - xmm1*factor );
3061 y.store( j+IT::size , y.load(j+IT::size ) - xmm2*factor );
3062 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) - xmm3*factor );
3064 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
3066 for(
size_t i=0UL; i<M; ++i ) {
3068 xmm1 = xmm1 + x1 * A.load(i,j );
3069 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
3071 y.store( j , y.load(j ) - xmm1*factor );
3072 y.store( j+IT::size, y.load(j+IT::size) - xmm2*factor );
3076 for(
size_t i=0UL; i<M; ++i ) {
3077 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
3079 y.store( j, y.load(j) - xmm1*factor );
3098 template<
typename VT1
3102 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3103 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3105 selectDefaultSubAssignKernel( y, x, A, scalar );
3124 template<
typename VT1
3128 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
3129 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3131 using boost::numeric_cast;
3137 const int M ( numeric_cast<int>( A.rows() ) );
3138 const int N ( numeric_cast<int>( A.columns() ) );
3139 const int lda( numeric_cast<int>( A.spacing() ) );
3141 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
3142 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
3162 template<
typename VT1
3166 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
3167 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3169 using boost::numeric_cast;
3175 const int M ( numeric_cast<int>( A.rows() ) );
3176 const int N ( numeric_cast<int>( A.columns() ) );
3177 const int lda( numeric_cast<int>( A.spacing() ) );
3179 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
3180 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
3200 template<
typename VT1
3204 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
3205 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3207 using boost::numeric_cast;
3216 const int M ( numeric_cast<int>( A.rows() ) );
3217 const int N ( numeric_cast<int>( A.columns() ) );
3218 const int lda( numeric_cast<int>( A.spacing() ) );
3219 const complex<float> alpha( -scalar );
3220 const complex<float> beta ( 1.0F, 0.0F );
3222 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
3223 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3243 template<
typename VT1
3247 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
3248 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3250 using boost::numeric_cast;
3259 const int M ( numeric_cast<int>( A.rows() ) );
3260 const int N ( numeric_cast<int>( A.columns() ) );
3261 const int lda( numeric_cast<int>( A.spacing() ) );
3262 const complex<double> alpha( -scalar );
3263 const complex<double> beta ( 1.0, 0.0 );
3265 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
3266 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3287 template<
typename VT1 >
3288 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3361 template<
typename T1
3363 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecDMatMultExpr<T1,T2> >::Type
3368 if( (~vec).
size() != (~mat).
rows() )
3369 throw std::invalid_argument(
"Vector and matrix sizes do not match" );
3397 template<
typename T1
3400 inline const typename EnableIf< IsMatMatMultExpr<T2>, MultExprTrait<T1,T2> >::Type::Type
3420 template<
typename VT,
typename MT,
bool AF >
3425 typedef typename MultExprTrait< VT, typename SubmatrixExprTrait<const MT,AF>::Type >::Type Type;
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:376
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4579
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
MT::ResultType MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:112
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4075
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:247
MT::CompositeType MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:116
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:151
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:386
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:197
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:342
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2384
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:249
Header file for the DenseVector base class.
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the RequiresEvaluation type trait.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:295
Header file for the VecScalarMultExpr base class.
SelectType< evaluateVector, const VRT, VCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:257
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
TDVecDMatMultExpr< VT, MT > This
Type of this TDVecDMatMultExpr instance.
Definition: TDVecDMatMultExpr.h:242
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:121
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:251
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:251
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:244
Header file for the multiplication trait.
const size_t SMP_TDVECDMATMULT_THRESHOLD
SMP dense vector/row-major dense matrix multiplication threshold.This threshold represents the system...
Definition: Thresholds.h:165
Header file for the dense vector SMP implementation.
Header file for the IsDouble type trait.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDVecDMatMultExpr.h:246
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:354
TDVecDMatMultExpr(const VT &vec, const MT &mat)
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:280
Header file for the IsMatMatMultExpr type trait class.
Header file for the IsBlasCompatible type trait.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Constraints on the storage order of matrix types.
Constraint on the data type.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2382
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:366
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:269
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:322
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:91
Header file for the IsNumeric type trait.
VT::ResultType VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:111
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
System settings for the BLAS mode.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
VRT::ElementType VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:113
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:254
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
LeftOperand leftOperand() const
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:332
MRT::ElementType MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:114
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
Header file for the TVecMatMultExpr base class.
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:397
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:129
ResultType::ElementType ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:245
SelectType< evaluateMatrix, const MRT, MCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:260
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
Header file for all intrinsic functionality.
const size_t end_
End of the unrolled calculation loop.
Definition: TDVecDMatMultExpr.h:398
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:248
Header file for the sparse vector SMP implementation.
const size_t TDVECDMATMULT_THRESHOLD
Dense Vector/row-major dense matrix multiplication threshold.This setting specifies the threshold bet...
Definition: Thresholds.h:91
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
VT::CompositeType VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:115
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2379
Header file for basic type definitions.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: TransposeFlag.h:81
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:248
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:396
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
size_t rows(const Matrix< MT, SO > &m)
Returns the current number of rows of the matrix.
Definition: Matrix.h:138
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
MultTrait< VRT, MRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:243
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.