35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
103 template<
typename VT
105 class TDVecTDMatMultExpr :
public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
106 ,
private TVecMatMultExpr
107 ,
private Computation
135 template<
typename T1,
typename T2,
typename T3 >
136 struct UseSMPAssignKernel {
137 enum { value = evaluateVector || evaluateMatrix };
148 template<
typename T1,
typename T2,
typename T3 >
149 struct UseSinglePrecisionKernel {
150 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
151 IsFloat<typename T1::ElementType>::value &&
152 IsFloat<typename T2::ElementType>::value &&
153 IsFloat<typename T3::ElementType>::value };
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseDoublePrecisionKernel {
166 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
167 IsDouble<typename T1::ElementType>::value &&
168 IsDouble<typename T2::ElementType>::value &&
169 IsDouble<typename T3::ElementType>::value };
180 template<
typename T1,
typename T2,
typename T3 >
181 struct UseSinglePrecisionComplexKernel {
182 typedef complex<float> Type;
183 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
184 IsSame<typename T1::ElementType,Type>::value &&
185 IsSame<typename T2::ElementType,Type>::value &&
186 IsSame<typename T3::ElementType,Type>::value };
197 template<
typename T1,
typename T2,
typename T3 >
198 struct UseDoublePrecisionComplexKernel {
199 typedef complex<double> Type;
200 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
201 IsSame<typename T1::ElementType,Type>::value &&
202 IsSame<typename T2::ElementType,Type>::value &&
203 IsSame<typename T3::ElementType,Type>::value };
213 template<
typename T1,
typename T2,
typename T3 >
214 struct UseDefaultKernel {
215 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
216 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
217 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
218 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
229 template<
typename T1,
typename T2,
typename T3 >
230 struct UseVectorizedDefaultKernel {
231 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
232 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
233 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
234 IntrinsicTrait<typename T1::ElementType>::addition &&
235 IntrinsicTrait<typename T1::ElementType>::multiplication };
265 enum { vectorizable = VT::vectorizable && MT::vectorizable &&
271 enum { smpAssignable = !evaluateVector && !evaluateMatrix };
300 if(
mat_.rows() != 0UL ) {
302 for(
size_t j=1UL; j<
end_; j+=2UL ) {
305 if( end_ < mat_.rows() ) {
323 return mat_.columns();
353 template<
typename T >
355 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
365 template<
typename T >
367 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
377 return vec_.isAligned() &&
mat_.isAligned();
414 template<
typename VT1 >
421 if( rhs.mat_.rows() == 0UL ) {
425 else if( rhs.mat_.columns() == 0UL ) {
437 TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
453 template<
typename VT1
457 selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
461 TDVecTDMatMultExpr::selectDefaultAssignKernel( y, x, A );
463 TDVecTDMatMultExpr::selectBlasAssignKernel( y, x, A );
479 template<
typename VT1
482 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
483 selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
504 template<
typename VT1
507 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
508 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
529 template<
typename VT1
532 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
533 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
535 typedef IntrinsicTrait<ElementType> IT;
537 const size_t M( A.rows() );
538 const size_t N( A.columns() );
542 for( ; (j+8UL) <= N; j+=8UL ) {
543 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
544 for(
size_t i=0UL; i<M; i+=IT::size ) {
546 xmm1 = xmm1 + x1 * A.load(i,j );
547 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
548 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
549 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
550 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
551 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
552 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
553 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
556 y[j+1UL] =
sum( xmm2 );
557 y[j+2UL] =
sum( xmm3 );
558 y[j+3UL] =
sum( xmm4 );
559 y[j+4UL] =
sum( xmm5 );
560 y[j+5UL] =
sum( xmm6 );
561 y[j+6UL] =
sum( xmm7 );
562 y[j+7UL] =
sum( xmm8 );
564 for( ; (j+4UL) <= N; j+=4UL ) {
566 for(
size_t i=0UL; i<M; i+=IT::size ) {
568 xmm1 = xmm1 + x1 * A.load(i,j );
569 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
570 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
571 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
574 y[j+1UL] =
sum( xmm2 );
575 y[j+2UL] =
sum( xmm3 );
576 y[j+3UL] =
sum( xmm4 );
578 for( ; (j+3UL) <= N; j+=3UL ) {
580 for(
size_t i=0UL; i<M; i+=IT::size ) {
582 xmm1 = xmm1 + x1 * A.load(i,j );
583 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
584 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
587 y[j+1UL] =
sum( xmm2 );
588 y[j+2UL] =
sum( xmm3 );
590 for( ; (j+2UL) <= N; j+=2UL ) {
592 for(
size_t i=0UL; i<M; i+=IT::size ) {
594 xmm1 = xmm1 + x1 * A.load(i,j );
595 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
598 y[j+1UL] =
sum( xmm2 );
602 for(
size_t i=0UL; i<M; i+=IT::size ) {
603 xmm1 = xmm1 + A.load(i,j) * x.load(i);
625 template<
typename VT1
628 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
629 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
631 selectDefaultAssignKernel( y, x, A );
651 template<
typename VT1
654 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
655 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
657 using boost::numeric_cast;
663 const int M ( numeric_cast<int>( A.rows() ) );
664 const int N ( numeric_cast<int>( A.columns() ) );
665 const int lda( numeric_cast<int>( A.spacing() ) );
667 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
668 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
689 template<
typename VT1
692 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
693 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
695 using boost::numeric_cast;
701 const int M ( numeric_cast<int>( A.rows() ) );
702 const int N ( numeric_cast<int>( A.columns() ) );
703 const int lda( numeric_cast<int>( A.spacing() ) );
705 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
706 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
727 template<
typename VT1
730 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
731 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
733 using boost::numeric_cast;
742 const int M ( numeric_cast<int>( A.rows() ) );
743 const int N ( numeric_cast<int>( A.columns() ) );
744 const int lda( numeric_cast<int>( A.spacing() ) );
745 const complex<float> alpha( 1.0F, 0.0F );
746 const complex<float> beta ( 0.0F, 0.0F );
748 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
749 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
770 template<
typename VT1
773 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
774 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
776 using boost::numeric_cast;
785 const int M ( numeric_cast<int>( A.rows() ) );
786 const int N ( numeric_cast<int>( A.columns() ) );
787 const int lda( numeric_cast<int>( A.spacing() ) );
788 const complex<double> alpha( 1.0, 0.0 );
789 const complex<double> beta ( 0.0, 0.0 );
791 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
792 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
811 template<
typename VT1 >
841 template<
typename VT1 >
848 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
860 TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
876 template<
typename VT1
879 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
880 selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
882 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
884 TDVecTDMatMultExpr::selectDefaultAddAssignKernel( y, x, A );
886 TDVecTDMatMultExpr::selectBlasAddAssignKernel( y, x, A );
902 template<
typename VT1
905 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
906 selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
927 template<
typename VT1
930 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
931 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
933 y.addAssign( x * A );
952 template<
typename VT1
955 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
956 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
958 typedef IntrinsicTrait<ElementType> IT;
960 const size_t M( A.rows() );
961 const size_t N( A.columns() );
965 for( ; (j+8UL) <= N; j+=8UL ) {
966 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
967 for(
size_t i=0UL; i<M; i+=IT::size ) {
969 xmm1 = xmm1 + x1 * A.load(i,j );
970 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
971 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
972 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
973 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
974 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
975 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
976 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
978 y[j ] +=
sum( xmm1 );
979 y[j+1UL] +=
sum( xmm2 );
980 y[j+2UL] +=
sum( xmm3 );
981 y[j+3UL] +=
sum( xmm4 );
982 y[j+4UL] +=
sum( xmm5 );
983 y[j+5UL] +=
sum( xmm6 );
984 y[j+6UL] +=
sum( xmm7 );
985 y[j+7UL] +=
sum( xmm8 );
987 for( ; (j+4UL) <= N; j+=4UL ) {
989 for(
size_t i=0UL; i<M; i+=IT::size ) {
991 xmm1 = xmm1 + x1 * A.load(i,j );
992 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
993 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
994 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
996 y[j ] +=
sum( xmm1 );
997 y[j+1UL] +=
sum( xmm2 );
998 y[j+2UL] +=
sum( xmm3 );
999 y[j+3UL] +=
sum( xmm4 );
1001 for( ; (j+3UL) <= N; j+=3UL ) {
1003 for(
size_t i=0UL; i<M; i+=IT::size ) {
1005 xmm1 = xmm1 + x1 * A.load(i,j );
1006 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1007 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1009 y[j ] +=
sum( xmm1 );
1010 y[j+1UL] +=
sum( xmm2 );
1011 y[j+2UL] +=
sum( xmm3 );
1013 for( ; (j+2UL) <= N; j+=2UL ) {
1015 for(
size_t i=0UL; i<M; i+=IT::size ) {
1017 xmm1 = xmm1 + x1 * A.load(i,j );
1018 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1020 y[j ] +=
sum( xmm1 );
1021 y[j+1UL] +=
sum( xmm2 );
1025 for(
size_t i=0UL; i<M; i+=IT::size ) {
1026 xmm1 = xmm1 + A.load(i,j) * x.load(i);
1028 y[j] +=
sum( xmm1 );
1048 template<
typename VT1
1051 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1052 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1054 selectDefaultAddAssignKernel( y, x, A );
1074 template<
typename VT1
1077 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1078 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1080 using boost::numeric_cast;
1086 const int M ( numeric_cast<int>( A.rows() ) );
1087 const int N ( numeric_cast<int>( A.columns() ) );
1088 const int lda( numeric_cast<int>( A.spacing() ) );
1090 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
1091 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1112 template<
typename VT1
1115 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1116 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1118 using boost::numeric_cast;
1124 const int M ( numeric_cast<int>( A.rows() ) );
1125 const int N ( numeric_cast<int>( A.columns() ) );
1126 const int lda( numeric_cast<int>( A.spacing() ) );
1128 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
1129 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1150 template<
typename VT1
1153 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1154 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1156 using boost::numeric_cast;
1165 const int M ( numeric_cast<int>( A.rows() ) );
1166 const int N ( numeric_cast<int>( A.columns() ) );
1167 const int lda( numeric_cast<int>( A.spacing() ) );
1168 const complex<float> alpha( 1.0F, 0.0F );
1169 const complex<float> beta ( 1.0F, 0.0F );
1171 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1172 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1193 template<
typename VT1
1196 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1197 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1199 using boost::numeric_cast;
1208 const int M ( numeric_cast<int>( A.rows() ) );
1209 const int N ( numeric_cast<int>( A.columns() ) );
1210 const int lda( numeric_cast<int>( A.spacing() ) );
1211 const complex<double> alpha( 1.0, 0.0 );
1212 const complex<double> beta ( 1.0, 0.0 );
1214 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1215 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1238 template<
typename VT1 >
1245 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1257 TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1273 template<
typename VT1
1276 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
1277 selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1279 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1281 TDVecTDMatMultExpr::selectDefaultSubAssignKernel( y, x, A );
1283 TDVecTDMatMultExpr::selectBlasSubAssignKernel( y, x, A );
1299 template<
typename VT1
1302 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1> >::Type
1303 selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1324 template<
typename VT1
1327 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1328 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1330 y.subAssign( x * A );
1349 template<
typename VT1
1352 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1353 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1355 typedef IntrinsicTrait<ElementType> IT;
1357 const size_t M( A.rows() );
1358 const size_t N( A.columns() );
1362 for( ; (j+8UL) <= N; j+=8UL ) {
1363 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1364 for(
size_t i=0UL; i<M; i+=IT::size ) {
1366 xmm1 = xmm1 + x1 * A.load(i,j );
1367 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1368 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1369 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1370 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1371 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1372 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1373 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1375 y[j ] -=
sum( xmm1 );
1376 y[j+1UL] -=
sum( xmm2 );
1377 y[j+2UL] -=
sum( xmm3 );
1378 y[j+3UL] -=
sum( xmm4 );
1379 y[j+4UL] -=
sum( xmm5 );
1380 y[j+5UL] -=
sum( xmm6 );
1381 y[j+6UL] -=
sum( xmm7 );
1382 y[j+7UL] -=
sum( xmm8 );
1384 for( ; (j+4UL) <= N; j+=4UL ) {
1386 for(
size_t i=0UL; i<M; i+=IT::size ) {
1388 xmm1 = xmm1 + x1 * A.load(i,j );
1389 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1390 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1391 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1393 y[j ] -=
sum( xmm1 );
1394 y[j+1UL] -=
sum( xmm2 );
1395 y[j+2UL] -=
sum( xmm3 );
1396 y[j+3UL] -=
sum( xmm4 );
1398 for( ; (j+3UL) <= N; j+=3UL ) {
1400 for(
size_t i=0UL; i<M; i+=IT::size ) {
1402 xmm1 = xmm1 + x1 * A.load(i,j );
1403 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1404 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1406 y[j ] -=
sum( xmm1 );
1407 y[j+1UL] -=
sum( xmm2 );
1408 y[j+2UL] -=
sum( xmm3 );
1410 for( ; (j+2UL) <= N; j+=2UL ) {
1412 for(
size_t i=0UL; i<M; i+=IT::size ) {
1414 xmm1 = xmm1 + x1 * A.load(i,j );
1415 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1417 y[j ] -=
sum( xmm1 );
1418 y[j+1UL] -=
sum( xmm2 );
1422 for(
size_t i=0UL; i<M; i+=IT::size ) {
1423 xmm1 = xmm1 + A.load(i,j) * x.load(i);
1425 y[j] -=
sum( xmm1 );
1445 template<
typename VT1
1448 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1449 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1451 selectDefaultSubAssignKernel( y, x, A );
1471 template<
typename VT1
1474 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1475 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1477 using boost::numeric_cast;
1483 const int M ( numeric_cast<int>( A.rows() ) );
1484 const int N ( numeric_cast<int>( A.columns() ) );
1485 const int lda( numeric_cast<int>( A.spacing() ) );
1487 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -1.0F,
1488 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1509 template<
typename VT1
1512 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1513 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1515 using boost::numeric_cast;
1521 const int M ( numeric_cast<int>( A.rows() ) );
1522 const int N ( numeric_cast<int>( A.columns() ) );
1523 const int lda( numeric_cast<int>( A.spacing() ) );
1525 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -1.0,
1526 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1547 template<
typename VT1
1550 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1551 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1553 using boost::numeric_cast;
1562 const int M ( numeric_cast<int>( A.rows() ) );
1563 const int N ( numeric_cast<int>( A.columns() ) );
1564 const int lda( numeric_cast<int>( A.spacing() ) );
1565 const complex<float> alpha( -1.0F, 0.0F );
1566 const complex<float> beta ( 1.0F, 0.0F );
1568 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1569 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1590 template<
typename VT1
1593 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1594 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1596 using boost::numeric_cast;
1605 const int M ( numeric_cast<int>( A.rows() ) );
1606 const int N ( numeric_cast<int>( A.columns() ) );
1607 const int lda( numeric_cast<int>( A.spacing() ) );
1608 const complex<double> alpha( -1.0, 0.0 );
1609 const complex<double> beta ( 1.0, 0.0 );
1611 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1612 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1635 template<
typename VT1 >
1684 template<
typename VT
1688 :
public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
1689 ,
private VecScalarMultExpr
1690 ,
private Computation
1694 typedef TDVecTDMatMultExpr<VT,MT> VMM;
1706 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
1711 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
1712 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
1719 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1720 struct UseSMPAssignKernel {
1721 enum { value = evaluateVector || evaluateMatrix };
1730 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1731 struct UseSinglePrecisionKernel {
1732 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1733 IsFloat<typename T1::ElementType>::value &&
1734 IsFloat<typename T2::ElementType>::value &&
1735 IsFloat<typename T3::ElementType>::value &&
1736 !IsComplex<T4>::value };
1745 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1746 struct UseDoublePrecisionKernel {
1747 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1748 IsDouble<typename T1::ElementType>::value &&
1749 IsDouble<typename T2::ElementType>::value &&
1750 IsDouble<typename T3::ElementType>::value &&
1751 !IsComplex<T4>::value };
1760 template<
typename T1,
typename T2,
typename T3 >
1761 struct UseSinglePrecisionComplexKernel {
1762 typedef complex<float> Type;
1763 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1764 IsSame<typename T1::ElementType,Type>::value &&
1765 IsSame<typename T2::ElementType,Type>::value &&
1766 IsSame<typename T3::ElementType,Type>::value };
1775 template<
typename T1,
typename T2,
typename T3 >
1776 struct UseDoublePrecisionComplexKernel {
1777 typedef complex<double> Type;
1778 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1779 IsSame<typename T1::ElementType,Type>::value &&
1780 IsSame<typename T2::ElementType,Type>::value &&
1781 IsSame<typename T3::ElementType,Type>::value };
1789 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1790 struct UseDefaultKernel {
1791 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1792 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1793 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1794 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1803 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1804 struct UseVectorizedDefaultKernel {
1805 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1806 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1807 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1808 IsSame<typename T1::ElementType,T4>::value &&
1809 IntrinsicTrait<typename T1::ElementType>::addition &&
1810 IntrinsicTrait<typename T1::ElementType>::multiplication };
1816 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1817 typedef typename MultTrait<RES,ST>::Type
ResultType;
1820 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1825 typedef const TDVecTDMatMultExpr<VT,MT>
LeftOperand;
1831 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
LT;
1834 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
RT;
1839 enum { vectorizable = VT::vectorizable && MT::vectorizable &&
1840 IsSame<VET,MET>::value &&
1841 IsSame<VET,ST>::value &&
1842 IntrinsicTrait<VET>::addition &&
1843 IntrinsicTrait<VET>::multiplication };
1846 enum { smpAssignable = !evaluateVector && !evaluateMatrix };
1855 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
1869 return vector_[index] * scalar_;
1878 inline size_t size()
const {
1879 return vector_.size();
1909 template<
typename T >
1910 inline bool canAlias(
const T* alias )
const {
1911 return vector_.canAlias( alias );
1921 template<
typename T >
1922 inline bool isAliased(
const T* alias )
const {
1923 return vector_.isAliased( alias );
1933 return vector_.isAligned();
1943 typename VMM::RightOperand A( vector_.rightOperand() );
1945 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1969 template<
typename VT1
1971 friend inline void assign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
1977 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
1978 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
1980 if( right.rows() == 0UL ) {
1984 else if( right.columns() == 0UL ) {
1996 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2011 template<
typename VT1
2015 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2016 selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2018 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2020 DVecScalarMultExpr::selectDefaultAssignKernel( y, x, A, scalar );
2022 DVecScalarMultExpr::selectBlasAssignKernel( y, x, A, scalar );
2037 template<
typename VT1
2041 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2042 selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2062 template<
typename VT1
2066 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2067 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2069 y.assign( x * A * scalar );
2087 template<
typename VT1
2091 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2092 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2094 typedef IntrinsicTrait<ElementType> IT;
2096 const size_t M( A.rows() );
2097 const size_t N( A.columns() );
2101 for( ; (j+8UL) <= N; j+=8UL ) {
2102 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2103 for(
size_t i=0UL; i<M; i+=IT::size ) {
2105 xmm1 = xmm1 + x1 * A.load(i,j );
2106 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2107 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2108 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2109 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
2110 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
2111 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
2112 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
2114 y[j ] =
sum( xmm1 ) * scalar;
2115 y[j+1UL] =
sum( xmm2 ) * scalar;
2116 y[j+2UL] =
sum( xmm3 ) * scalar;
2117 y[j+3UL] =
sum( xmm4 ) * scalar;
2118 y[j+4UL] =
sum( xmm5 ) * scalar;
2119 y[j+5UL] =
sum( xmm6 ) * scalar;
2120 y[j+6UL] =
sum( xmm7 ) * scalar;
2121 y[j+7UL] =
sum( xmm8 ) * scalar;
2123 for( ; (j+4UL) <= N; j+=4UL ) {
2125 for(
size_t i=0UL; i<M; i+=IT::size ) {
2127 xmm1 = xmm1 + x1 * A.load(i,j );
2128 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2129 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2130 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2132 y[j ] =
sum( xmm1 ) * scalar;
2133 y[j+1UL] =
sum( xmm2 ) * scalar;
2134 y[j+2UL] =
sum( xmm3 ) * scalar;
2135 y[j+3UL] =
sum( xmm4 ) * scalar;
2137 for( ; (j+3UL) <= N; j+=3UL ) {
2139 for(
size_t i=0UL; i<M; i+=IT::size ) {
2141 xmm1 = xmm1 + x1 * A.load(i,j );
2142 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2143 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2145 y[j ] =
sum( xmm1 ) * scalar;
2146 y[j+1UL] =
sum( xmm2 ) * scalar;
2147 y[j+2UL] =
sum( xmm3 ) * scalar;
2149 for( ; (j+2UL) <= N; j+=2UL ) {
2151 for(
size_t i=0UL; i<M; i+=IT::size ) {
2153 xmm1 = xmm1 + x1 * A.load(i,j );
2154 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2156 y[j ] =
sum( xmm1 ) * scalar;
2157 y[j+1UL] =
sum( xmm2 ) * scalar;
2161 for(
size_t i=0UL; i<M; i+=IT::size ) {
2162 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2164 y[j] =
sum( xmm1 ) * scalar;
2182 template<
typename VT1
2186 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2187 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2189 selectDefaultAssignKernel( y, x, A, scalar );
2208 template<
typename VT1
2212 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2213 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2215 using boost::numeric_cast;
2221 const int M ( numeric_cast<int>( A.rows() ) );
2222 const int N ( numeric_cast<int>( A.columns() ) );
2223 const int lda( numeric_cast<int>( A.spacing() ) );
2225 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
2226 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2246 template<
typename VT1
2250 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2251 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2253 using boost::numeric_cast;
2259 const int M ( numeric_cast<int>( A.rows() ) );
2260 const int N ( numeric_cast<int>( A.columns() ) );
2261 const int lda( numeric_cast<int>( A.spacing() ) );
2263 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
2264 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2285 template<
typename VT1
2289 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2290 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2292 using boost::numeric_cast;
2301 const int M ( numeric_cast<int>( A.rows() ) );
2302 const int N ( numeric_cast<int>( A.columns() ) );
2303 const int lda( numeric_cast<int>( A.spacing() ) );
2304 const complex<float> alpha( scalar );
2305 const complex<float> beta ( 0.0F, 0.0F );
2307 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2308 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2329 template<
typename VT1
2333 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2334 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2336 using boost::numeric_cast;
2345 const int M ( numeric_cast<int>( A.rows() ) );
2346 const int N ( numeric_cast<int>( A.columns() ) );
2347 const int lda( numeric_cast<int>( A.spacing() ) );
2348 const complex<double> alpha( scalar );
2349 const complex<double> beta ( 0.0, 0.0 );
2351 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2352 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2369 template<
typename VT1
2371 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2398 template<
typename VT1
2400 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2406 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2407 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2409 if( right.rows() == 0UL || right.columns() == 0UL ) {
2421 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2436 template<
typename VT1
2440 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2441 selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2443 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2445 DVecScalarMultExpr::selectDefaultAddAssignKernel( y, x, A, scalar );
2447 DVecScalarMultExpr::selectBlasAddAssignKernel( y, x, A, scalar );
2462 template<
typename VT1
2466 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2467 selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2487 template<
typename VT1
2491 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2492 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2494 y.addAssign( x * A * scalar );
2512 template<
typename VT1
2516 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2517 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2519 typedef IntrinsicTrait<ElementType> IT;
2521 const size_t M( A.rows() );
2522 const size_t N( A.columns() );
2526 for( ; (j+8UL) <= N; j+=8UL ) {
2527 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2528 for(
size_t i=0UL; i<M; i+=IT::size ) {
2530 xmm1 = xmm1 + x1 * A.load(i,j );
2531 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2532 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2533 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2534 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
2535 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
2536 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
2537 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
2539 y[j ] +=
sum( xmm1 ) * scalar;
2540 y[j+1UL] +=
sum( xmm2 ) * scalar;
2541 y[j+2UL] +=
sum( xmm3 ) * scalar;
2542 y[j+3UL] +=
sum( xmm4 ) * scalar;
2543 y[j+4UL] +=
sum( xmm5 ) * scalar;
2544 y[j+5UL] +=
sum( xmm6 ) * scalar;
2545 y[j+6UL] +=
sum( xmm7 ) * scalar;
2546 y[j+7UL] +=
sum( xmm8 ) * scalar;
2548 for( ; (j+4UL) <= N; j+=4UL ) {
2550 for(
size_t i=0UL; i<M; i+=IT::size ) {
2552 xmm1 = xmm1 + x1 * A.load(i,j );
2553 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2554 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2555 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2557 y[j ] +=
sum( xmm1 ) * scalar;
2558 y[j+1UL] +=
sum( xmm2 ) * scalar;
2559 y[j+2UL] +=
sum( xmm3 ) * scalar;
2560 y[j+3UL] +=
sum( xmm4 ) * scalar;
2562 for( ; (j+3UL) <= N; j+=3UL ) {
2564 for(
size_t i=0UL; i<M; i+=IT::size ) {
2566 xmm1 = xmm1 + x1 * A.load(i,j );
2567 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2568 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2570 y[j ] +=
sum( xmm1 ) * scalar;
2571 y[j+1UL] +=
sum( xmm2 ) * scalar;
2572 y[j+2UL] +=
sum( xmm3 ) * scalar;
2574 for( ; (j+2UL) <= N; j+=2UL ) {
2576 for(
size_t i=0UL; i<M; i+=IT::size ) {
2578 xmm1 = xmm1 + x1 * A.load(i,j );
2579 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2581 y[j ] +=
sum( xmm1 ) * scalar;
2582 y[j+1UL] +=
sum( xmm2 ) * scalar;
2586 for(
size_t i=0UL; i<M; i+=IT::size ) {
2587 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2589 y[j] +=
sum( xmm1 ) * scalar;
2608 template<
typename VT1
2612 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2613 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2615 selectDefaultAddAssignKernel( y, x, A, scalar );
2634 template<
typename VT1
2638 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2639 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2641 using boost::numeric_cast;
2647 const int M ( numeric_cast<int>( A.rows() ) );
2648 const int N ( numeric_cast<int>( A.columns() ) );
2649 const int lda( numeric_cast<int>( A.spacing() ) );
2651 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
2652 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2672 template<
typename VT1
2676 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2677 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2679 using boost::numeric_cast;
2685 const int M ( numeric_cast<int>( A.rows() ) );
2686 const int N ( numeric_cast<int>( A.columns() ) );
2687 const int lda( numeric_cast<int>( A.spacing() ) );
2689 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
2690 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2711 template<
typename VT1
2715 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2716 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2718 using boost::numeric_cast;
2727 const int M ( numeric_cast<int>( A.rows() ) );
2728 const int N ( numeric_cast<int>( A.columns() ) );
2729 const int lda( numeric_cast<int>( A.spacing() ) );
2730 const complex<float> alpha( scalar );
2731 const complex<float> beta ( 1.0F, 0.0F );
2733 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2734 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2755 template<
typename VT1
2759 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2760 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2762 using boost::numeric_cast;
2771 const int M ( numeric_cast<int>( A.rows() ) );
2772 const int N ( numeric_cast<int>( A.columns() ) );
2773 const int lda( numeric_cast<int>( A.spacing() ) );
2774 const complex<double> alpha( scalar );
2775 const complex<double> beta ( 1.0, 0.0 );
2777 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2778 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2799 template<
typename VT1
2801 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2807 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2808 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2810 if( right.rows() == 0UL || right.columns() == 0UL ) {
2822 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2837 template<
typename VT1
2841 static inline typename DisableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2842 selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2844 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2846 DVecScalarMultExpr::selectDefaultSubAssignKernel( y, x, A, scalar );
2848 DVecScalarMultExpr::selectBlasSubAssignKernel( y, x, A, scalar );
2863 template<
typename VT1
2867 static inline typename EnableIf< UseSMPAssignKernel<VT1,VT2,MT1,ST2> >::Type
2868 selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2888 template<
typename VT1
2892 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2893 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2895 y.subAssign( x * A * scalar );
2913 template<
typename VT1
2917 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2918 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2920 typedef IntrinsicTrait<ElementType> IT;
2922 const size_t M( A.rows() );
2923 const size_t N( A.columns() );
2927 for( ; (j+8UL) <= N; j+=8UL ) {
2928 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2929 for(
size_t i=0UL; i<M; i+=IT::size ) {
2931 xmm1 = xmm1 + x1 * A.load(i,j );
2932 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2933 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2934 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2935 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
2936 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
2937 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
2938 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
2940 y[j ] -=
sum( xmm1 ) * scalar;
2941 y[j+1UL] -=
sum( xmm2 ) * scalar;
2942 y[j+2UL] -=
sum( xmm3 ) * scalar;
2943 y[j+3UL] -=
sum( xmm4 ) * scalar;
2944 y[j+4UL] -=
sum( xmm5 ) * scalar;
2945 y[j+5UL] -=
sum( xmm6 ) * scalar;
2946 y[j+6UL] -=
sum( xmm7 ) * scalar;
2947 y[j+7UL] -=
sum( xmm8 ) * scalar;
2949 for( ; (j+4UL) <= N; j+=4UL ) {
2951 for(
size_t i=0UL; i<M; i+=IT::size ) {
2953 xmm1 = xmm1 + x1 * A.load(i,j );
2954 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2955 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2956 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2958 y[j ] -=
sum( xmm1 ) * scalar;
2959 y[j+1UL] -=
sum( xmm2 ) * scalar;
2960 y[j+2UL] -=
sum( xmm3 ) * scalar;
2961 y[j+3UL] -=
sum( xmm4 ) * scalar;
2963 for( ; (j+3UL) <= N; j+=3UL ) {
2965 for(
size_t i=0UL; i<M; i+=IT::size ) {
2967 xmm1 = xmm1 + x1 * A.load(i,j );
2968 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2969 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2971 y[j ] -=
sum( xmm1 ) * scalar;
2972 y[j+1UL] -=
sum( xmm2 ) * scalar;
2973 y[j+2UL] -=
sum( xmm3 ) * scalar;
2975 for( ; (j+2UL) <= N; j+=2UL ) {
2977 for(
size_t i=0UL; i<M; i+=IT::size ) {
2979 xmm1 = xmm1 + x1 * A.load(i,j );
2980 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2982 y[j ] -=
sum( xmm1 ) * scalar;
2983 y[j+1UL] -=
sum( xmm2 ) * scalar;
2987 for(
size_t i=0UL; i<M; i+=IT::size ) {
2988 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2990 y[j] -=
sum( xmm1 ) * scalar;
3010 template<
typename VT1
3014 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3015 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3017 selectDefaultSubAssignKernel( y, x, A, scalar );
3036 template<
typename VT1
3040 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
3041 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3043 using boost::numeric_cast;
3049 const int M ( numeric_cast<int>( A.rows() ) );
3050 const int N ( numeric_cast<int>( A.columns() ) );
3051 const int lda( numeric_cast<int>( A.spacing() ) );
3053 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -scalar,
3054 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
3074 template<
typename VT1
3078 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
3079 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3081 using boost::numeric_cast;
3087 const int M ( numeric_cast<int>( A.rows() ) );
3088 const int N ( numeric_cast<int>( A.columns() ) );
3089 const int lda( numeric_cast<int>( A.spacing() ) );
3091 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -scalar,
3092 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
3114 template<
typename VT1
3118 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
3119 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3121 using boost::numeric_cast;
3130 const int M ( numeric_cast<int>( A.rows() ) );
3131 const int N ( numeric_cast<int>( A.columns() ) );
3132 const int lda( numeric_cast<int>( A.spacing() ) );
3133 const complex<float> alpha( -scalar );
3134 const complex<float> beta ( 1.0F, 0.0F );
3136 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
3137 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3159 template<
typename VT1
3163 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
3164 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3166 using boost::numeric_cast;
3175 const int M ( numeric_cast<int>( A.rows() ) );
3176 const int N ( numeric_cast<int>( A.columns() ) );
3177 const int lda( numeric_cast<int>( A.spacing() ) );
3178 const complex<double> alpha( -scalar );
3179 const complex<double> beta ( 1.0, 0.0 );
3181 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
3182 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3203 template<
typename VT1
3205 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3278 template<
typename T1
3280 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >::Type
3285 if( (~vec).
size() != (~mat).
rows() )
3286 throw std::invalid_argument(
"Vector and matrix sizes do not match" );
3303 template<
typename VT,
typename MT,
bool AF >
3308 typedef typename MultExprTrait< VT, typename SubmatrixExprTrait<const MT,AF>::Type >::Type Type;
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:251
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:131
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
const size_t end_
End of the unrolled calculation loop.
Definition: TDVecTDMatMultExpr.h:398
LeftOperand leftOperand() const
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:332
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:342
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4579
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4075
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:151
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:197
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:62
Header file for the IsSame and IsStrictlySame type traits.
const size_t TDVECTDMATMULT_THRESHOLD
Dense Vector/column-major dense matrix multiplication threshold.This setting specifies the threshold ...
Definition: Thresholds.h:108
Constraint on the data type.
MT::CompositeType MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:116
MRT::ElementType MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:114
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2384
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:249
SelectType< evaluateMatrix, const MRT, MCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:260
Header file for the DenseVector base class.
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
const size_t SMP_TDVECTDMATMULT_THRESHOLD
SMP dense vector/column-major dense matrix multiplication threshold.This threshold represents the sys...
Definition: Thresholds.h:178
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:254
Header file for the RequiresEvaluation type trait.
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:121
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:251
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the dense vector SMP implementation.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:376
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:354
ResultType::ElementType ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:245
Header file for the IsMatMatMultExpr type trait class.
Header file for the IsBlasCompatible type trait.
TDVecTDMatMultExpr< VT, MT > This
Type of this TDVecTDMatMultExpr instance.
Definition: TDVecTDMatMultExpr.h:242
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
VT::CompositeType VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:115
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDVecTDMatMultExpr.h:246
SelectType< evaluateVector, const VRT, VCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:257
Constraints on the storage order of matrix types.
Constraint on the data type.
MT::ResultType MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:112
VT::ResultType VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:111
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2382
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:269
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:295
Header file for the EnableIf class template.
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:91
Header file for the IsNumeric type trait.
Header file for the SubmatrixExprTrait class template.
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:386
System settings for the BLAS mode.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:396
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:248
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
Header file for the TVecMatMultExpr base class.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:247
VRT::ElementType VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:113
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:244
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:397
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:366
Header file for the IsComputation type trait class.
MultTrait< VRT, MRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:243
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:248
Header file for the sparse vector SMP implementation.
const size_t TDVECDMATMULT_THRESHOLD
Dense Vector/row-major dense matrix multiplication threshold.This setting specifies the threshold bet...
Definition: Thresholds.h:91
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2379
TDVecTDMatMultExpr(const VT &vec, const MT &mat)
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:280
Header file for basic type definitions.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: TransposeFlag.h:81
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
size_t rows(const Matrix< MT, SO > &m)
Returns the current number of rows of the matrix.
Definition: Matrix.h:138
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:322
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.