35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
101 template<
typename VT
103 class TDVecDMatMultExpr :
public DenseVector< TDVecDMatMultExpr<VT,MT>, true >
104 ,
private TVecMatMultExpr
105 ,
private Computation
134 template<
typename T1,
typename T2,
typename T3 >
135 struct UseSinglePrecisionKernel {
136 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
137 IsFloat<typename T1::ElementType>::value &&
138 IsFloat<typename T2::ElementType>::value &&
139 IsFloat<typename T3::ElementType>::value };
150 template<
typename T1,
typename T2,
typename T3 >
151 struct UseDoublePrecisionKernel {
152 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
153 IsDouble<typename T1::ElementType>::value &&
154 IsDouble<typename T2::ElementType>::value &&
155 IsDouble<typename T3::ElementType>::value };
166 template<
typename T1,
typename T2,
typename T3 >
167 struct UseSinglePrecisionComplexKernel {
168 typedef complex<float> Type;
169 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
170 IsSame<typename T1::ElementType,Type>::value &&
171 IsSame<typename T2::ElementType,Type>::value &&
172 IsSame<typename T3::ElementType,Type>::value };
183 template<
typename T1,
typename T2,
typename T3 >
184 struct UseDoublePrecisionComplexKernel {
185 typedef complex<double> Type;
186 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
187 IsSame<typename T1::ElementType,Type>::value &&
188 IsSame<typename T2::ElementType,Type>::value &&
189 IsSame<typename T3::ElementType,Type>::value };
199 template<
typename T1,
typename T2,
typename T3 >
200 struct UseDefaultKernel {
201 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
202 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
203 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
204 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
215 template<
typename T1,
typename T2,
typename T3 >
216 struct UseVectorizedDefaultKernel {
217 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
218 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
219 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
220 IntrinsicTrait<typename T1::ElementType>::addition &&
221 IntrinsicTrait<typename T1::ElementType>::multiplication };
251 enum { vectorizable = 0 };
254 enum { smpAssignable = 0 };
283 if(
mat_.rows() != 0UL ) {
285 for(
size_t j=1UL; j<
end_; j+=2UL ) {
288 if( end_ < mat_.rows() ) {
306 return mat_.columns();
336 template<
typename T >
338 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
348 template<
typename T >
350 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
374 template<
typename VT1 >
381 if( rhs.mat_.rows() == 0UL ) {
385 else if( rhs.mat_.columns() == 0UL ) {
399 TDVecDMatMultExpr::selectDefaultAssignKernel( ~lhs, x, A );
401 TDVecDMatMultExpr::selectBlasAssignKernel( ~lhs, x, A );
420 template<
typename VT1
424 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
426 const size_t M( A.rows() );
427 const size_t N( A.columns() );
430 const size_t jend( N &
size_t(-2) );
432 for(
size_t j=0UL; j<N; ++j ) {
433 y[j] = x[0UL] * A(0UL,j);
435 for(
size_t i=1UL; i<M; ++i ) {
436 for(
size_t j=0UL; j<jend; j+=2UL ) {
437 y[j ] += x[i] * A(i,j );
438 y[j+1UL] += x[i] * A(i,j+1UL);
441 y[jend] += x[i] * A(i,jend);
462 template<
typename VT1
465 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
466 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
468 typedef IntrinsicTrait<ElementType> IT;
470 const size_t M( A.rows() );
471 const size_t N( A.columns() );
475 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
476 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
477 for(
size_t i=0UL; i<M; ++i ) {
479 xmm1 = xmm1 + x1 * A.load(i,j );
480 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
481 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
482 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
483 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
484 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
485 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
486 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
489 y.store( j+IT::size , xmm2 );
490 y.store( j+IT::size*2UL, xmm3 );
491 y.store( j+IT::size*3UL, xmm4 );
492 y.store( j+IT::size*4UL, xmm5 );
493 y.store( j+IT::size*5UL, xmm6 );
494 y.store( j+IT::size*6UL, xmm7 );
495 y.store( j+IT::size*7UL, xmm8 );
497 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
499 for(
size_t i=0UL; i<M; ++i ) {
501 xmm1 = xmm1 + x1 * A.load(i,j );
502 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
503 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
504 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
507 y.store( j+IT::size , xmm2 );
508 y.store( j+IT::size*2UL, xmm3 );
509 y.store( j+IT::size*3UL, xmm4 );
511 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
513 for(
size_t i=0UL; i<M; ++i ) {
515 xmm1 = xmm1 + x1 * A.load(i,j );
516 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
517 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
520 y.store( j+IT::size , xmm2 );
521 y.store( j+IT::size*2UL, xmm3 );
523 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
525 for(
size_t i=0UL; i<M; ++i ) {
527 xmm1 = xmm1 + x1 * A.load(i,j );
528 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
531 y.store( j+IT::size, xmm2 );
535 for(
size_t i=0UL; i<M; ++i ) {
536 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
558 template<
typename VT1
561 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
562 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
564 selectDefaultAssignKernel( y, x, A );
584 template<
typename VT1
587 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
588 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
590 using boost::numeric_cast;
596 const int M ( numeric_cast<int>( A.rows() ) );
597 const int N ( numeric_cast<int>( A.columns() ) );
598 const int lda( numeric_cast<int>( A.spacing() ) );
600 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
601 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
622 template<
typename VT1
625 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
626 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
628 using boost::numeric_cast;
634 const int M ( numeric_cast<int>( A.rows() ) );
635 const int N ( numeric_cast<int>( A.columns() ) );
636 const int lda( numeric_cast<int>( A.spacing() ) );
638 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
639 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
660 template<
typename VT1
663 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
664 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
666 using boost::numeric_cast;
675 const int M ( numeric_cast<int>( A.rows() ) );
676 const int N ( numeric_cast<int>( A.columns() ) );
677 const int lda( numeric_cast<int>( A.spacing() ) );
678 const complex<float> alpha( 1.0F, 0.0F );
679 const complex<float> beta ( 0.0F, 0.0F );
681 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
682 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
703 template<
typename VT1
706 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
707 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
709 using boost::numeric_cast;
718 const int M ( numeric_cast<int>( A.rows() ) );
719 const int N ( numeric_cast<int>( A.columns() ) );
720 const int lda( numeric_cast<int>( A.spacing() ) );
721 const complex<double> alpha( 1.0, 0.0 );
722 const complex<double> beta ( 0.0, 0.0 );
724 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
725 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
744 template<
typename VT1 >
774 template<
typename VT1 >
781 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
793 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
795 TDVecDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A );
797 TDVecDMatMultExpr::selectBlasAddAssignKernel( ~lhs, x, A );
816 template<
typename VT1
819 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
820 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
822 const size_t M( A.rows() );
823 const size_t N( A.columns() );
826 const size_t jend( N &
size_t(-2) );
828 for(
size_t i=0UL; i<M; ++i ) {
829 for(
size_t j=0UL; j<jend; j+=2UL ) {
830 y[j ] += x[i] * A(i,j );
831 y[j+1UL] += x[i] * A(i,j+1UL);
834 y[jend] += x[i] * A(i,jend);
855 template<
typename VT1
858 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
859 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
861 typedef IntrinsicTrait<ElementType> IT;
863 const size_t M( A.rows() );
864 const size_t N( A.columns() );
868 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
877 for(
size_t i=0UL; i<M; ++i ) {
879 xmm1 = xmm1 + x1 * A.load(i,j );
880 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
881 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
882 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
883 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
884 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
885 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
886 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
889 y.store( j+IT::size , xmm2 );
890 y.store( j+IT::size*2UL, xmm3 );
891 y.store( j+IT::size*3UL, xmm4 );
892 y.store( j+IT::size*4UL, xmm5 );
893 y.store( j+IT::size*5UL, xmm6 );
894 y.store( j+IT::size*6UL, xmm7 );
895 y.store( j+IT::size*7UL, xmm8 );
897 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
902 for(
size_t i=0UL; i<M; ++i ) {
904 xmm1 = xmm1 + x1 * A.load(i,j );
905 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
906 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
907 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
910 y.store( j+IT::size , xmm2 );
911 y.store( j+IT::size*2UL, xmm3 );
912 y.store( j+IT::size*3UL, xmm4 );
914 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
918 for(
size_t i=0UL; i<M; ++i ) {
920 xmm1 = xmm1 + x1 * A.load(i,j );
921 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
922 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
925 y.store( j+IT::size , xmm2 );
926 y.store( j+IT::size*2UL, xmm3 );
928 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
931 for(
size_t i=0UL; i<M; ++i ) {
933 xmm1 = xmm1 + x1 * A.load(i,j );
934 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
937 y.store( j+IT::size, xmm2 );
941 for(
size_t i=0UL; i<M; ++i ) {
942 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
964 template<
typename VT1
967 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
968 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
970 selectDefaultAddAssignKernel( y, x, A );
990 template<
typename VT1
993 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
994 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
996 using boost::numeric_cast;
1002 const int M ( numeric_cast<int>( A.rows() ) );
1003 const int N ( numeric_cast<int>( A.columns() ) );
1004 const int lda( numeric_cast<int>( A.spacing() ) );
1006 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
1007 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1028 template<
typename VT1
1031 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1032 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1034 using boost::numeric_cast;
1040 const int M ( numeric_cast<int>( A.rows() ) );
1041 const int N ( numeric_cast<int>( A.columns() ) );
1042 const int lda( numeric_cast<int>( A.spacing() ) );
1044 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
1045 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1066 template<
typename VT1
1069 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1070 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1072 using boost::numeric_cast;
1081 const int M ( numeric_cast<int>( A.rows() ) );
1082 const int N ( numeric_cast<int>( A.columns() ) );
1083 const int lda( numeric_cast<int>( A.spacing() ) );
1084 const complex<float> alpha( 1.0F, 0.0F );
1085 const complex<float> beta ( 1.0F, 0.0F );
1087 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1088 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1109 template<
typename VT1
1112 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1113 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1115 using boost::numeric_cast;
1124 const int M ( numeric_cast<int>( A.rows() ) );
1125 const int N ( numeric_cast<int>( A.columns() ) );
1126 const int lda( numeric_cast<int>( A.spacing() ) );
1127 const complex<double> alpha( 1.0, 0.0 );
1128 const complex<double> beta ( 1.0, 0.0 );
1130 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1131 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1154 template<
typename VT1 >
1161 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1173 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1175 TDVecDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A );
1177 TDVecDMatMultExpr::selectBlasSubAssignKernel( ~lhs, x, A );
1196 template<
typename VT1
1199 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1200 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1202 const size_t M( A.rows() );
1203 const size_t N( A.columns() );
1206 const size_t jend( N &
size_t(-2) );
1208 for(
size_t i=0UL; i<M; ++i ) {
1209 for(
size_t j=0UL; j<jend; j+=2UL ) {
1210 y[j ] -= x[i] * A(i,j );
1211 y[j+1UL] -= x[i] * A(i,j+1UL);
1214 y[jend] -= x[i] * A(i,jend);
1235 template<
typename VT1
1238 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1239 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1241 typedef IntrinsicTrait<ElementType> IT;
1243 const size_t M( A.rows() );
1244 const size_t N( A.columns() );
1248 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1257 for(
size_t i=0UL; i<M; ++i ) {
1259 xmm1 = xmm1 - x1 * A.load(i,j );
1260 xmm2 = xmm2 - x1 * A.load(i,j+IT::size );
1261 xmm3 = xmm3 - x1 * A.load(i,j+IT::size*2UL);
1262 xmm4 = xmm4 - x1 * A.load(i,j+IT::size*3UL);
1263 xmm5 = xmm5 - x1 * A.load(i,j+IT::size*4UL);
1264 xmm6 = xmm6 - x1 * A.load(i,j+IT::size*5UL);
1265 xmm7 = xmm7 - x1 * A.load(i,j+IT::size*6UL);
1266 xmm8 = xmm8 - x1 * A.load(i,j+IT::size*7UL);
1268 y.store( j , xmm1 );
1269 y.store( j+IT::size , xmm2 );
1270 y.store( j+IT::size*2UL, xmm3 );
1271 y.store( j+IT::size*3UL, xmm4 );
1272 y.store( j+IT::size*4UL, xmm5 );
1273 y.store( j+IT::size*5UL, xmm6 );
1274 y.store( j+IT::size*6UL, xmm7 );
1275 y.store( j+IT::size*7UL, xmm8 );
1277 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1282 for(
size_t i=0UL; i<M; ++i ) {
1284 xmm1 = xmm1 - x1 * A.load(i,j );
1285 xmm2 = xmm2 - x1 * A.load(i,j+IT::size );
1286 xmm3 = xmm3 - x1 * A.load(i,j+IT::size*2UL);
1287 xmm4 = xmm4 - x1 * A.load(i,j+IT::size*3UL);
1289 y.store( j , xmm1 );
1290 y.store( j+IT::size , xmm2 );
1291 y.store( j+IT::size*2UL, xmm3 );
1292 y.store( j+IT::size*3UL, xmm4 );
1294 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
1298 for(
size_t i=0UL; i<M; ++i ) {
1300 xmm1 = xmm1 - x1 * A.load(i,j );
1301 xmm2 = xmm2 - x1 * A.load(i,j+IT::size );
1302 xmm3 = xmm3 - x1 * A.load(i,j+IT::size*2UL);
1304 y.store( j , xmm1 );
1305 y.store( j+IT::size , xmm2 );
1306 y.store( j+IT::size*2UL, xmm3 );
1308 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1311 for(
size_t i=0UL; i<M; ++i ) {
1313 xmm1 = xmm1 - x1 * A.load(i,j );
1314 xmm2 = xmm2 - x1 * A.load(i,j+IT::size);
1316 y.store( j , xmm1 );
1317 y.store( j+IT::size, xmm2 );
1321 for(
size_t i=0UL; i<M; ++i ) {
1322 xmm1 = xmm1 -
set( x[i] ) * A.load(i,j);
1344 template<
typename VT1
1347 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1348 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1350 selectDefaultSubAssignKernel( y, x, A );
1370 template<
typename VT1
1373 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1374 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1376 using boost::numeric_cast;
1382 const int M ( numeric_cast<int>( A.rows() ) );
1383 const int N ( numeric_cast<int>( A.columns() ) );
1384 const int lda( numeric_cast<int>( A.spacing() ) );
1386 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -1.0F,
1387 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1408 template<
typename VT1
1411 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1412 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1414 using boost::numeric_cast;
1420 const int M ( numeric_cast<int>( A.rows() ) );
1421 const int N ( numeric_cast<int>( A.columns() ) );
1422 const int lda( numeric_cast<int>( A.spacing() ) );
1424 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -1.0,
1425 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1446 template<
typename VT1
1449 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1450 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1452 using boost::numeric_cast;
1461 const int M ( numeric_cast<int>( A.rows() ) );
1462 const int N ( numeric_cast<int>( A.columns() ) );
1463 const int lda( numeric_cast<int>( A.spacing() ) );
1464 const complex<float> alpha( -1.0F, 0.0F );
1465 const complex<float> beta ( 1.0F, 0.0F );
1467 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1468 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1489 template<
typename VT1
1492 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1493 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1495 using boost::numeric_cast;
1504 const int M ( numeric_cast<int>( A.rows() ) );
1505 const int N ( numeric_cast<int>( A.columns() ) );
1506 const int lda( numeric_cast<int>( A.spacing() ) );
1507 const complex<double> alpha( -1.0, 0.0 );
1508 const complex<double> beta ( 1.0, 0.0 );
1510 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1511 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1534 template<
typename VT1 >
1583 template<
typename VT
1587 :
public DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true >
1588 ,
private VecScalarMultExpr
1589 ,
private Computation
1593 typedef TDVecDMatMultExpr<VT,MT> VMM;
1605 enum { evaluateVector = IsComputation<VT>::value };
1610 enum { evaluateMatrix = IsComputation<MT>::value && !MT::vectorizable &&
1611 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1619 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1620 struct UseSinglePrecisionKernel {
1621 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1622 IsFloat<typename T1::ElementType>::value &&
1623 IsFloat<typename T2::ElementType>::value &&
1624 IsFloat<typename T3::ElementType>::value &&
1625 !IsComplex<T4>::value };
1634 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1635 struct UseDoublePrecisionKernel {
1636 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1637 IsDouble<typename T1::ElementType>::value &&
1638 IsDouble<typename T2::ElementType>::value &&
1639 IsDouble<typename T3::ElementType>::value &&
1640 !IsComplex<T4>::value };
1649 template<
typename T1,
typename T2,
typename T3 >
1650 struct UseSinglePrecisionComplexKernel {
1651 typedef complex<float> Type;
1652 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1653 IsSame<typename T1::ElementType,Type>::value &&
1654 IsSame<typename T2::ElementType,Type>::value &&
1655 IsSame<typename T3::ElementType,Type>::value };
1664 template<
typename T1,
typename T2,
typename T3 >
1665 struct UseDoublePrecisionComplexKernel {
1666 typedef complex<double> Type;
1667 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1668 IsSame<typename T1::ElementType,Type>::value &&
1669 IsSame<typename T2::ElementType,Type>::value &&
1670 IsSame<typename T3::ElementType,Type>::value };
1678 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1679 struct UseDefaultKernel {
1680 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1681 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1682 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1683 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1692 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1693 struct UseVectorizedDefaultKernel {
1694 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1695 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1696 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1697 IsSame<typename T1::ElementType,T4>::value &&
1698 IntrinsicTrait<typename T1::ElementType>::addition &&
1699 IntrinsicTrait<typename T1::ElementType>::multiplication };
1705 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1706 typedef typename MultTrait<RES,ST>::Type
ResultType;
1709 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1714 typedef const TDVecDMatMultExpr<VT,MT>
LeftOperand;
1720 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
LT;
1723 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
RT;
1728 enum { vectorizable = 0 };
1731 enum { smpAssignable = 0 };
1740 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
1754 return vector_[index] * scalar_;
1763 inline size_t size()
const {
1764 return vector_.size();
1794 template<
typename T >
1795 inline bool canAlias(
const T* alias )
const {
1796 return vector_.canAlias( alias );
1806 template<
typename T >
1807 inline bool isAliased(
const T* alias )
const {
1808 return vector_.isAliased( alias );
1830 template<
typename VT1 >
1831 friend inline void assign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
1837 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
1838 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
1840 if( right.rows() == 0UL ) {
1844 else if( right.columns() == 0UL ) {
1856 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1858 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, x, A, rhs.scalar_ );
1860 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, x, A, rhs.scalar_ );
1878 template<
typename VT1
1882 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1883 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1885 const size_t M( A.rows() );
1886 const size_t N( A.columns() );
1889 const size_t jend( N &
size_t(-2) );
1891 for(
size_t j=0UL; j<N; ++j ) {
1892 y[j] = x[0UL] * A(0UL,j);
1894 for(
size_t i=1UL; i<M; ++i ) {
1895 for(
size_t j=0UL; j<jend; j+=2UL ) {
1896 y[j ] += x[i] * A(i,j );
1897 y[j+1UL] += x[i] * A(i,j+1UL);
1900 y[jend] += x[i] * A(i,jend);
1903 for(
size_t j=0UL; j<N; ++j ) {
1923 template<
typename VT1
1927 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1928 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1930 typedef IntrinsicTrait<ElementType> IT;
1932 const size_t M( A.rows() );
1933 const size_t N( A.columns() );
1939 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1940 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1941 for(
size_t i=0UL; i<M; ++i ) {
1943 xmm1 = xmm1 + x1 * A.load(i,j );
1944 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
1945 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
1946 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
1947 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
1948 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
1949 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
1950 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
1952 y.store( j , xmm1*factor );
1953 y.store( j+IT::size , xmm2*factor );
1954 y.store( j+IT::size*2UL, xmm3*factor );
1955 y.store( j+IT::size*3UL, xmm4*factor );
1956 y.store( j+IT::size*4UL, xmm5*factor );
1957 y.store( j+IT::size*5UL, xmm6*factor );
1958 y.store( j+IT::size*6UL, xmm7*factor );
1959 y.store( j+IT::size*7UL, xmm8*factor );
1961 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1963 for(
size_t i=0UL; i<M; ++i ) {
1965 xmm1 = xmm1 + x1 * A.load(i,j );
1966 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
1967 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
1968 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
1970 y.store( j , xmm1*factor );
1971 y.store( j+IT::size , xmm2*factor );
1972 y.store( j+IT::size*2UL, xmm3*factor );
1973 y.store( j+IT::size*3UL, xmm4*factor );
1975 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
1977 for(
size_t i=0UL; i<M; ++i ) {
1979 xmm1 = xmm1 + x1 * A.load(i,j );
1980 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
1981 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
1983 y.store( j , xmm1*factor );
1984 y.store( j+IT::size , xmm2*factor );
1985 y.store( j+IT::size*2UL, xmm3*factor );
1987 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1989 for(
size_t i=0UL; i<M; ++i ) {
1991 xmm1 = xmm1 + x1 * A.load(i,j );
1992 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
1994 y.store( j , xmm1*factor );
1995 y.store( j+IT::size, xmm2*factor );
1999 for(
size_t i=0UL; i<M; ++i ) {
2000 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
2002 y.store( j, xmm1*factor );
2020 template<
typename VT1
2024 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2025 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2027 selectDefaultAssignKernel( y, x, A, scalar );
2046 template<
typename VT1
2050 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2051 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2053 using boost::numeric_cast;
2059 const int M ( numeric_cast<int>( A.rows() ) );
2060 const int N ( numeric_cast<int>( A.columns() ) );
2061 const int lda( numeric_cast<int>( A.spacing() ) );
2063 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2064 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2084 template<
typename VT1
2088 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2089 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2091 using boost::numeric_cast;
2097 const int M ( numeric_cast<int>( A.rows() ) );
2098 const int N ( numeric_cast<int>( A.columns() ) );
2099 const int lda( numeric_cast<int>( A.spacing() ) );
2101 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2102 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2122 template<
typename VT1
2126 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2127 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2129 using boost::numeric_cast;
2138 const int M ( numeric_cast<int>( A.rows() ) );
2139 const int N ( numeric_cast<int>( A.columns() ) );
2140 const int lda( numeric_cast<int>( A.spacing() ) );
2141 const complex<float> alpha( scalar );
2142 const complex<float> beta ( 0.0F, 0.0F );
2144 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2145 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2165 template<
typename VT1
2169 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2170 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2172 using boost::numeric_cast;
2181 const int M ( numeric_cast<int>( A.rows() ) );
2182 const int N ( numeric_cast<int>( A.columns() ) );
2183 const int lda( numeric_cast<int>( A.spacing() ) );
2184 const complex<double> alpha( scalar );
2185 const complex<double> beta ( 0.0, 0.0 );
2187 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2188 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2205 template<
typename VT1 >
2206 friend inline void assign( SparseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2233 template<
typename VT1 >
2234 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2240 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2241 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2243 if( right.rows() == 0UL || right.columns() == 0UL ) {
2255 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2257 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2259 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2277 template<
typename VT1
2281 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2282 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2284 y.addAssign( x * A * scalar );
2302 template<
typename VT1
2306 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2307 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2309 typedef IntrinsicTrait<ElementType> IT;
2311 const size_t M( A.rows() );
2312 const size_t N( A.columns() );
2318 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2319 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2320 for(
size_t i=0UL; i<M; ++i ) {
2322 xmm1 = xmm1 + x1 * A.load(i,j );
2323 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2324 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2325 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
2326 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
2327 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
2328 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
2329 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
2331 y.store( j , y.load(j ) + xmm1*factor );
2332 y.store( j+IT::size , y.load(j+IT::size ) + xmm2*factor );
2333 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) + xmm3*factor );
2334 y.store( j+IT::size*3UL, y.load(j+IT::size*3UL) + xmm4*factor );
2335 y.store( j+IT::size*4UL, y.load(j+IT::size*4UL) + xmm5*factor );
2336 y.store( j+IT::size*5UL, y.load(j+IT::size*5UL) + xmm6*factor );
2337 y.store( j+IT::size*6UL, y.load(j+IT::size*6UL) + xmm7*factor );
2338 y.store( j+IT::size*7UL, y.load(j+IT::size*7UL) + xmm8*factor );
2340 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2342 for(
size_t i=0UL; i<M; ++i ) {
2344 xmm1 = xmm1 + x1 * A.load(i,j );
2345 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2346 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2347 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
2349 y.store( j , y.load(j ) + xmm1*factor );
2350 y.store( j+IT::size , y.load(j+IT::size ) + xmm2*factor );
2351 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) + xmm3*factor );
2352 y.store( j+IT::size*3UL, y.load(j+IT::size*3UL) + xmm4*factor );
2354 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
2356 for(
size_t i=0UL; i<M; ++i ) {
2358 xmm1 = xmm1 + x1 * A.load(i,j );
2359 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2360 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2362 y.store( j , y.load(j ) + xmm1*factor );
2363 y.store( j+IT::size , y.load(j+IT::size ) + xmm2*factor );
2364 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) + xmm3*factor );
2366 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2368 for(
size_t i=0UL; i<M; ++i ) {
2370 xmm1 = xmm1 + x1 * A.load(i,j );
2371 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
2373 y.store( j , y.load(j ) + xmm1*factor );
2374 y.store( j+IT::size, y.load(j+IT::size) + xmm2*factor );
2378 for(
size_t i=0UL; i<M; ++i ) {
2379 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
2381 y.store( j, y.load(j) + xmm1*factor );
2400 template<
typename VT1
2404 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2405 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2407 selectDefaultAddAssignKernel( y, x, A, scalar );
2426 template<
typename VT1
2430 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2431 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2433 using boost::numeric_cast;
2439 const int M ( numeric_cast<int>( A.rows() ) );
2440 const int N ( numeric_cast<int>( A.columns() ) );
2441 const int lda( numeric_cast<int>( A.spacing() ) );
2443 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2444 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2464 template<
typename VT1
2468 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2469 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2471 using boost::numeric_cast;
2477 const int M ( numeric_cast<int>( A.rows() ) );
2478 const int N ( numeric_cast<int>( A.columns() ) );
2479 const int lda( numeric_cast<int>( A.spacing() ) );
2481 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2482 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2502 template<
typename VT1
2506 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2507 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2509 using boost::numeric_cast;
2518 const int M ( numeric_cast<int>( A.rows() ) );
2519 const int N ( numeric_cast<int>( A.columns() ) );
2520 const int lda( numeric_cast<int>( A.spacing() ) );
2521 const complex<float> alpha( scalar );
2522 const complex<float> beta ( 1.0F, 0.0F );
2524 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2525 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2545 template<
typename VT1
2549 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2550 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2552 using boost::numeric_cast;
2561 const int M ( numeric_cast<int>( A.rows() ) );
2562 const int N ( numeric_cast<int>( A.columns() ) );
2563 const int lda( numeric_cast<int>( A.spacing() ) );
2564 const complex<double> alpha( scalar );
2565 const complex<double> beta ( 1.0, 0.0 );
2567 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2568 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2589 template<
typename VT1 >
2590 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2596 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2597 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2599 if( right.rows() == 0UL || right.columns() == 0UL ) {
2611 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2613 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2615 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2633 template<
typename VT1
2637 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2638 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2640 y.subAssign( x * A * scalar );
2658 template<
typename VT1
2662 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2663 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2665 typedef IntrinsicTrait<ElementType> IT;
2667 const size_t M( A.rows() );
2668 const size_t N( A.columns() );
2674 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2675 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2676 for(
size_t i=0UL; i<M; ++i ) {
2678 xmm1 = xmm1 + x1 * A.load(i,j );
2679 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2680 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2681 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
2682 xmm5 = xmm5 + x1 * A.load(i,j+IT::size*4UL);
2683 xmm6 = xmm6 + x1 * A.load(i,j+IT::size*5UL);
2684 xmm7 = xmm7 + x1 * A.load(i,j+IT::size*6UL);
2685 xmm8 = xmm8 + x1 * A.load(i,j+IT::size*7UL);
2687 y.store( j , y.load(j ) - xmm1*factor );
2688 y.store( j+IT::size , y.load(j+IT::size ) - xmm2*factor );
2689 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) - xmm3*factor );
2690 y.store( j+IT::size*3UL, y.load(j+IT::size*3UL) - xmm4*factor );
2691 y.store( j+IT::size*4UL, y.load(j+IT::size*4UL) - xmm5*factor );
2692 y.store( j+IT::size*5UL, y.load(j+IT::size*5UL) - xmm6*factor );
2693 y.store( j+IT::size*6UL, y.load(j+IT::size*6UL) - xmm7*factor );
2694 y.store( j+IT::size*7UL, y.load(j+IT::size*7UL) - xmm8*factor );
2696 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2698 for(
size_t i=0UL; i<M; ++i ) {
2700 xmm1 = xmm1 + x1 * A.load(i,j );
2701 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2702 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2703 xmm4 = xmm4 + x1 * A.load(i,j+IT::size*3UL);
2705 y.store( j , y.load(j ) - xmm1*factor );
2706 y.store( j+IT::size , y.load(j+IT::size ) - xmm2*factor );
2707 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) - xmm3*factor );
2708 y.store( j+IT::size*3UL, y.load(j+IT::size*3UL) - xmm4*factor );
2710 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
2712 for(
size_t i=0UL; i<M; ++i ) {
2714 xmm1 = xmm1 + x1 * A.load(i,j );
2715 xmm2 = xmm2 + x1 * A.load(i,j+IT::size );
2716 xmm3 = xmm3 + x1 * A.load(i,j+IT::size*2UL);
2718 y.store( j , y.load(j ) - xmm1*factor );
2719 y.store( j+IT::size , y.load(j+IT::size ) - xmm2*factor );
2720 y.store( j+IT::size*2UL, y.load(j+IT::size*2UL) - xmm3*factor );
2722 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2724 for(
size_t i=0UL; i<M; ++i ) {
2726 xmm1 = xmm1 + x1 * A.load(i,j );
2727 xmm2 = xmm2 + x1 * A.load(i,j+IT::size);
2729 y.store( j , y.load(j ) - xmm1*factor );
2730 y.store( j+IT::size, y.load(j+IT::size) - xmm2*factor );
2734 for(
size_t i=0UL; i<M; ++i ) {
2735 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
2737 y.store( j, y.load(j) - xmm1*factor );
2756 template<
typename VT1
2760 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2761 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2763 selectDefaultSubAssignKernel( y, x, A, scalar );
2782 template<
typename VT1
2786 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2787 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2789 using boost::numeric_cast;
2795 const int M ( numeric_cast<int>( A.rows() ) );
2796 const int N ( numeric_cast<int>( A.columns() ) );
2797 const int lda( numeric_cast<int>( A.spacing() ) );
2799 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
2800 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2820 template<
typename VT1
2824 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2825 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2827 using boost::numeric_cast;
2833 const int M ( numeric_cast<int>( A.rows() ) );
2834 const int N ( numeric_cast<int>( A.columns() ) );
2835 const int lda( numeric_cast<int>( A.spacing() ) );
2837 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
2838 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2858 template<
typename VT1
2862 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2863 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2865 using boost::numeric_cast;
2874 const int M ( numeric_cast<int>( A.rows() ) );
2875 const int N ( numeric_cast<int>( A.columns() ) );
2876 const int lda( numeric_cast<int>( A.spacing() ) );
2877 const complex<float> alpha( -scalar );
2878 const complex<float> beta ( 1.0F, 0.0F );
2880 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2881 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2901 template<
typename VT1
2905 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2906 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2908 using boost::numeric_cast;
2917 const int M ( numeric_cast<int>( A.rows() ) );
2918 const int N ( numeric_cast<int>( A.columns() ) );
2919 const int lda( numeric_cast<int>( A.spacing() ) );
2920 const complex<double> alpha( -scalar );
2921 const complex<double> beta ( 1.0, 0.0 );
2923 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2924 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2945 template<
typename VT1 >
2946 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3019 template<
typename T1
3021 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecDMatMultExpr<T1,T2> >::Type
3026 if( (~vec).
size() != (~mat).
rows() )
3027 throw std::invalid_argument(
"Vector and matrix sizes do not match" );
3055 template<
typename T1
3058 inline const typename EnableIf< IsMatMatMultExpr<T2>, MultExprTrait<T1,T2> >::Type::Type
3078 template<
typename VT,
typename MT >
3083 typedef typename MultExprTrait< VT, typename SubmatrixExprTrait<const MT>::Type >::Type Type;
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4512
MT::ResultType MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:110
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:3703
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:233
MT::CompositeType MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:114
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:196
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:325
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2375
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:248
Header file for the DenseVector base class.
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the RequiresEvaluation type trait.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:278
Header file for the VecScalarMultExpr base class.
SelectType< evaluateVector, const VRT, VCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:243
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
TDVecDMatMultExpr< VT, MT > This
Type of this TDVecDMatMultExpr instance.
Definition: TDVecDMatMultExpr.h:228
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:237
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:250
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:230
Header file for the multiplication trait.
Header file for the IsDouble type trait.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDVecDMatMultExpr.h:232
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:337
TDVecDMatMultExpr(const VT &vec, const MT &mat)
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:263
Header file for the IsMatMatMultExpr type trait class.
Header file for the IsBlasCompatible type trait.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Constraints on the storage order of matrix types.
Constraint on the data type.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2373
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:349
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:269
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:305
Header file for the IsNumeric type trait.
VT::ResultType VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:109
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
System settings for the BLAS mode.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:648
Header file for run time assertion macros.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
VRT::ElementType VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:111
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:240
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
LeftOperand leftOperand() const
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:315
MRT::ElementType MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:112
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
Header file for the TVecMatMultExpr base class.
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:357
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:129
ResultType::ElementType ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:231
SelectType< evaluateMatrix, const MRT, MCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:246
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
Header file for all intrinsic functionality.
const size_t end_
End of the unrolled calculation loop.
Definition: TDVecDMatMultExpr.h:358
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:247
const size_t TDVECDMATMULT_THRESHOLD
Dense Vector/row-major dense matrix multiplication threshold.This setting specifies the threshold bet...
Definition: Thresholds.h:85
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
VT::CompositeType VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:113
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2370
Header file for basic type definitions.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: TransposeFlag.h:81
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:234
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:356
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
size_t rows(const Matrix< MT, SO > &m)
Returns the current number of rows of the matrix.
Definition: Matrix.h:138
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
MultTrait< VRT, MRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:229
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.