22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
86 class TDVecDMatMultExpr :
public DenseVector< TDVecDMatMultExpr<VT,MT>, true >
92 typedef typename VT::ResultType
VRT;
93 typedef typename MT::ResultType
MRT;
94 typedef typename VRT::ElementType
VET;
95 typedef typename MRT::ElementType
MET;
96 typedef typename VT::CompositeType
VCT;
97 typedef typename MT::CompositeType
MCT;
112 template<
typename T1,
typename T2,
typename T3 >
113 struct UseSinglePrecisionKernel {
114 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
128 template<
typename T1,
typename T2,
typename T3 >
129 struct UseDoublePrecisionKernel {
130 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
131 IsDouble<typename T1::ElementType>::value &&
132 IsDouble<typename T2::ElementType>::value &&
133 IsDouble<typename T3::ElementType>::value };
144 template<
typename T1,
typename T2,
typename T3 >
145 struct UseSinglePrecisionComplexKernel {
146 typedef complex<float> Type;
147 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
148 IsSame<typename T1::ElementType,Type>::value &&
149 IsSame<typename T2::ElementType,Type>::value &&
150 IsSame<typename T3::ElementType,Type>::value };
161 template<
typename T1,
typename T2,
typename T3 >
162 struct UseDoublePrecisionComplexKernel {
163 typedef complex<double> Type;
164 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
165 IsSame<typename T1::ElementType,Type>::value &&
166 IsSame<typename T2::ElementType,Type>::value &&
167 IsSame<typename T3::ElementType,Type>::value };
177 template<
typename T1,
typename T2,
typename T3 >
178 struct UseDefaultKernel {
179 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
180 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
181 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
182 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
193 template<
typename T1,
typename T2,
typename T3 >
194 struct UseVectorizedDefaultKernel {
195 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
196 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
197 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
198 IntrinsicTrait<typename T1::ElementType>::addition &&
199 IntrinsicTrait<typename T1::ElementType>::multiplication };
229 enum { vectorizable = 0 };
258 if(
mat_.rows() != 0UL ) {
260 for(
size_t j=1UL; j<
end_; j+=2UL ) {
263 if( end_ < mat_.rows() ) {
281 return mat_.columns();
311 template<
typename T >
313 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
323 template<
typename T >
325 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
349 template<
typename VT1 >
356 if( rhs.mat_.rows() == 0UL ) {
360 else if( rhs.mat_.columns() == 0UL ) {
374 TDVecDMatMultExpr::selectDefaultAssignKernel( ~lhs, x, A );
376 TDVecDMatMultExpr::selectBlasAssignKernel( ~lhs, x, A );
395 template<
typename VT1
399 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
401 const size_t M( A.rows() );
402 const size_t N( A.columns() );
405 const size_t jend( N &
size_t(-2) );
407 for(
size_t j=0UL; j<N; ++j ) {
408 y[j] = x[0UL] * A(0UL,j);
410 for(
size_t i=1UL; i<M; ++i ) {
411 for(
size_t j=0UL; j<jend; j+=2UL ) {
412 y[j ] += x[i] * A(i,j );
413 y[j+1UL] += x[i] * A(i,j+1UL);
416 y[jend] += x[i] * A(i,jend);
437 template<
typename VT1
440 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
441 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
443 typedef IntrinsicTrait<ElementType> IT;
445 const size_t M( A.rows() );
446 const size_t N( A.spacing() );
450 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
451 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
452 for(
size_t i=0UL; i<M; ++i ) {
454 xmm1 = xmm1 + x1 * A.get(i,j );
455 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
456 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
457 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
458 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
459 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
460 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
461 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
463 store( &y[j ], xmm1 );
464 store( &y[j+IT::size ], xmm2 );
465 store( &y[j+IT::size*2UL], xmm3 );
466 store( &y[j+IT::size*3UL], xmm4 );
467 store( &y[j+IT::size*4UL], xmm5 );
468 store( &y[j+IT::size*5UL], xmm6 );
469 store( &y[j+IT::size*6UL], xmm7 );
470 store( &y[j+IT::size*7UL], xmm8 );
472 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
474 for(
size_t i=0UL; i<M; ++i ) {
476 xmm1 = xmm1 + x1 * A.get(i,j );
477 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
478 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
479 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
481 store( &y[j ], xmm1 );
482 store( &y[j+IT::size ], xmm2 );
483 store( &y[j+IT::size*2UL], xmm3 );
484 store( &y[j+IT::size*3UL], xmm4 );
486 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
488 for(
size_t i=0UL; i<M; ++i ) {
490 xmm1 = xmm1 + x1 * A.get(i,j );
491 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
492 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
494 store( &y[j ], xmm1 );
495 store( &y[j+IT::size ], xmm2 );
496 store( &y[j+IT::size*2UL], xmm3 );
498 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
500 for(
size_t i=0UL; i<M; ++i ) {
502 xmm1 = xmm1 + x1 * A.get(i,j );
503 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
505 store( &y[j ], xmm1 );
506 store( &y[j+IT::size], xmm2 );
510 for(
size_t i=0UL; i<M; ++i ) {
511 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
513 store( &y[j], xmm1 );
533 template<
typename VT1
536 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
537 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
539 selectDefaultAssignKernel( y, x, A );
559 template<
typename VT1
562 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
563 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
565 using boost::numeric_cast;
571 const int M ( numeric_cast<int>( A.rows() ) );
572 const int N ( numeric_cast<int>( A.columns() ) );
573 const int lda( numeric_cast<int>( A.spacing() ) );
575 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
576 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
597 template<
typename VT1
600 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
601 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
603 using boost::numeric_cast;
609 const int M ( numeric_cast<int>( A.rows() ) );
610 const int N ( numeric_cast<int>( A.columns() ) );
611 const int lda( numeric_cast<int>( A.spacing() ) );
613 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
614 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
635 template<
typename VT1
638 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
639 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
641 using boost::numeric_cast;
650 const int M ( numeric_cast<int>( A.rows() ) );
651 const int N ( numeric_cast<int>( A.columns() ) );
652 const int lda( numeric_cast<int>( A.spacing() ) );
653 const complex<float> alpha( 1.0F, 0.0F );
654 const complex<float> beta ( 0.0F, 0.0F );
656 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
657 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
678 template<
typename VT1
681 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
682 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
684 using boost::numeric_cast;
693 const int M ( numeric_cast<int>( A.rows() ) );
694 const int N ( numeric_cast<int>( A.columns() ) );
695 const int lda( numeric_cast<int>( A.spacing() ) );
696 const complex<double> alpha( 1.0, 0.0 );
697 const complex<double> beta ( 0.0, 0.0 );
699 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
700 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
719 template<
typename VT1 >
749 template<
typename VT1 >
756 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
768 if( ( IsComputation<MT>::value && !evaluate ) ||
770 TDVecDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A );
772 TDVecDMatMultExpr::selectBlasAddAssignKernel( ~lhs, x, A );
791 template<
typename VT1
794 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
795 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
797 const size_t M( A.rows() );
798 const size_t N( A.columns() );
801 const size_t jend( N &
size_t(-2) );
803 for(
size_t i=0UL; i<M; ++i ) {
804 for(
size_t j=0UL; j<jend; j+=2UL ) {
805 y[j ] += x[i] * A(i,j );
806 y[j+1UL] += x[i] * A(i,j+1UL);
809 y[jend] += x[i] * A(i,jend);
830 template<
typename VT1
833 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
834 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
836 typedef IntrinsicTrait<ElementType> IT;
838 const size_t M( A.rows() );
839 const size_t N( A.spacing() );
843 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
852 for(
size_t i=0UL; i<M; ++i ) {
854 xmm1 = xmm1 + x1 * A.get(i,j );
855 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
856 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
857 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
858 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
859 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
860 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
861 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
863 store( &y[j ], xmm1 );
864 store( &y[j+IT::size ], xmm2 );
865 store( &y[j+IT::size*2UL], xmm3 );
866 store( &y[j+IT::size*3UL], xmm4 );
867 store( &y[j+IT::size*4UL], xmm5 );
868 store( &y[j+IT::size*5UL], xmm6 );
869 store( &y[j+IT::size*6UL], xmm7 );
870 store( &y[j+IT::size*7UL], xmm8 );
872 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
877 for(
size_t i=0UL; i<M; ++i ) {
879 xmm1 = xmm1 + x1 * A.get(i,j );
880 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
881 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
882 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
884 store( &y[j ], xmm1 );
885 store( &y[j+IT::size ], xmm2 );
886 store( &y[j+IT::size*2UL], xmm3 );
887 store( &y[j+IT::size*3UL], xmm4 );
889 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
893 for(
size_t i=0UL; i<M; ++i ) {
895 xmm1 = xmm1 + x1 * A.get(i,j );
896 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
897 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
899 store( &y[j ], xmm1 );
900 store( &y[j+IT::size ], xmm2 );
901 store( &y[j+IT::size*2UL], xmm3 );
903 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
906 for(
size_t i=0UL; i<M; ++i ) {
908 xmm1 = xmm1 + x1 * A.get(i,j );
909 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
911 store( &y[j ], xmm1 );
912 store( &y[j+IT::size], xmm2 );
916 for(
size_t i=0UL; i<M; ++i ) {
917 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
919 store( &y[j], xmm1 );
939 template<
typename VT1
942 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
943 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
945 selectDefaultAddAssignKernel( y, x, A );
965 template<
typename VT1
968 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
969 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
971 using boost::numeric_cast;
977 const int M ( numeric_cast<int>( A.rows() ) );
978 const int N ( numeric_cast<int>( A.columns() ) );
979 const int lda( numeric_cast<int>( A.spacing() ) );
981 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
982 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1003 template<
typename VT1
1006 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1007 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1009 using boost::numeric_cast;
1015 const int M ( numeric_cast<int>( A.rows() ) );
1016 const int N ( numeric_cast<int>( A.columns() ) );
1017 const int lda( numeric_cast<int>( A.spacing() ) );
1019 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
1020 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1041 template<
typename VT1
1044 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1045 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1047 using boost::numeric_cast;
1056 const int M ( numeric_cast<int>( A.rows() ) );
1057 const int N ( numeric_cast<int>( A.columns() ) );
1058 const int lda( numeric_cast<int>( A.spacing() ) );
1059 const complex<float> alpha( 1.0F, 0.0F );
1060 const complex<float> beta ( 1.0F, 0.0F );
1062 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1063 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1084 template<
typename VT1
1087 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1088 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1090 using boost::numeric_cast;
1099 const int M ( numeric_cast<int>( A.rows() ) );
1100 const int N ( numeric_cast<int>( A.columns() ) );
1101 const int lda( numeric_cast<int>( A.spacing() ) );
1102 const complex<double> alpha( 1.0, 0.0 );
1103 const complex<double> beta ( 1.0, 0.0 );
1105 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1106 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1129 template<
typename VT1 >
1136 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1148 if( ( IsComputation<MT>::value && !evaluate ) ||
1150 TDVecDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A );
1152 TDVecDMatMultExpr::selectBlasSubAssignKernel( ~lhs, x, A );
1171 template<
typename VT1
1174 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1175 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1177 const size_t M( A.rows() );
1178 const size_t N( A.columns() );
1181 const size_t jend( N &
size_t(-2) );
1183 for(
size_t i=0UL; i<M; ++i ) {
1184 for(
size_t j=0UL; j<jend; j+=2UL ) {
1185 y[j ] -= x[i] * A(i,j );
1186 y[j+1UL] -= x[i] * A(i,j+1UL);
1189 y[jend] -= x[i] * A(i,jend);
1210 template<
typename VT1
1213 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1214 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1216 typedef IntrinsicTrait<ElementType> IT;
1218 const size_t M( A.rows() );
1219 const size_t N( A.spacing() );
1223 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1232 for(
size_t i=0UL; i<M; ++i ) {
1234 xmm1 = xmm1 - x1 * A.get(i,j );
1235 xmm2 = xmm2 - x1 * A.get(i,j+IT::size );
1236 xmm3 = xmm3 - x1 * A.get(i,j+IT::size*2UL);
1237 xmm4 = xmm4 - x1 * A.get(i,j+IT::size*3UL);
1238 xmm5 = xmm5 - x1 * A.get(i,j+IT::size*4UL);
1239 xmm6 = xmm6 - x1 * A.get(i,j+IT::size*5UL);
1240 xmm7 = xmm7 - x1 * A.get(i,j+IT::size*6UL);
1241 xmm8 = xmm8 - x1 * A.get(i,j+IT::size*7UL);
1243 store( &y[j ], xmm1 );
1244 store( &y[j+IT::size ], xmm2 );
1245 store( &y[j+IT::size*2UL], xmm3 );
1246 store( &y[j+IT::size*3UL], xmm4 );
1247 store( &y[j+IT::size*4UL], xmm5 );
1248 store( &y[j+IT::size*5UL], xmm6 );
1249 store( &y[j+IT::size*6UL], xmm7 );
1250 store( &y[j+IT::size*7UL], xmm8 );
1252 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1257 for(
size_t i=0UL; i<M; ++i ) {
1259 xmm1 = xmm1 - x1 * A.get(i,j );
1260 xmm2 = xmm2 - x1 * A.get(i,j+IT::size );
1261 xmm3 = xmm3 - x1 * A.get(i,j+IT::size*2UL);
1262 xmm4 = xmm4 - x1 * A.get(i,j+IT::size*3UL);
1264 store( &y[j ], xmm1 );
1265 store( &y[j+IT::size ], xmm2 );
1266 store( &y[j+IT::size*2UL], xmm3 );
1267 store( &y[j+IT::size*3UL], xmm4 );
1269 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
1273 for(
size_t i=0UL; i<M; ++i ) {
1275 xmm1 = xmm1 - x1 * A.get(i,j );
1276 xmm2 = xmm2 - x1 * A.get(i,j+IT::size );
1277 xmm3 = xmm3 - x1 * A.get(i,j+IT::size*2UL);
1279 store( &y[j ], xmm1 );
1280 store( &y[j+IT::size ], xmm2 );
1281 store( &y[j+IT::size*2UL], xmm3 );
1283 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1286 for(
size_t i=0UL; i<M; ++i ) {
1288 xmm1 = xmm1 - x1 * A.get(i,j );
1289 xmm2 = xmm2 - x1 * A.get(i,j+IT::size);
1291 store( &y[j ], xmm1 );
1292 store( &y[j+IT::size], xmm2 );
1296 for(
size_t i=0UL; i<M; ++i ) {
1297 xmm1 = xmm1 -
set( x[i] ) * A.get(i,j);
1299 store( &y[j], xmm1 );
1319 template<
typename VT1
1322 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1323 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1325 selectDefaultSubAssignKernel( y, x, A );
1345 template<
typename VT1
1348 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1349 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1351 using boost::numeric_cast;
1357 const int M ( numeric_cast<int>( A.rows() ) );
1358 const int N ( numeric_cast<int>( A.columns() ) );
1359 const int lda( numeric_cast<int>( A.spacing() ) );
1361 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -1.0F,
1362 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1383 template<
typename VT1
1386 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1387 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1389 using boost::numeric_cast;
1395 const int M ( numeric_cast<int>( A.rows() ) );
1396 const int N ( numeric_cast<int>( A.columns() ) );
1397 const int lda( numeric_cast<int>( A.spacing() ) );
1399 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -1.0,
1400 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1421 template<
typename VT1
1424 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1425 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1427 using boost::numeric_cast;
1436 const int M ( numeric_cast<int>( A.rows() ) );
1437 const int N ( numeric_cast<int>( A.columns() ) );
1438 const int lda( numeric_cast<int>( A.spacing() ) );
1439 const complex<float> alpha( -1.0F, 0.0F );
1440 const complex<float> beta ( 1.0F, 0.0F );
1442 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1443 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1464 template<
typename VT1
1467 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1468 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1470 using boost::numeric_cast;
1479 const int M ( numeric_cast<int>( A.rows() ) );
1480 const int N ( numeric_cast<int>( A.columns() ) );
1481 const int lda( numeric_cast<int>( A.spacing() ) );
1482 const complex<double> alpha( -1.0, 0.0 );
1483 const complex<double> beta ( 1.0, 0.0 );
1485 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1486 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1509 template<
typename VT1 >
1558 template<
typename VT
1562 :
public DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true >
1563 ,
private Expression
1564 ,
private Computation
1568 typedef TDVecDMatMultExpr<VT,MT> VMM;
1569 typedef typename VMM::ResultType RES;
1570 typedef typename VT::ResultType
VRT;
1571 typedef typename MT::ResultType
MRT;
1572 typedef typename VRT::ElementType
VET;
1573 typedef typename MRT::ElementType
MET;
1574 typedef typename VT::CompositeType
VCT;
1575 typedef typename MT::CompositeType
MCT;
1580 enum { evaluate = IsComputation<MT>::value && !MT::vectorizable &&
1581 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1589 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1590 struct UseSinglePrecisionKernel {
1591 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1592 IsFloat<typename T1::ElementType>::value &&
1593 IsFloat<typename T2::ElementType>::value &&
1594 IsFloat<typename T3::ElementType>::value &&
1595 !IsComplex<T4>::value };
1604 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1605 struct UseDoublePrecisionKernel {
1606 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1607 IsDouble<typename T1::ElementType>::value &&
1608 IsDouble<typename T2::ElementType>::value &&
1609 IsDouble<typename T3::ElementType>::value &&
1610 !IsComplex<T4>::value };
1619 template<
typename T1,
typename T2,
typename T3 >
1620 struct UseSinglePrecisionComplexKernel {
1621 typedef complex<float> Type;
1622 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1623 IsSame<typename T1::ElementType,Type>::value &&
1624 IsSame<typename T2::ElementType,Type>::value &&
1625 IsSame<typename T3::ElementType,Type>::value };
1634 template<
typename T1,
typename T2,
typename T3 >
1635 struct UseDoublePrecisionComplexKernel {
1636 typedef complex<double> Type;
1637 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1638 IsSame<typename T1::ElementType,Type>::value &&
1639 IsSame<typename T2::ElementType,Type>::value &&
1640 IsSame<typename T3::ElementType,Type>::value };
1648 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1649 struct UseDefaultKernel {
1650 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1651 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1652 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1653 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1662 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1663 struct UseVectorizedDefaultKernel {
1664 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1665 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1666 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1667 IsSame<typename T1::ElementType,T4>::value &&
1668 IntrinsicTrait<typename T1::ElementType>::addition &&
1669 IntrinsicTrait<typename T1::ElementType>::multiplication };
1675 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1676 typedef typename MultTrait<RES,ST>::Type
ResultType;
1678 typedef typename ResultType::ElementType
ElementType;
1679 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1684 typedef const TDVecDMatMultExpr<VT,MT>
LeftOperand;
1690 typedef typename SelectType< IsComputation<VT>::value,
const VRT,
VCT >::Type
LT;
1693 typedef typename SelectType< evaluate, const MRT, MCT >::Type
RT;
1698 enum { vectorizable = 0 };
1707 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
1721 return vector_[index] * scalar_;
1730 inline size_t size()
const {
1731 return vector_.size();
1761 template<
typename T >
1762 inline bool canAlias(
const T* alias )
const {
1763 return vector_.canAlias( alias );
1773 template<
typename T >
1774 inline bool isAliased(
const T* alias )
const {
1775 return vector_.isAliased( alias );
1797 template<
typename VT1 >
1798 friend inline void assign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
1804 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
1805 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
1807 if( right.rows() == 0UL ) {
1811 else if( right.columns() == 0UL ) {
1823 if( ( IsComputation<MT>::value && !evaluate ) ||
1825 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, x, A, rhs.scalar_ );
1827 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, x, A, rhs.scalar_ );
1845 template<
typename VT1
1849 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1850 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1852 const size_t M( A.rows() );
1853 const size_t N( A.columns() );
1856 const size_t jend( N &
size_t(-2) );
1858 for(
size_t j=0UL; j<N; ++j ) {
1859 y[j] = x[0UL] * A(0UL,j);
1861 for(
size_t i=1UL; i<M; ++i ) {
1862 for(
size_t j=0UL; j<jend; j+=2UL ) {
1863 y[j ] += x[i] * A(i,j );
1864 y[j+1UL] += x[i] * A(i,j+1UL);
1867 y[jend] += x[i] * A(i,jend);
1870 for(
size_t j=0UL; j<N; ++j ) {
1890 template<
typename VT1
1894 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1895 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1897 typedef IntrinsicTrait<ElementType> IT;
1899 const size_t M( A.rows() );
1900 const size_t N( A.spacing() );
1906 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1907 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1908 for(
size_t i=0UL; i<M; ++i ) {
1910 xmm1 = xmm1 + x1 * A.get(i,j );
1911 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
1912 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
1913 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
1914 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
1915 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
1916 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
1917 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
1919 store( &y[j ], xmm1*factor );
1920 store( &y[j+IT::size ], xmm2*factor );
1921 store( &y[j+IT::size*2UL], xmm3*factor );
1922 store( &y[j+IT::size*3UL], xmm4*factor );
1923 store( &y[j+IT::size*4UL], xmm5*factor );
1924 store( &y[j+IT::size*5UL], xmm6*factor );
1925 store( &y[j+IT::size*6UL], xmm7*factor );
1926 store( &y[j+IT::size*7UL], xmm8*factor );
1928 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1930 for(
size_t i=0UL; i<M; ++i ) {
1932 xmm1 = xmm1 + x1 * A.get(i,j );
1933 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
1934 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
1935 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
1937 store( &y[j ], xmm1*factor );
1938 store( &y[j+IT::size ], xmm2*factor );
1939 store( &y[j+IT::size*2UL], xmm3*factor );
1940 store( &y[j+IT::size*3UL], xmm4*factor );
1942 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
1944 for(
size_t i=0UL; i<M; ++i ) {
1946 xmm1 = xmm1 + x1 * A.get(i,j );
1947 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
1948 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
1950 store( &y[j ], xmm1*factor );
1951 store( &y[j+IT::size ], xmm2*factor );
1952 store( &y[j+IT::size*2UL], xmm3*factor );
1954 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1956 for(
size_t i=0UL; i<M; ++i ) {
1958 xmm1 = xmm1 + x1 * A.get(i,j );
1959 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
1961 store( &y[j ], xmm1*factor );
1962 store( &y[j+IT::size], xmm2*factor );
1966 for(
size_t i=0UL; i<M; ++i ) {
1967 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
1969 store( &y[j], xmm1*factor );
1987 template<
typename VT1
1991 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1992 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1994 selectDefaultAssignKernel( y, x, A, scalar );
2013 template<
typename VT1
2017 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2018 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2020 using boost::numeric_cast;
2026 const int M ( numeric_cast<int>( A.rows() ) );
2027 const int N ( numeric_cast<int>( A.columns() ) );
2028 const int lda( numeric_cast<int>( A.spacing() ) );
2030 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2031 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2051 template<
typename VT1
2055 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2056 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2058 using boost::numeric_cast;
2064 const int M ( numeric_cast<int>( A.rows() ) );
2065 const int N ( numeric_cast<int>( A.columns() ) );
2066 const int lda( numeric_cast<int>( A.spacing() ) );
2068 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2069 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2089 template<
typename VT1
2093 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2094 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2096 using boost::numeric_cast;
2106 const int M ( numeric_cast<int>( A.rows() ) );
2107 const int N ( numeric_cast<int>( A.columns() ) );
2108 const int lda( numeric_cast<int>( A.spacing() ) );
2109 const complex<float> alpha( scalar );
2110 const complex<float> beta ( 0.0F, 0.0F );
2112 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2113 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2133 template<
typename VT1
2137 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2138 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2140 using boost::numeric_cast;
2150 const int M ( numeric_cast<int>( A.rows() ) );
2151 const int N ( numeric_cast<int>( A.columns() ) );
2152 const int lda( numeric_cast<int>( A.spacing() ) );
2153 const complex<double> alpha( scalar );
2154 const complex<double> beta ( 0.0, 0.0 );
2156 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2157 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2174 template<
typename VT1 >
2175 friend inline void assign( SparseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2202 template<
typename VT1 >
2203 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2209 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2210 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2212 if( right.rows() == 0UL || right.columns() == 0UL ) {
2224 if( ( IsComputation<MT>::value && !evaluate ) ||
2226 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2228 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2246 template<
typename VT1
2250 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2251 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2253 y.addAssign( x * A * scalar );
2271 template<
typename VT1
2275 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2276 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2278 typedef IntrinsicTrait<ElementType> IT;
2280 const size_t M( A.rows() );
2281 const size_t N( A.spacing() );
2287 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2288 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2289 for(
size_t i=0UL; i<M; ++i ) {
2291 xmm1 = xmm1 + x1 * A.get(i,j );
2292 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2293 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2294 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2295 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
2296 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
2297 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
2298 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
2300 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2301 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) + xmm2*factor );
2302 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) + xmm3*factor );
2303 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) + xmm4*factor );
2304 store( &y[j+IT::size*4UL],
load( &y[j+IT::size*4UL] ) + xmm5*factor );
2305 store( &y[j+IT::size*5UL],
load( &y[j+IT::size*5UL] ) + xmm6*factor );
2306 store( &y[j+IT::size*6UL],
load( &y[j+IT::size*6UL] ) + xmm7*factor );
2307 store( &y[j+IT::size*7UL],
load( &y[j+IT::size*7UL] ) + xmm8*factor );
2309 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2311 for(
size_t i=0UL; i<M; ++i ) {
2313 xmm1 = xmm1 + x1 * A.get(i,j );
2314 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2315 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2316 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2318 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2319 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) + xmm2*factor );
2320 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) + xmm3*factor );
2321 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) + xmm4*factor );
2323 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
2325 for(
size_t i=0UL; i<M; ++i ) {
2327 xmm1 = xmm1 + x1 * A.get(i,j );
2328 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2329 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2331 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2332 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) + xmm2*factor );
2333 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) + xmm3*factor );
2335 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2337 for(
size_t i=0UL; i<M; ++i ) {
2339 xmm1 = xmm1 + x1 * A.get(i,j );
2340 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
2342 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2343 store( &y[j+IT::size],
load( &y[j+IT::size] ) + xmm2*factor );
2347 for(
size_t i=0UL; i<M; ++i ) {
2348 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
2350 store( &y[j],
load( &y[j] ) + xmm1*factor );
2369 template<
typename VT1
2373 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2374 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2376 selectDefaultAddAssignKernel( y, x, A, scalar );
2395 template<
typename VT1
2399 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2400 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2402 using boost::numeric_cast;
2408 const int M ( numeric_cast<int>( A.rows() ) );
2409 const int N ( numeric_cast<int>( A.columns() ) );
2410 const int lda( numeric_cast<int>( A.spacing() ) );
2412 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2413 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2433 template<
typename VT1
2437 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2438 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2440 using boost::numeric_cast;
2446 const int M ( numeric_cast<int>( A.rows() ) );
2447 const int N ( numeric_cast<int>( A.columns() ) );
2448 const int lda( numeric_cast<int>( A.spacing() ) );
2450 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2451 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2471 template<
typename VT1
2475 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2476 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2478 using boost::numeric_cast;
2488 const int M ( numeric_cast<int>( A.rows() ) );
2489 const int N ( numeric_cast<int>( A.columns() ) );
2490 const int lda( numeric_cast<int>( A.spacing() ) );
2491 const complex<float> alpha( scalar );
2492 const complex<float> beta ( 1.0F, 0.0F );
2494 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2495 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2515 template<
typename VT1
2519 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2520 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2522 using boost::numeric_cast;
2532 const int M ( numeric_cast<int>( A.rows() ) );
2533 const int N ( numeric_cast<int>( A.columns() ) );
2534 const int lda( numeric_cast<int>( A.spacing() ) );
2535 const complex<double> alpha( scalar );
2536 const complex<double> beta ( 1.0, 0.0 );
2538 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2539 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2560 template<
typename VT1 >
2561 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2567 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2568 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2570 if( right.rows() == 0UL || right.columns() == 0UL ) {
2582 if( ( IsComputation<MT>::value && !evaluate ) ||
2584 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2586 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2604 template<
typename VT1
2608 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2609 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2611 y.subAssign( x * A * scalar );
2629 template<
typename VT1
2633 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2634 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2636 typedef IntrinsicTrait<ElementType> IT;
2638 const size_t M( A.rows() );
2639 const size_t N( A.spacing() );
2645 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2646 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2647 for(
size_t i=0UL; i<M; ++i ) {
2649 xmm1 = xmm1 + x1 * A.get(i,j );
2650 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2651 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2652 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2653 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
2654 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
2655 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
2656 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
2658 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2659 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) - xmm2*factor );
2660 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) - xmm3*factor );
2661 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) - xmm4*factor );
2662 store( &y[j+IT::size*4UL],
load( &y[j+IT::size*4UL] ) - xmm5*factor );
2663 store( &y[j+IT::size*5UL],
load( &y[j+IT::size*5UL] ) - xmm6*factor );
2664 store( &y[j+IT::size*6UL],
load( &y[j+IT::size*6UL] ) - xmm7*factor );
2665 store( &y[j+IT::size*7UL],
load( &y[j+IT::size*7UL] ) - xmm8*factor );
2667 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2669 for(
size_t i=0UL; i<M; ++i ) {
2671 xmm1 = xmm1 + x1 * A.get(i,j );
2672 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2673 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2674 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2676 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2677 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) - xmm2*factor );
2678 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) - xmm3*factor );
2679 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) - xmm4*factor );
2681 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
2683 for(
size_t i=0UL; i<M; ++i ) {
2685 xmm1 = xmm1 + x1 * A.get(i,j );
2686 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2687 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2689 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2690 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) - xmm2*factor );
2691 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) - xmm3*factor );
2693 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2695 for(
size_t i=0UL; i<M; ++i ) {
2697 xmm1 = xmm1 + x1 * A.get(i,j );
2698 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
2700 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2701 store( &y[j+IT::size],
load( &y[j+IT::size] ) - xmm2*factor );
2705 for(
size_t i=0UL; i<M; ++i ) {
2706 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
2708 store( &y[j],
load( &y[j] ) - xmm1*factor );
2727 template<
typename VT1
2731 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2732 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2734 selectDefaultSubAssignKernel( y, x, A, scalar );
2753 template<
typename VT1
2757 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2758 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2760 using boost::numeric_cast;
2766 const int M ( numeric_cast<int>( A.rows() ) );
2767 const int N ( numeric_cast<int>( A.columns() ) );
2768 const int lda( numeric_cast<int>( A.spacing() ) );
2770 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
2771 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2791 template<
typename VT1
2795 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2796 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2798 using boost::numeric_cast;
2804 const int M ( numeric_cast<int>( A.rows() ) );
2805 const int N ( numeric_cast<int>( A.columns() ) );
2806 const int lda( numeric_cast<int>( A.spacing() ) );
2808 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
2809 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2829 template<
typename VT1
2833 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2834 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2836 using boost::numeric_cast;
2846 const int M ( numeric_cast<int>( A.rows() ) );
2847 const int N ( numeric_cast<int>( A.columns() ) );
2848 const int lda( numeric_cast<int>( A.spacing() ) );
2849 const complex<float> alpha( -scalar );
2850 const complex<float> beta ( 1.0F, 0.0F );
2852 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2853 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2873 template<
typename VT1
2877 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2878 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2880 using boost::numeric_cast;
2890 const int M ( numeric_cast<int>( A.rows() ) );
2891 const int N ( numeric_cast<int>( A.columns() ) );
2892 const int lda( numeric_cast<int>( A.spacing() ) );
2893 const complex<double> alpha( -scalar );
2894 const complex<double> beta ( 1.0, 0.0 );
2896 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2897 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2918 template<
typename VT1 >
2919 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2991 template<
typename T1
2993 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecDMatMultExpr<T1,T2> >::Type
2998 if( (~vec).
size() != (~mat).
rows() )
2999 throw std::invalid_argument(
"Vector and matrix sizes do not match" );
3027 template<
typename T1
3030 inline const typename EnableIf< IsMatMatMultExpr<T2>, MultExprTrait<T1,T2> >::Type::Type