35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
101 template<
typename VT
103 class TDVecTDMatMultExpr :
public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
104 ,
private TVecMatMultExpr
105 ,
private Computation
134 template<
typename T1,
typename T2,
typename T3 >
135 struct UseSinglePrecisionKernel {
136 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
137 IsFloat<typename T1::ElementType>::value &&
138 IsFloat<typename T2::ElementType>::value &&
139 IsFloat<typename T3::ElementType>::value };
150 template<
typename T1,
typename T2,
typename T3 >
151 struct UseDoublePrecisionKernel {
152 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
153 IsDouble<typename T1::ElementType>::value &&
154 IsDouble<typename T2::ElementType>::value &&
155 IsDouble<typename T3::ElementType>::value };
166 template<
typename T1,
typename T2,
typename T3 >
167 struct UseSinglePrecisionComplexKernel {
168 typedef complex<float> Type;
169 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
170 IsSame<typename T1::ElementType,Type>::value &&
171 IsSame<typename T2::ElementType,Type>::value &&
172 IsSame<typename T3::ElementType,Type>::value };
183 template<
typename T1,
typename T2,
typename T3 >
184 struct UseDoublePrecisionComplexKernel {
185 typedef complex<double> Type;
186 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
187 IsSame<typename T1::ElementType,Type>::value &&
188 IsSame<typename T2::ElementType,Type>::value &&
189 IsSame<typename T3::ElementType,Type>::value };
199 template<
typename T1,
typename T2,
typename T3 >
200 struct UseDefaultKernel {
201 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
202 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
203 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
204 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
215 template<
typename T1,
typename T2,
typename T3 >
216 struct UseVectorizedDefaultKernel {
217 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
218 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
219 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
220 IntrinsicTrait<typename T1::ElementType>::addition &&
221 IntrinsicTrait<typename T1::ElementType>::multiplication };
251 enum { vectorizable = 0 };
254 enum { smpAssignable = 0 };
283 if(
mat_.rows() != 0UL ) {
285 for(
size_t j=1UL; j<
end_; j+=2UL ) {
288 if( end_ < mat_.rows() ) {
306 return mat_.columns();
336 template<
typename T >
338 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
348 template<
typename T >
350 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
374 template<
typename VT1 >
381 if( rhs.mat_.rows() == 0UL ) {
385 else if( rhs.mat_.columns() == 0UL ) {
399 TDVecTDMatMultExpr::selectDefaultAssignKernel( ~lhs, x, A );
401 TDVecTDMatMultExpr::selectBlasAssignKernel( ~lhs, x, A );
420 template<
typename VT1
424 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
445 template<
typename VT1
448 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
449 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
451 typedef IntrinsicTrait<ElementType> IT;
453 const size_t M( A.rows() );
454 const size_t N( A.columns() );
458 for( ; (j+8UL) <= N; j+=8UL ) {
459 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
460 for(
size_t i=0UL; i<M; i+=IT::size ) {
462 xmm1 = xmm1 + x1 * A.load(i,j );
463 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
464 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
465 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
466 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
467 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
468 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
469 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
472 y[j+1UL] =
sum( xmm2 );
473 y[j+2UL] =
sum( xmm3 );
474 y[j+3UL] =
sum( xmm4 );
475 y[j+4UL] =
sum( xmm5 );
476 y[j+5UL] =
sum( xmm6 );
477 y[j+6UL] =
sum( xmm7 );
478 y[j+7UL] =
sum( xmm8 );
480 for( ; (j+4UL) <= N; j+=4UL ) {
482 for(
size_t i=0UL; i<M; i+=IT::size ) {
484 xmm1 = xmm1 + x1 * A.load(i,j );
485 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
486 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
487 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
490 y[j+1UL] =
sum( xmm2 );
491 y[j+2UL] =
sum( xmm3 );
492 y[j+3UL] =
sum( xmm4 );
494 for( ; (j+3UL) <= N; j+=3UL ) {
496 for(
size_t i=0UL; i<M; i+=IT::size ) {
498 xmm1 = xmm1 + x1 * A.load(i,j );
499 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
500 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
503 y[j+1UL] =
sum( xmm2 );
504 y[j+2UL] =
sum( xmm3 );
506 for( ; (j+2UL) <= N; j+=2UL ) {
508 for(
size_t i=0UL; i<M; i+=IT::size ) {
510 xmm1 = xmm1 + x1 * A.load(i,j );
511 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
514 y[j+1UL] =
sum( xmm2 );
518 for(
size_t i=0UL; i<M; i+=IT::size ) {
519 xmm1 = xmm1 + A.load(i,j) * x.load(i);
541 template<
typename VT1
544 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
545 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
547 selectDefaultAssignKernel( y, x, A );
567 template<
typename VT1
570 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
571 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
573 using boost::numeric_cast;
579 const int M ( numeric_cast<int>( A.rows() ) );
580 const int N ( numeric_cast<int>( A.columns() ) );
581 const int lda( numeric_cast<int>( A.spacing() ) );
583 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
584 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
605 template<
typename VT1
608 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
609 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
611 using boost::numeric_cast;
617 const int M ( numeric_cast<int>( A.rows() ) );
618 const int N ( numeric_cast<int>( A.columns() ) );
619 const int lda( numeric_cast<int>( A.spacing() ) );
621 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
622 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
643 template<
typename VT1
646 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
647 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
649 using boost::numeric_cast;
658 const int M ( numeric_cast<int>( A.rows() ) );
659 const int N ( numeric_cast<int>( A.columns() ) );
660 const int lda( numeric_cast<int>( A.spacing() ) );
661 const complex<float> alpha( 1.0F, 0.0F );
662 const complex<float> beta ( 0.0F, 0.0F );
664 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
665 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
686 template<
typename VT1
689 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
690 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
692 using boost::numeric_cast;
701 const int M ( numeric_cast<int>( A.rows() ) );
702 const int N ( numeric_cast<int>( A.columns() ) );
703 const int lda( numeric_cast<int>( A.spacing() ) );
704 const complex<double> alpha( 1.0, 0.0 );
705 const complex<double> beta ( 0.0, 0.0 );
707 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
708 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
727 template<
typename VT1 >
757 template<
typename VT1 >
764 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
776 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
778 TDVecTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A );
780 TDVecTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, x, A );
799 template<
typename VT1
802 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
803 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
805 y.addAssign( x * A );
824 template<
typename VT1
827 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
828 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
830 typedef IntrinsicTrait<ElementType> IT;
832 const size_t M( A.rows() );
833 const size_t N( A.columns() );
837 for( ; (j+8UL) <= N; j+=8UL ) {
838 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
839 for(
size_t i=0UL; i<M; i+=IT::size ) {
841 xmm1 = xmm1 + x1 * A.load(i,j );
842 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
843 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
844 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
845 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
846 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
847 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
848 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
850 y[j ] +=
sum( xmm1 );
851 y[j+1UL] +=
sum( xmm2 );
852 y[j+2UL] +=
sum( xmm3 );
853 y[j+3UL] +=
sum( xmm4 );
854 y[j+4UL] +=
sum( xmm5 );
855 y[j+5UL] +=
sum( xmm6 );
856 y[j+6UL] +=
sum( xmm7 );
857 y[j+7UL] +=
sum( xmm8 );
859 for( ; (j+4UL) <= N; j+=4UL ) {
861 for(
size_t i=0UL; i<M; i+=IT::size ) {
863 xmm1 = xmm1 + x1 * A.load(i,j );
864 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
865 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
866 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
868 y[j ] +=
sum( xmm1 );
869 y[j+1UL] +=
sum( xmm2 );
870 y[j+2UL] +=
sum( xmm3 );
871 y[j+3UL] +=
sum( xmm4 );
873 for( ; (j+3UL) <= N; j+=3UL ) {
875 for(
size_t i=0UL; i<M; i+=IT::size ) {
877 xmm1 = xmm1 + x1 * A.load(i,j );
878 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
879 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
881 y[j ] +=
sum( xmm1 );
882 y[j+1UL] +=
sum( xmm2 );
883 y[j+2UL] +=
sum( xmm3 );
885 for( ; (j+2UL) <= N; j+=2UL ) {
887 for(
size_t i=0UL; i<M; i+=IT::size ) {
889 xmm1 = xmm1 + x1 * A.load(i,j );
890 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
892 y[j ] +=
sum( xmm1 );
893 y[j+1UL] +=
sum( xmm2 );
897 for(
size_t i=0UL; i<M; i+=IT::size ) {
898 xmm1 = xmm1 + A.load(i,j) * x.load(i);
920 template<
typename VT1
923 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
924 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
926 selectDefaultAddAssignKernel( y, x, A );
946 template<
typename VT1
949 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
950 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
952 using boost::numeric_cast;
958 const int M ( numeric_cast<int>( A.rows() ) );
959 const int N ( numeric_cast<int>( A.columns() ) );
960 const int lda( numeric_cast<int>( A.spacing() ) );
962 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
963 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
984 template<
typename VT1
987 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
988 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
990 using boost::numeric_cast;
996 const int M ( numeric_cast<int>( A.rows() ) );
997 const int N ( numeric_cast<int>( A.columns() ) );
998 const int lda( numeric_cast<int>( A.spacing() ) );
1000 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
1001 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1022 template<
typename VT1
1025 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1026 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1028 using boost::numeric_cast;
1037 const int M ( numeric_cast<int>( A.rows() ) );
1038 const int N ( numeric_cast<int>( A.columns() ) );
1039 const int lda( numeric_cast<int>( A.spacing() ) );
1040 const complex<float> alpha( 1.0F, 0.0F );
1041 const complex<float> beta ( 1.0F, 0.0F );
1043 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1044 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1065 template<
typename VT1
1068 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1069 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1071 using boost::numeric_cast;
1080 const int M ( numeric_cast<int>( A.rows() ) );
1081 const int N ( numeric_cast<int>( A.columns() ) );
1082 const int lda( numeric_cast<int>( A.spacing() ) );
1083 const complex<double> alpha( 1.0, 0.0 );
1084 const complex<double> beta ( 1.0, 0.0 );
1086 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1087 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1110 template<
typename VT1 >
1117 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1129 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1131 TDVecTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A );
1133 TDVecTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, x, A );
1152 template<
typename VT1
1155 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1156 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1158 y.subAssign( x * A );
1177 template<
typename VT1
1180 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1181 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1183 typedef IntrinsicTrait<ElementType> IT;
1185 const size_t M( A.rows() );
1186 const size_t N( A.columns() );
1190 for( ; (j+8UL) <= N; j+=8UL ) {
1191 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1192 for(
size_t i=0UL; i<M; i+=IT::size ) {
1194 xmm1 = xmm1 + x1 * A.load(i,j );
1195 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1196 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1197 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1198 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1199 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1200 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1201 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1203 y[j ] -=
sum( xmm1 );
1204 y[j+1UL] -=
sum( xmm2 );
1205 y[j+2UL] -=
sum( xmm3 );
1206 y[j+3UL] -=
sum( xmm4 );
1207 y[j+4UL] -=
sum( xmm5 );
1208 y[j+5UL] -=
sum( xmm6 );
1209 y[j+6UL] -=
sum( xmm7 );
1210 y[j+7UL] -=
sum( xmm8 );
1212 for( ; (j+4UL) <= N; j+=4UL ) {
1214 for(
size_t i=0UL; i<M; i+=IT::size ) {
1216 xmm1 = xmm1 + x1 * A.load(i,j );
1217 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1218 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1219 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1221 y[j ] -=
sum( xmm1 );
1222 y[j+1UL] -=
sum( xmm2 );
1223 y[j+2UL] -=
sum( xmm3 );
1224 y[j+3UL] -=
sum( xmm4 );
1226 for( ; (j+3UL) <= N; j+=3UL ) {
1228 for(
size_t i=0UL; i<M; i+=IT::size ) {
1230 xmm1 = xmm1 + x1 * A.load(i,j );
1231 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1232 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1234 y[j ] -=
sum( xmm1 );
1235 y[j+1UL] -=
sum( xmm2 );
1236 y[j+2UL] -=
sum( xmm3 );
1238 for( ; (j+2UL) <= N; j+=2UL ) {
1240 for(
size_t i=0UL; i<M; i+=IT::size ) {
1242 xmm1 = xmm1 + x1 * A.load(i,j );
1243 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1245 y[j ] -=
sum( xmm1 );
1246 y[j+1UL] -=
sum( xmm2 );
1250 for(
size_t i=0UL; i<M; i+=IT::size ) {
1251 xmm1 = xmm1 + A.load(i,j) * x.load(i);
1253 y[j] -=
sum( xmm1 );
1273 template<
typename VT1
1276 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1277 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1279 selectDefaultSubAssignKernel( y, x, A );
1299 template<
typename VT1
1302 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1303 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1305 using boost::numeric_cast;
1311 const int M ( numeric_cast<int>( A.rows() ) );
1312 const int N ( numeric_cast<int>( A.columns() ) );
1313 const int lda( numeric_cast<int>( A.spacing() ) );
1315 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -1.0F,
1316 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1337 template<
typename VT1
1340 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1341 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1343 using boost::numeric_cast;
1349 const int M ( numeric_cast<int>( A.rows() ) );
1350 const int N ( numeric_cast<int>( A.columns() ) );
1351 const int lda( numeric_cast<int>( A.spacing() ) );
1353 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -1.0,
1354 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1375 template<
typename VT1
1378 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1379 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1381 using boost::numeric_cast;
1390 const int M ( numeric_cast<int>( A.rows() ) );
1391 const int N ( numeric_cast<int>( A.columns() ) );
1392 const int lda( numeric_cast<int>( A.spacing() ) );
1393 const complex<float> alpha( -1.0F, 0.0F );
1394 const complex<float> beta ( 1.0F, 0.0F );
1396 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1397 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1418 template<
typename VT1
1421 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1422 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1424 using boost::numeric_cast;
1433 const int M ( numeric_cast<int>( A.rows() ) );
1434 const int N ( numeric_cast<int>( A.columns() ) );
1435 const int lda( numeric_cast<int>( A.spacing() ) );
1436 const complex<double> alpha( -1.0, 0.0 );
1437 const complex<double> beta ( 1.0, 0.0 );
1439 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1440 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1463 template<
typename VT1 >
1512 template<
typename VT
1516 :
public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
1517 ,
private VecScalarMultExpr
1518 ,
private Computation
1522 typedef TDVecTDMatMultExpr<VT,MT> VMM;
1534 enum { evaluateVector = IsComputation<VT>::value };
1539 enum { evaluateMatrix = IsComputation<MT>::value && !MT::vectorizable &&
1540 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1548 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1549 struct UseSinglePrecisionKernel {
1550 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1551 IsFloat<typename T1::ElementType>::value &&
1552 IsFloat<typename T2::ElementType>::value &&
1553 IsFloat<typename T3::ElementType>::value &&
1554 !IsComplex<T4>::value };
1563 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1564 struct UseDoublePrecisionKernel {
1565 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1566 IsDouble<typename T1::ElementType>::value &&
1567 IsDouble<typename T2::ElementType>::value &&
1568 IsDouble<typename T3::ElementType>::value &&
1569 !IsComplex<T4>::value };
1578 template<
typename T1,
typename T2,
typename T3 >
1579 struct UseSinglePrecisionComplexKernel {
1580 typedef complex<float> Type;
1581 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1582 IsSame<typename T1::ElementType,Type>::value &&
1583 IsSame<typename T2::ElementType,Type>::value &&
1584 IsSame<typename T3::ElementType,Type>::value };
1593 template<
typename T1,
typename T2,
typename T3 >
1594 struct UseDoublePrecisionComplexKernel {
1595 typedef complex<double> Type;
1596 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1597 IsSame<typename T1::ElementType,Type>::value &&
1598 IsSame<typename T2::ElementType,Type>::value &&
1599 IsSame<typename T3::ElementType,Type>::value };
1607 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1608 struct UseDefaultKernel {
1609 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1610 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1611 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1612 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1621 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1622 struct UseVectorizedDefaultKernel {
1623 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1624 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1625 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1626 IsSame<typename T1::ElementType,T4>::value &&
1627 IntrinsicTrait<typename T1::ElementType>::addition &&
1628 IntrinsicTrait<typename T1::ElementType>::multiplication };
1634 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1635 typedef typename MultTrait<RES,ST>::Type
ResultType;
1638 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1643 typedef const TDVecTDMatMultExpr<VT,MT>
LeftOperand;
1649 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
LT;
1652 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
RT;
1657 enum { vectorizable = 0 };
1660 enum { smpAssignable = 0 };
1669 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
1683 return vector_[index] * scalar_;
1692 inline size_t size()
const {
1693 return vector_.size();
1723 template<
typename T >
1724 inline bool canAlias(
const T* alias )
const {
1725 return vector_.canAlias( alias );
1735 template<
typename T >
1736 inline bool isAliased(
const T* alias )
const {
1737 return vector_.isAliased( alias );
1759 template<
typename VT1
1761 friend inline void assign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
1767 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
1768 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
1770 if( right.rows() == 0UL ) {
1774 else if( right.columns() == 0UL ) {
1786 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1788 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, x, A, rhs.scalar_ );
1790 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, x, A, rhs.scalar_ );
1808 template<
typename VT1
1812 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1813 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1815 y.assign( x * A * scalar );
1833 template<
typename VT1
1837 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1838 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1840 typedef IntrinsicTrait<ElementType> IT;
1842 const size_t M( A.rows() );
1843 const size_t N( A.columns() );
1847 for( ; (j+8UL) <= N; j+=8UL ) {
1848 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1849 for(
size_t i=0UL; i<M; i+=IT::size ) {
1851 xmm1 = xmm1 + x1 * A.load(i,j );
1852 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1853 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1854 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1855 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1856 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1857 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1858 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1860 y[j ] =
sum( xmm1 ) * scalar;
1861 y[j+1UL] =
sum( xmm2 ) * scalar;
1862 y[j+2UL] =
sum( xmm3 ) * scalar;
1863 y[j+3UL] =
sum( xmm4 ) * scalar;
1864 y[j+4UL] =
sum( xmm5 ) * scalar;
1865 y[j+5UL] =
sum( xmm6 ) * scalar;
1866 y[j+6UL] =
sum( xmm7 ) * scalar;
1867 y[j+7UL] =
sum( xmm8 ) * scalar;
1869 for( ; (j+4UL) <= N; j+=4UL ) {
1871 for(
size_t i=0UL; i<M; i+=IT::size ) {
1873 xmm1 = xmm1 + x1 * A.load(i,j );
1874 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1875 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1876 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1878 y[j ] =
sum( xmm1 ) * scalar;
1879 y[j+1UL] =
sum( xmm2 ) * scalar;
1880 y[j+2UL] =
sum( xmm3 ) * scalar;
1881 y[j+3UL] =
sum( xmm4 ) * scalar;
1883 for( ; (j+3UL) <= N; j+=3UL ) {
1885 for(
size_t i=0UL; i<M; i+=IT::size ) {
1887 xmm1 = xmm1 + x1 * A.load(i,j );
1888 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1889 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1891 y[j ] =
sum( xmm1 ) * scalar;
1892 y[j+1UL] =
sum( xmm2 ) * scalar;
1893 y[j+2UL] =
sum( xmm3 ) * scalar;
1895 for( ; (j+2UL) <= N; j+=2UL ) {
1897 for(
size_t i=0UL; i<M; i+=IT::size ) {
1899 xmm1 = xmm1 + x1 * A.load(i,j );
1900 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1902 y[j ] =
sum( xmm1 ) * scalar;
1903 y[j+1UL] =
sum( xmm2 ) * scalar;
1907 for(
size_t i=0UL; i<M; i+=IT::size ) {
1908 xmm1 = xmm1 + A.load(i,j) * x.load(i);
1910 y[j] =
sum( xmm1 ) * scalar;
1928 template<
typename VT1
1932 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1933 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1935 selectDefaultAssignKernel( y, x, A, scalar );
1954 template<
typename VT1
1958 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
1959 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1961 using boost::numeric_cast;
1967 const int M ( numeric_cast<int>( A.rows() ) );
1968 const int N ( numeric_cast<int>( A.columns() ) );
1969 const int lda( numeric_cast<int>( A.spacing() ) );
1971 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
1972 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
1992 template<
typename VT1
1996 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
1997 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1999 using boost::numeric_cast;
2005 const int M ( numeric_cast<int>( A.rows() ) );
2006 const int N ( numeric_cast<int>( A.columns() ) );
2007 const int lda( numeric_cast<int>( A.spacing() ) );
2009 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
2010 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2031 template<
typename VT1
2035 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2036 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2038 using boost::numeric_cast;
2047 const int M ( numeric_cast<int>( A.rows() ) );
2048 const int N ( numeric_cast<int>( A.columns() ) );
2049 const int lda( numeric_cast<int>( A.spacing() ) );
2050 const complex<float> alpha( scalar );
2051 const complex<float> beta ( 0.0F, 0.0F );
2053 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2054 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2075 template<
typename VT1
2079 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2080 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2082 using boost::numeric_cast;
2091 const int M ( numeric_cast<int>( A.rows() ) );
2092 const int N ( numeric_cast<int>( A.columns() ) );
2093 const int lda( numeric_cast<int>( A.spacing() ) );
2094 const complex<double> alpha( scalar );
2095 const complex<double> beta ( 0.0, 0.0 );
2097 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2098 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2115 template<
typename VT1
2117 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2144 template<
typename VT1
2146 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2152 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2153 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2155 if( right.rows() == 0UL || right.columns() == 0UL ) {
2167 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2169 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2171 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2189 template<
typename VT1
2193 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2194 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2196 y.addAssign( x * A * scalar );
2214 template<
typename VT1
2218 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2219 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2221 typedef IntrinsicTrait<ElementType> IT;
2223 const size_t M( A.rows() );
2224 const size_t N( A.columns() );
2228 for( ; (j+8UL) <= N; j+=8UL ) {
2229 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2230 for(
size_t i=0UL; i<M; i+=IT::size ) {
2232 xmm1 = xmm1 + x1 * A.load(i,j );
2233 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2234 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2235 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2236 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
2237 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
2238 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
2239 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
2241 y[j ] +=
sum( xmm1 ) * scalar;
2242 y[j+1UL] +=
sum( xmm2 ) * scalar;
2243 y[j+2UL] +=
sum( xmm3 ) * scalar;
2244 y[j+3UL] +=
sum( xmm4 ) * scalar;
2245 y[j+4UL] +=
sum( xmm5 ) * scalar;
2246 y[j+5UL] +=
sum( xmm6 ) * scalar;
2247 y[j+6UL] +=
sum( xmm7 ) * scalar;
2248 y[j+7UL] +=
sum( xmm8 ) * scalar;
2250 for( ; (j+4UL) <= N; j+=4UL ) {
2252 for(
size_t i=0UL; i<M; i+=IT::size ) {
2254 xmm1 = xmm1 + x1 * A.load(i,j );
2255 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2256 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2257 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2259 y[j ] +=
sum( xmm1 ) * scalar;
2260 y[j+1UL] +=
sum( xmm2 ) * scalar;
2261 y[j+2UL] +=
sum( xmm3 ) * scalar;
2262 y[j+3UL] +=
sum( xmm4 ) * scalar;
2264 for( ; (j+3UL) <= N; j+=3UL ) {
2266 for(
size_t i=0UL; i<M; i+=IT::size ) {
2268 xmm1 = xmm1 + x1 * A.load(i,j );
2269 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2270 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2272 y[j ] +=
sum( xmm1 ) * scalar;
2273 y[j+1UL] +=
sum( xmm2 ) * scalar;
2274 y[j+2UL] +=
sum( xmm3 ) * scalar;
2276 for( ; (j+2UL) <= N; j+=2UL ) {
2278 for(
size_t i=0UL; i<M; i+=IT::size ) {
2280 xmm1 = xmm1 + x1 * A.load(i,j );
2281 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2283 y[j ] +=
sum( xmm1 ) * scalar;
2284 y[j+1UL] +=
sum( xmm2 ) * scalar;
2288 for(
size_t i=0UL; i<M; i+=IT::size ) {
2289 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2291 y[j] +=
sum( xmm1 ) * scalar;
2310 template<
typename VT1
2314 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2315 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2317 selectDefaultAddAssignKernel( y, x, A, scalar );
2336 template<
typename VT1
2340 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2341 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2343 using boost::numeric_cast;
2349 const int M ( numeric_cast<int>( A.rows() ) );
2350 const int N ( numeric_cast<int>( A.columns() ) );
2351 const int lda( numeric_cast<int>( A.spacing() ) );
2353 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
2354 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2374 template<
typename VT1
2378 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2379 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2381 using boost::numeric_cast;
2387 const int M ( numeric_cast<int>( A.rows() ) );
2388 const int N ( numeric_cast<int>( A.columns() ) );
2389 const int lda( numeric_cast<int>( A.spacing() ) );
2391 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
2392 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2413 template<
typename VT1
2417 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2418 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2420 using boost::numeric_cast;
2429 const int M ( numeric_cast<int>( A.rows() ) );
2430 const int N ( numeric_cast<int>( A.columns() ) );
2431 const int lda( numeric_cast<int>( A.spacing() ) );
2432 const complex<float> alpha( scalar );
2433 const complex<float> beta ( 1.0F, 0.0F );
2435 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2436 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2457 template<
typename VT1
2461 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2462 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2464 using boost::numeric_cast;
2473 const int M ( numeric_cast<int>( A.rows() ) );
2474 const int N ( numeric_cast<int>( A.columns() ) );
2475 const int lda( numeric_cast<int>( A.spacing() ) );
2476 const complex<double> alpha( scalar );
2477 const complex<double> beta ( 1.0, 0.0 );
2479 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2480 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2501 template<
typename VT1
2503 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2509 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2510 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2512 if( right.rows() == 0UL || right.columns() == 0UL ) {
2524 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2526 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2528 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2546 template<
typename VT1
2550 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2551 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2553 y.subAssign( x * A * scalar );
2571 template<
typename VT1
2575 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2576 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2578 typedef IntrinsicTrait<ElementType> IT;
2580 const size_t M( A.rows() );
2581 const size_t N( A.columns() );
2585 for( ; (j+8UL) <= N; j+=8UL ) {
2586 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2587 for(
size_t i=0UL; i<M; i+=IT::size ) {
2589 xmm1 = xmm1 + x1 * A.load(i,j );
2590 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2591 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2592 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2593 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
2594 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
2595 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
2596 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
2598 y[j ] -=
sum( xmm1 ) * scalar;
2599 y[j+1UL] -=
sum( xmm2 ) * scalar;
2600 y[j+2UL] -=
sum( xmm3 ) * scalar;
2601 y[j+3UL] -=
sum( xmm4 ) * scalar;
2602 y[j+4UL] -=
sum( xmm5 ) * scalar;
2603 y[j+5UL] -=
sum( xmm6 ) * scalar;
2604 y[j+6UL] -=
sum( xmm7 ) * scalar;
2605 y[j+7UL] -=
sum( xmm8 ) * scalar;
2607 for( ; (j+4UL) <= N; j+=4UL ) {
2609 for(
size_t i=0UL; i<M; i+=IT::size ) {
2611 xmm1 = xmm1 + x1 * A.load(i,j );
2612 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2613 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2614 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2616 y[j ] -=
sum( xmm1 ) * scalar;
2617 y[j+1UL] -=
sum( xmm2 ) * scalar;
2618 y[j+2UL] -=
sum( xmm3 ) * scalar;
2619 y[j+3UL] -=
sum( xmm4 ) * scalar;
2621 for( ; (j+3UL) <= N; j+=3UL ) {
2623 for(
size_t i=0UL; i<M; i+=IT::size ) {
2625 xmm1 = xmm1 + x1 * A.load(i,j );
2626 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2627 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2629 y[j ] -=
sum( xmm1 ) * scalar;
2630 y[j+1UL] -=
sum( xmm2 ) * scalar;
2631 y[j+2UL] -=
sum( xmm3 ) * scalar;
2633 for( ; (j+2UL) <= N; j+=2UL ) {
2635 for(
size_t i=0UL; i<M; i+=IT::size ) {
2637 xmm1 = xmm1 + x1 * A.load(i,j );
2638 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2640 y[j ] -=
sum( xmm1 ) * scalar;
2641 y[j+1UL] -=
sum( xmm2 ) * scalar;
2645 for(
size_t i=0UL; i<M; i+=IT::size ) {
2646 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2648 y[j] -=
sum( xmm1 ) * scalar;
2668 template<
typename VT1
2672 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2673 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2675 selectDefaultSubAssignKernel( y, x, A, scalar );
2694 template<
typename VT1
2698 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2699 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2701 using boost::numeric_cast;
2707 const int M ( numeric_cast<int>( A.rows() ) );
2708 const int N ( numeric_cast<int>( A.columns() ) );
2709 const int lda( numeric_cast<int>( A.spacing() ) );
2711 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -scalar,
2712 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2732 template<
typename VT1
2736 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2737 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2739 using boost::numeric_cast;
2745 const int M ( numeric_cast<int>( A.rows() ) );
2746 const int N ( numeric_cast<int>( A.columns() ) );
2747 const int lda( numeric_cast<int>( A.spacing() ) );
2749 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -scalar,
2750 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2772 template<
typename VT1
2776 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2777 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2779 using boost::numeric_cast;
2788 const int M ( numeric_cast<int>( A.rows() ) );
2789 const int N ( numeric_cast<int>( A.columns() ) );
2790 const int lda( numeric_cast<int>( A.spacing() ) );
2791 const complex<float> alpha( -scalar );
2792 const complex<float> beta ( 1.0F, 0.0F );
2794 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2795 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2817 template<
typename VT1
2821 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2822 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2824 using boost::numeric_cast;
2833 const int M ( numeric_cast<int>( A.rows() ) );
2834 const int N ( numeric_cast<int>( A.columns() ) );
2835 const int lda( numeric_cast<int>( A.spacing() ) );
2836 const complex<double> alpha( -scalar );
2837 const complex<double> beta ( 1.0, 0.0 );
2839 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2840 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2861 template<
typename VT1
2863 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2936 template<
typename T1
2938 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >::Type
2943 if( (~vec).
size() != (~mat).
rows() )
2944 throw std::invalid_argument(
"Vector and matrix sizes do not match" );
2961 template<
typename VT,
typename MT >
2966 typedef typename MultExprTrait< VT, typename SubmatrixExprTrait<const MT>::Type >::Type Type;
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:237
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:131
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
const size_t end_
End of the unrolled calculation loop.
Definition: TDVecTDMatMultExpr.h:358
LeftOperand leftOperand() const
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:315
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:325
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4512
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:3703
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:196
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:62
Header file for the IsSame and IsStrictlySame type traits.
const size_t TDVECTDMATMULT_THRESHOLD
Dense Vector/column-major dense matrix multiplication threshold.This setting specifies the threshold ...
Definition: Thresholds.h:102
Constraint on the data type.
MT::CompositeType MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:114
MRT::ElementType MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:112
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2375
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:248
SelectType< evaluateMatrix, const MRT, MCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:246
Header file for the DenseVector base class.
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:240
Header file for the RequiresEvaluation type trait.
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:250
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:337
ResultType::ElementType ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:231
Header file for the IsMatMatMultExpr type trait class.
Header file for the IsBlasCompatible type trait.
TDVecTDMatMultExpr< VT, MT > This
Type of this TDVecTDMatMultExpr instance.
Definition: TDVecTDMatMultExpr.h:228
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
VT::CompositeType VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:113
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDVecTDMatMultExpr.h:232
SelectType< evaluateVector, const VRT, VCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:243
Constraints on the storage order of matrix types.
Constraint on the data type.
MT::ResultType MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:110
VT::ResultType VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:109
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2373
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:269
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:278
Header file for the EnableIf class template.
Header file for the IsNumeric type trait.
Header file for the SubmatrixExprTrait class template.
System settings for the BLAS mode.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:648
Header file for run time assertion macros.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:356
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:234
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
Header file for the TVecMatMultExpr base class.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:233
VRT::ElementType VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:111
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:230
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:357
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:349
Header file for the IsComputation type trait class.
MultTrait< VRT, MRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:229
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:247
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2370
TDVecTDMatMultExpr(const VT &vec, const MT &mat)
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:263
Header file for basic type definitions.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: TransposeFlag.h:81
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
size_t rows(const Matrix< MT, SO > &m)
Returns the current number of rows of the matrix.
Definition: Matrix.h:138
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:305
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.