35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
44 #include <boost/cast.hpp>
102 template<
typename VT
104 class TDVecTDMatMultExpr :
public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
105 ,
private TVecMatMultExpr
106 ,
private Computation
135 template<
typename T1 >
136 struct UseSMPAssign {
137 enum { value = ( evaluateVector || evaluateMatrix ) };
148 template<
typename T1,
typename T2,
typename T3 >
149 struct UseSinglePrecisionKernel {
150 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
151 IsFloat<typename T1::ElementType>::value &&
152 IsFloat<typename T2::ElementType>::value &&
153 IsFloat<typename T3::ElementType>::value };
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseDoublePrecisionKernel {
166 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
167 IsDouble<typename T1::ElementType>::value &&
168 IsDouble<typename T2::ElementType>::value &&
169 IsDouble<typename T3::ElementType>::value };
180 template<
typename T1,
typename T2,
typename T3 >
181 struct UseSinglePrecisionComplexKernel {
182 typedef complex<float> Type;
183 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
184 IsSame<typename T1::ElementType,Type>::value &&
185 IsSame<typename T2::ElementType,Type>::value &&
186 IsSame<typename T3::ElementType,Type>::value };
197 template<
typename T1,
typename T2,
typename T3 >
198 struct UseDoublePrecisionComplexKernel {
199 typedef complex<double> Type;
200 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
201 IsSame<typename T1::ElementType,Type>::value &&
202 IsSame<typename T2::ElementType,Type>::value &&
203 IsSame<typename T3::ElementType,Type>::value };
213 template<
typename T1,
typename T2,
typename T3 >
214 struct UseDefaultKernel {
215 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
216 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
217 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
218 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
229 template<
typename T1,
typename T2,
typename T3 >
230 struct UseVectorizedDefaultKernel {
231 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
232 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
233 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
234 IntrinsicTrait<typename T1::ElementType>::addition &&
235 IntrinsicTrait<typename T1::ElementType>::multiplication };
265 enum { vectorizable = VT::vectorizable && MT::vectorizable &&
271 enum { smpAssignable = !evaluateVector && VT::smpAssignable &&
272 !evaluateMatrix && MT::smpAssignable };
301 if(
mat_.rows() != 0UL ) {
303 for(
size_t j=1UL; j<
end_; j+=2UL ) {
306 if( end_ < mat_.rows() ) {
324 return mat_.columns();
354 template<
typename T >
356 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
366 template<
typename T >
368 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
378 return vec_.isAligned() &&
mat_.isAligned();
415 template<
typename VT1 >
422 if( rhs.mat_.rows() == 0UL ) {
426 else if( rhs.mat_.columns() == 0UL ) {
438 TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
454 template<
typename VT1
457 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
461 TDVecTDMatMultExpr::selectDefaultAssignKernel( y, x, A );
463 TDVecTDMatMultExpr::selectBlasAssignKernel( y, x, A );
482 template<
typename VT1
485 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
486 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
507 template<
typename VT1
510 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
511 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
513 typedef IntrinsicTrait<ElementType> IT;
515 const size_t M( A.rows() );
516 const size_t N( A.columns() );
520 for( ; (j+8UL) <= N; j+=8UL ) {
521 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
522 for(
size_t i=0UL; i<M; i+=IT::size ) {
524 xmm1 = xmm1 + x1 * A.load(i,j );
525 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
526 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
527 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
528 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
529 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
530 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
531 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
534 y[j+1UL] =
sum( xmm2 );
535 y[j+2UL] =
sum( xmm3 );
536 y[j+3UL] =
sum( xmm4 );
537 y[j+4UL] =
sum( xmm5 );
538 y[j+5UL] =
sum( xmm6 );
539 y[j+6UL] =
sum( xmm7 );
540 y[j+7UL] =
sum( xmm8 );
542 for( ; (j+4UL) <= N; j+=4UL ) {
544 for(
size_t i=0UL; i<M; i+=IT::size ) {
546 xmm1 = xmm1 + x1 * A.load(i,j );
547 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
548 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
549 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
552 y[j+1UL] =
sum( xmm2 );
553 y[j+2UL] =
sum( xmm3 );
554 y[j+3UL] =
sum( xmm4 );
556 for( ; (j+3UL) <= N; j+=3UL ) {
558 for(
size_t i=0UL; i<M; i+=IT::size ) {
560 xmm1 = xmm1 + x1 * A.load(i,j );
561 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
562 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
565 y[j+1UL] =
sum( xmm2 );
566 y[j+2UL] =
sum( xmm3 );
568 for( ; (j+2UL) <= N; j+=2UL ) {
570 for(
size_t i=0UL; i<M; i+=IT::size ) {
572 xmm1 = xmm1 + x1 * A.load(i,j );
573 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
576 y[j+1UL] =
sum( xmm2 );
580 for(
size_t i=0UL; i<M; i+=IT::size ) {
581 xmm1 = xmm1 + A.load(i,j) * x.load(i);
603 template<
typename VT1
606 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
607 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
609 selectDefaultAssignKernel( y, x, A );
629 template<
typename VT1
632 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
633 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
635 using boost::numeric_cast;
641 const int M ( numeric_cast<int>( A.rows() ) );
642 const int N ( numeric_cast<int>( A.columns() ) );
643 const int lda( numeric_cast<int>( A.spacing() ) );
645 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
646 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
667 template<
typename VT1
670 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
671 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
673 using boost::numeric_cast;
679 const int M ( numeric_cast<int>( A.rows() ) );
680 const int N ( numeric_cast<int>( A.columns() ) );
681 const int lda( numeric_cast<int>( A.spacing() ) );
683 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
684 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
705 template<
typename VT1
708 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
709 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
711 using boost::numeric_cast;
720 const int M ( numeric_cast<int>( A.rows() ) );
721 const int N ( numeric_cast<int>( A.columns() ) );
722 const int lda( numeric_cast<int>( A.spacing() ) );
723 const complex<float> alpha( 1.0F, 0.0F );
724 const complex<float> beta ( 0.0F, 0.0F );
726 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
727 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
748 template<
typename VT1
751 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
752 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
754 using boost::numeric_cast;
763 const int M ( numeric_cast<int>( A.rows() ) );
764 const int N ( numeric_cast<int>( A.columns() ) );
765 const int lda( numeric_cast<int>( A.spacing() ) );
766 const complex<double> alpha( 1.0, 0.0 );
767 const complex<double> beta ( 0.0, 0.0 );
769 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
770 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
789 template<
typename VT1 >
819 template<
typename VT1 >
826 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
838 TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
854 template<
typename VT1
857 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
859 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
861 TDVecTDMatMultExpr::selectDefaultAddAssignKernel( y, x, A );
863 TDVecTDMatMultExpr::selectBlasAddAssignKernel( y, x, A );
882 template<
typename VT1
885 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
886 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
888 y.addAssign( x * A );
907 template<
typename VT1
910 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
911 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
913 typedef IntrinsicTrait<ElementType> IT;
915 const size_t M( A.rows() );
916 const size_t N( A.columns() );
920 for( ; (j+8UL) <= N; j+=8UL ) {
921 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
922 for(
size_t i=0UL; i<M; i+=IT::size ) {
924 xmm1 = xmm1 + x1 * A.load(i,j );
925 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
926 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
927 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
928 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
929 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
930 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
931 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
933 y[j ] +=
sum( xmm1 );
934 y[j+1UL] +=
sum( xmm2 );
935 y[j+2UL] +=
sum( xmm3 );
936 y[j+3UL] +=
sum( xmm4 );
937 y[j+4UL] +=
sum( xmm5 );
938 y[j+5UL] +=
sum( xmm6 );
939 y[j+6UL] +=
sum( xmm7 );
940 y[j+7UL] +=
sum( xmm8 );
942 for( ; (j+4UL) <= N; j+=4UL ) {
944 for(
size_t i=0UL; i<M; i+=IT::size ) {
946 xmm1 = xmm1 + x1 * A.load(i,j );
947 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
948 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
949 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
951 y[j ] +=
sum( xmm1 );
952 y[j+1UL] +=
sum( xmm2 );
953 y[j+2UL] +=
sum( xmm3 );
954 y[j+3UL] +=
sum( xmm4 );
956 for( ; (j+3UL) <= N; j+=3UL ) {
958 for(
size_t i=0UL; i<M; i+=IT::size ) {
960 xmm1 = xmm1 + x1 * A.load(i,j );
961 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
962 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
964 y[j ] +=
sum( xmm1 );
965 y[j+1UL] +=
sum( xmm2 );
966 y[j+2UL] +=
sum( xmm3 );
968 for( ; (j+2UL) <= N; j+=2UL ) {
970 for(
size_t i=0UL; i<M; i+=IT::size ) {
972 xmm1 = xmm1 + x1 * A.load(i,j );
973 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
975 y[j ] +=
sum( xmm1 );
976 y[j+1UL] +=
sum( xmm2 );
980 for(
size_t i=0UL; i<M; i+=IT::size ) {
981 xmm1 = xmm1 + A.load(i,j) * x.load(i);
1003 template<
typename VT1
1006 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1007 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1009 selectDefaultAddAssignKernel( y, x, A );
1029 template<
typename VT1
1032 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1033 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1035 using boost::numeric_cast;
1041 const int M ( numeric_cast<int>( A.rows() ) );
1042 const int N ( numeric_cast<int>( A.columns() ) );
1043 const int lda( numeric_cast<int>( A.spacing() ) );
1045 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
1046 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1067 template<
typename VT1
1070 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1071 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1073 using boost::numeric_cast;
1079 const int M ( numeric_cast<int>( A.rows() ) );
1080 const int N ( numeric_cast<int>( A.columns() ) );
1081 const int lda( numeric_cast<int>( A.spacing() ) );
1083 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
1084 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1105 template<
typename VT1
1108 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1109 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1111 using boost::numeric_cast;
1120 const int M ( numeric_cast<int>( A.rows() ) );
1121 const int N ( numeric_cast<int>( A.columns() ) );
1122 const int lda( numeric_cast<int>( A.spacing() ) );
1123 const complex<float> alpha( 1.0F, 0.0F );
1124 const complex<float> beta ( 1.0F, 0.0F );
1126 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1127 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1148 template<
typename VT1
1151 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1152 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1154 using boost::numeric_cast;
1163 const int M ( numeric_cast<int>( A.rows() ) );
1164 const int N ( numeric_cast<int>( A.columns() ) );
1165 const int lda( numeric_cast<int>( A.spacing() ) );
1166 const complex<double> alpha( 1.0, 0.0 );
1167 const complex<double> beta ( 1.0, 0.0 );
1169 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1170 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1193 template<
typename VT1 >
1200 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1212 TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1228 template<
typename VT1
1231 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1233 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1235 TDVecTDMatMultExpr::selectDefaultSubAssignKernel( y, x, A );
1237 TDVecTDMatMultExpr::selectBlasSubAssignKernel( y, x, A );
1256 template<
typename VT1
1259 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1260 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1262 y.subAssign( x * A );
1281 template<
typename VT1
1284 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1285 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1287 typedef IntrinsicTrait<ElementType> IT;
1289 const size_t M( A.rows() );
1290 const size_t N( A.columns() );
1294 for( ; (j+8UL) <= N; j+=8UL ) {
1295 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1296 for(
size_t i=0UL; i<M; i+=IT::size ) {
1298 xmm1 = xmm1 + x1 * A.load(i,j );
1299 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1300 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1301 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1302 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1303 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1304 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1305 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1307 y[j ] -=
sum( xmm1 );
1308 y[j+1UL] -=
sum( xmm2 );
1309 y[j+2UL] -=
sum( xmm3 );
1310 y[j+3UL] -=
sum( xmm4 );
1311 y[j+4UL] -=
sum( xmm5 );
1312 y[j+5UL] -=
sum( xmm6 );
1313 y[j+6UL] -=
sum( xmm7 );
1314 y[j+7UL] -=
sum( xmm8 );
1316 for( ; (j+4UL) <= N; j+=4UL ) {
1318 for(
size_t i=0UL; i<M; i+=IT::size ) {
1320 xmm1 = xmm1 + x1 * A.load(i,j );
1321 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1322 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1323 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1325 y[j ] -=
sum( xmm1 );
1326 y[j+1UL] -=
sum( xmm2 );
1327 y[j+2UL] -=
sum( xmm3 );
1328 y[j+3UL] -=
sum( xmm4 );
1330 for( ; (j+3UL) <= N; j+=3UL ) {
1332 for(
size_t i=0UL; i<M; i+=IT::size ) {
1334 xmm1 = xmm1 + x1 * A.load(i,j );
1335 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1336 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1338 y[j ] -=
sum( xmm1 );
1339 y[j+1UL] -=
sum( xmm2 );
1340 y[j+2UL] -=
sum( xmm3 );
1342 for( ; (j+2UL) <= N; j+=2UL ) {
1344 for(
size_t i=0UL; i<M; i+=IT::size ) {
1346 xmm1 = xmm1 + x1 * A.load(i,j );
1347 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1349 y[j ] -=
sum( xmm1 );
1350 y[j+1UL] -=
sum( xmm2 );
1354 for(
size_t i=0UL; i<M; i+=IT::size ) {
1355 xmm1 = xmm1 + A.load(i,j) * x.load(i);
1357 y[j] -=
sum( xmm1 );
1377 template<
typename VT1
1380 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1381 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1383 selectDefaultSubAssignKernel( y, x, A );
1403 template<
typename VT1
1406 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1407 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1409 using boost::numeric_cast;
1415 const int M ( numeric_cast<int>( A.rows() ) );
1416 const int N ( numeric_cast<int>( A.columns() ) );
1417 const int lda( numeric_cast<int>( A.spacing() ) );
1419 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -1.0F,
1420 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1441 template<
typename VT1
1444 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1445 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1447 using boost::numeric_cast;
1453 const int M ( numeric_cast<int>( A.rows() ) );
1454 const int N ( numeric_cast<int>( A.columns() ) );
1455 const int lda( numeric_cast<int>( A.spacing() ) );
1457 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -1.0,
1458 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1479 template<
typename VT1
1482 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1483 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1485 using boost::numeric_cast;
1494 const int M ( numeric_cast<int>( A.rows() ) );
1495 const int N ( numeric_cast<int>( A.columns() ) );
1496 const int lda( numeric_cast<int>( A.spacing() ) );
1497 const complex<float> alpha( -1.0F, 0.0F );
1498 const complex<float> beta ( 1.0F, 0.0F );
1500 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1501 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1522 template<
typename VT1
1525 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1526 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1528 using boost::numeric_cast;
1537 const int M ( numeric_cast<int>( A.rows() ) );
1538 const int N ( numeric_cast<int>( A.columns() ) );
1539 const int lda( numeric_cast<int>( A.spacing() ) );
1540 const complex<double> alpha( -1.0, 0.0 );
1541 const complex<double> beta ( 1.0, 0.0 );
1543 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1544 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1567 template<
typename VT1 >
1603 template<
typename VT1 >
1604 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1611 if( rhs.mat_.rows() == 0UL ) {
1615 else if( rhs.mat_.columns() == 0UL ) {
1647 template<
typename VT1 >
1648 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1680 template<
typename VT1 >
1681 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1688 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1724 template<
typename VT1 >
1725 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1732 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1768 template<
typename VT1 >
1769 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1818 template<
typename VT
1822 :
public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
1823 ,
private VecScalarMultExpr
1824 ,
private Computation
1828 typedef TDVecTDMatMultExpr<VT,MT> VMM;
1840 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
1845 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
1846 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
1854 template<
typename T1 >
1855 struct UseSMPAssign {
1856 enum { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
1865 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1866 struct UseSinglePrecisionKernel {
1867 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1868 IsFloat<typename T1::ElementType>::value &&
1869 IsFloat<typename T2::ElementType>::value &&
1870 IsFloat<typename T3::ElementType>::value &&
1871 !IsComplex<T4>::value };
1880 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1881 struct UseDoublePrecisionKernel {
1882 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1883 IsDouble<typename T1::ElementType>::value &&
1884 IsDouble<typename T2::ElementType>::value &&
1885 IsDouble<typename T3::ElementType>::value &&
1886 !IsComplex<T4>::value };
1895 template<
typename T1,
typename T2,
typename T3 >
1896 struct UseSinglePrecisionComplexKernel {
1897 typedef complex<float> Type;
1898 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1899 IsSame<typename T1::ElementType,Type>::value &&
1900 IsSame<typename T2::ElementType,Type>::value &&
1901 IsSame<typename T3::ElementType,Type>::value };
1910 template<
typename T1,
typename T2,
typename T3 >
1911 struct UseDoublePrecisionComplexKernel {
1912 typedef complex<double> Type;
1913 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1914 IsSame<typename T1::ElementType,Type>::value &&
1915 IsSame<typename T2::ElementType,Type>::value &&
1916 IsSame<typename T3::ElementType,Type>::value };
1924 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1925 struct UseDefaultKernel {
1926 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1927 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1928 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1929 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1938 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1939 struct UseVectorizedDefaultKernel {
1940 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1941 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1942 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1943 IsSame<typename T1::ElementType,T4>::value &&
1944 IntrinsicTrait<typename T1::ElementType>::addition &&
1945 IntrinsicTrait<typename T1::ElementType>::multiplication };
1951 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1952 typedef typename MultTrait<RES,ST>::Type
ResultType;
1955 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1960 typedef const TDVecTDMatMultExpr<VT,MT>
LeftOperand;
1966 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
LT;
1969 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
RT;
1974 enum { vectorizable = VT::vectorizable && MT::vectorizable &&
1975 IsSame<VET,MET>::value &&
1976 IsSame<VET,ST>::value &&
1977 IntrinsicTrait<VET>::addition &&
1978 IntrinsicTrait<VET>::multiplication };
1981 enum { smpAssignable = !evaluateVector && VT::smpAssignable &&
1982 !evaluateMatrix && MT::smpAssignable };
1991 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
2005 return vector_[index] * scalar_;
2014 inline size_t size()
const {
2015 return vector_.size();
2045 template<
typename T >
2046 inline bool canAlias(
const T* alias )
const {
2047 return vector_.canAlias( alias );
2057 template<
typename T >
2058 inline bool isAliased(
const T* alias )
const {
2059 return vector_.isAliased( alias );
2069 return vector_.isAligned();
2079 typename VMM::RightOperand A( vector_.rightOperand() );
2081 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2105 template<
typename VT1
2107 friend inline void assign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2113 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2114 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2116 if( right.rows() == 0UL ) {
2120 else if( right.columns() == 0UL ) {
2132 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2147 template<
typename VT1
2151 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2153 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2155 DVecScalarMultExpr::selectDefaultAssignKernel( y, x, A, scalar );
2157 DVecScalarMultExpr::selectBlasAssignKernel( y, x, A, scalar );
2175 template<
typename VT1
2179 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2180 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2182 y.assign( x * A * scalar );
2200 template<
typename VT1
2204 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2205 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2207 typedef IntrinsicTrait<ElementType> IT;
2209 const size_t M( A.rows() );
2210 const size_t N( A.columns() );
2214 for( ; (j+8UL) <= N; j+=8UL ) {
2215 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2216 for(
size_t i=0UL; i<M; i+=IT::size ) {
2218 xmm1 = xmm1 + x1 * A.load(i,j );
2219 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2220 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2221 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2222 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
2223 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
2224 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
2225 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
2227 y[j ] =
sum( xmm1 ) * scalar;
2228 y[j+1UL] =
sum( xmm2 ) * scalar;
2229 y[j+2UL] =
sum( xmm3 ) * scalar;
2230 y[j+3UL] =
sum( xmm4 ) * scalar;
2231 y[j+4UL] =
sum( xmm5 ) * scalar;
2232 y[j+5UL] =
sum( xmm6 ) * scalar;
2233 y[j+6UL] =
sum( xmm7 ) * scalar;
2234 y[j+7UL] =
sum( xmm8 ) * scalar;
2236 for( ; (j+4UL) <= N; j+=4UL ) {
2238 for(
size_t i=0UL; i<M; i+=IT::size ) {
2240 xmm1 = xmm1 + x1 * A.load(i,j );
2241 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2242 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2243 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2245 y[j ] =
sum( xmm1 ) * scalar;
2246 y[j+1UL] =
sum( xmm2 ) * scalar;
2247 y[j+2UL] =
sum( xmm3 ) * scalar;
2248 y[j+3UL] =
sum( xmm4 ) * scalar;
2250 for( ; (j+3UL) <= N; j+=3UL ) {
2252 for(
size_t i=0UL; i<M; i+=IT::size ) {
2254 xmm1 = xmm1 + x1 * A.load(i,j );
2255 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2256 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2258 y[j ] =
sum( xmm1 ) * scalar;
2259 y[j+1UL] =
sum( xmm2 ) * scalar;
2260 y[j+2UL] =
sum( xmm3 ) * scalar;
2262 for( ; (j+2UL) <= N; j+=2UL ) {
2264 for(
size_t i=0UL; i<M; i+=IT::size ) {
2266 xmm1 = xmm1 + x1 * A.load(i,j );
2267 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2269 y[j ] =
sum( xmm1 ) * scalar;
2270 y[j+1UL] =
sum( xmm2 ) * scalar;
2274 for(
size_t i=0UL; i<M; i+=IT::size ) {
2275 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2277 y[j] =
sum( xmm1 ) * scalar;
2295 template<
typename VT1
2299 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2300 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2302 selectDefaultAssignKernel( y, x, A, scalar );
2321 template<
typename VT1
2325 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2326 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2328 using boost::numeric_cast;
2334 const int M ( numeric_cast<int>( A.rows() ) );
2335 const int N ( numeric_cast<int>( A.columns() ) );
2336 const int lda( numeric_cast<int>( A.spacing() ) );
2338 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
2339 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2359 template<
typename VT1
2363 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2364 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2366 using boost::numeric_cast;
2372 const int M ( numeric_cast<int>( A.rows() ) );
2373 const int N ( numeric_cast<int>( A.columns() ) );
2374 const int lda( numeric_cast<int>( A.spacing() ) );
2376 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
2377 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2398 template<
typename VT1
2402 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2403 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2405 using boost::numeric_cast;
2414 const int M ( numeric_cast<int>( A.rows() ) );
2415 const int N ( numeric_cast<int>( A.columns() ) );
2416 const int lda( numeric_cast<int>( A.spacing() ) );
2417 const complex<float> alpha( scalar );
2418 const complex<float> beta ( 0.0F, 0.0F );
2420 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2421 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2442 template<
typename VT1
2446 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2447 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2449 using boost::numeric_cast;
2458 const int M ( numeric_cast<int>( A.rows() ) );
2459 const int N ( numeric_cast<int>( A.columns() ) );
2460 const int lda( numeric_cast<int>( A.spacing() ) );
2461 const complex<double> alpha( scalar );
2462 const complex<double> beta ( 0.0, 0.0 );
2464 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2465 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2482 template<
typename VT1
2484 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2511 template<
typename VT1
2513 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2519 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2520 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2522 if( right.rows() == 0UL || right.columns() == 0UL ) {
2534 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2549 template<
typename VT1
2553 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2555 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2557 DVecScalarMultExpr::selectDefaultAddAssignKernel( y, x, A, scalar );
2559 DVecScalarMultExpr::selectBlasAddAssignKernel( y, x, A, scalar );
2577 template<
typename VT1
2581 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2582 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2584 y.addAssign( x * A * scalar );
2602 template<
typename VT1
2606 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2607 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2609 typedef IntrinsicTrait<ElementType> IT;
2611 const size_t M( A.rows() );
2612 const size_t N( A.columns() );
2616 for( ; (j+8UL) <= N; j+=8UL ) {
2617 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2618 for(
size_t i=0UL; i<M; i+=IT::size ) {
2620 xmm1 = xmm1 + x1 * A.load(i,j );
2621 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2622 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2623 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2624 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
2625 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
2626 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
2627 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
2629 y[j ] +=
sum( xmm1 ) * scalar;
2630 y[j+1UL] +=
sum( xmm2 ) * scalar;
2631 y[j+2UL] +=
sum( xmm3 ) * scalar;
2632 y[j+3UL] +=
sum( xmm4 ) * scalar;
2633 y[j+4UL] +=
sum( xmm5 ) * scalar;
2634 y[j+5UL] +=
sum( xmm6 ) * scalar;
2635 y[j+6UL] +=
sum( xmm7 ) * scalar;
2636 y[j+7UL] +=
sum( xmm8 ) * scalar;
2638 for( ; (j+4UL) <= N; j+=4UL ) {
2640 for(
size_t i=0UL; i<M; i+=IT::size ) {
2642 xmm1 = xmm1 + x1 * A.load(i,j );
2643 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2644 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2645 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
2647 y[j ] +=
sum( xmm1 ) * scalar;
2648 y[j+1UL] +=
sum( xmm2 ) * scalar;
2649 y[j+2UL] +=
sum( xmm3 ) * scalar;
2650 y[j+3UL] +=
sum( xmm4 ) * scalar;
2652 for( ; (j+3UL) <= N; j+=3UL ) {
2654 for(
size_t i=0UL; i<M; i+=IT::size ) {
2656 xmm1 = xmm1 + x1 * A.load(i,j );
2657 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2658 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2660 y[j ] +=
sum( xmm1 ) * scalar;
2661 y[j+1UL] +=
sum( xmm2 ) * scalar;
2662 y[j+2UL] +=
sum( xmm3 ) * scalar;
2664 for( ; (j+2UL) <= N; j+=2UL ) {
2666 for(
size_t i=0UL; i<M; i+=IT::size ) {
2668 xmm1 = xmm1 + x1 * A.load(i,j );
2669 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2671 y[j ] +=
sum( xmm1 ) * scalar;
2672 y[j+1UL] +=
sum( xmm2 ) * scalar;
2676 for(
size_t i=0UL; i<M; i+=IT::size ) {
2677 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2679 y[j] +=
sum( xmm1 ) * scalar;
2698 template<
typename VT1
2702 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2703 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2705 selectDefaultAddAssignKernel( y, x, A, scalar );
2724 template<
typename VT1
2728 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2729 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2731 using boost::numeric_cast;
2737 const int M ( numeric_cast<int>( A.rows() ) );
2738 const int N ( numeric_cast<int>( A.columns() ) );
2739 const int lda( numeric_cast<int>( A.spacing() ) );
2741 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
2742 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2762 template<
typename VT1
2766 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2767 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2769 using boost::numeric_cast;
2775 const int M ( numeric_cast<int>( A.rows() ) );
2776 const int N ( numeric_cast<int>( A.columns() ) );
2777 const int lda( numeric_cast<int>( A.spacing() ) );
2779 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
2780 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2801 template<
typename VT1
2805 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2806 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2808 using boost::numeric_cast;
2817 const int M ( numeric_cast<int>( A.rows() ) );
2818 const int N ( numeric_cast<int>( A.columns() ) );
2819 const int lda( numeric_cast<int>( A.spacing() ) );
2820 const complex<float> alpha( scalar );
2821 const complex<float> beta ( 1.0F, 0.0F );
2823 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2824 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2845 template<
typename VT1
2849 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2850 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2852 using boost::numeric_cast;
2861 const int M ( numeric_cast<int>( A.rows() ) );
2862 const int N ( numeric_cast<int>( A.columns() ) );
2863 const int lda( numeric_cast<int>( A.spacing() ) );
2864 const complex<double> alpha( scalar );
2865 const complex<double> beta ( 1.0, 0.0 );
2867 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2868 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2889 template<
typename VT1
2891 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2897 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2898 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2900 if( right.rows() == 0UL || right.columns() == 0UL ) {
2912 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2927 template<
typename VT1
2931 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2933 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2935 DVecScalarMultExpr::selectDefaultSubAssignKernel( y, x, A, scalar );
2937 DVecScalarMultExpr::selectBlasSubAssignKernel( y, x, A, scalar );
2955 template<
typename VT1
2959 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2960 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2962 y.subAssign( x * A * scalar );
2980 template<
typename VT1
2984 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2985 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2987 typedef IntrinsicTrait<ElementType> IT;
2989 const size_t M( A.rows() );
2990 const size_t N( A.columns() );
2994 for( ; (j+8UL) <= N; j+=8UL ) {
2995 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2996 for(
size_t i=0UL; i<M; i+=IT::size ) {
2998 xmm1 = xmm1 + x1 * A.load(i,j );
2999 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3000 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3001 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3002 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3003 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3004 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3005 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3007 y[j ] -=
sum( xmm1 ) * scalar;
3008 y[j+1UL] -=
sum( xmm2 ) * scalar;
3009 y[j+2UL] -=
sum( xmm3 ) * scalar;
3010 y[j+3UL] -=
sum( xmm4 ) * scalar;
3011 y[j+4UL] -=
sum( xmm5 ) * scalar;
3012 y[j+5UL] -=
sum( xmm6 ) * scalar;
3013 y[j+6UL] -=
sum( xmm7 ) * scalar;
3014 y[j+7UL] -=
sum( xmm8 ) * scalar;
3016 for( ; (j+4UL) <= N; j+=4UL ) {
3018 for(
size_t i=0UL; i<M; i+=IT::size ) {
3020 xmm1 = xmm1 + x1 * A.load(i,j );
3021 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3022 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3023 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3025 y[j ] -=
sum( xmm1 ) * scalar;
3026 y[j+1UL] -=
sum( xmm2 ) * scalar;
3027 y[j+2UL] -=
sum( xmm3 ) * scalar;
3028 y[j+3UL] -=
sum( xmm4 ) * scalar;
3030 for( ; (j+3UL) <= N; j+=3UL ) {
3032 for(
size_t i=0UL; i<M; i+=IT::size ) {
3034 xmm1 = xmm1 + x1 * A.load(i,j );
3035 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3036 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3038 y[j ] -=
sum( xmm1 ) * scalar;
3039 y[j+1UL] -=
sum( xmm2 ) * scalar;
3040 y[j+2UL] -=
sum( xmm3 ) * scalar;
3042 for( ; (j+2UL) <= N; j+=2UL ) {
3044 for(
size_t i=0UL; i<M; i+=IT::size ) {
3046 xmm1 = xmm1 + x1 * A.load(i,j );
3047 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3049 y[j ] -=
sum( xmm1 ) * scalar;
3050 y[j+1UL] -=
sum( xmm2 ) * scalar;
3054 for(
size_t i=0UL; i<M; i+=IT::size ) {
3055 xmm1 = xmm1 + A.load(i,j) * x.load(i);
3057 y[j] -=
sum( xmm1 ) * scalar;
3077 template<
typename VT1
3081 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3082 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3084 selectDefaultSubAssignKernel( y, x, A, scalar );
3103 template<
typename VT1
3107 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
3108 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3110 using boost::numeric_cast;
3116 const int M ( numeric_cast<int>( A.rows() ) );
3117 const int N ( numeric_cast<int>( A.columns() ) );
3118 const int lda( numeric_cast<int>( A.spacing() ) );
3120 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -scalar,
3121 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
3141 template<
typename VT1
3145 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
3146 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3148 using boost::numeric_cast;
3154 const int M ( numeric_cast<int>( A.rows() ) );
3155 const int N ( numeric_cast<int>( A.columns() ) );
3156 const int lda( numeric_cast<int>( A.spacing() ) );
3158 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -scalar,
3159 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
3181 template<
typename VT1
3185 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
3186 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3188 using boost::numeric_cast;
3197 const int M ( numeric_cast<int>( A.rows() ) );
3198 const int N ( numeric_cast<int>( A.columns() ) );
3199 const int lda( numeric_cast<int>( A.spacing() ) );
3200 const complex<float> alpha( -scalar );
3201 const complex<float> beta ( 1.0F, 0.0F );
3203 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
3204 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3226 template<
typename VT1
3230 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
3231 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3233 using boost::numeric_cast;
3242 const int M ( numeric_cast<int>( A.rows() ) );
3243 const int N ( numeric_cast<int>( A.columns() ) );
3244 const int lda( numeric_cast<int>( A.spacing() ) );
3245 const complex<double> alpha( -scalar );
3246 const complex<double> beta ( 1.0, 0.0 );
3248 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
3249 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3270 template<
typename VT1
3272 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3305 template<
typename VT1
3307 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3308 smpAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3314 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
3315 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
3317 if( right.rows() == 0UL ) {
3321 else if( right.columns() == 0UL ) {
3351 template<
typename VT1
3353 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3354 smpAssign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3383 template<
typename VT1
3385 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3386 smpAddAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3392 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
3393 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
3395 if( right.rows() == 0UL || right.columns() == 0UL ) {
3429 template<
typename VT1
3431 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3432 smpSubAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3438 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
3439 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
3441 if( right.rows() == 0UL || right.columns() == 0UL ) {
3475 template<
typename VT1
3477 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3478 smpMultAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3551 template<
typename T1
3553 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >::Type
3558 if( (~vec).
size() != (~mat).
rows() )
3559 throw std::invalid_argument(
"Vector and matrix sizes do not match" );
3576 template<
typename VT,
typename MT,
bool AF >
3581 typedef typename MultExprTrait< VT, typename SubmatrixExprTrait<const MT,AF>::Type >::Type Type;
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:251
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:135
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
const size_t end_
End of the unrolled calculation loop.
Definition: TDVecTDMatMultExpr.h:399
LeftOperand leftOperand() const
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:333
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:343
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4599
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4329
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:152
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:199
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
void smpMultAssign(DenseVector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:179
int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:62
Header file for the IsSame and IsStrictlySame type traits.
const size_t TDVECTDMATMULT_THRESHOLD
Dense Vector/column-major dense matrix multiplication threshold.This setting specifies the threshold ...
Definition: Thresholds.h:108
Constraint on the data type.
MT::CompositeType MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:115
MRT::ElementType MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:113
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2408
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:251
SelectType< evaluateMatrix, const MRT, MCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:260
Header file for the DenseVector base class.
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:690
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:254
Header file for the RequiresEvaluation type trait.
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:122
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:253
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:377
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:355
ResultType::ElementType ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:245
Header file for the IsMatMatMultExpr type trait class.
Header file for the IsBlasCompatible type trait.
TDVecTDMatMultExpr< VT, MT > This
Type of this TDVecTDMatMultExpr instance.
Definition: TDVecTDMatMultExpr.h:242
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:271
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
VT::CompositeType VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:114
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDVecTDMatMultExpr.h:246
SelectType< evaluateVector, const VRT, VCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:257
Constraints on the storage order of matrix types.
Constraint on the data type.
MT::ResultType MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:111
VT::ResultType VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:110
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2406
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:361
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:296
Header file for the EnableIf class template.
Header file for the serial shim.
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:92
Header file for the IsNumeric type trait.
Header file for the SubmatrixExprTrait class template.
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:387
System settings for the BLAS mode.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:301
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:397
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:248
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:331
Header file for the TVecMatMultExpr base class.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:247
VRT::ElementType VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:112
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:244
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:398
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:367
Header file for the IsComputation type trait class.
MultTrait< VRT, MRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:243
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:250
const size_t TDVECDMATMULT_THRESHOLD
Dense Vector/row-major dense matrix multiplication threshold.This setting specifies the threshold bet...
Definition: Thresholds.h:91
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2403
TDVecTDMatMultExpr(const VT &vec, const MT &mat)
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:281
Header file for basic type definitions.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: TransposeFlag.h:81
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
const size_t SMP_TDVECTDMATMULT_THRESHOLD
SMP dense vector/column-major dense matrix multiplication threshold.This threshold specifies when a d...
Definition: Thresholds.h:391
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
size_t rows(const Matrix< MT, SO > &m)
Returns the current number of rows of the matrix.
Definition: Matrix.h:154
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:323
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.