22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
92 typedef typename VT::ResultType
VRT;
93 typedef typename MT::ResultType
MRT;
94 typedef typename VRT::ElementType
VET;
95 typedef typename MRT::ElementType
MET;
96 typedef typename VT::CompositeType
VCT;
97 typedef typename MT::CompositeType
MCT;
112 template<
typename T1,
typename T2,
typename T3 >
113 struct UseSinglePrecisionKernel {
127 template<
typename T1,
typename T2,
typename T3 >
128 struct UseDoublePrecisionKernel {
129 enum { value = IsDouble<typename T1::ElementType>::value &&
130 IsDouble<typename T2::ElementType>::value &&
131 IsDouble<typename T3::ElementType>::value };
142 template<
typename T1,
typename T2,
typename T3 >
143 struct UseSinglePrecisionComplexKernel {
144 typedef complex<float> Type;
145 enum { value = IsSame<typename T1::ElementType,Type>::value &&
146 IsSame<typename T2::ElementType,Type>::value &&
147 IsSame<typename T3::ElementType,Type>::value };
158 template<
typename T1,
typename T2,
typename T3 >
159 struct UseDoublePrecisionComplexKernel {
160 typedef complex<double> Type;
161 enum { value = IsSame<typename T1::ElementType,Type>::value &&
162 IsSame<typename T2::ElementType,Type>::value &&
163 IsSame<typename T3::ElementType,Type>::value };
173 template<
typename T1,
typename T2,
typename T3 >
174 struct UseDefaultKernel {
175 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
176 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
177 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
178 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
189 template<
typename T1,
typename T2,
typename T3 >
190 struct UseVectorizedDefaultKernel {
191 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
192 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
193 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
194 IntrinsicTrait<typename T1::ElementType>::addition &&
195 IntrinsicTrait<typename T1::ElementType>::multiplication };
225 enum { vectorizable = 0 };
259 if(
mat_.rows() != 0UL ) {
261 for(
size_t j=1UL; j<
end_; j+=2UL ) {
264 if( end_ < mat_.rows() ) {
282 return mat_.columns();
312 template<
typename T >
340 template<
typename VT1 >
345 if( rhs.
mat_.rows() == 0UL ) {
349 else if( rhs.
mat_.columns() == 0UL ) {
363 TDVecDMatMultExpr::selectDefaultAssignKernel( ~lhs, x, A );
365 TDVecDMatMultExpr::selectBlasAssignKernel( ~lhs, x, A );
384 template<
typename VT1
388 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
390 const size_t M( A.rows() );
391 const size_t N( A.columns() );
394 const size_t jend( N &
size_t(-2) );
396 for(
size_t j=0UL; j<N; ++j ) {
397 y[j] = x[0UL] * A(0UL,j);
399 for(
size_t i=1UL; i<M; ++i ) {
400 for(
size_t j=0UL; j<jend; j+=2UL ) {
401 y[j ] += x[i] * A(i,j );
402 y[j+1UL] += x[i] * A(i,j+1UL);
405 y[jend] += x[i] * A(i,jend);
426 template<
typename VT1
429 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
430 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
432 typedef IntrinsicTrait<ElementType> IT;
434 const size_t M( A.rows() );
435 const size_t N( A.spacing() );
439 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
440 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
441 for(
size_t i=0UL; i<M; ++i ) {
443 xmm1 = xmm1 + x1 * A.get(i,j );
444 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
445 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
446 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
447 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
448 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
449 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
450 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
452 store( &y[j ], xmm1 );
453 store( &y[j+IT::size ], xmm2 );
454 store( &y[j+IT::size*2UL], xmm3 );
455 store( &y[j+IT::size*3UL], xmm4 );
456 store( &y[j+IT::size*4UL], xmm5 );
457 store( &y[j+IT::size*5UL], xmm6 );
458 store( &y[j+IT::size*6UL], xmm7 );
459 store( &y[j+IT::size*7UL], xmm8 );
461 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
463 for(
size_t i=0UL; i<M; ++i ) {
465 xmm1 = xmm1 + x1 * A.get(i,j );
466 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
467 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
468 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
470 store( &y[j ], xmm1 );
471 store( &y[j+IT::size ], xmm2 );
472 store( &y[j+IT::size*2UL], xmm3 );
473 store( &y[j+IT::size*3UL], xmm4 );
475 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
477 for(
size_t i=0UL; i<M; ++i ) {
479 xmm1 = xmm1 + x1 * A.get(i,j );
480 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
481 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
483 store( &y[j ], xmm1 );
484 store( &y[j+IT::size ], xmm2 );
485 store( &y[j+IT::size*2UL], xmm3 );
487 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
489 for(
size_t i=0UL; i<M; ++i ) {
491 xmm1 = xmm1 + x1 * A.get(i,j );
492 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
494 store( &y[j ], xmm1 );
495 store( &y[j+IT::size], xmm2 );
499 for(
size_t i=0UL; i<M; ++i ) {
500 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
502 store( &y[j], xmm1 );
522 template<
typename VT1
525 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
526 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
528 selectDefaultAssignKernel( y, x, A );
548 template<
typename VT1
551 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
552 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
554 using boost::numeric_cast;
560 const int M ( numeric_cast<int>( A.rows() ) );
561 const int N ( numeric_cast<int>( A.columns() ) );
562 const int lda( numeric_cast<int>( A.spacing() ) );
564 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
565 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
586 template<
typename VT1
589 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
590 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
592 using boost::numeric_cast;
598 const int M ( numeric_cast<int>( A.rows() ) );
599 const int N ( numeric_cast<int>( A.columns() ) );
600 const int lda( numeric_cast<int>( A.spacing() ) );
602 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
603 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
624 template<
typename VT1
627 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
628 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
630 using boost::numeric_cast;
639 const int M ( numeric_cast<int>( A.rows() ) );
640 const int N ( numeric_cast<int>( A.columns() ) );
641 const int lda( numeric_cast<int>( A.spacing() ) );
642 const complex<float> alpha( 1.0F, 0.0F );
643 const complex<float> beta ( 0.0F, 0.0F );
645 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
646 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
667 template<
typename VT1
670 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
671 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
673 using boost::numeric_cast;
682 const int M ( numeric_cast<int>( A.rows() ) );
683 const int N ( numeric_cast<int>( A.columns() ) );
684 const int lda( numeric_cast<int>( A.spacing() ) );
685 const complex<double> alpha( 1.0, 0.0 );
686 const complex<double> beta ( 0.0, 0.0 );
688 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
689 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
708 template<
typename VT1 >
736 template<
typename VT1 >
741 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
753 if( ( IsComputation<MT>::value && !evaluate ) ||
755 TDVecDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A );
757 TDVecDMatMultExpr::selectBlasAddAssignKernel( ~lhs, x, A );
776 template<
typename VT1
779 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
780 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
782 const size_t M( A.rows() );
783 const size_t N( A.columns() );
786 const size_t jend( N &
size_t(-2) );
788 for(
size_t i=0UL; i<M; ++i ) {
789 for(
size_t j=0UL; j<jend; j+=2UL ) {
790 y[j ] += x[i] * A(i,j );
791 y[j+1UL] += x[i] * A(i,j+1UL);
794 y[jend] += x[i] * A(i,jend);
815 template<
typename VT1
818 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
819 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
821 typedef IntrinsicTrait<ElementType> IT;
823 const size_t M( A.rows() );
824 const size_t N( A.spacing() );
828 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
837 for(
size_t i=0UL; i<M; ++i ) {
839 xmm1 = xmm1 + x1 * A.get(i,j );
840 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
841 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
842 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
843 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
844 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
845 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
846 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
848 store( &y[j ], xmm1 );
849 store( &y[j+IT::size ], xmm2 );
850 store( &y[j+IT::size*2UL], xmm3 );
851 store( &y[j+IT::size*3UL], xmm4 );
852 store( &y[j+IT::size*4UL], xmm5 );
853 store( &y[j+IT::size*5UL], xmm6 );
854 store( &y[j+IT::size*6UL], xmm7 );
855 store( &y[j+IT::size*7UL], xmm8 );
857 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
862 for(
size_t i=0UL; i<M; ++i ) {
864 xmm1 = xmm1 + x1 * A.get(i,j );
865 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
866 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
867 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
869 store( &y[j ], xmm1 );
870 store( &y[j+IT::size ], xmm2 );
871 store( &y[j+IT::size*2UL], xmm3 );
872 store( &y[j+IT::size*3UL], xmm4 );
874 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
878 for(
size_t i=0UL; i<M; ++i ) {
880 xmm1 = xmm1 + x1 * A.get(i,j );
881 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
882 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
884 store( &y[j ], xmm1 );
885 store( &y[j+IT::size ], xmm2 );
886 store( &y[j+IT::size*2UL], xmm3 );
888 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
891 for(
size_t i=0UL; i<M; ++i ) {
893 xmm1 = xmm1 + x1 * A.get(i,j );
894 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
896 store( &y[j ], xmm1 );
897 store( &y[j+IT::size], xmm2 );
901 for(
size_t i=0UL; i<M; ++i ) {
902 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
904 store( &y[j], xmm1 );
924 template<
typename VT1
927 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
928 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
930 selectDefaultAddAssignKernel( y, x, A );
950 template<
typename VT1
953 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
954 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
956 using boost::numeric_cast;
962 const int M ( numeric_cast<int>( A.rows() ) );
963 const int N ( numeric_cast<int>( A.columns() ) );
964 const int lda( numeric_cast<int>( A.spacing() ) );
966 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
967 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
988 template<
typename VT1
991 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
992 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
994 using boost::numeric_cast;
1000 const int M ( numeric_cast<int>( A.rows() ) );
1001 const int N ( numeric_cast<int>( A.columns() ) );
1002 const int lda( numeric_cast<int>( A.spacing() ) );
1004 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
1005 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1026 template<
typename VT1
1029 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1030 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1032 using boost::numeric_cast;
1041 const int M ( numeric_cast<int>( A.rows() ) );
1042 const int N ( numeric_cast<int>( A.columns() ) );
1043 const int lda( numeric_cast<int>( A.spacing() ) );
1044 const complex<float> alpha( 1.0F, 0.0F );
1045 const complex<float> beta ( 1.0F, 0.0F );
1047 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1048 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1069 template<
typename VT1
1072 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1073 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1075 using boost::numeric_cast;
1084 const int M ( numeric_cast<int>( A.rows() ) );
1085 const int N ( numeric_cast<int>( A.columns() ) );
1086 const int lda( numeric_cast<int>( A.spacing() ) );
1087 const complex<double> alpha( 1.0, 0.0 );
1088 const complex<double> beta ( 1.0, 0.0 );
1090 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1091 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1114 template<
typename VT1 >
1119 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1131 if( ( IsComputation<MT>::value && !evaluate ) ||
1133 TDVecDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A );
1135 TDVecDMatMultExpr::selectBlasSubAssignKernel( ~lhs, x, A );
1154 template<
typename VT1
1157 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1158 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1160 const size_t M( A.rows() );
1161 const size_t N( A.columns() );
1164 const size_t jend( N &
size_t(-2) );
1166 for(
size_t i=0UL; i<M; ++i ) {
1167 for(
size_t j=0UL; j<jend; j+=2UL ) {
1168 y[j ] -= x[i] * A(i,j );
1169 y[j+1UL] -= x[i] * A(i,j+1UL);
1172 y[jend] -= x[i] * A(i,jend);
1193 template<
typename VT1
1196 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1197 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1199 typedef IntrinsicTrait<ElementType> IT;
1201 const size_t M( A.rows() );
1202 const size_t N( A.spacing() );
1206 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1215 for(
size_t i=0UL; i<M; ++i ) {
1217 xmm1 = xmm1 - x1 * A.get(i,j );
1218 xmm2 = xmm2 - x1 * A.get(i,j+IT::size );
1219 xmm3 = xmm3 - x1 * A.get(i,j+IT::size*2UL);
1220 xmm4 = xmm4 - x1 * A.get(i,j+IT::size*3UL);
1221 xmm5 = xmm5 - x1 * A.get(i,j+IT::size*4UL);
1222 xmm6 = xmm6 - x1 * A.get(i,j+IT::size*5UL);
1223 xmm7 = xmm7 - x1 * A.get(i,j+IT::size*6UL);
1224 xmm8 = xmm8 - x1 * A.get(i,j+IT::size*7UL);
1226 store( &y[j ], xmm1 );
1227 store( &y[j+IT::size ], xmm2 );
1228 store( &y[j+IT::size*2UL], xmm3 );
1229 store( &y[j+IT::size*3UL], xmm4 );
1230 store( &y[j+IT::size*4UL], xmm5 );
1231 store( &y[j+IT::size*5UL], xmm6 );
1232 store( &y[j+IT::size*6UL], xmm7 );
1233 store( &y[j+IT::size*7UL], xmm8 );
1235 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1240 for(
size_t i=0UL; i<M; ++i ) {
1242 xmm1 = xmm1 - x1 * A.get(i,j );
1243 xmm2 = xmm2 - x1 * A.get(i,j+IT::size );
1244 xmm3 = xmm3 - x1 * A.get(i,j+IT::size*2UL);
1245 xmm4 = xmm4 - x1 * A.get(i,j+IT::size*3UL);
1247 store( &y[j ], xmm1 );
1248 store( &y[j+IT::size ], xmm2 );
1249 store( &y[j+IT::size*2UL], xmm3 );
1250 store( &y[j+IT::size*3UL], xmm4 );
1252 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
1256 for(
size_t i=0UL; i<M; ++i ) {
1258 xmm1 = xmm1 - x1 * A.get(i,j );
1259 xmm2 = xmm2 - x1 * A.get(i,j+IT::size );
1260 xmm3 = xmm3 - x1 * A.get(i,j+IT::size*2UL);
1262 store( &y[j ], xmm1 );
1263 store( &y[j+IT::size ], xmm2 );
1264 store( &y[j+IT::size*2UL], xmm3 );
1266 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1269 for(
size_t i=0UL; i<M; ++i ) {
1271 xmm1 = xmm1 - x1 * A.get(i,j );
1272 xmm2 = xmm2 - x1 * A.get(i,j+IT::size);
1274 store( &y[j ], xmm1 );
1275 store( &y[j+IT::size], xmm2 );
1279 for(
size_t i=0UL; i<M; ++i ) {
1280 xmm1 = xmm1 -
set( x[i] ) * A.get(i,j);
1282 store( &y[j], xmm1 );
1302 template<
typename VT1
1305 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1306 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1308 selectDefaultSubAssignKernel( y, x, A );
1328 template<
typename VT1
1331 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1332 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1334 using boost::numeric_cast;
1340 const int M ( numeric_cast<int>( A.rows() ) );
1341 const int N ( numeric_cast<int>( A.columns() ) );
1342 const int lda( numeric_cast<int>( A.spacing() ) );
1344 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -1.0F,
1345 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1366 template<
typename VT1
1369 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1370 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1372 using boost::numeric_cast;
1378 const int M ( numeric_cast<int>( A.rows() ) );
1379 const int N ( numeric_cast<int>( A.columns() ) );
1380 const int lda( numeric_cast<int>( A.spacing() ) );
1382 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -1.0,
1383 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1404 template<
typename VT1
1407 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1408 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1410 using boost::numeric_cast;
1419 const int M ( numeric_cast<int>( A.rows() ) );
1420 const int N ( numeric_cast<int>( A.columns() ) );
1421 const int lda( numeric_cast<int>( A.spacing() ) );
1422 const complex<float> alpha( -1.0F, 0.0F );
1423 const complex<float> beta ( 1.0F, 0.0F );
1425 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1426 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1447 template<
typename VT1
1450 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1451 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1453 using boost::numeric_cast;
1462 const int M ( numeric_cast<int>( A.rows() ) );
1463 const int N ( numeric_cast<int>( A.columns() ) );
1464 const int lda( numeric_cast<int>( A.spacing() ) );
1465 const complex<double> alpha( -1.0, 0.0 );
1466 const complex<double> beta ( 1.0, 0.0 );
1468 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1469 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1492 template<
typename VT1 >
1539 template<
typename VT
1543 :
public DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true >
1544 ,
private Expression
1545 ,
private Computation
1550 typedef typename VMM::ResultType RES;
1551 typedef typename VT::ResultType
VRT;
1552 typedef typename MT::ResultType
MRT;
1553 typedef typename VRT::ElementType
VET;
1554 typedef typename MRT::ElementType
MET;
1555 typedef typename VT::CompositeType
VCT;
1556 typedef typename MT::CompositeType
MCT;
1561 enum { evaluate = IsComputation<MT>::value && !MT::vectorizable &&
1562 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1570 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1571 struct UseSinglePrecisionKernel {
1572 enum { value = IsFloat<typename T1::ElementType>::value &&
1573 IsFloat<typename T2::ElementType>::value &&
1574 IsFloat<typename T3::ElementType>::value &&
1575 !IsComplex<T4>::value };
1584 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1585 struct UseDoublePrecisionKernel {
1586 enum { value = IsDouble<typename T1::ElementType>::value &&
1587 IsDouble<typename T2::ElementType>::value &&
1588 IsDouble<typename T3::ElementType>::value &&
1589 !IsComplex<T4>::value };
1598 template<
typename T1,
typename T2,
typename T3 >
1599 struct UseSinglePrecisionComplexKernel {
1600 typedef complex<float> Type;
1601 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1602 IsSame<typename T2::ElementType,Type>::value &&
1603 IsSame<typename T3::ElementType,Type>::value };
1612 template<
typename T1,
typename T2,
typename T3 >
1613 struct UseDoublePrecisionComplexKernel {
1614 typedef complex<double> Type;
1615 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1616 IsSame<typename T2::ElementType,Type>::value &&
1617 IsSame<typename T3::ElementType,Type>::value };
1625 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1626 struct UseDefaultKernel {
1627 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1628 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1629 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1630 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1639 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1640 struct UseVectorizedDefaultKernel {
1641 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1642 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1643 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1644 IsSame<typename T1::ElementType,T4>::value &&
1645 IntrinsicTrait<typename T1::ElementType>::addition &&
1646 IntrinsicTrait<typename T1::ElementType>::multiplication };
1652 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1653 typedef typename MultTrait<RES,ST>::Type
ResultType;
1655 typedef typename ResultType::ElementType
ElementType;
1656 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1667 typedef typename SelectType< IsComputation<VT>::value,
const VRT,
VCT >::Type
LT;
1670 typedef typename SelectType< evaluate, const MRT, MCT >::Type
RT;
1675 enum { vectorizable = 0 };
1678 enum { canAlias = CanAlias<VMM>::value };
1687 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
1701 return vector_[index] * scalar_;
1710 inline size_t size()
const {
1711 return vector_.size();
1741 template<
typename T >
1742 inline bool isAliased(
const T* alias )
const {
1743 return CanAlias<VMM>::value && vector_.isAliased( alias );
1765 template<
typename VT1 >
1766 friend inline void assign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
1770 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
1771 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
1773 if( right.rows() == 0UL ) {
1777 else if( right.columns() == 0UL ) {
1789 if( ( IsComputation<MT>::value && !evaluate ) ||
1791 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, x, A, rhs.scalar_ );
1793 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, x, A, rhs.scalar_ );
1811 template<
typename VT1
1815 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1816 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1818 const size_t M( A.rows() );
1819 const size_t N( A.columns() );
1822 const size_t jend( N &
size_t(-2) );
1824 for(
size_t j=0UL; j<N; ++j ) {
1825 y[j] = x[0UL] * A(0UL,j);
1827 for(
size_t i=1UL; i<M; ++i ) {
1828 for(
size_t j=0UL; j<jend; j+=2UL ) {
1829 y[j ] += x[i] * A(i,j );
1830 y[j+1UL] += x[i] * A(i,j+1UL);
1833 y[jend] += x[i] * A(i,jend);
1836 for(
size_t j=0UL; j<N; ++j ) {
1856 template<
typename VT1
1860 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1861 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1863 typedef IntrinsicTrait<ElementType> IT;
1865 const size_t M( A.rows() );
1866 const size_t N( A.spacing() );
1872 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
1873 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1874 for(
size_t i=0UL; i<M; ++i ) {
1876 xmm1 = xmm1 + x1 * A.get(i,j );
1877 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
1878 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
1879 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
1880 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
1881 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
1882 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
1883 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
1885 store( &y[j ], xmm1*factor );
1886 store( &y[j+IT::size ], xmm2*factor );
1887 store( &y[j+IT::size*2UL], xmm3*factor );
1888 store( &y[j+IT::size*3UL], xmm4*factor );
1889 store( &y[j+IT::size*4UL], xmm5*factor );
1890 store( &y[j+IT::size*5UL], xmm6*factor );
1891 store( &y[j+IT::size*6UL], xmm7*factor );
1892 store( &y[j+IT::size*7UL], xmm8*factor );
1894 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
1896 for(
size_t i=0UL; i<M; ++i ) {
1898 xmm1 = xmm1 + x1 * A.get(i,j );
1899 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
1900 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
1901 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
1903 store( &y[j ], xmm1*factor );
1904 store( &y[j+IT::size ], xmm2*factor );
1905 store( &y[j+IT::size*2UL], xmm3*factor );
1906 store( &y[j+IT::size*3UL], xmm4*factor );
1908 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
1910 for(
size_t i=0UL; i<M; ++i ) {
1912 xmm1 = xmm1 + x1 * A.get(i,j );
1913 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
1914 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
1916 store( &y[j ], xmm1*factor );
1917 store( &y[j+IT::size ], xmm2*factor );
1918 store( &y[j+IT::size*2UL], xmm3*factor );
1920 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
1922 for(
size_t i=0UL; i<M; ++i ) {
1924 xmm1 = xmm1 + x1 * A.get(i,j );
1925 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
1927 store( &y[j ], xmm1*factor );
1928 store( &y[j+IT::size], xmm2*factor );
1932 for(
size_t i=0UL; i<M; ++i ) {
1933 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
1935 store( &y[j], xmm1*factor );
1953 template<
typename VT1
1957 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1958 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1960 selectDefaultAssignKernel( y, x, A, scalar );
1979 template<
typename VT1
1983 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
1984 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1986 using boost::numeric_cast;
1992 const int M ( numeric_cast<int>( A.rows() ) );
1993 const int N ( numeric_cast<int>( A.columns() ) );
1994 const int lda( numeric_cast<int>( A.spacing() ) );
1996 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
1997 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2017 template<
typename VT1
2021 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2022 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2024 using boost::numeric_cast;
2030 const int M ( numeric_cast<int>( A.rows() ) );
2031 const int N ( numeric_cast<int>( A.columns() ) );
2032 const int lda( numeric_cast<int>( A.spacing() ) );
2034 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2035 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2055 template<
typename VT1
2059 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2060 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2062 using boost::numeric_cast;
2072 const int M ( numeric_cast<int>( A.rows() ) );
2073 const int N ( numeric_cast<int>( A.columns() ) );
2074 const int lda( numeric_cast<int>( A.spacing() ) );
2075 const complex<float> alpha( scalar );
2076 const complex<float> beta ( 0.0F, 0.0F );
2078 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2079 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2099 template<
typename VT1
2103 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2104 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2106 using boost::numeric_cast;
2116 const int M ( numeric_cast<int>( A.rows() ) );
2117 const int N ( numeric_cast<int>( A.columns() ) );
2118 const int lda( numeric_cast<int>( A.spacing() ) );
2119 const complex<double> alpha( scalar );
2120 const complex<double> beta ( 0.0, 0.0 );
2122 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2123 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2140 template<
typename VT1 >
2166 template<
typename VT1 >
2167 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2171 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2172 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2174 if( right.rows() == 0UL || right.columns() == 0UL ) {
2186 if( ( IsComputation<MT>::value && !evaluate ) ||
2188 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2190 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2208 template<
typename VT1
2212 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2213 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2215 y.addAssign( x * A * scalar );
2233 template<
typename VT1
2237 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2238 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2240 typedef IntrinsicTrait<ElementType> IT;
2242 const size_t M( A.rows() );
2243 const size_t N( A.spacing() );
2249 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2250 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2251 for(
size_t i=0UL; i<M; ++i ) {
2253 xmm1 = xmm1 + x1 * A.get(i,j );
2254 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2255 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2256 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2257 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
2258 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
2259 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
2260 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
2262 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2263 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) + xmm2*factor );
2264 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) + xmm3*factor );
2265 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) + xmm4*factor );
2266 store( &y[j+IT::size*4UL],
load( &y[j+IT::size*4UL] ) + xmm5*factor );
2267 store( &y[j+IT::size*5UL],
load( &y[j+IT::size*5UL] ) + xmm6*factor );
2268 store( &y[j+IT::size*6UL],
load( &y[j+IT::size*6UL] ) + xmm7*factor );
2269 store( &y[j+IT::size*7UL],
load( &y[j+IT::size*7UL] ) + xmm8*factor );
2271 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2273 for(
size_t i=0UL; i<M; ++i ) {
2275 xmm1 = xmm1 + x1 * A.get(i,j );
2276 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2277 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2278 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2280 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2281 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) + xmm2*factor );
2282 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) + xmm3*factor );
2283 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) + xmm4*factor );
2285 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
2287 for(
size_t i=0UL; i<M; ++i ) {
2289 xmm1 = xmm1 + x1 * A.get(i,j );
2290 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2291 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2293 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2294 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) + xmm2*factor );
2295 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) + xmm3*factor );
2297 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2299 for(
size_t i=0UL; i<M; ++i ) {
2301 xmm1 = xmm1 + x1 * A.get(i,j );
2302 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
2304 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2305 store( &y[j+IT::size],
load( &y[j+IT::size] ) + xmm2*factor );
2309 for(
size_t i=0UL; i<M; ++i ) {
2310 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
2312 store( &y[j],
load( &y[j] ) + xmm1*factor );
2331 template<
typename VT1
2335 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2336 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2338 selectDefaultAddAssignKernel( y, x, A, scalar );
2357 template<
typename VT1
2361 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2362 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2364 using boost::numeric_cast;
2370 const int M ( numeric_cast<int>( A.rows() ) );
2371 const int N ( numeric_cast<int>( A.columns() ) );
2372 const int lda( numeric_cast<int>( A.spacing() ) );
2374 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2375 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2395 template<
typename VT1
2399 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2400 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2402 using boost::numeric_cast;
2408 const int M ( numeric_cast<int>( A.rows() ) );
2409 const int N ( numeric_cast<int>( A.columns() ) );
2410 const int lda( numeric_cast<int>( A.spacing() ) );
2412 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2413 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2433 template<
typename VT1
2437 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2438 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2440 using boost::numeric_cast;
2450 const int M ( numeric_cast<int>( A.rows() ) );
2451 const int N ( numeric_cast<int>( A.columns() ) );
2452 const int lda( numeric_cast<int>( A.spacing() ) );
2453 const complex<float> alpha( scalar );
2454 const complex<float> beta ( 1.0F, 0.0F );
2456 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2457 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2477 template<
typename VT1
2481 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2482 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2484 using boost::numeric_cast;
2494 const int M ( numeric_cast<int>( A.rows() ) );
2495 const int N ( numeric_cast<int>( A.columns() ) );
2496 const int lda( numeric_cast<int>( A.spacing() ) );
2497 const complex<double> alpha( scalar );
2498 const complex<double> beta ( 1.0, 0.0 );
2500 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2501 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2522 template<
typename VT1 >
2523 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2527 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2528 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2530 if( right.rows() == 0UL || right.columns() == 0UL ) {
2542 if( ( IsComputation<MT>::value && !evaluate ) ||
2544 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2546 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2564 template<
typename VT1
2568 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2569 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2571 y.subAssign( x * A * scalar );
2589 template<
typename VT1
2593 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2594 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2596 typedef IntrinsicTrait<ElementType> IT;
2598 const size_t M( A.rows() );
2599 const size_t N( A.spacing() );
2605 for( ; (j+IT::size*8UL) <= N; j+=IT::size*8UL ) {
2606 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2607 for(
size_t i=0UL; i<M; ++i ) {
2609 xmm1 = xmm1 + x1 * A.get(i,j );
2610 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2611 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2612 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2613 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
2614 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
2615 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
2616 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
2618 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2619 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) - xmm2*factor );
2620 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) - xmm3*factor );
2621 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) - xmm4*factor );
2622 store( &y[j+IT::size*4UL],
load( &y[j+IT::size*4UL] ) - xmm5*factor );
2623 store( &y[j+IT::size*5UL],
load( &y[j+IT::size*5UL] ) - xmm6*factor );
2624 store( &y[j+IT::size*6UL],
load( &y[j+IT::size*6UL] ) - xmm7*factor );
2625 store( &y[j+IT::size*7UL],
load( &y[j+IT::size*7UL] ) - xmm8*factor );
2627 for( ; (j+IT::size*4UL) <= N; j+=IT::size*4UL ) {
2629 for(
size_t i=0UL; i<M; ++i ) {
2631 xmm1 = xmm1 + x1 * A.get(i,j );
2632 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2633 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2634 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2636 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2637 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) - xmm2*factor );
2638 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) - xmm3*factor );
2639 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) - xmm4*factor );
2641 for( ; (j+IT::size*3UL) <= N; j+=IT::size*3UL ) {
2643 for(
size_t i=0UL; i<M; ++i ) {
2645 xmm1 = xmm1 + x1 * A.get(i,j );
2646 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2647 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2649 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2650 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) - xmm2*factor );
2651 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) - xmm3*factor );
2653 for( ; (j+IT::size*2UL) <= N; j+=IT::size*2UL ) {
2655 for(
size_t i=0UL; i<M; ++i ) {
2657 xmm1 = xmm1 + x1 * A.get(i,j );
2658 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
2660 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2661 store( &y[j+IT::size],
load( &y[j+IT::size] ) - xmm2*factor );
2665 for(
size_t i=0UL; i<M; ++i ) {
2666 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
2668 store( &y[j],
load( &y[j] ) - xmm1*factor );
2687 template<
typename VT1
2691 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2692 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2694 selectDefaultSubAssignKernel( y, x, A, scalar );
2713 template<
typename VT1
2717 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2718 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2720 using boost::numeric_cast;
2726 const int M ( numeric_cast<int>( A.rows() ) );
2727 const int N ( numeric_cast<int>( A.columns() ) );
2728 const int lda( numeric_cast<int>( A.spacing() ) );
2730 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
2731 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2751 template<
typename VT1
2755 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2756 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2758 using boost::numeric_cast;
2764 const int M ( numeric_cast<int>( A.rows() ) );
2765 const int N ( numeric_cast<int>( A.columns() ) );
2766 const int lda( numeric_cast<int>( A.spacing() ) );
2768 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
2769 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2789 template<
typename VT1
2793 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2794 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2796 using boost::numeric_cast;
2806 const int M ( numeric_cast<int>( A.rows() ) );
2807 const int N ( numeric_cast<int>( A.columns() ) );
2808 const int lda( numeric_cast<int>( A.spacing() ) );
2809 const complex<float> alpha( -scalar );
2810 const complex<float> beta ( 1.0F, 0.0F );
2812 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2813 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2833 template<
typename VT1
2837 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2838 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2840 using boost::numeric_cast;
2850 const int M ( numeric_cast<int>( A.rows() ) );
2851 const int N ( numeric_cast<int>( A.columns() ) );
2852 const int lda( numeric_cast<int>( A.spacing() ) );
2853 const complex<double> alpha( -scalar );
2854 const complex<double> beta ( 1.0, 0.0 );
2856 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2857 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2878 template<
typename VT1 >
2879 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2949 template<
typename T1
2954 if( (~vec).
size() != (~mat).
rows() )
2955 throw std::invalid_argument(
"Vector and matrix sizes do not match" );
2983 template<
typename T1
2986 inline const typename EnableIf< IsMatMatMultExpr<T2>, MultExprTrait<T1,T2> >::Type::Type