35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_ 123 template<
typename MT
125 class TDMatDVecMultExpr :
public DenseVector< TDMatDVecMultExpr<MT,VT>, false >
126 ,
private MatVecMultExpr
127 ,
private Computation
156 template<
typename T1 >
157 struct UseSMPAssign {
158 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
168 template<
typename T1,
typename T2,
typename T3 >
169 struct UseBlasKernel {
175 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
191 template<
typename T1,
typename T2,
typename T3 >
192 struct UseVectorizedDefaultKernel {
195 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
231 MT::simdEnabled && VT::simdEnabled &&
236 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
237 !evaluateVector && VT::smpAssignable };
270 return mat_(index,index) *
vec_[index];
280 const size_t n (
mat_.columns() -
begin );
297 inline ReturnType
at(
size_t index )
const {
298 if( index >=
mat_.rows() ) {
301 return (*
this)[index];
310 inline size_t size() const noexcept {
341 template<
typename T >
342 inline bool canAlias(
const T* alias )
const noexcept {
343 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
353 template<
typename T >
354 inline bool isAliased(
const T* alias )
const noexcept {
355 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
365 return mat_.isAligned() &&
vec_.isAligned();
377 (
mat_.rows() *
mat_.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
378 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
401 template<
typename VT1 >
408 if( rhs.mat_.rows() == 0UL ) {
411 else if( rhs.mat_.columns() == 0UL ) {
416 LT A(
serial( rhs.mat_ ) );
417 RT x(
serial( rhs.vec_ ) );
424 TDMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
440 template<
typename VT1
443 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
447 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
448 selectSmallAssignKernel( y, A, x );
450 selectBlasAssignKernel( y, A, x );
469 template<
typename VT1
472 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
474 const size_t M( A.rows() );
475 const size_t N( A.columns() );
484 y[i] = A(i,0UL) * x[0UL];
492 y[j] = A(j,j) * x[j];
504 const size_t inum( iend - ibegin );
505 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
507 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
508 y[i ] += A(i ,j) * x[j];
509 y[i+1UL] += A(i+1UL,j) * x[j];
512 y[ipos] += A(ipos,j) * x[j];
515 y[iend] = A(iend,j) * x[j];
541 template<
typename VT1
545 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
547 selectDefaultAssignKernel( y, A, x );
566 template<
typename VT1
570 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
574 const size_t M( A.rows() );
575 const size_t N( A.columns() );
577 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
582 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
592 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
594 for(
size_t j=jbegin; j<jend; ++j ) {
595 const SIMDType x1(
set( x[j] ) );
596 xmm1 += A.load(i ,j) * x1;
597 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
598 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
599 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
600 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
601 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
602 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
603 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
607 y.store( i+SIMDSIZE , xmm2 );
608 y.store( i+SIMDSIZE*2UL, xmm3 );
609 y.store( i+SIMDSIZE*3UL, xmm4 );
610 y.store( i+SIMDSIZE*4UL, xmm5 );
611 y.store( i+SIMDSIZE*5UL, xmm6 );
612 y.store( i+SIMDSIZE*6UL, xmm7 );
613 y.store( i+SIMDSIZE*7UL, xmm8 );
616 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
626 SIMDType xmm1, xmm2, xmm3, xmm4;
628 for(
size_t j=jbegin; j<jend; ++j ) {
629 const SIMDType x1(
set( x[j] ) );
630 xmm1 += A.load(i ,j) * x1;
631 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
632 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
633 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
637 y.store( i+SIMDSIZE , xmm2 );
638 y.store( i+SIMDSIZE*2UL, xmm3 );
639 y.store( i+SIMDSIZE*3UL, xmm4 );
642 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
652 SIMDType xmm1, xmm2, xmm3;
654 for(
size_t j=jbegin; j<jend; ++j ) {
655 const SIMDType x1(
set( x[j] ) );
656 xmm1 += A.load(i ,j) * x1;
657 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
658 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
662 y.store( i+SIMDSIZE , xmm2 );
663 y.store( i+SIMDSIZE*2UL, xmm3 );
666 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
678 for(
size_t j=jbegin; j<jend; ++j ) {
679 const SIMDType x1(
set( x[j] ) );
680 xmm1 += A.load(i ,j) * x1;
681 xmm2 += A.load(i+SIMDSIZE,j) * x1;
685 y.store( i+SIMDSIZE, xmm2 );
688 for( ; i<ipos; i+=SIMDSIZE )
700 for(
size_t j=jbegin; j<jend; ++j ) {
701 xmm1 += A.load(i,j) *
set( x[j] );
707 for( ; remainder && i<M; ++i )
719 for(
size_t j=jbegin; j<jend; ++j ) {
720 value += A(i,j) * x[j];
743 template<
typename VT1
747 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
749 selectDefaultAssignKernel( y, A, x );
768 template<
typename VT1
772 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
776 const size_t M( A.rows() );
777 const size_t N( A.columns() );
779 const size_t iblock( 32768UL /
sizeof( ElementType ) );
780 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
786 for(
size_t ii=0U; ii<M; ii+=iblock ) {
787 for(
size_t jj=0UL; jj<N; jj+=jblock )
789 const size_t jend(
min( jj+jblock, N ) );
790 const size_t itmp(
min( ii+iblock, M ) );
795 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
796 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
802 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
804 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
806 for(
size_t j=jj; j<jend; ++j ) {
807 const SIMDType x1(
set( x[j] ) );
808 xmm1 += A.load(i ,j) * x1;
809 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
810 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
811 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
812 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
813 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
814 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
815 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
818 y.store( i , y.load(i ) + xmm1 );
819 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
820 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
821 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
822 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
823 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
824 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
825 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
828 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
830 SIMDType xmm1, xmm2, xmm3, xmm4;
832 for(
size_t j=jj; j<jend; ++j ) {
833 const SIMDType x1(
set( x[j] ) );
834 xmm1 += A.load(i ,j) * x1;
835 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
836 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
837 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
840 y.store( i , y.load(i ) + xmm1 );
841 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
842 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
843 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
846 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
848 SIMDType xmm1, xmm2, xmm3;
850 for(
size_t j=jj; j<jend; ++j ) {
851 const SIMDType x1(
set( x[j] ) );
852 xmm1 += A.load(i ,j) * x1;
853 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
854 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
857 y.store( i , y.load(i ) + xmm1 );
858 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
859 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
862 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
866 for(
size_t j=jj; j<jend; ++j ) {
867 const SIMDType x1(
set( x[j] ) );
868 xmm1 += A.load(i ,j) * x1;
869 xmm2 += A.load(i+SIMDSIZE,j) * x1;
872 y.store( i , y.load(i ) + xmm1 );
873 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
876 for( ; i<ipos; i+=SIMDSIZE )
880 for(
size_t j=jj; j<jend; ++j ) {
881 xmm1 += A.load(i,j) *
set( x[j] );
884 y.store( i, y.load(i) + xmm1 );
887 for( ; remainder && i<iend; ++i )
891 for(
size_t j=jj; j<jend; ++j ) {
892 value += A(i,j) * x[j];
917 template<
typename VT1
921 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
923 selectLargeAssignKernel( y, A, x );
929 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 943 template<
typename VT1
947 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
956 gemv( y, A, x, ET(1), ET(0) );
976 template<
typename VT1 >
987 const ResultType tmp(
serial( rhs ) );
1006 template<
typename VT1 >
1013 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1017 LT A(
serial( rhs.mat_ ) );
1018 RT x(
serial( rhs.vec_ ) );
1025 TDMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1041 template<
typename VT1
1044 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1048 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1049 selectSmallAddAssignKernel( y, A, x );
1051 selectBlasAddAssignKernel( y, A, x );
1070 template<
typename VT1
1073 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1075 const size_t M( A.rows() );
1076 const size_t N( A.columns() );
1078 for(
size_t j=0UL; j<N; ++j )
1082 y[j] += A(j,j) * x[j];
1094 const size_t inum( iend - ibegin );
1095 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1097 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1098 y[i ] += A(i ,j) * x[j];
1099 y[i+1UL] += A(i+1UL,j) * x[j];
1102 y[ipos] += A(ipos,j) * x[j];
1124 template<
typename VT1
1128 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1130 selectDefaultAddAssignKernel( y, A, x );
1149 template<
typename VT1
1153 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1157 const size_t M( A.rows() );
1158 const size_t N( A.columns() );
1160 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1165 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1175 SIMDType xmm1( y.load(i ) );
1176 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1177 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1178 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1179 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1180 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1181 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1182 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1184 for(
size_t j=jbegin; j<jend; ++j ) {
1185 const SIMDType x1(
set( x[j] ) );
1186 xmm1 += A.load(i ,j) * x1;
1187 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1188 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1189 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1190 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1191 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1192 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1193 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1196 y.store( i , xmm1 );
1197 y.store( i+SIMDSIZE , xmm2 );
1198 y.store( i+SIMDSIZE*2UL, xmm3 );
1199 y.store( i+SIMDSIZE*3UL, xmm4 );
1200 y.store( i+SIMDSIZE*4UL, xmm5 );
1201 y.store( i+SIMDSIZE*5UL, xmm6 );
1202 y.store( i+SIMDSIZE*6UL, xmm7 );
1203 y.store( i+SIMDSIZE*7UL, xmm8 );
1206 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1216 SIMDType xmm1( y.load(i ) );
1217 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1218 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1219 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1221 for(
size_t j=jbegin; j<jend; ++j ) {
1222 const SIMDType x1(
set( x[j] ) );
1223 xmm1 += A.load(i ,j) * x1;
1224 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1225 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1226 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1229 y.store( i , xmm1 );
1230 y.store( i+SIMDSIZE , xmm2 );
1231 y.store( i+SIMDSIZE*2UL, xmm3 );
1232 y.store( i+SIMDSIZE*3UL, xmm4 );
1235 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1245 SIMDType xmm1( y.load(i ) );
1246 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1247 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1249 for(
size_t j=jbegin; j<jend; ++j ) {
1250 const SIMDType x1(
set( x[j] ) );
1251 xmm1 += A.load(i ,j) * x1;
1252 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1253 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1256 y.store( i , xmm1 );
1257 y.store( i+SIMDSIZE , xmm2 );
1258 y.store( i+SIMDSIZE*2UL, xmm3 );
1261 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1271 SIMDType xmm1( y.load(i ) );
1272 SIMDType xmm2( y.load(i+SIMDSIZE) );
1274 for(
size_t j=jbegin; j<jend; ++j ) {
1275 const SIMDType x1(
set( x[j] ) );
1276 xmm1 += A.load(i ,j) * x1;
1277 xmm2 += A.load(i+SIMDSIZE,j) * x1;
1280 y.store( i , xmm1 );
1281 y.store( i+SIMDSIZE, xmm2 );
1284 for( ; i<ipos; i+=SIMDSIZE )
1294 SIMDType xmm1( y.load(i) );
1296 for(
size_t j=jbegin; j<jend; ++j ) {
1297 xmm1 += A.load(i,j) *
set( x[j] );
1303 for( ; remainder && i<M; ++i )
1315 for(
size_t j=jbegin; j<jend; ++j ) {
1316 value += A(i,j) * x[j];
1339 template<
typename VT1
1343 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1345 selectDefaultAddAssignKernel( y, A, x );
1364 template<
typename VT1
1368 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1372 const size_t M( A.rows() );
1373 const size_t N( A.columns() );
1375 const size_t iblock( 32768UL /
sizeof( ElementType ) );
1376 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1380 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1381 for(
size_t jj=0UL; jj<N; jj+=jblock )
1383 const size_t jend(
min( jj+jblock, N ) );
1384 const size_t itmp(
min( ii+iblock, M ) );
1389 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1390 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1396 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1398 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1400 for(
size_t j=jj; j<jend; ++j ) {
1401 const SIMDType x1(
set( x[j] ) );
1402 xmm1 += A.load(i ,j) * x1;
1403 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1404 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1405 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1406 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1407 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1408 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1409 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1412 y.store( i , y.load(i ) + xmm1 );
1413 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1414 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1415 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1416 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
1417 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
1418 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
1419 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
1422 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1424 SIMDType xmm1, xmm2, xmm3, xmm4;
1426 for(
size_t j=jj; j<jend; ++j ) {
1427 const SIMDType x1(
set( x[j] ) );
1428 xmm1 += A.load(i ,j) * x1;
1429 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1430 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1431 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1434 y.store( i , y.load(i ) + xmm1 );
1435 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1436 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1437 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1440 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1442 SIMDType xmm1, xmm2, xmm3;
1444 for(
size_t j=jj; j<jend; ++j ) {
1445 const SIMDType x1(
set( x[j] ) );
1446 xmm1 += A.load(i ,j) * x1;
1447 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1448 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1451 y.store( i , y.load(i ) + xmm1 );
1452 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1453 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1456 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1458 SIMDType xmm1, xmm2;
1460 for(
size_t j=jj; j<jend; ++j ) {
1461 const SIMDType x1(
set( x[j] ) );
1462 xmm1 += A.load(i ,j) * x1;
1463 xmm2 += A.load(i+SIMDSIZE,j) * x1;
1466 y.store( i , y.load(i ) + xmm1 );
1467 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
1470 for( ; i<ipos; i+=SIMDSIZE )
1474 for(
size_t j=jj; j<jend; ++j ) {
1475 xmm1 += A.load(i,j) *
set( x[j] );
1478 y.store( i, y.load(i) + xmm1 );
1481 for( ; remainder && i<iend; ++i )
1485 for(
size_t j=jj; j<jend; ++j ) {
1486 value += A(i,j) * x[j];
1511 template<
typename VT1
1515 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1517 selectLargeAddAssignKernel( y, A, x );
1523 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1537 template<
typename VT1
1541 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1548 addAssign( y, tmp );
1551 gemv( y, A, x, ET(1), ET(1) );
1575 template<
typename VT1 >
1582 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1586 LT A(
serial( rhs.mat_ ) );
1587 RT x(
serial( rhs.vec_ ) );
1594 TDMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1610 template<
typename VT1
1613 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1617 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1618 selectSmallSubAssignKernel( y, A, x );
1620 selectBlasSubAssignKernel( y, A, x );
1639 template<
typename VT1
1642 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1644 const size_t M( A.rows() );
1645 const size_t N( A.columns() );
1647 for(
size_t j=0UL; j<N; ++j )
1651 y[j] -= A(j,j) * x[j];
1663 const size_t inum( iend - ibegin );
1664 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1666 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1667 y[i ] -= A(i ,j) * x[j];
1668 y[i+1UL] -= A(i+1UL,j) * x[j];
1671 y[ipos] -= A(ipos,j) * x[j];
1693 template<
typename VT1
1697 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1699 selectDefaultSubAssignKernel( y, A, x );
1719 template<
typename VT1
1723 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1727 const size_t M( A.rows() );
1728 const size_t N( A.columns() );
1730 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1735 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1745 SIMDType xmm1( y.load(i ) );
1746 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1747 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1748 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1749 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1750 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1751 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1752 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1754 for(
size_t j=jbegin; j<jend; ++j ) {
1755 const SIMDType x1(
set( x[j] ) );
1756 xmm1 -= A.load(i ,j) * x1;
1757 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1758 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1759 xmm4 -= A.load(i+SIMDSIZE*3UL,j) * x1;
1760 xmm5 -= A.load(i+SIMDSIZE*4UL,j) * x1;
1761 xmm6 -= A.load(i+SIMDSIZE*5UL,j) * x1;
1762 xmm7 -= A.load(i+SIMDSIZE*6UL,j) * x1;
1763 xmm8 -= A.load(i+SIMDSIZE*7UL,j) * x1;
1766 y.store( i , xmm1 );
1767 y.store( i+SIMDSIZE , xmm2 );
1768 y.store( i+SIMDSIZE*2UL, xmm3 );
1769 y.store( i+SIMDSIZE*3UL, xmm4 );
1770 y.store( i+SIMDSIZE*4UL, xmm5 );
1771 y.store( i+SIMDSIZE*5UL, xmm6 );
1772 y.store( i+SIMDSIZE*6UL, xmm7 );
1773 y.store( i+SIMDSIZE*7UL, xmm8 );
1776 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1786 SIMDType xmm1( y.load(i ) );
1787 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1788 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1789 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1791 for(
size_t j=jbegin; j<jend; ++j ) {
1792 const SIMDType x1(
set( x[j] ) );
1793 xmm1 -= A.load(i ,j) * x1;
1794 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1795 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1796 xmm4 -= A.load(i+SIMDSIZE*3UL,j) * x1;
1799 y.store( i , xmm1 );
1800 y.store( i+SIMDSIZE , xmm2 );
1801 y.store( i+SIMDSIZE*2UL, xmm3 );
1802 y.store( i+SIMDSIZE*3UL, xmm4 );
1805 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1815 SIMDType xmm1( y.load(i ) );
1816 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1817 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1819 for(
size_t j=jbegin; j<jend; ++j ) {
1820 const SIMDType x1(
set( x[j] ) );
1821 xmm1 -= A.load(i ,j) * x1;
1822 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1823 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1826 y.store( i , xmm1 );
1827 y.store( i+SIMDSIZE , xmm2 );
1828 y.store( i+SIMDSIZE*2UL, xmm3 );
1831 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1841 SIMDType xmm1( y.load(i ) );
1842 SIMDType xmm2( y.load(i+SIMDSIZE) );
1844 for(
size_t j=jbegin; j<jend; ++j ) {
1845 const SIMDType x1(
set( x[j] ) );
1846 xmm1 -= A.load(i ,j) * x1;
1847 xmm2 -= A.load(i+SIMDSIZE,j) * x1;
1850 y.store( i , xmm1 );
1851 y.store( i+SIMDSIZE, xmm2 );
1854 for( ; i<ipos; i+=SIMDSIZE )
1864 SIMDType xmm1( y.load(i) );
1866 for(
size_t j=jbegin; j<jend; ++j ) {
1867 xmm1 -= A.load(i,j) *
set( x[j] );
1873 for( ; remainder && i<M; ++i )
1885 for(
size_t j=jbegin; j<jend; ++j ) {
1886 value += A(i,j) * x[j];
1909 template<
typename VT1
1913 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1915 selectDefaultSubAssignKernel( y, A, x );
1935 template<
typename VT1
1939 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1943 const size_t M( A.rows() );
1944 const size_t N( A.columns() );
1946 const size_t iblock( 32768UL /
sizeof( ElementType ) );
1947 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1951 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1952 for(
size_t jj=0UL; jj<N; jj+=jblock )
1954 const size_t jend(
min( jj+jblock, N ) );
1955 const size_t itmp(
min( ii+iblock, M ) );
1960 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1961 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1967 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1969 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1971 for(
size_t j=jj; j<jend; ++j ) {
1972 const SIMDType x1(
set( x[j] ) );
1973 xmm1 += A.load(i ,j) * x1;
1974 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1975 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1976 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1977 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1978 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1979 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1980 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1983 y.store( i , y.load(i ) - xmm1 );
1984 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
1985 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
1986 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
1987 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5 );
1988 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6 );
1989 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7 );
1990 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8 );
1993 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1995 SIMDType xmm1, xmm2, xmm3, xmm4;
1997 for(
size_t j=jj; j<jend; ++j ) {
1998 const SIMDType x1(
set( x[j] ) );
1999 xmm1 += A.load(i ,j) * x1;
2000 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2001 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2002 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2005 y.store( i , y.load(i ) - xmm1 );
2006 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2007 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2008 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
2011 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2013 SIMDType xmm1, xmm2, xmm3;
2015 for(
size_t j=jj; j<jend; ++j ) {
2016 const SIMDType x1(
set( x[j] ) );
2017 xmm1 += A.load(i ,j) * x1;
2018 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2019 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2022 y.store( i , y.load(i ) - xmm1 );
2023 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2024 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2027 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2029 SIMDType xmm1, xmm2;
2031 for(
size_t j=jj; j<jend; ++j ) {
2032 const SIMDType x1(
set( x[j] ) );
2033 xmm1 += A.load(i ,j) * x1;
2034 xmm2 += A.load(i+SIMDSIZE,j) * x1;
2037 y.store( i , y.load(i ) - xmm1 );
2038 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2 );
2041 for( ; i<ipos; i+=SIMDSIZE )
2045 for(
size_t j=jj; j<jend; ++j ) {
2046 xmm1 += A.load(i,j) *
set( x[j] );
2049 y.store( i, y.load(i) - xmm1 );
2052 for( ; remainder && i<iend; ++i )
2056 for(
size_t j=jj; j<jend; ++j ) {
2057 value += A(i,j) * x[j];
2082 template<
typename VT1
2086 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2088 selectLargeSubAssignKernel( y, A, x );
2094 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2108 template<
typename VT1
2112 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2119 subAssign( y, tmp );
2122 gemv( y, A, x, ET(-1), ET(1) );
2146 template<
typename VT1 >
2157 const ResultType tmp(
serial( rhs ) );
2158 multAssign( ~lhs, tmp );
2180 template<
typename VT1 >
2191 const ResultType tmp(
serial( rhs ) );
2192 divAssign( ~lhs, tmp );
2216 template<
typename VT1 >
2224 if( rhs.mat_.rows() == 0UL ) {
2227 else if( rhs.mat_.columns() == 0UL ) {
2260 template<
typename VT1 >
2272 const ResultType tmp( rhs );
2293 template<
typename VT1 >
2301 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2337 template<
typename VT1 >
2345 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2381 template<
typename VT1 >
2393 const ResultType tmp( rhs );
2418 template<
typename VT1 >
2430 const ResultType tmp( rhs );
2470 template<
typename MT
2474 :
public DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false >
2506 template<
typename T1 >
2507 struct UseSMPAssign {
2508 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
2516 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2517 struct UseBlasKernel {
2523 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2538 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2539 struct UseVectorizedDefaultKernel {
2542 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2578 MT::simdEnabled && VT::simdEnabled &&
2584 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2585 !evaluateVector && VT::smpAssignable };
2611 inline ReturnType
operator[](
size_t index )
const {
2613 return vector_[index] * scalar_;
2624 inline ReturnType
at(
size_t index )
const {
2625 if( index >= vector_.size() ) {
2628 return (*
this)[index];
2637 inline size_t size()
const {
2638 return vector_.size();
2668 template<
typename T >
2669 inline bool canAlias(
const T* alias )
const {
2670 return vector_.canAlias( alias );
2680 template<
typename T >
2681 inline bool isAliased(
const T* alias )
const {
2682 return vector_.isAliased( alias );
2692 return vector_.isAligned();
2705 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
2706 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
2712 LeftOperand vector_;
2713 RightOperand scalar_;
2728 template<
typename VT1 >
2738 if( left.rows() == 0UL ) {
2741 else if( left.columns() == 0UL ) {
2754 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.
scalar_ );
2769 template<
typename VT1
2773 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2777 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
2778 selectSmallAssignKernel( y, A, x, scalar );
2780 selectBlasAssignKernel( y, A, x, scalar );
2798 template<
typename VT1
2802 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2804 const size_t M( A.rows() );
2805 const size_t N( A.columns() );
2814 y[i] = A(i,0UL) * x[0UL];
2822 y[j] = A(j,j) * x[j] * scalar;
2834 const size_t inum( iend - ibegin );
2835 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2837 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2838 y[i ] += A(i ,j) * x[j];
2839 y[i+1UL] += A(i+1UL,j) * x[j];
2842 y[ipos] += A(ipos,j) * x[j];
2845 y[iend] = A(iend,j) * x[j];
2878 template<
typename VT1
2883 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2885 selectDefaultAssignKernel( y, A, x, scalar );
2903 template<
typename VT1
2908 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2912 const size_t M( A.rows() );
2913 const size_t N( A.columns() );
2915 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
2918 const SIMDType factor(
set( scalar ) );
2922 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
2932 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934 for(
size_t j=jbegin; j<jend; ++j ) {
2935 const SIMDType x1(
set( x[j] ) );
2936 xmm1 += A.load(i ,j) * x1;
2937 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2938 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2939 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2940 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
2941 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
2942 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
2943 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
2946 y.store( i , xmm1*factor );
2947 y.store( i+SIMDSIZE , xmm2*factor );
2948 y.store( i+SIMDSIZE*2UL, xmm3*factor );
2949 y.store( i+SIMDSIZE*3UL, xmm4*factor );
2950 y.store( i+SIMDSIZE*4UL, xmm5*factor );
2951 y.store( i+SIMDSIZE*5UL, xmm6*factor );
2952 y.store( i+SIMDSIZE*6UL, xmm7*factor );
2953 y.store( i+SIMDSIZE*7UL, xmm8*factor );
2956 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2966 SIMDType xmm1, xmm2, xmm3, xmm4;
2968 for(
size_t j=jbegin; j<jend; ++j ) {
2969 const SIMDType x1(
set( x[j] ) );
2970 xmm1 += A.load(i ,j) * x1;
2971 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2972 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2973 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2976 y.store( i , xmm1*factor );
2977 y.store( i+SIMDSIZE , xmm2*factor );
2978 y.store( i+SIMDSIZE*2UL, xmm3*factor );
2979 y.store( i+SIMDSIZE*3UL, xmm4*factor );
2982 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2992 SIMDType xmm1, xmm2, xmm3;
2994 for(
size_t j=jbegin; j<jend; ++j ) {
2995 const SIMDType x1(
set( x[j] ) );
2996 xmm1 += A.load(i ,j) * x1;
2997 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2998 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3001 y.store( i , xmm1*factor );
3002 y.store( i+SIMDSIZE , xmm2*factor );
3003 y.store( i+SIMDSIZE*2UL, xmm3*factor );
3006 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3016 SIMDType xmm1, xmm2;
3018 for(
size_t j=jbegin; j<jend; ++j ) {
3019 const SIMDType x1(
set( x[j] ) );
3020 xmm1 += A.load(i ,j) * x1;
3021 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3024 y.store( i , xmm1*factor );
3025 y.store( i+SIMDSIZE, xmm2*factor );
3028 for( ; i<ipos; i+=SIMDSIZE )
3040 for(
size_t j=jbegin; j<jend; ++j ) {
3041 const SIMDType x1(
set( x[j] ) );
3042 xmm1 += A.load(i,j) * x1;
3045 y.store( i, xmm1*factor );
3048 for( ; remainder && i<M; ++i )
3060 for(
size_t j=jbegin; j<jend; ++j ) {
3061 value += A(i,j) * x[j];
3064 y[i] = value * scalar;
3083 template<
typename VT1
3088 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3090 selectDefaultAssignKernel( y, A, x, scalar );
3108 template<
typename VT1
3113 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3117 const size_t M( A.rows() );
3118 const size_t N( A.columns() );
3120 const size_t iblock( 32768UL /
sizeof( ElementType ) );
3121 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3125 const SIMDType factor(
set( scalar ) );
3129 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3130 for(
size_t jj=0UL; jj<N; jj+=jblock )
3132 const size_t jend(
min( jj+jblock, N ) );
3133 const size_t itmp(
min( ii+iblock, M ) );
3138 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3139 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3145 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3147 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3149 for(
size_t j=jj; j<jend; ++j ) {
3150 const SIMDType x1(
set( x[j] ) );
3151 xmm1 += A.load(i ,j) * x1;
3152 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3153 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3154 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3155 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3156 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3157 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3158 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3161 y.store( i , y.load(i ) + xmm1*factor );
3162 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3163 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3164 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3165 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3166 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3167 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3168 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3171 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3173 SIMDType xmm1, xmm2, xmm3, xmm4;
3175 for(
size_t j=jj; j<jend; ++j ) {
3176 const SIMDType x1(
set( x[j] ) );
3177 xmm1 += A.load(i ,j) * x1;
3178 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3179 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3180 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3183 y.store( i , y.load(i ) + xmm1*factor );
3184 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3185 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3186 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3189 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3191 SIMDType xmm1, xmm2, xmm3;
3193 for(
size_t j=jj; j<jend; ++j ) {
3194 const SIMDType x1(
set( x[j] ) );
3195 xmm1 += A.load(i ,j) * x1;
3196 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3197 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3200 y.store( i , y.load(i ) + xmm1*factor );
3201 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3202 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3205 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3207 SIMDType xmm1, xmm2;
3209 for(
size_t j=jj; j<jend; ++j ) {
3210 const SIMDType x1(
set( x[j] ) );
3211 xmm1 += A.load(i ,j) * x1;
3212 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3215 y.store( i , y.load(i ) + xmm1*factor );
3216 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3219 for( ; i<ipos; i+=SIMDSIZE )
3223 for(
size_t j=jj; j<jend; ++j ) {
3224 xmm1 += A.load(i,j) *
set( x[j] );
3227 y.store( i, y.load(i) + xmm1*factor );
3230 for( ; remainder && i<iend; ++i )
3234 for(
size_t j=jj; j<jend; ++j ) {
3235 value += A(i,j) * x[j];
3238 y[i] += value * scalar;
3259 template<
typename VT1
3264 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3266 selectLargeAssignKernel( y, A, x, scalar );
3271 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3285 template<
typename VT1
3290 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3295 assign( y, scalar * x );
3299 gemv( y, A, x, ET(scalar), ET(0) );
3317 template<
typename VT1 >
3328 const ResultType tmp(
serial( rhs ) );
3329 assign( ~lhs, tmp );
3345 template<
typename VT1 >
3355 if( left.rows() == 0UL || left.columns() == 0UL ) {
3367 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3382 template<
typename VT1
3386 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3390 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3391 selectSmallAddAssignKernel( y, A, x, scalar );
3393 selectBlasAddAssignKernel( y, A, x, scalar );
3411 template<
typename VT1
3415 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3417 y.addAssign( A * x * scalar );
3435 template<
typename VT1
3440 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3442 selectDefaultAddAssignKernel( y, A, x, scalar );
3461 template<
typename VT1
3466 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3470 const size_t M( A.rows() );
3471 const size_t N( A.columns() );
3473 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3476 const SIMDType factor(
set( scalar ) );
3480 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3490 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3492 for(
size_t j=jbegin; j<jend; ++j ) {
3493 const SIMDType x1(
set( x[j] ) );
3494 xmm1 += A.load(i ,j) * x1;
3495 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3496 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3497 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3498 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3499 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3500 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3501 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3504 y.store( i , y.load(i ) + xmm1*factor );
3505 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3506 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3507 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3508 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3509 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3510 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3511 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3514 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3524 SIMDType xmm1, xmm2, xmm3, xmm4;
3526 for(
size_t j=jbegin; j<jend; ++j ) {
3527 const SIMDType x1(
set( x[j] ) );
3528 xmm1 += A.load(i ,j) * x1;
3529 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3530 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3531 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3534 y.store( i , y.load(i ) + xmm1*factor );
3535 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3536 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3537 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3540 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3550 SIMDType xmm1, xmm2, xmm3;
3552 for(
size_t j=jbegin; j<jend; ++j ) {
3553 const SIMDType x1(
set( x[j] ) );
3554 xmm1 += A.load(i ,j) * x1;
3555 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3556 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3559 y.store( i , y.load(i ) + xmm1*factor );
3560 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3561 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3564 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3574 SIMDType xmm1, xmm2;
3576 for(
size_t j=jbegin; j<jend; ++j ) {
3577 const SIMDType x1(
set( x[j] ) );
3578 xmm1 += A.load(i ,j) * x1;
3579 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3582 y.store( i , y.load(i ) + xmm1*factor );
3583 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3586 for( ; i<ipos; i+=SIMDSIZE )
3598 for(
size_t j=jbegin; j<jend; ++j ) {
3599 xmm1 += A.load(i,j) *
set( x[j] );
3602 y.store( i, y.load(i) + xmm1*factor );
3605 for( ; remainder && i<M; ++i )
3617 for(
size_t j=jbegin; j<jend; ++j ) {
3618 value += A(i,j) * x[j];
3621 y[i] += value * scalar;
3640 template<
typename VT1
3645 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3647 selectDefaultAddAssignKernel( y, A, x, scalar );
3666 template<
typename VT1
3671 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3675 const size_t M( A.rows() );
3676 const size_t N( A.columns() );
3678 const size_t iblock( 32768UL /
sizeof( ElementType ) );
3679 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3683 const SIMDType factor(
set( scalar ) );
3685 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3686 for(
size_t jj=0UL; jj<N; jj+=jblock )
3688 const size_t jend(
min( jj+jblock, N ) );
3689 const size_t itmp(
min( ii+iblock, M ) );
3694 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3695 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3701 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3703 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3705 for(
size_t j=jj; j<jend; ++j ) {
3706 const SIMDType x1(
set( x[j] ) );
3707 xmm1 += A.load(i ,j) * x1;
3708 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3709 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3710 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3711 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3712 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3713 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3714 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3717 y.store( i , y.load(i ) + xmm1*factor );
3718 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3719 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3720 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3721 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3722 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3723 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3724 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3727 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3729 SIMDType xmm1, xmm2, xmm3, xmm4;
3731 for(
size_t j=jj; j<jend; ++j ) {
3732 const SIMDType x1(
set( x[j] ) );
3733 xmm1 += A.load(i ,j) * x1;
3734 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3735 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3736 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3739 y.store( i , y.load(i ) + xmm1*factor );
3740 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3741 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3742 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3745 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3747 SIMDType xmm1, xmm2, xmm3;
3749 for(
size_t j=jj; j<jend; ++j ) {
3750 const SIMDType x1(
set( x[j] ) );
3751 xmm1 += A.load(i ,j) * x1;
3752 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3753 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3756 y.store( i , y.load(i ) + xmm1*factor );
3757 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3758 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3761 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3763 SIMDType xmm1, xmm2;
3765 for(
size_t j=jj; j<jend; ++j ) {
3766 const SIMDType x1(
set( x[j] ) );
3767 xmm1 += A.load(i ,j) * x1;
3768 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3771 y.store( i , y.load(i ) + xmm1*factor );
3772 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3775 for( ; i<ipos; i+=SIMDSIZE )
3779 for(
size_t j=jj; j<jend; ++j ) {
3780 xmm1 += A.load(i,j) *
set( x[j] );
3783 y.store( i, y.load(i) + xmm1*factor );
3786 for( ; remainder && i<iend; ++i )
3790 for(
size_t j=jj; j<jend; ++j ) {
3791 value += A(i,j) * x[j];
3794 y[i] += value * scalar;
3815 template<
typename VT1
3820 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3822 selectLargeAddAssignKernel( y, A, x, scalar );
3827 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3841 template<
typename VT1
3846 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3853 addAssign( y, tmp );
3856 gemv( y, A, x, ET(scalar), ET(1) );
3878 template<
typename VT1 >
3888 if( left.rows() == 0UL || left.columns() == 0UL ) {
3900 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3915 template<
typename VT1
3919 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3923 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3924 selectSmallSubAssignKernel( y, A, x, scalar );
3926 selectBlasSubAssignKernel( y, A, x, scalar );
3944 template<
typename VT1
3948 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3950 y.subAssign( A * x * scalar );
3968 template<
typename VT1
3973 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3975 selectDefaultSubAssignKernel( y, A, x, scalar );
3994 template<
typename VT1
3999 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4003 const size_t M( A.rows() );
4004 const size_t N( A.columns() );
4006 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
4009 const SIMDType factor(
set( scalar ) );
4013 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4023 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4025 for(
size_t j=jbegin; j<jend; ++j ) {
4026 const SIMDType x1(
set( x[j] ) );
4027 xmm1 += A.load(i ,j) * x1;
4028 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4029 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4030 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4031 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
4032 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
4033 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
4034 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
4037 y.store( i , y.load(i ) - xmm1*factor );
4038 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4039 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4040 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4041 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4042 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4043 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4044 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4047 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4057 SIMDType xmm1, xmm2, xmm3, xmm4;
4059 for(
size_t j=jbegin; j<jend; ++j ) {
4060 const SIMDType x1(
set( x[j] ) );
4061 xmm1 += A.load(i ,j) * x1;
4062 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4063 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4064 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4067 y.store( i , y.load(i ) - xmm1*factor );
4068 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4069 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4070 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4073 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4083 SIMDType xmm1, xmm2, xmm3;
4085 for(
size_t j=jbegin; j<jend; ++j ) {
4086 const SIMDType x1(
set( x[j] ) );
4087 xmm1 += A.load(i ,j) * x1;
4088 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4089 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4092 y.store( i , y.load(i ) - xmm1*factor );
4093 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4094 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4097 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4107 SIMDType xmm1, xmm2;
4109 for(
size_t j=jbegin; j<jend; ++j ) {
4110 const SIMDType x1(
set( x[j] ) );
4111 xmm1 += A.load(i ,j) * x1;
4112 xmm2 += A.load(i+SIMDSIZE,j) * x1;
4115 y.store( i , y.load(i ) - xmm1*factor );
4116 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4119 for( ; i<ipos; i+=SIMDSIZE )
4131 for(
size_t j=jbegin; j<jend; ++j ) {
4132 xmm1 += A.load(i,j) *
set( x[j] );
4135 y.store( i, y.load(i) - xmm1*factor );
4138 for( ; remainder && i<M; ++i )
4150 for(
size_t j=jbegin; j<jend; ++j ) {
4151 value += A(i,j) * x[j];
4154 y[i] -= value * scalar;
4173 template<
typename VT1
4178 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4180 selectDefaultSubAssignKernel( y, A, x, scalar );
4199 template<
typename VT1
4204 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4208 const size_t M( A.rows() );
4209 const size_t N( A.columns() );
4211 const size_t iblock( 32768UL /
sizeof( ElementType ) );
4212 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
4216 const SIMDType factor(
set( scalar ) );
4218 for(
size_t ii=0U; ii<M; ii+=iblock ) {
4219 for(
size_t jj=0UL; jj<N; jj+=jblock )
4221 const size_t jend(
min( jj+jblock, N ) );
4222 const size_t itmp(
min( ii+iblock, M ) );
4227 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4228 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
4234 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4236 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4238 for(
size_t j=jj; j<jend; ++j ) {
4239 const SIMDType x1(
set( x[j] ) );
4240 xmm1 += A.load(i ,j) * x1;
4241 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4242 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4243 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4244 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
4245 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
4246 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
4247 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
4250 y.store( i , y.load(i ) - xmm1*factor );
4251 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4252 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4253 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4254 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4255 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4256 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4257 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4260 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4262 SIMDType xmm1, xmm2, xmm3, xmm4;
4264 for(
size_t j=jj; j<jend; ++j ) {
4265 const SIMDType x1(
set( x[j] ) );
4266 xmm1 += A.load(i ,j) * x1;
4267 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4268 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4269 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4272 y.store( i , y.load(i ) - xmm1*factor );
4273 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4274 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4275 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4278 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4280 SIMDType xmm1, xmm2, xmm3;
4282 for(
size_t j=jj; j<jend; ++j ) {
4283 const SIMDType x1(
set( x[j] ) );
4284 xmm1 += A.load(i ,j) * x1;
4285 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4286 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4289 y.store( i , y.load(i ) - xmm1*factor );
4290 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4291 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4294 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4296 SIMDType xmm1, xmm2;
4298 for(
size_t j=jj; j<jend; ++j ) {
4299 const SIMDType x1(
set( x[j] ) );
4300 xmm1 += A.load(i ,j) * x1;
4301 xmm2 += A.load(i+SIMDSIZE,j) * x1;
4304 y.store( i , y.load(i ) - xmm1*factor );
4305 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4308 for( ; i<ipos; i+=SIMDSIZE )
4312 for(
size_t j=jj; j<jend; ++j ) {
4313 xmm1 += A.load(i,j) *
set( x[j] );
4316 y.store( i, y.load(i) - xmm1*factor );
4319 for( ; remainder && i<iend; ++i )
4323 for(
size_t j=jj; j<jend; ++j ) {
4324 value += A(i,j) * x[j];
4327 y[i] -= value * scalar;
4348 template<
typename VT1
4353 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4355 selectLargeSubAssignKernel( y, A, x, scalar );
4360 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4374 template<
typename VT1
4379 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4386 subAssign( y, tmp );
4389 gemv( y, A, x, ET(-scalar), ET(1) );
4411 template<
typename VT1 >
4422 const ResultType tmp(
serial( rhs ) );
4423 multAssign( ~lhs, tmp );
4443 template<
typename VT1 >
4454 const ResultType tmp(
serial( rhs ) );
4455 divAssign( ~lhs, tmp );
4477 template<
typename VT1 >
4488 if( left.rows() == 0UL ) {
4491 else if( left.columns() == 0UL ) {
4522 template<
typename VT1 >
4534 const ResultType tmp( rhs );
4553 template<
typename VT1 >
4564 if( left.rows() == 0UL || left.columns() == 0UL ) {
4598 template<
typename VT1 >
4609 if( left.rows() == 0UL || left.columns() == 0UL ) {
4644 template<
typename VT1 >
4656 const ResultType tmp( rhs );
4679 template<
typename VT1 >
4691 const ResultType tmp( rhs );
4754 template<
typename T1
4780 template<
typename MT,
typename VT >
4781 struct Size< TDMatDVecMultExpr<MT,VT> > :
public Rows<MT>
4797 template<
typename MT,
typename VT >
4798 struct IsAligned< TDMatDVecMultExpr<MT,VT> >
4799 :
public BoolConstant< And< IsAligned<MT>, IsAligned<VT> >::value >
4815 template<
typename MT,
typename VT,
bool AF >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:131
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Evaluation of the expression type type of a subvector operation.Via this type trait it is possible to...
Definition: SubvectorExprTrait.h:79
Header file for the Rows type trait.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDMatDVecMultExpr.h:297
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:208
Header file for basic type definitions.
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:135
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
TDMatDVecMultExpr< MT, VT > This
Type of this TDMatDVecMultExpr instance.
Definition: TDMatDVecMultExpr.h:207
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:194
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:532
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1755
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:163
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
If_< IsExpression< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:216
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1802
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:71
Expression object for transpose dense matrix-dense vector multiplications.The TDMatDVecMultExpr class...
Definition: Forward.h:139
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:209
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
typename MultExprTrait< T1, T2 >::Type MultExprTrait_
Auxiliary alias declaration for the MultExprTrait class template.The MultExprTrait_ alias declaration...
Definition: MultExprTrait.h:344
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:210
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:374
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:364
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:265
Header file for the HasSIMDAdd type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:342
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:320
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:132
typename SubvectorExprTrait< VT, AF >::Type SubvectorExprTrait_
Auxiliary alias declaration for the SubvectorExprTrait type trait.The SubvectorExprTrait_ alias decla...
Definition: SubvectorExprTrait.h:133
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:336
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:128
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:110
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:134
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:222
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:133
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:603
Header file for all forward declarations for expression class templates.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDVecMultExpr.h:211
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsNumeric type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:604
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:66
Header file for the IsSIMDCombinable type trait.
Header file for the SubmatrixExprTrait class template.
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:310
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:93
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Constraints on the storage order of matrix types.
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:223
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:117
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:75
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:330
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:120
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:354
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:76
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Constraint on the data type.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:363
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:384
Header file for the MatVecMultExpr base class.
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:136
const DMatDMatMultExpr< T1, T2, false, false, false, false > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7505
TDMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:251
Header file for the Size type trait.
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:385
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:213
If_< IsExpression< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:219
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:212
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:225