35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_ 120 template<
typename MT
122 class TDMatDVecMultExpr
123 :
public MatVecMultExpr< DenseVector< TDMatDVecMultExpr<MT,VT>, false > >
124 ,
private Computation
153 template<
typename T1 >
154 struct UseSMPAssign {
155 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
165 template<
typename T1,
typename T2,
typename T3 >
166 struct UseBlasKernel {
172 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
188 template<
typename T1,
typename T2,
typename T3 >
189 struct UseVectorizedDefaultKernel {
190 enum :
bool { value = useOptimizedKernels &&
192 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
228 MT::simdEnabled && VT::simdEnabled &&
233 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
234 !evaluateVector && VT::smpAssignable };
267 return mat_(index,index) *
vec_[index];
277 const size_t n (
mat_.columns() -
begin );
295 if( index >=
mat_.rows() ) {
298 return (*
this)[index];
307 inline size_t size() const noexcept {
338 template<
typename T >
339 inline bool canAlias(
const T* alias )
const noexcept {
340 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
350 template<
typename T >
351 inline bool isAliased(
const T* alias )
const noexcept {
352 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
362 return mat_.isAligned() &&
vec_.isAligned();
376 (
mat_.rows() *
mat_.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
377 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
400 template<
typename VT1 >
407 if( rhs.mat_.rows() == 0UL ) {
410 else if( rhs.mat_.columns() == 0UL ) {
423 TDMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
439 template<
typename VT1
442 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
446 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
447 selectSmallAssignKernel( y, A, x );
449 selectBlasAssignKernel( y, A, x );
468 template<
typename VT1
471 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
473 const size_t M( A.rows() );
474 const size_t N( A.columns() );
483 y[i] = A(i,0UL) * x[0UL];
491 y[j] = A(j,j) * x[j];
503 const size_t inum( iend - ibegin );
504 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
506 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
507 y[i ] += A(i ,j) * x[j];
508 y[i+1UL] += A(i+1UL,j) * x[j];
511 y[ipos] += A(ipos,j) * x[j];
514 y[iend] = A(iend,j) * x[j];
540 template<
typename VT1
544 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
546 selectDefaultAssignKernel( y, A, x );
565 template<
typename VT1
569 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
573 const size_t M( A.rows() );
574 const size_t N( A.columns() );
576 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
581 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
591 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
593 for(
size_t j=jbegin; j<jend; ++j ) {
595 xmm1 += A.load(i ,j) * x1;
596 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
597 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
598 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
599 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
600 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
601 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
602 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
606 y.store( i+SIMDSIZE , xmm2 );
607 y.store( i+SIMDSIZE*2UL, xmm3 );
608 y.store( i+SIMDSIZE*3UL, xmm4 );
609 y.store( i+SIMDSIZE*4UL, xmm5 );
610 y.store( i+SIMDSIZE*5UL, xmm6 );
611 y.store( i+SIMDSIZE*6UL, xmm7 );
612 y.store( i+SIMDSIZE*7UL, xmm8 );
615 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
627 for(
size_t j=jbegin; j<jend; ++j ) {
629 xmm1 += A.load(i ,j) * x1;
630 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
631 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
632 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
636 y.store( i+SIMDSIZE , xmm2 );
637 y.store( i+SIMDSIZE*2UL, xmm3 );
638 y.store( i+SIMDSIZE*3UL, xmm4 );
641 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
653 for(
size_t j=jbegin; j<jend; ++j ) {
655 xmm1 += A.load(i ,j) * x1;
656 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
657 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
661 y.store( i+SIMDSIZE , xmm2 );
662 y.store( i+SIMDSIZE*2UL, xmm3 );
665 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
677 for(
size_t j=jbegin; j<jend; ++j ) {
679 xmm1 += A.load(i ,j) * x1;
680 xmm2 += A.load(i+SIMDSIZE,j) * x1;
684 y.store( i+SIMDSIZE, xmm2 );
687 for( ; i<ipos; i+=SIMDSIZE )
699 for(
size_t j=jbegin; j<jend; ++j ) {
700 xmm1 += A.load(i,j) *
set( x[j] );
706 for( ; remainder && i<M; ++i )
718 for(
size_t j=jbegin; j<jend; ++j ) {
719 value += A(i,j) * x[j];
742 template<
typename VT1
746 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
748 selectDefaultAssignKernel( y, A, x );
767 template<
typename VT1
771 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
775 const size_t M( A.rows() );
776 const size_t N( A.columns() );
778 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
779 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
785 for(
size_t ii=0U; ii<M; ii+=iblock ) {
786 for(
size_t jj=0UL; jj<N; jj+=jblock )
788 const size_t jend(
min( jj+jblock, N ) );
789 const size_t itmp(
min( ii+iblock, M ) );
794 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
795 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
801 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
803 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
805 for(
size_t j=jj; j<jend; ++j ) {
807 xmm1 += A.load(i ,j) * x1;
808 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
809 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
810 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
811 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
812 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
813 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
814 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
817 y.store( i , y.load(i ) + xmm1 );
818 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
819 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
820 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
821 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
822 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
823 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
824 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
827 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
831 for(
size_t j=jj; j<jend; ++j ) {
833 xmm1 += A.load(i ,j) * x1;
834 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
835 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
836 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
839 y.store( i , y.load(i ) + xmm1 );
840 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
841 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
842 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
845 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
849 for(
size_t j=jj; j<jend; ++j ) {
851 xmm1 += A.load(i ,j) * x1;
852 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
853 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
856 y.store( i , y.load(i ) + xmm1 );
857 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
858 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
861 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
865 for(
size_t j=jj; j<jend; ++j ) {
867 xmm1 += A.load(i ,j) * x1;
868 xmm2 += A.load(i+SIMDSIZE,j) * x1;
871 y.store( i , y.load(i ) + xmm1 );
872 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
875 for( ; i<ipos; i+=SIMDSIZE )
879 for(
size_t j=jj; j<jend; ++j ) {
880 xmm1 += A.load(i,j) *
set( x[j] );
883 y.store( i, y.load(i) + xmm1 );
886 for( ; remainder && i<iend; ++i )
890 for(
size_t j=jj; j<jend; ++j ) {
891 value += A(i,j) * x[j];
916 template<
typename VT1
920 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
922 selectLargeAssignKernel( y, A, x );
928 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 942 template<
typename VT1
946 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
955 gemv( y, A, x, ET(1), ET(0) );
975 template<
typename VT1 >
1005 template<
typename VT1 >
1012 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1024 TDMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1040 template<
typename VT1
1043 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1047 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1048 selectSmallAddAssignKernel( y, A, x );
1050 selectBlasAddAssignKernel( y, A, x );
1069 template<
typename VT1
1072 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1074 const size_t M( A.rows() );
1075 const size_t N( A.columns() );
1077 for(
size_t j=0UL; j<N; ++j )
1081 y[j] += A(j,j) * x[j];
1093 const size_t inum( iend - ibegin );
1094 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1096 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1097 y[i ] += A(i ,j) * x[j];
1098 y[i+1UL] += A(i+1UL,j) * x[j];
1101 y[ipos] += A(ipos,j) * x[j];
1123 template<
typename VT1
1127 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1129 selectDefaultAddAssignKernel( y, A, x );
1148 template<
typename VT1
1152 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1156 const size_t M( A.rows() );
1157 const size_t N( A.columns() );
1159 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1164 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1175 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1176 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1177 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1178 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1179 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1180 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1181 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1183 for(
size_t j=jbegin; j<jend; ++j ) {
1185 xmm1 += A.load(i ,j) * x1;
1186 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1187 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1188 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1189 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1190 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1191 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1192 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1195 y.store( i , xmm1 );
1196 y.store( i+SIMDSIZE , xmm2 );
1197 y.store( i+SIMDSIZE*2UL, xmm3 );
1198 y.store( i+SIMDSIZE*3UL, xmm4 );
1199 y.store( i+SIMDSIZE*4UL, xmm5 );
1200 y.store( i+SIMDSIZE*5UL, xmm6 );
1201 y.store( i+SIMDSIZE*6UL, xmm7 );
1202 y.store( i+SIMDSIZE*7UL, xmm8 );
1205 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1216 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1217 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1218 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1220 for(
size_t j=jbegin; j<jend; ++j ) {
1222 xmm1 += A.load(i ,j) * x1;
1223 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1224 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1225 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1228 y.store( i , xmm1 );
1229 y.store( i+SIMDSIZE , xmm2 );
1230 y.store( i+SIMDSIZE*2UL, xmm3 );
1231 y.store( i+SIMDSIZE*3UL, xmm4 );
1234 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1245 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1246 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1248 for(
size_t j=jbegin; j<jend; ++j ) {
1250 xmm1 += A.load(i ,j) * x1;
1251 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1252 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1255 y.store( i , xmm1 );
1256 y.store( i+SIMDSIZE , xmm2 );
1257 y.store( i+SIMDSIZE*2UL, xmm3 );
1260 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1271 SIMDType xmm2( y.load(i+SIMDSIZE) );
1273 for(
size_t j=jbegin; j<jend; ++j ) {
1275 xmm1 += A.load(i ,j) * x1;
1276 xmm2 += A.load(i+SIMDSIZE,j) * x1;
1279 y.store( i , xmm1 );
1280 y.store( i+SIMDSIZE, xmm2 );
1283 for( ; i<ipos; i+=SIMDSIZE )
1295 for(
size_t j=jbegin; j<jend; ++j ) {
1296 xmm1 += A.load(i,j) *
set( x[j] );
1302 for( ; remainder && i<M; ++i )
1314 for(
size_t j=jbegin; j<jend; ++j ) {
1315 value += A(i,j) * x[j];
1338 template<
typename VT1
1342 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1344 selectDefaultAddAssignKernel( y, A, x );
1363 template<
typename VT1
1367 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1371 const size_t M( A.rows() );
1372 const size_t N( A.columns() );
1374 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
1375 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1379 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1380 for(
size_t jj=0UL; jj<N; jj+=jblock )
1382 const size_t jend(
min( jj+jblock, N ) );
1383 const size_t itmp(
min( ii+iblock, M ) );
1388 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1389 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1395 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1397 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1399 for(
size_t j=jj; j<jend; ++j ) {
1401 xmm1 += A.load(i ,j) * x1;
1402 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1403 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1404 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1405 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1406 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1407 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1408 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1411 y.store( i , y.load(i ) + xmm1 );
1412 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1413 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1414 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1415 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
1416 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
1417 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
1418 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
1421 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1425 for(
size_t j=jj; j<jend; ++j ) {
1427 xmm1 += A.load(i ,j) * x1;
1428 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1429 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1430 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1433 y.store( i , y.load(i ) + xmm1 );
1434 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1435 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1436 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1439 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1443 for(
size_t j=jj; j<jend; ++j ) {
1445 xmm1 += A.load(i ,j) * x1;
1446 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1447 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1450 y.store( i , y.load(i ) + xmm1 );
1451 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1452 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1455 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1459 for(
size_t j=jj; j<jend; ++j ) {
1461 xmm1 += A.load(i ,j) * x1;
1462 xmm2 += A.load(i+SIMDSIZE,j) * x1;
1465 y.store( i , y.load(i ) + xmm1 );
1466 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
1469 for( ; i<ipos; i+=SIMDSIZE )
1473 for(
size_t j=jj; j<jend; ++j ) {
1474 xmm1 += A.load(i,j) *
set( x[j] );
1477 y.store( i, y.load(i) + xmm1 );
1480 for( ; remainder && i<iend; ++i )
1484 for(
size_t j=jj; j<jend; ++j ) {
1485 value += A(i,j) * x[j];
1510 template<
typename VT1
1514 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1516 selectLargeAddAssignKernel( y, A, x );
1522 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1536 template<
typename VT1
1540 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1547 addAssign( y, tmp );
1550 gemv( y, A, x, ET(1), ET(1) );
1574 template<
typename VT1 >
1581 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1593 TDMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1609 template<
typename VT1
1612 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1616 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1617 selectSmallSubAssignKernel( y, A, x );
1619 selectBlasSubAssignKernel( y, A, x );
1638 template<
typename VT1
1641 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1643 const size_t M( A.rows() );
1644 const size_t N( A.columns() );
1646 for(
size_t j=0UL; j<N; ++j )
1650 y[j] -= A(j,j) * x[j];
1662 const size_t inum( iend - ibegin );
1663 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1665 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1666 y[i ] -= A(i ,j) * x[j];
1667 y[i+1UL] -= A(i+1UL,j) * x[j];
1670 y[ipos] -= A(ipos,j) * x[j];
1692 template<
typename VT1
1696 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1698 selectDefaultSubAssignKernel( y, A, x );
1718 template<
typename VT1
1722 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1726 const size_t M( A.rows() );
1727 const size_t N( A.columns() );
1729 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1734 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1745 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1746 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1747 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1748 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1749 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1750 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1751 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1753 for(
size_t j=jbegin; j<jend; ++j ) {
1755 xmm1 -= A.load(i ,j) * x1;
1756 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1757 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1758 xmm4 -= A.load(i+SIMDSIZE*3UL,j) * x1;
1759 xmm5 -= A.load(i+SIMDSIZE*4UL,j) * x1;
1760 xmm6 -= A.load(i+SIMDSIZE*5UL,j) * x1;
1761 xmm7 -= A.load(i+SIMDSIZE*6UL,j) * x1;
1762 xmm8 -= A.load(i+SIMDSIZE*7UL,j) * x1;
1765 y.store( i , xmm1 );
1766 y.store( i+SIMDSIZE , xmm2 );
1767 y.store( i+SIMDSIZE*2UL, xmm3 );
1768 y.store( i+SIMDSIZE*3UL, xmm4 );
1769 y.store( i+SIMDSIZE*4UL, xmm5 );
1770 y.store( i+SIMDSIZE*5UL, xmm6 );
1771 y.store( i+SIMDSIZE*6UL, xmm7 );
1772 y.store( i+SIMDSIZE*7UL, xmm8 );
1775 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1786 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1787 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1788 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1790 for(
size_t j=jbegin; j<jend; ++j ) {
1792 xmm1 -= A.load(i ,j) * x1;
1793 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1794 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1795 xmm4 -= A.load(i+SIMDSIZE*3UL,j) * x1;
1798 y.store( i , xmm1 );
1799 y.store( i+SIMDSIZE , xmm2 );
1800 y.store( i+SIMDSIZE*2UL, xmm3 );
1801 y.store( i+SIMDSIZE*3UL, xmm4 );
1804 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1815 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1816 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1818 for(
size_t j=jbegin; j<jend; ++j ) {
1820 xmm1 -= A.load(i ,j) * x1;
1821 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1822 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1825 y.store( i , xmm1 );
1826 y.store( i+SIMDSIZE , xmm2 );
1827 y.store( i+SIMDSIZE*2UL, xmm3 );
1830 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1841 SIMDType xmm2( y.load(i+SIMDSIZE) );
1843 for(
size_t j=jbegin; j<jend; ++j ) {
1845 xmm1 -= A.load(i ,j) * x1;
1846 xmm2 -= A.load(i+SIMDSIZE,j) * x1;
1849 y.store( i , xmm1 );
1850 y.store( i+SIMDSIZE, xmm2 );
1853 for( ; i<ipos; i+=SIMDSIZE )
1865 for(
size_t j=jbegin; j<jend; ++j ) {
1866 xmm1 -= A.load(i,j) *
set( x[j] );
1872 for( ; remainder && i<M; ++i )
1884 for(
size_t j=jbegin; j<jend; ++j ) {
1885 value += A(i,j) * x[j];
1908 template<
typename VT1
1912 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1914 selectDefaultSubAssignKernel( y, A, x );
1934 template<
typename VT1
1938 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1942 const size_t M( A.rows() );
1943 const size_t N( A.columns() );
1945 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
1946 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1950 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1951 for(
size_t jj=0UL; jj<N; jj+=jblock )
1953 const size_t jend(
min( jj+jblock, N ) );
1954 const size_t itmp(
min( ii+iblock, M ) );
1959 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1960 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1966 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1968 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1970 for(
size_t j=jj; j<jend; ++j ) {
1972 xmm1 += A.load(i ,j) * x1;
1973 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1974 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1975 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1976 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1977 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1978 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1979 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1982 y.store( i , y.load(i ) - xmm1 );
1983 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
1984 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
1985 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
1986 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5 );
1987 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6 );
1988 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7 );
1989 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8 );
1992 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1996 for(
size_t j=jj; j<jend; ++j ) {
1998 xmm1 += A.load(i ,j) * x1;
1999 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2000 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2001 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2004 y.store( i , y.load(i ) - xmm1 );
2005 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2006 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2007 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
2010 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2014 for(
size_t j=jj; j<jend; ++j ) {
2016 xmm1 += A.load(i ,j) * x1;
2017 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2018 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2021 y.store( i , y.load(i ) - xmm1 );
2022 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2023 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2026 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2030 for(
size_t j=jj; j<jend; ++j ) {
2032 xmm1 += A.load(i ,j) * x1;
2033 xmm2 += A.load(i+SIMDSIZE,j) * x1;
2036 y.store( i , y.load(i ) - xmm1 );
2037 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2 );
2040 for( ; i<ipos; i+=SIMDSIZE )
2044 for(
size_t j=jj; j<jend; ++j ) {
2045 xmm1 += A.load(i,j) *
set( x[j] );
2048 y.store( i, y.load(i) - xmm1 );
2051 for( ; remainder && i<iend; ++i )
2055 for(
size_t j=jj; j<jend; ++j ) {
2056 value += A(i,j) * x[j];
2081 template<
typename VT1
2085 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2087 selectLargeSubAssignKernel( y, A, x );
2093 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2107 template<
typename VT1
2111 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2118 subAssign( y, tmp );
2121 gemv( y, A, x, ET(-1), ET(1) );
2145 template<
typename VT1 >
2157 multAssign( ~lhs, tmp );
2179 template<
typename VT1 >
2191 divAssign( ~lhs, tmp );
2215 template<
typename VT1 >
2223 if( rhs.mat_.rows() == 0UL ) {
2226 else if( rhs.mat_.columns() == 0UL ) {
2259 template<
typename VT1 >
2292 template<
typename VT1 >
2300 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2336 template<
typename VT1 >
2344 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2380 template<
typename VT1 >
2417 template<
typename VT1 >
2469 template<
typename MT
2473 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false > >
2504 template<
typename T1 >
2505 struct UseSMPAssign {
2506 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
2514 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2515 struct UseBlasKernel {
2521 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2536 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2537 struct UseVectorizedDefaultKernel {
2538 enum :
bool { value = useOptimizedKernels &&
2540 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2576 MT::simdEnabled && VT::simdEnabled &&
2582 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2583 !evaluateVector && VT::smpAssignable };
2611 return vector_[index] * scalar_;
2623 if( index >= vector_.size() ) {
2626 return (*
this)[index];
2635 inline size_t size()
const {
2636 return vector_.size();
2666 template<
typename T >
2667 inline bool canAlias(
const T* alias )
const {
2668 return vector_.canAlias( alias );
2678 template<
typename T >
2679 inline bool isAliased(
const T* alias )
const {
2680 return vector_.isAliased( alias );
2690 return vector_.isAligned();
2705 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
2706 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
2728 template<
typename VT1 >
2738 if( left.rows() == 0UL ) {
2741 else if( left.columns() == 0UL ) {
2754 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.
scalar_ );
2769 template<
typename VT1
2773 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2777 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
2778 selectSmallAssignKernel( y, A, x, scalar );
2780 selectBlasAssignKernel( y, A, x, scalar );
2798 template<
typename VT1
2802 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2804 const size_t M( A.rows() );
2805 const size_t N( A.columns() );
2814 y[i] = A(i,0UL) * x[0UL];
2822 y[j] = A(j,j) * x[j] * scalar;
2834 const size_t inum( iend - ibegin );
2835 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2837 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2838 y[i ] += A(i ,j) * x[j];
2839 y[i+1UL] += A(i+1UL,j) * x[j];
2842 y[ipos] += A(ipos,j) * x[j];
2845 y[iend] = A(iend,j) * x[j];
2878 template<
typename VT1
2883 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2885 selectDefaultAssignKernel( y, A, x, scalar );
2903 template<
typename VT1
2908 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2912 const size_t M( A.rows() );
2913 const size_t N( A.columns() );
2915 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
2918 const SIMDType factor(
set( scalar ) );
2922 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
2932 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934 for(
size_t j=jbegin; j<jend; ++j ) {
2936 xmm1 += A.load(i ,j) * x1;
2937 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2938 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2939 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2940 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
2941 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
2942 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
2943 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
2946 y.store( i , xmm1*factor );
2947 y.store( i+SIMDSIZE , xmm2*factor );
2948 y.store( i+SIMDSIZE*2UL, xmm3*factor );
2949 y.store( i+SIMDSIZE*3UL, xmm4*factor );
2950 y.store( i+SIMDSIZE*4UL, xmm5*factor );
2951 y.store( i+SIMDSIZE*5UL, xmm6*factor );
2952 y.store( i+SIMDSIZE*6UL, xmm7*factor );
2953 y.store( i+SIMDSIZE*7UL, xmm8*factor );
2956 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2968 for(
size_t j=jbegin; j<jend; ++j ) {
2970 xmm1 += A.load(i ,j) * x1;
2971 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2972 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2973 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2976 y.store( i , xmm1*factor );
2977 y.store( i+SIMDSIZE , xmm2*factor );
2978 y.store( i+SIMDSIZE*2UL, xmm3*factor );
2979 y.store( i+SIMDSIZE*3UL, xmm4*factor );
2982 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2994 for(
size_t j=jbegin; j<jend; ++j ) {
2996 xmm1 += A.load(i ,j) * x1;
2997 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2998 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3001 y.store( i , xmm1*factor );
3002 y.store( i+SIMDSIZE , xmm2*factor );
3003 y.store( i+SIMDSIZE*2UL, xmm3*factor );
3006 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3018 for(
size_t j=jbegin; j<jend; ++j ) {
3020 xmm1 += A.load(i ,j) * x1;
3021 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3024 y.store( i , xmm1*factor );
3025 y.store( i+SIMDSIZE, xmm2*factor );
3028 for( ; i<ipos; i+=SIMDSIZE )
3040 for(
size_t j=jbegin; j<jend; ++j ) {
3042 xmm1 += A.load(i,j) * x1;
3045 y.store( i, xmm1*factor );
3048 for( ; remainder && i<M; ++i )
3060 for(
size_t j=jbegin; j<jend; ++j ) {
3061 value += A(i,j) * x[j];
3064 y[i] = value * scalar;
3083 template<
typename VT1
3088 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3090 selectDefaultAssignKernel( y, A, x, scalar );
3108 template<
typename VT1
3113 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3117 const size_t M( A.rows() );
3118 const size_t N( A.columns() );
3120 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
3121 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3125 const SIMDType factor(
set( scalar ) );
3129 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3130 for(
size_t jj=0UL; jj<N; jj+=jblock )
3132 const size_t jend(
min( jj+jblock, N ) );
3133 const size_t itmp(
min( ii+iblock, M ) );
3138 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3139 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3145 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3147 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3149 for(
size_t j=jj; j<jend; ++j ) {
3151 xmm1 += A.load(i ,j) * x1;
3152 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3153 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3154 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3155 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3156 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3157 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3158 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3161 y.store( i , y.load(i ) + xmm1*factor );
3162 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3163 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3164 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3165 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3166 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3167 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3168 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3171 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3175 for(
size_t j=jj; j<jend; ++j ) {
3177 xmm1 += A.load(i ,j) * x1;
3178 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3179 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3180 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3183 y.store( i , y.load(i ) + xmm1*factor );
3184 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3185 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3186 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3189 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3193 for(
size_t j=jj; j<jend; ++j ) {
3195 xmm1 += A.load(i ,j) * x1;
3196 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3197 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3200 y.store( i , y.load(i ) + xmm1*factor );
3201 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3202 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3205 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3209 for(
size_t j=jj; j<jend; ++j ) {
3211 xmm1 += A.load(i ,j) * x1;
3212 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3215 y.store( i , y.load(i ) + xmm1*factor );
3216 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3219 for( ; i<ipos; i+=SIMDSIZE )
3223 for(
size_t j=jj; j<jend; ++j ) {
3224 xmm1 += A.load(i,j) *
set( x[j] );
3227 y.store( i, y.load(i) + xmm1*factor );
3230 for( ; remainder && i<iend; ++i )
3234 for(
size_t j=jj; j<jend; ++j ) {
3235 value += A(i,j) * x[j];
3238 y[i] += value * scalar;
3259 template<
typename VT1
3264 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3266 selectLargeAssignKernel( y, A, x, scalar );
3271 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3285 template<
typename VT1
3290 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3295 assign( y, scalar * x );
3299 gemv( y, A, x, ET(scalar), ET(0) );
3317 template<
typename VT1 >
3329 assign( ~lhs, tmp );
3345 template<
typename VT1 >
3355 if( left.rows() == 0UL || left.columns() == 0UL ) {
3367 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3382 template<
typename VT1
3386 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3390 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3391 selectSmallAddAssignKernel( y, A, x, scalar );
3393 selectBlasAddAssignKernel( y, A, x, scalar );
3411 template<
typename VT1
3415 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3417 y.addAssign( A * x * scalar );
3435 template<
typename VT1
3440 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3442 selectDefaultAddAssignKernel( y, A, x, scalar );
3461 template<
typename VT1
3466 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3470 const size_t M( A.rows() );
3471 const size_t N( A.columns() );
3473 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3476 const SIMDType factor(
set( scalar ) );
3480 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3490 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3492 for(
size_t j=jbegin; j<jend; ++j ) {
3494 xmm1 += A.load(i ,j) * x1;
3495 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3496 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3497 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3498 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3499 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3500 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3501 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3504 y.store( i , y.load(i ) + xmm1*factor );
3505 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3506 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3507 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3508 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3509 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3510 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3511 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3514 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3526 for(
size_t j=jbegin; j<jend; ++j ) {
3528 xmm1 += A.load(i ,j) * x1;
3529 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3530 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3531 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3534 y.store( i , y.load(i ) + xmm1*factor );
3535 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3536 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3537 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3540 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3552 for(
size_t j=jbegin; j<jend; ++j ) {
3554 xmm1 += A.load(i ,j) * x1;
3555 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3556 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3559 y.store( i , y.load(i ) + xmm1*factor );
3560 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3561 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3564 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3576 for(
size_t j=jbegin; j<jend; ++j ) {
3578 xmm1 += A.load(i ,j) * x1;
3579 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3582 y.store( i , y.load(i ) + xmm1*factor );
3583 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3586 for( ; i<ipos; i+=SIMDSIZE )
3598 for(
size_t j=jbegin; j<jend; ++j ) {
3599 xmm1 += A.load(i,j) *
set( x[j] );
3602 y.store( i, y.load(i) + xmm1*factor );
3605 for( ; remainder && i<M; ++i )
3617 for(
size_t j=jbegin; j<jend; ++j ) {
3618 value += A(i,j) * x[j];
3621 y[i] += value * scalar;
3640 template<
typename VT1
3645 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3647 selectDefaultAddAssignKernel( y, A, x, scalar );
3666 template<
typename VT1
3671 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3675 const size_t M( A.rows() );
3676 const size_t N( A.columns() );
3678 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
3679 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3683 const SIMDType factor(
set( scalar ) );
3685 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3686 for(
size_t jj=0UL; jj<N; jj+=jblock )
3688 const size_t jend(
min( jj+jblock, N ) );
3689 const size_t itmp(
min( ii+iblock, M ) );
3694 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3695 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3701 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3703 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3705 for(
size_t j=jj; j<jend; ++j ) {
3707 xmm1 += A.load(i ,j) * x1;
3708 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3709 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3710 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3711 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3712 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3713 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3714 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3717 y.store( i , y.load(i ) + xmm1*factor );
3718 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3719 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3720 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3721 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3722 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3723 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3724 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3727 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3731 for(
size_t j=jj; j<jend; ++j ) {
3733 xmm1 += A.load(i ,j) * x1;
3734 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3735 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3736 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3739 y.store( i , y.load(i ) + xmm1*factor );
3740 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3741 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3742 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3745 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3749 for(
size_t j=jj; j<jend; ++j ) {
3751 xmm1 += A.load(i ,j) * x1;
3752 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3753 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3756 y.store( i , y.load(i ) + xmm1*factor );
3757 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3758 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3761 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3765 for(
size_t j=jj; j<jend; ++j ) {
3767 xmm1 += A.load(i ,j) * x1;
3768 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3771 y.store( i , y.load(i ) + xmm1*factor );
3772 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3775 for( ; i<ipos; i+=SIMDSIZE )
3779 for(
size_t j=jj; j<jend; ++j ) {
3780 xmm1 += A.load(i,j) *
set( x[j] );
3783 y.store( i, y.load(i) + xmm1*factor );
3786 for( ; remainder && i<iend; ++i )
3790 for(
size_t j=jj; j<jend; ++j ) {
3791 value += A(i,j) * x[j];
3794 y[i] += value * scalar;
3815 template<
typename VT1
3820 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3822 selectLargeAddAssignKernel( y, A, x, scalar );
3827 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3841 template<
typename VT1
3846 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3853 addAssign( y, tmp );
3856 gemv( y, A, x, ET(scalar), ET(1) );
3878 template<
typename VT1 >
3888 if( left.rows() == 0UL || left.columns() == 0UL ) {
3900 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3915 template<
typename VT1
3919 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3923 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3924 selectSmallSubAssignKernel( y, A, x, scalar );
3926 selectBlasSubAssignKernel( y, A, x, scalar );
3944 template<
typename VT1
3948 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3950 y.subAssign( A * x * scalar );
3968 template<
typename VT1
3973 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3975 selectDefaultSubAssignKernel( y, A, x, scalar );
3994 template<
typename VT1
3999 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4003 const size_t M( A.rows() );
4004 const size_t N( A.columns() );
4006 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
4009 const SIMDType factor(
set( scalar ) );
4013 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4023 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4025 for(
size_t j=jbegin; j<jend; ++j ) {
4027 xmm1 += A.load(i ,j) * x1;
4028 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4029 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4030 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4031 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
4032 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
4033 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
4034 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
4037 y.store( i , y.load(i ) - xmm1*factor );
4038 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4039 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4040 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4041 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4042 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4043 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4044 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4047 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4059 for(
size_t j=jbegin; j<jend; ++j ) {
4061 xmm1 += A.load(i ,j) * x1;
4062 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4063 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4064 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4067 y.store( i , y.load(i ) - xmm1*factor );
4068 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4069 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4070 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4073 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4085 for(
size_t j=jbegin; j<jend; ++j ) {
4087 xmm1 += A.load(i ,j) * x1;
4088 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4089 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4092 y.store( i , y.load(i ) - xmm1*factor );
4093 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4094 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4097 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4109 for(
size_t j=jbegin; j<jend; ++j ) {
4111 xmm1 += A.load(i ,j) * x1;
4112 xmm2 += A.load(i+SIMDSIZE,j) * x1;
4115 y.store( i , y.load(i ) - xmm1*factor );
4116 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4119 for( ; i<ipos; i+=SIMDSIZE )
4131 for(
size_t j=jbegin; j<jend; ++j ) {
4132 xmm1 += A.load(i,j) *
set( x[j] );
4135 y.store( i, y.load(i) - xmm1*factor );
4138 for( ; remainder && i<M; ++i )
4150 for(
size_t j=jbegin; j<jend; ++j ) {
4151 value += A(i,j) * x[j];
4154 y[i] -= value * scalar;
4173 template<
typename VT1
4178 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4180 selectDefaultSubAssignKernel( y, A, x, scalar );
4199 template<
typename VT1
4204 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4208 const size_t M( A.rows() );
4209 const size_t N( A.columns() );
4211 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
4212 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
4216 const SIMDType factor(
set( scalar ) );
4218 for(
size_t ii=0U; ii<M; ii+=iblock ) {
4219 for(
size_t jj=0UL; jj<N; jj+=jblock )
4221 const size_t jend(
min( jj+jblock, N ) );
4222 const size_t itmp(
min( ii+iblock, M ) );
4227 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4228 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
4234 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4236 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4238 for(
size_t j=jj; j<jend; ++j ) {
4240 xmm1 += A.load(i ,j) * x1;
4241 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4242 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4243 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4244 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
4245 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
4246 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
4247 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
4250 y.store( i , y.load(i ) - xmm1*factor );
4251 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4252 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4253 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4254 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4255 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4256 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4257 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4260 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4264 for(
size_t j=jj; j<jend; ++j ) {
4266 xmm1 += A.load(i ,j) * x1;
4267 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4268 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4269 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4272 y.store( i , y.load(i ) - xmm1*factor );
4273 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4274 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4275 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4278 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4282 for(
size_t j=jj; j<jend; ++j ) {
4284 xmm1 += A.load(i ,j) * x1;
4285 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4286 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4289 y.store( i , y.load(i ) - xmm1*factor );
4290 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4291 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4294 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4298 for(
size_t j=jj; j<jend; ++j ) {
4300 xmm1 += A.load(i ,j) * x1;
4301 xmm2 += A.load(i+SIMDSIZE,j) * x1;
4304 y.store( i , y.load(i ) - xmm1*factor );
4305 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4308 for( ; i<ipos; i+=SIMDSIZE )
4312 for(
size_t j=jj; j<jend; ++j ) {
4313 xmm1 += A.load(i,j) *
set( x[j] );
4316 y.store( i, y.load(i) - xmm1*factor );
4319 for( ; remainder && i<iend; ++i )
4323 for(
size_t j=jj; j<jend; ++j ) {
4324 value += A(i,j) * x[j];
4327 y[i] -= value * scalar;
4348 template<
typename VT1
4353 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4355 selectLargeSubAssignKernel( y, A, x, scalar );
4360 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4374 template<
typename VT1
4379 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4386 subAssign( y, tmp );
4389 gemv( y, A, x, ET(-scalar), ET(1) );
4411 template<
typename VT1 >
4423 multAssign( ~lhs, tmp );
4443 template<
typename VT1 >
4455 divAssign( ~lhs, tmp );
4477 template<
typename VT1 >
4488 if( left.rows() == 0UL ) {
4491 else if( left.columns() == 0UL ) {
4522 template<
typename VT1 >
4553 template<
typename VT1 >
4564 if( left.rows() == 0UL || left.columns() == 0UL ) {
4598 template<
typename VT1 >
4609 if( left.rows() == 0UL || left.columns() == 0UL ) {
4644 template<
typename VT1 >
4679 template<
typename VT1 >
4754 template<
typename MT
4756 inline decltype(
auto)
4783 template<
typename MT,
typename VT >
4784 struct Size< TDMatDVecMultExpr<MT,VT> >
4801 template<
typename MT,
typename VT >
4802 struct IsAligned< TDMatDVecMultExpr<MT,VT> >
4803 :
public BoolConstant< And< IsAligned<MT>, IsAligned<VT> >::value >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:219
Headerfile for the generic min algorithm.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:209
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDMatDVecMultExpr.h:294
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:131
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:128
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:133
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:521
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1809
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
Expression object for transpose dense matrix-dense vector multiplications.The TDMatDVecMultExpr class...
Definition: Forward.h:149
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:205
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Row< MT > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:124
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:371
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:361
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:210
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:262
Header file for the HasSIMDAdd type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:339
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:317
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:340
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:132
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:88
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:109
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:129
Constraint on the data type.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDVecMultExpr.h:208
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:593
If_< IsExpression< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:213
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:207
Header file for the IsSIMDCombinable type trait.
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:222
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:307
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:206
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:130
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:106
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:74
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:327
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:351
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:75
Header file for the IsComplex type trait.
Constraint on the data type.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
If_< IsExpression< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:216
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:383
Header file for the MatVecMultExpr base class.
Constraint on the data type.
TDMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:248
Header file for the Size type trait.
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:384
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the function trace functionality.