35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_ 121 template<
typename MT
123 class TDMatDVecMultExpr
124 :
public MatVecMultExpr< DenseVector< TDMatDVecMultExpr<MT,VT>, false > >
125 ,
private Computation
154 template<
typename T1 >
155 struct UseSMPAssign {
156 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
166 template<
typename T1,
typename T2,
typename T3 >
167 struct UseBlasKernel {
173 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
189 template<
typename T1,
typename T2,
typename T3 >
190 struct UseVectorizedDefaultKernel {
191 enum :
bool { value = useOptimizedKernels &&
193 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
229 MT::simdEnabled && VT::simdEnabled &&
234 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
235 !evaluateVector && VT::smpAssignable };
268 return mat_(index,index) *
vec_[index];
279 const size_t n (
mat_.columns() -
begin );
298 if( index >=
mat_.rows() ) {
301 return (*
this)[index];
310 inline size_t size() const noexcept {
341 template<
typename T >
342 inline bool canAlias(
const T* alias )
const noexcept {
343 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
353 template<
typename T >
354 inline bool isAliased(
const T* alias )
const noexcept {
355 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
365 return mat_.isAligned() &&
vec_.isAligned();
379 (
mat_.rows() *
mat_.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
380 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
403 template<
typename VT1 >
410 if( rhs.mat_.rows() == 0UL ) {
413 else if( rhs.mat_.columns() == 0UL ) {
426 TDMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
442 template<
typename VT1
445 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
449 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
450 selectSmallAssignKernel( y, A, x );
452 selectBlasAssignKernel( y, A, x );
471 template<
typename VT1
474 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
476 const size_t M( A.rows() );
477 const size_t N( A.columns() );
486 y[i] = A(i,0UL) * x[0UL];
494 y[j] = A(j,j) * x[j];
506 const size_t inum( iend - ibegin );
507 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
509 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
510 y[i ] += A(i ,j) * x[j];
511 y[i+1UL] += A(i+1UL,j) * x[j];
514 y[ipos] += A(ipos,j) * x[j];
517 y[iend] = A(iend,j) * x[j];
543 template<
typename VT1
547 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
549 selectDefaultAssignKernel( y, A, x );
568 template<
typename VT1
572 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
576 const size_t M( A.rows() );
577 const size_t N( A.columns() );
579 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
584 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
594 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
596 for(
size_t j=jbegin; j<jend; ++j ) {
598 xmm1 += A.load(i ,j) * x1;
599 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
600 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
601 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
602 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
603 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
604 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
605 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
609 y.store( i+SIMDSIZE , xmm2 );
610 y.store( i+SIMDSIZE*2UL, xmm3 );
611 y.store( i+SIMDSIZE*3UL, xmm4 );
612 y.store( i+SIMDSIZE*4UL, xmm5 );
613 y.store( i+SIMDSIZE*5UL, xmm6 );
614 y.store( i+SIMDSIZE*6UL, xmm7 );
615 y.store( i+SIMDSIZE*7UL, xmm8 );
618 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
630 for(
size_t j=jbegin; j<jend; ++j ) {
632 xmm1 += A.load(i ,j) * x1;
633 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
634 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
635 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
639 y.store( i+SIMDSIZE , xmm2 );
640 y.store( i+SIMDSIZE*2UL, xmm3 );
641 y.store( i+SIMDSIZE*3UL, xmm4 );
644 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
656 for(
size_t j=jbegin; j<jend; ++j ) {
658 xmm1 += A.load(i ,j) * x1;
659 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
660 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
664 y.store( i+SIMDSIZE , xmm2 );
665 y.store( i+SIMDSIZE*2UL, xmm3 );
668 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
680 for(
size_t j=jbegin; j<jend; ++j ) {
682 xmm1 += A.load(i ,j) * x1;
683 xmm2 += A.load(i+SIMDSIZE,j) * x1;
687 y.store( i+SIMDSIZE, xmm2 );
690 for( ; i<ipos; i+=SIMDSIZE )
702 for(
size_t j=jbegin; j<jend; ++j ) {
703 xmm1 += A.load(i,j) *
set( x[j] );
709 for( ; remainder && i<M; ++i )
721 for(
size_t j=jbegin; j<jend; ++j ) {
722 value += A(i,j) * x[j];
745 template<
typename VT1
749 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
751 selectDefaultAssignKernel( y, A, x );
770 template<
typename VT1
774 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
778 const size_t M( A.rows() );
779 const size_t N( A.columns() );
781 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
782 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
788 for(
size_t ii=0U; ii<M; ii+=iblock ) {
789 for(
size_t jj=0UL; jj<N; jj+=jblock )
791 const size_t jend(
min( jj+jblock, N ) );
792 const size_t itmp(
min( ii+iblock, M ) );
797 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
798 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
804 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
806 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
808 for(
size_t j=jj; j<jend; ++j ) {
810 xmm1 += A.load(i ,j) * x1;
811 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
812 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
813 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
814 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
815 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
816 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
817 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
820 y.store( i , y.load(i ) + xmm1 );
821 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
822 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
823 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
824 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
825 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
826 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
827 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
830 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
834 for(
size_t j=jj; j<jend; ++j ) {
836 xmm1 += A.load(i ,j) * x1;
837 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
838 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
839 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
842 y.store( i , y.load(i ) + xmm1 );
843 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
844 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
845 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
848 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
852 for(
size_t j=jj; j<jend; ++j ) {
854 xmm1 += A.load(i ,j) * x1;
855 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
856 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
859 y.store( i , y.load(i ) + xmm1 );
860 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
861 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
864 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
868 for(
size_t j=jj; j<jend; ++j ) {
870 xmm1 += A.load(i ,j) * x1;
871 xmm2 += A.load(i+SIMDSIZE,j) * x1;
874 y.store( i , y.load(i ) + xmm1 );
875 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
878 for( ; i<ipos; i+=SIMDSIZE )
882 for(
size_t j=jj; j<jend; ++j ) {
883 xmm1 += A.load(i,j) *
set( x[j] );
886 y.store( i, y.load(i) + xmm1 );
889 for( ; remainder && i<iend; ++i )
893 for(
size_t j=jj; j<jend; ++j ) {
894 value += A(i,j) * x[j];
919 template<
typename VT1
923 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
925 selectLargeAssignKernel( y, A, x );
931 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 945 template<
typename VT1
949 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
958 gemv( y, A, x, ET(1), ET(0) );
978 template<
typename VT1 >
1008 template<
typename VT1 >
1015 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1027 TDMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1043 template<
typename VT1
1046 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1050 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1051 selectSmallAddAssignKernel( y, A, x );
1053 selectBlasAddAssignKernel( y, A, x );
1072 template<
typename VT1
1075 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1077 const size_t M( A.rows() );
1078 const size_t N( A.columns() );
1080 for(
size_t j=0UL; j<N; ++j )
1084 y[j] += A(j,j) * x[j];
1096 const size_t inum( iend - ibegin );
1097 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1099 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1100 y[i ] += A(i ,j) * x[j];
1101 y[i+1UL] += A(i+1UL,j) * x[j];
1104 y[ipos] += A(ipos,j) * x[j];
1126 template<
typename VT1
1130 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1132 selectDefaultAddAssignKernel( y, A, x );
1151 template<
typename VT1
1155 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1159 const size_t M( A.rows() );
1160 const size_t N( A.columns() );
1162 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1167 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1178 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1179 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1180 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1181 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1182 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1183 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1184 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1186 for(
size_t j=jbegin; j<jend; ++j ) {
1188 xmm1 += A.load(i ,j) * x1;
1189 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1190 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1191 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1192 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1193 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1194 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1195 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1198 y.store( i , xmm1 );
1199 y.store( i+SIMDSIZE , xmm2 );
1200 y.store( i+SIMDSIZE*2UL, xmm3 );
1201 y.store( i+SIMDSIZE*3UL, xmm4 );
1202 y.store( i+SIMDSIZE*4UL, xmm5 );
1203 y.store( i+SIMDSIZE*5UL, xmm6 );
1204 y.store( i+SIMDSIZE*6UL, xmm7 );
1205 y.store( i+SIMDSIZE*7UL, xmm8 );
1208 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1219 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1220 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1221 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1223 for(
size_t j=jbegin; j<jend; ++j ) {
1225 xmm1 += A.load(i ,j) * x1;
1226 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1227 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1228 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1231 y.store( i , xmm1 );
1232 y.store( i+SIMDSIZE , xmm2 );
1233 y.store( i+SIMDSIZE*2UL, xmm3 );
1234 y.store( i+SIMDSIZE*3UL, xmm4 );
1237 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1248 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1249 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1251 for(
size_t j=jbegin; j<jend; ++j ) {
1253 xmm1 += A.load(i ,j) * x1;
1254 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1255 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1258 y.store( i , xmm1 );
1259 y.store( i+SIMDSIZE , xmm2 );
1260 y.store( i+SIMDSIZE*2UL, xmm3 );
1263 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1274 SIMDType xmm2( y.load(i+SIMDSIZE) );
1276 for(
size_t j=jbegin; j<jend; ++j ) {
1278 xmm1 += A.load(i ,j) * x1;
1279 xmm2 += A.load(i+SIMDSIZE,j) * x1;
1282 y.store( i , xmm1 );
1283 y.store( i+SIMDSIZE, xmm2 );
1286 for( ; i<ipos; i+=SIMDSIZE )
1298 for(
size_t j=jbegin; j<jend; ++j ) {
1299 xmm1 += A.load(i,j) *
set( x[j] );
1305 for( ; remainder && i<M; ++i )
1317 for(
size_t j=jbegin; j<jend; ++j ) {
1318 value += A(i,j) * x[j];
1341 template<
typename VT1
1345 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1347 selectDefaultAddAssignKernel( y, A, x );
1366 template<
typename VT1
1370 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1374 const size_t M( A.rows() );
1375 const size_t N( A.columns() );
1377 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
1378 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1382 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1383 for(
size_t jj=0UL; jj<N; jj+=jblock )
1385 const size_t jend(
min( jj+jblock, N ) );
1386 const size_t itmp(
min( ii+iblock, M ) );
1391 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1392 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1398 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1400 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1402 for(
size_t j=jj; j<jend; ++j ) {
1404 xmm1 += A.load(i ,j) * x1;
1405 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1406 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1407 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1408 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1409 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1410 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1411 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1414 y.store( i , y.load(i ) + xmm1 );
1415 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1416 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1417 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1418 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
1419 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
1420 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
1421 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
1424 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1428 for(
size_t j=jj; j<jend; ++j ) {
1430 xmm1 += A.load(i ,j) * x1;
1431 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1432 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1433 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1436 y.store( i , y.load(i ) + xmm1 );
1437 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1438 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1439 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1442 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1446 for(
size_t j=jj; j<jend; ++j ) {
1448 xmm1 += A.load(i ,j) * x1;
1449 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1450 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1453 y.store( i , y.load(i ) + xmm1 );
1454 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1455 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1458 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1462 for(
size_t j=jj; j<jend; ++j ) {
1464 xmm1 += A.load(i ,j) * x1;
1465 xmm2 += A.load(i+SIMDSIZE,j) * x1;
1468 y.store( i , y.load(i ) + xmm1 );
1469 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
1472 for( ; i<ipos; i+=SIMDSIZE )
1476 for(
size_t j=jj; j<jend; ++j ) {
1477 xmm1 += A.load(i,j) *
set( x[j] );
1480 y.store( i, y.load(i) + xmm1 );
1483 for( ; remainder && i<iend; ++i )
1487 for(
size_t j=jj; j<jend; ++j ) {
1488 value += A(i,j) * x[j];
1513 template<
typename VT1
1517 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1519 selectLargeAddAssignKernel( y, A, x );
1525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1539 template<
typename VT1
1543 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1550 addAssign( y, tmp );
1553 gemv( y, A, x, ET(1), ET(1) );
1577 template<
typename VT1 >
1584 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1596 TDMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1612 template<
typename VT1
1615 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1619 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1620 selectSmallSubAssignKernel( y, A, x );
1622 selectBlasSubAssignKernel( y, A, x );
1641 template<
typename VT1
1644 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1646 const size_t M( A.rows() );
1647 const size_t N( A.columns() );
1649 for(
size_t j=0UL; j<N; ++j )
1653 y[j] -= A(j,j) * x[j];
1665 const size_t inum( iend - ibegin );
1666 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1668 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1669 y[i ] -= A(i ,j) * x[j];
1670 y[i+1UL] -= A(i+1UL,j) * x[j];
1673 y[ipos] -= A(ipos,j) * x[j];
1695 template<
typename VT1
1699 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1701 selectDefaultSubAssignKernel( y, A, x );
1721 template<
typename VT1
1725 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1729 const size_t M( A.rows() );
1730 const size_t N( A.columns() );
1732 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1737 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1748 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1749 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1750 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1751 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1752 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1753 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1754 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1756 for(
size_t j=jbegin; j<jend; ++j ) {
1758 xmm1 -= A.load(i ,j) * x1;
1759 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1760 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1761 xmm4 -= A.load(i+SIMDSIZE*3UL,j) * x1;
1762 xmm5 -= A.load(i+SIMDSIZE*4UL,j) * x1;
1763 xmm6 -= A.load(i+SIMDSIZE*5UL,j) * x1;
1764 xmm7 -= A.load(i+SIMDSIZE*6UL,j) * x1;
1765 xmm8 -= A.load(i+SIMDSIZE*7UL,j) * x1;
1768 y.store( i , xmm1 );
1769 y.store( i+SIMDSIZE , xmm2 );
1770 y.store( i+SIMDSIZE*2UL, xmm3 );
1771 y.store( i+SIMDSIZE*3UL, xmm4 );
1772 y.store( i+SIMDSIZE*4UL, xmm5 );
1773 y.store( i+SIMDSIZE*5UL, xmm6 );
1774 y.store( i+SIMDSIZE*6UL, xmm7 );
1775 y.store( i+SIMDSIZE*7UL, xmm8 );
1778 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1789 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1790 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1791 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1793 for(
size_t j=jbegin; j<jend; ++j ) {
1795 xmm1 -= A.load(i ,j) * x1;
1796 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1797 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1798 xmm4 -= A.load(i+SIMDSIZE*3UL,j) * x1;
1801 y.store( i , xmm1 );
1802 y.store( i+SIMDSIZE , xmm2 );
1803 y.store( i+SIMDSIZE*2UL, xmm3 );
1804 y.store( i+SIMDSIZE*3UL, xmm4 );
1807 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1818 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1819 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1821 for(
size_t j=jbegin; j<jend; ++j ) {
1823 xmm1 -= A.load(i ,j) * x1;
1824 xmm2 -= A.load(i+SIMDSIZE ,j) * x1;
1825 xmm3 -= A.load(i+SIMDSIZE*2UL,j) * x1;
1828 y.store( i , xmm1 );
1829 y.store( i+SIMDSIZE , xmm2 );
1830 y.store( i+SIMDSIZE*2UL, xmm3 );
1833 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1844 SIMDType xmm2( y.load(i+SIMDSIZE) );
1846 for(
size_t j=jbegin; j<jend; ++j ) {
1848 xmm1 -= A.load(i ,j) * x1;
1849 xmm2 -= A.load(i+SIMDSIZE,j) * x1;
1852 y.store( i , xmm1 );
1853 y.store( i+SIMDSIZE, xmm2 );
1856 for( ; i<ipos; i+=SIMDSIZE )
1868 for(
size_t j=jbegin; j<jend; ++j ) {
1869 xmm1 -= A.load(i,j) *
set( x[j] );
1875 for( ; remainder && i<M; ++i )
1887 for(
size_t j=jbegin; j<jend; ++j ) {
1888 value += A(i,j) * x[j];
1911 template<
typename VT1
1915 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1917 selectDefaultSubAssignKernel( y, A, x );
1937 template<
typename VT1
1941 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1945 const size_t M( A.rows() );
1946 const size_t N( A.columns() );
1948 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
1949 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1953 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1954 for(
size_t jj=0UL; jj<N; jj+=jblock )
1956 const size_t jend(
min( jj+jblock, N ) );
1957 const size_t itmp(
min( ii+iblock, M ) );
1962 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1963 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1969 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1971 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1973 for(
size_t j=jj; j<jend; ++j ) {
1975 xmm1 += A.load(i ,j) * x1;
1976 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
1977 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
1978 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
1979 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
1980 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
1981 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
1982 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
1985 y.store( i , y.load(i ) - xmm1 );
1986 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
1987 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
1988 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
1989 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5 );
1990 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6 );
1991 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7 );
1992 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8 );
1995 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1999 for(
size_t j=jj; j<jend; ++j ) {
2001 xmm1 += A.load(i ,j) * x1;
2002 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2003 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2004 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2007 y.store( i , y.load(i ) - xmm1 );
2008 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2009 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2010 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
2013 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2017 for(
size_t j=jj; j<jend; ++j ) {
2019 xmm1 += A.load(i ,j) * x1;
2020 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2021 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2024 y.store( i , y.load(i ) - xmm1 );
2025 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2026 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2029 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2033 for(
size_t j=jj; j<jend; ++j ) {
2035 xmm1 += A.load(i ,j) * x1;
2036 xmm2 += A.load(i+SIMDSIZE,j) * x1;
2039 y.store( i , y.load(i ) - xmm1 );
2040 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2 );
2043 for( ; i<ipos; i+=SIMDSIZE )
2047 for(
size_t j=jj; j<jend; ++j ) {
2048 xmm1 += A.load(i,j) *
set( x[j] );
2051 y.store( i, y.load(i) - xmm1 );
2054 for( ; remainder && i<iend; ++i )
2058 for(
size_t j=jj; j<jend; ++j ) {
2059 value += A(i,j) * x[j];
2084 template<
typename VT1
2088 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2090 selectLargeSubAssignKernel( y, A, x );
2096 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2110 template<
typename VT1
2114 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2121 subAssign( y, tmp );
2124 gemv( y, A, x, ET(-1), ET(1) );
2148 template<
typename VT1 >
2160 multAssign( ~lhs, tmp );
2182 template<
typename VT1 >
2194 divAssign( ~lhs, tmp );
2218 template<
typename VT1 >
2226 if( rhs.mat_.rows() == 0UL ) {
2229 else if( rhs.mat_.columns() == 0UL ) {
2262 template<
typename VT1 >
2295 template<
typename VT1 >
2303 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2339 template<
typename VT1 >
2347 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2383 template<
typename VT1 >
2420 template<
typename VT1 >
2472 template<
typename MT
2476 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false > >
2507 template<
typename T1 >
2508 struct UseSMPAssign {
2509 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
2517 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2518 struct UseBlasKernel {
2524 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2539 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2540 struct UseVectorizedDefaultKernel {
2541 enum :
bool { value = useOptimizedKernels &&
2543 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2579 MT::simdEnabled && VT::simdEnabled &&
2585 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2586 !evaluateVector && VT::smpAssignable };
2614 return vector_[index] * scalar_;
2626 if( index >= vector_.size() ) {
2629 return (*
this)[index];
2638 inline size_t size()
const {
2639 return vector_.size();
2669 template<
typename T >
2670 inline bool canAlias(
const T* alias )
const {
2671 return vector_.canAlias( alias );
2681 template<
typename T >
2682 inline bool isAliased(
const T* alias )
const {
2683 return vector_.isAliased( alias );
2693 return vector_.isAligned();
2708 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
2709 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
2731 template<
typename VT1 >
2741 if( left.rows() == 0UL ) {
2744 else if( left.columns() == 0UL ) {
2757 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.
scalar_ );
2772 template<
typename VT1
2776 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2780 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
2781 selectSmallAssignKernel( y, A, x, scalar );
2783 selectBlasAssignKernel( y, A, x, scalar );
2801 template<
typename VT1
2805 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2807 const size_t M( A.rows() );
2808 const size_t N( A.columns() );
2817 y[i] = A(i,0UL) * x[0UL];
2825 y[j] = A(j,j) * x[j] * scalar;
2837 const size_t inum( iend - ibegin );
2838 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2840 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2841 y[i ] += A(i ,j) * x[j];
2842 y[i+1UL] += A(i+1UL,j) * x[j];
2845 y[ipos] += A(ipos,j) * x[j];
2848 y[iend] = A(iend,j) * x[j];
2881 template<
typename VT1
2886 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2888 selectDefaultAssignKernel( y, A, x, scalar );
2906 template<
typename VT1
2911 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2915 const size_t M( A.rows() );
2916 const size_t N( A.columns() );
2918 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
2921 const SIMDType factor(
set( scalar ) );
2925 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
2935 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2937 for(
size_t j=jbegin; j<jend; ++j ) {
2939 xmm1 += A.load(i ,j) * x1;
2940 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2941 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2942 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2943 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
2944 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
2945 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
2946 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
2949 y.store( i , xmm1*factor );
2950 y.store( i+SIMDSIZE , xmm2*factor );
2951 y.store( i+SIMDSIZE*2UL, xmm3*factor );
2952 y.store( i+SIMDSIZE*3UL, xmm4*factor );
2953 y.store( i+SIMDSIZE*4UL, xmm5*factor );
2954 y.store( i+SIMDSIZE*5UL, xmm6*factor );
2955 y.store( i+SIMDSIZE*6UL, xmm7*factor );
2956 y.store( i+SIMDSIZE*7UL, xmm8*factor );
2959 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2971 for(
size_t j=jbegin; j<jend; ++j ) {
2973 xmm1 += A.load(i ,j) * x1;
2974 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
2975 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
2976 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
2979 y.store( i , xmm1*factor );
2980 y.store( i+SIMDSIZE , xmm2*factor );
2981 y.store( i+SIMDSIZE*2UL, xmm3*factor );
2982 y.store( i+SIMDSIZE*3UL, xmm4*factor );
2985 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2997 for(
size_t j=jbegin; j<jend; ++j ) {
2999 xmm1 += A.load(i ,j) * x1;
3000 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3001 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3004 y.store( i , xmm1*factor );
3005 y.store( i+SIMDSIZE , xmm2*factor );
3006 y.store( i+SIMDSIZE*2UL, xmm3*factor );
3009 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3021 for(
size_t j=jbegin; j<jend; ++j ) {
3023 xmm1 += A.load(i ,j) * x1;
3024 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3027 y.store( i , xmm1*factor );
3028 y.store( i+SIMDSIZE, xmm2*factor );
3031 for( ; i<ipos; i+=SIMDSIZE )
3043 for(
size_t j=jbegin; j<jend; ++j ) {
3045 xmm1 += A.load(i,j) * x1;
3048 y.store( i, xmm1*factor );
3051 for( ; remainder && i<M; ++i )
3063 for(
size_t j=jbegin; j<jend; ++j ) {
3064 value += A(i,j) * x[j];
3067 y[i] = value * scalar;
3086 template<
typename VT1
3091 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3093 selectDefaultAssignKernel( y, A, x, scalar );
3111 template<
typename VT1
3116 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3120 const size_t M( A.rows() );
3121 const size_t N( A.columns() );
3123 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
3124 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3128 const SIMDType factor(
set( scalar ) );
3132 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3133 for(
size_t jj=0UL; jj<N; jj+=jblock )
3135 const size_t jend(
min( jj+jblock, N ) );
3136 const size_t itmp(
min( ii+iblock, M ) );
3141 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3142 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3148 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3150 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3152 for(
size_t j=jj; j<jend; ++j ) {
3154 xmm1 += A.load(i ,j) * x1;
3155 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3156 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3157 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3158 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3159 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3160 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3161 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3164 y.store( i , y.load(i ) + xmm1*factor );
3165 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3166 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3167 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3168 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3169 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3170 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3171 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3174 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3178 for(
size_t j=jj; j<jend; ++j ) {
3180 xmm1 += A.load(i ,j) * x1;
3181 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3182 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3183 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3186 y.store( i , y.load(i ) + xmm1*factor );
3187 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3188 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3189 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3192 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3196 for(
size_t j=jj; j<jend; ++j ) {
3198 xmm1 += A.load(i ,j) * x1;
3199 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3200 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3203 y.store( i , y.load(i ) + xmm1*factor );
3204 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3205 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3208 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3212 for(
size_t j=jj; j<jend; ++j ) {
3214 xmm1 += A.load(i ,j) * x1;
3215 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3218 y.store( i , y.load(i ) + xmm1*factor );
3219 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3222 for( ; i<ipos; i+=SIMDSIZE )
3226 for(
size_t j=jj; j<jend; ++j ) {
3227 xmm1 += A.load(i,j) *
set( x[j] );
3230 y.store( i, y.load(i) + xmm1*factor );
3233 for( ; remainder && i<iend; ++i )
3237 for(
size_t j=jj; j<jend; ++j ) {
3238 value += A(i,j) * x[j];
3241 y[i] += value * scalar;
3262 template<
typename VT1
3267 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3269 selectLargeAssignKernel( y, A, x, scalar );
3274 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3288 template<
typename VT1
3293 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3298 assign( y, scalar * x );
3302 gemv( y, A, x, ET(scalar), ET(0) );
3320 template<
typename VT1 >
3332 assign( ~lhs, tmp );
3348 template<
typename VT1 >
3358 if( left.rows() == 0UL || left.columns() == 0UL ) {
3370 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3385 template<
typename VT1
3389 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3393 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3394 selectSmallAddAssignKernel( y, A, x, scalar );
3396 selectBlasAddAssignKernel( y, A, x, scalar );
3414 template<
typename VT1
3418 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3420 y.addAssign( A * x * scalar );
3438 template<
typename VT1
3443 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3445 selectDefaultAddAssignKernel( y, A, x, scalar );
3464 template<
typename VT1
3469 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3473 const size_t M( A.rows() );
3474 const size_t N( A.columns() );
3476 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3479 const SIMDType factor(
set( scalar ) );
3483 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3493 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3495 for(
size_t j=jbegin; j<jend; ++j ) {
3497 xmm1 += A.load(i ,j) * x1;
3498 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3499 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3500 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3501 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3502 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3503 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3504 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3507 y.store( i , y.load(i ) + xmm1*factor );
3508 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3509 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3510 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3511 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3512 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3513 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3514 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3517 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3529 for(
size_t j=jbegin; j<jend; ++j ) {
3531 xmm1 += A.load(i ,j) * x1;
3532 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3533 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3534 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3537 y.store( i , y.load(i ) + xmm1*factor );
3538 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3539 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3540 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3543 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3555 for(
size_t j=jbegin; j<jend; ++j ) {
3557 xmm1 += A.load(i ,j) * x1;
3558 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3559 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3562 y.store( i , y.load(i ) + xmm1*factor );
3563 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3564 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3567 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3579 for(
size_t j=jbegin; j<jend; ++j ) {
3581 xmm1 += A.load(i ,j) * x1;
3582 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3585 y.store( i , y.load(i ) + xmm1*factor );
3586 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3589 for( ; i<ipos; i+=SIMDSIZE )
3601 for(
size_t j=jbegin; j<jend; ++j ) {
3602 xmm1 += A.load(i,j) *
set( x[j] );
3605 y.store( i, y.load(i) + xmm1*factor );
3608 for( ; remainder && i<M; ++i )
3620 for(
size_t j=jbegin; j<jend; ++j ) {
3621 value += A(i,j) * x[j];
3624 y[i] += value * scalar;
3643 template<
typename VT1
3648 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3650 selectDefaultAddAssignKernel( y, A, x, scalar );
3669 template<
typename VT1
3674 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3678 const size_t M( A.rows() );
3679 const size_t N( A.columns() );
3681 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
3682 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3686 const SIMDType factor(
set( scalar ) );
3688 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3689 for(
size_t jj=0UL; jj<N; jj+=jblock )
3691 const size_t jend(
min( jj+jblock, N ) );
3692 const size_t itmp(
min( ii+iblock, M ) );
3697 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3698 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3704 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3706 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3708 for(
size_t j=jj; j<jend; ++j ) {
3710 xmm1 += A.load(i ,j) * x1;
3711 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3712 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3713 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3714 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
3715 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
3716 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
3717 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
3720 y.store( i , y.load(i ) + xmm1*factor );
3721 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3722 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3723 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3724 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3725 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3726 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3727 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3730 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3734 for(
size_t j=jj; j<jend; ++j ) {
3736 xmm1 += A.load(i ,j) * x1;
3737 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3738 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3739 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
3742 y.store( i , y.load(i ) + xmm1*factor );
3743 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3744 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3745 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3748 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3752 for(
size_t j=jj; j<jend; ++j ) {
3754 xmm1 += A.load(i ,j) * x1;
3755 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
3756 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
3759 y.store( i , y.load(i ) + xmm1*factor );
3760 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3761 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3764 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3768 for(
size_t j=jj; j<jend; ++j ) {
3770 xmm1 += A.load(i ,j) * x1;
3771 xmm2 += A.load(i+SIMDSIZE,j) * x1;
3774 y.store( i , y.load(i ) + xmm1*factor );
3775 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3778 for( ; i<ipos; i+=SIMDSIZE )
3782 for(
size_t j=jj; j<jend; ++j ) {
3783 xmm1 += A.load(i,j) *
set( x[j] );
3786 y.store( i, y.load(i) + xmm1*factor );
3789 for( ; remainder && i<iend; ++i )
3793 for(
size_t j=jj; j<jend; ++j ) {
3794 value += A(i,j) * x[j];
3797 y[i] += value * scalar;
3818 template<
typename VT1
3823 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3825 selectLargeAddAssignKernel( y, A, x, scalar );
3830 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3844 template<
typename VT1
3849 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3856 addAssign( y, tmp );
3859 gemv( y, A, x, ET(scalar), ET(1) );
3881 template<
typename VT1 >
3891 if( left.rows() == 0UL || left.columns() == 0UL ) {
3903 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3918 template<
typename VT1
3922 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3926 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3927 selectSmallSubAssignKernel( y, A, x, scalar );
3929 selectBlasSubAssignKernel( y, A, x, scalar );
3947 template<
typename VT1
3951 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3953 y.subAssign( A * x * scalar );
3971 template<
typename VT1
3976 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3978 selectDefaultSubAssignKernel( y, A, x, scalar );
3997 template<
typename VT1
4002 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4006 const size_t M( A.rows() );
4007 const size_t N( A.columns() );
4009 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
4012 const SIMDType factor(
set( scalar ) );
4016 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4026 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4028 for(
size_t j=jbegin; j<jend; ++j ) {
4030 xmm1 += A.load(i ,j) * x1;
4031 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4032 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4033 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4034 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
4035 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
4036 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
4037 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
4040 y.store( i , y.load(i ) - xmm1*factor );
4041 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4042 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4043 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4044 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4045 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4046 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4047 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4050 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4062 for(
size_t j=jbegin; j<jend; ++j ) {
4064 xmm1 += A.load(i ,j) * x1;
4065 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4066 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4067 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4070 y.store( i , y.load(i ) - xmm1*factor );
4071 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4072 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4073 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4076 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4088 for(
size_t j=jbegin; j<jend; ++j ) {
4090 xmm1 += A.load(i ,j) * x1;
4091 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4092 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4095 y.store( i , y.load(i ) - xmm1*factor );
4096 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4097 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4100 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4112 for(
size_t j=jbegin; j<jend; ++j ) {
4114 xmm1 += A.load(i ,j) * x1;
4115 xmm2 += A.load(i+SIMDSIZE,j) * x1;
4118 y.store( i , y.load(i ) - xmm1*factor );
4119 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4122 for( ; i<ipos; i+=SIMDSIZE )
4134 for(
size_t j=jbegin; j<jend; ++j ) {
4135 xmm1 += A.load(i,j) *
set( x[j] );
4138 y.store( i, y.load(i) - xmm1*factor );
4141 for( ; remainder && i<M; ++i )
4153 for(
size_t j=jbegin; j<jend; ++j ) {
4154 value += A(i,j) * x[j];
4157 y[i] -= value * scalar;
4176 template<
typename VT1
4181 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4183 selectDefaultSubAssignKernel( y, A, x, scalar );
4202 template<
typename VT1
4207 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4211 const size_t M( A.rows() );
4212 const size_t N( A.columns() );
4214 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
4215 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
4219 const SIMDType factor(
set( scalar ) );
4221 for(
size_t ii=0U; ii<M; ii+=iblock ) {
4222 for(
size_t jj=0UL; jj<N; jj+=jblock )
4224 const size_t jend(
min( jj+jblock, N ) );
4225 const size_t itmp(
min( ii+iblock, M ) );
4230 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4231 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
4237 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4239 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4241 for(
size_t j=jj; j<jend; ++j ) {
4243 xmm1 += A.load(i ,j) * x1;
4244 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4245 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4246 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4247 xmm5 += A.load(i+SIMDSIZE*4UL,j) * x1;
4248 xmm6 += A.load(i+SIMDSIZE*5UL,j) * x1;
4249 xmm7 += A.load(i+SIMDSIZE*6UL,j) * x1;
4250 xmm8 += A.load(i+SIMDSIZE*7UL,j) * x1;
4253 y.store( i , y.load(i ) - xmm1*factor );
4254 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4255 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4256 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4257 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4258 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4259 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4260 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4263 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4267 for(
size_t j=jj; j<jend; ++j ) {
4269 xmm1 += A.load(i ,j) * x1;
4270 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4271 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4272 xmm4 += A.load(i+SIMDSIZE*3UL,j) * x1;
4275 y.store( i , y.load(i ) - xmm1*factor );
4276 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4277 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4278 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4281 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4285 for(
size_t j=jj; j<jend; ++j ) {
4287 xmm1 += A.load(i ,j) * x1;
4288 xmm2 += A.load(i+SIMDSIZE ,j) * x1;
4289 xmm3 += A.load(i+SIMDSIZE*2UL,j) * x1;
4292 y.store( i , y.load(i ) - xmm1*factor );
4293 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4294 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4297 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4301 for(
size_t j=jj; j<jend; ++j ) {
4303 xmm1 += A.load(i ,j) * x1;
4304 xmm2 += A.load(i+SIMDSIZE,j) * x1;
4307 y.store( i , y.load(i ) - xmm1*factor );
4308 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4311 for( ; i<ipos; i+=SIMDSIZE )
4315 for(
size_t j=jj; j<jend; ++j ) {
4316 xmm1 += A.load(i,j) *
set( x[j] );
4319 y.store( i, y.load(i) - xmm1*factor );
4322 for( ; remainder && i<iend; ++i )
4326 for(
size_t j=jj; j<jend; ++j ) {
4327 value += A(i,j) * x[j];
4330 y[i] -= value * scalar;
4351 template<
typename VT1
4356 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4358 selectLargeSubAssignKernel( y, A, x, scalar );
4363 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4377 template<
typename VT1
4382 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4389 subAssign( y, tmp );
4392 gemv( y, A, x, ET(-scalar), ET(1) );
4414 template<
typename VT1 >
4426 multAssign( ~lhs, tmp );
4446 template<
typename VT1 >
4458 divAssign( ~lhs, tmp );
4480 template<
typename VT1 >
4491 if( left.rows() == 0UL ) {
4494 else if( left.columns() == 0UL ) {
4525 template<
typename VT1 >
4556 template<
typename VT1 >
4567 if( left.rows() == 0UL || left.columns() == 0UL ) {
4601 template<
typename VT1 >
4612 if( left.rows() == 0UL || left.columns() == 0UL ) {
4647 template<
typename VT1 >
4682 template<
typename VT1 >
4757 template<
typename MT
4759 inline decltype(
auto)
4786 template<
typename MT,
typename VT >
4787 struct Size< TDMatDVecMultExpr<MT,VT>, 0UL >
4788 :
public Size<MT,0UL>
4804 template<
typename MT,
typename VT >
4805 struct IsAligned< TDMatDVecMultExpr<MT,VT> >
4806 :
public And< IsAligned<MT>, IsAligned<VT> >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:220
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:210
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDMatDVecMultExpr.h:297
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:132
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:129
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:134
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:519
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1903
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1950
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
Expression object for transpose dense matrix-dense vector multiplications.The TDMatDVecMultExpr class...
Definition: Forward.h:149
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:206
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:374
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:364
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:211
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:263
Header file for the HasSIMDAdd type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:342
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:320
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:506
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:133
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:108
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:130
Constraint on the data type.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDVecMultExpr.h:209
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:590
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
If_< IsExpression< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:214
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:208
Header file for the IsSIMDCombinable type trait.
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:223
Header file for the IsContiguous type trait.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:131
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:310
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:207
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:131
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:104
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:330
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
Header file for the IsComplexFloat type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:354
Header file for the IsComplex type trait.
Compile time logical 'and' evaluation.The And alias declaration performs at compile time a logical 'a...
Definition: And.h:76
Constraint on the data type.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
If_< IsExpression< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:217
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:386
Header file for the MatVecMultExpr base class.
Constraint on the data type.
TDMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:249
Header file for the Size type trait.
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:387
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the function trace functionality.