35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 120 template<
typename VT
122 class TDVecDMatMultExpr
123 :
public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
124 ,
private Computation
153 template<
typename T1 >
154 struct UseSMPAssign {
155 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
165 template<
typename T1,
typename T2,
typename T3 >
166 struct UseBlasKernel {
172 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
188 template<
typename T1,
typename T2,
typename T3 >
189 struct UseVectorizedDefaultKernel {
190 enum :
bool { value = useOptimizedKernels &&
192 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
228 VT::simdEnabled && MT::simdEnabled &&
233 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
234 !evaluateMatrix && MT::smpAssignable };
267 return vec_[index] *
mat_(index,index);
295 if( index >=
mat_.columns() ) {
298 return (*
this)[index];
307 inline size_t size() const noexcept {
308 return mat_.columns();
338 template<
typename T >
339 inline bool canAlias(
const T* alias )
const noexcept {
340 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
350 template<
typename T >
351 inline bool isAliased(
const T* alias )
const noexcept {
352 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
362 return vec_.isAligned() &&
mat_.isAligned();
376 (
mat_.rows() *
mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
377 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
400 template<
typename VT1 >
407 if( rhs.mat_.rows() == 0UL ) {
411 else if( rhs.mat_.columns() == 0UL ) {
423 TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
439 template<
typename VT1
442 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
446 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
447 selectSmallAssignKernel( y, x, A );
449 selectBlasAssignKernel( y, x, A );
468 template<
typename VT1
471 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
473 const size_t M( A.rows() );
474 const size_t N( A.columns() );
483 for(
size_t j=jbegin; j<N; ++j ) {
484 y[j] = x[0UL] * A(0UL,j);
492 y[i] = x[i] * A(i,i);
504 const size_t jnum( jend - jbegin );
505 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
507 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
508 y[j ] += x[i] * A(i,j );
509 y[j+1UL] += x[i] * A(i,j+1UL);
512 y[jpos] += x[i] * A(i,jpos);
515 y[jend] = x[i] * A(i,jend);
541 template<
typename VT1
545 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
547 selectDefaultAssignKernel( y, x, A );
566 template<
typename VT1
570 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
574 const size_t M( A.rows() );
575 const size_t N( A.columns() );
577 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
582 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
592 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
594 for(
size_t i=ibegin; i<iend; ++i ) {
596 xmm1 += x1 * A.load(i,j );
597 xmm2 += x1 * A.load(i,j+SIMDSIZE );
598 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
599 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
600 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
601 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
602 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
603 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
607 y.store( j+SIMDSIZE , xmm2 );
608 y.store( j+SIMDSIZE*2UL, xmm3 );
609 y.store( j+SIMDSIZE*3UL, xmm4 );
610 y.store( j+SIMDSIZE*4UL, xmm5 );
611 y.store( j+SIMDSIZE*5UL, xmm6 );
612 y.store( j+SIMDSIZE*6UL, xmm7 );
613 y.store( j+SIMDSIZE*7UL, xmm8 );
616 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
628 for(
size_t i=ibegin; i<iend; ++i ) {
630 xmm1 += x1 * A.load(i,j );
631 xmm2 += x1 * A.load(i,j+SIMDSIZE );
632 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
633 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
637 y.store( j+SIMDSIZE , xmm2 );
638 y.store( j+SIMDSIZE*2UL, xmm3 );
639 y.store( j+SIMDSIZE*3UL, xmm4 );
642 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
654 for(
size_t i=ibegin; i<iend; ++i ) {
656 xmm1 += x1 * A.load(i,j );
657 xmm2 += x1 * A.load(i,j+SIMDSIZE );
658 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
662 y.store( j+SIMDSIZE , xmm2 );
663 y.store( j+SIMDSIZE*2UL, xmm3 );
666 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
678 for(
size_t i=ibegin; i<iend; ++i ) {
680 xmm1 += x1 * A.load(i,j );
681 xmm2 += x1 * A.load(i,j+SIMDSIZE);
685 y.store( j+SIMDSIZE, xmm2 );
688 for( ; j<jpos; j+=SIMDSIZE )
700 for(
size_t i=ibegin; i<iend; ++i ) {
701 xmm1 +=
set( x[i] ) * A.load(i,j);
707 for( ; remainder && j<N; ++j )
719 for(
size_t i=ibegin; i<iend; ++i ) {
720 value += x[i] * A(i,j);
743 template<
typename VT1
747 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
749 selectDefaultAssignKernel( y, x, A );
768 template<
typename VT1
772 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
776 const size_t M( A.rows() );
777 const size_t N( A.columns() );
779 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
780 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
786 for(
size_t jj=0U; jj<N; jj+=jblock ) {
787 for(
size_t ii=0UL; ii<M; ii+=iblock )
789 const size_t iend(
min( ii+iblock, M ) );
790 const size_t jtmp(
min( jj+jblock, N ) );
795 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
796 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
802 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
804 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
806 for(
size_t i=ii; i<iend; ++i ) {
808 xmm1 += x1 * A.load(i,j );
809 xmm2 += x1 * A.load(i,j+SIMDSIZE );
810 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
811 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
812 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
813 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
814 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
815 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
818 y.store( j , y.load(j ) + xmm1 );
819 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
820 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
821 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
822 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
823 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
824 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
825 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
828 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
832 for(
size_t i=ii; i<iend; ++i ) {
834 xmm1 += x1 * A.load(i,j );
835 xmm2 += x1 * A.load(i,j+SIMDSIZE );
836 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
837 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
840 y.store( j , y.load(j ) + xmm1 );
841 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
842 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
843 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
846 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
850 for(
size_t i=ii; i<iend; ++i ) {
852 xmm1 += x1 * A.load(i,j );
853 xmm2 += x1 * A.load(i,j+SIMDSIZE );
854 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
857 y.store( j , y.load(j ) + xmm1 );
858 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
859 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
862 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
866 for(
size_t i=ii; i<iend; ++i ) {
868 xmm1 += x1 * A.load(i,j );
869 xmm2 += x1 * A.load(i,j+SIMDSIZE);
872 y.store( j , y.load(j ) + xmm1 );
873 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
876 for( ; j<jpos; j+=SIMDSIZE )
880 for(
size_t i=ii; i<iend; ++i ) {
881 xmm1 +=
set( x[i] ) * A.load(i,j);
884 y.store( j, y.load(j) + xmm1 );
887 for( ; remainder && j<jend; ++j )
891 for(
size_t i=ii; i<iend; ++i ) {
892 value += x[i] * A(i,j);
917 template<
typename VT1
921 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
923 selectLargeAssignKernel( y, x, A );
929 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 943 template<
typename VT1
947 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
956 gemv( y, x, A, ET(1), ET(0) );
976 template<
typename VT1 >
1006 template<
typename VT1 >
1013 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1025 TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1041 template<
typename VT1
1044 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1048 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1049 selectSmallAddAssignKernel( y, x, A );
1051 selectBlasAddAssignKernel( y, x, A );
1070 template<
typename VT1
1073 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1075 const size_t M( A.rows() );
1076 const size_t N( A.columns() );
1078 for(
size_t i=0UL; i<M; ++i )
1082 y[i] += x[i] * A(i,i);
1094 const size_t jnum( jend - jbegin );
1095 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1097 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1098 y[j ] += x[i] * A(i,j );
1099 y[j+1UL] += x[i] * A(i,j+1UL);
1102 y[jpos] += x[i] * A(i,jpos);
1124 template<
typename VT1
1128 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1130 selectDefaultAddAssignKernel( y, x, A );
1149 template<
typename VT1
1153 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1157 const size_t M( A.rows() );
1158 const size_t N( A.columns() );
1160 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1165 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1176 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1177 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1178 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1179 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1180 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1181 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1182 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1184 for(
size_t i=ibegin; i<iend; ++i ) {
1186 xmm1 += x1 * A.load(i,j );
1187 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1188 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1189 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1190 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1191 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1192 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1193 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1196 y.store( j , xmm1 );
1197 y.store( j+SIMDSIZE , xmm2 );
1198 y.store( j+SIMDSIZE*2UL, xmm3 );
1199 y.store( j+SIMDSIZE*3UL, xmm4 );
1200 y.store( j+SIMDSIZE*4UL, xmm5 );
1201 y.store( j+SIMDSIZE*5UL, xmm6 );
1202 y.store( j+SIMDSIZE*6UL, xmm7 );
1203 y.store( j+SIMDSIZE*7UL, xmm8 );
1206 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1217 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1218 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1219 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1221 for(
size_t i=ibegin; i<iend; ++i ) {
1223 xmm1 += x1 * A.load(i,j );
1224 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1225 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1226 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1229 y.store( j , xmm1 );
1230 y.store( j+SIMDSIZE , xmm2 );
1231 y.store( j+SIMDSIZE*2UL, xmm3 );
1232 y.store( j+SIMDSIZE*3UL, xmm4 );
1235 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1246 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1247 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1249 for(
size_t i=ibegin; i<iend; ++i ) {
1251 xmm1 += x1 * A.load(i,j );
1252 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1253 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1256 y.store( j , xmm1 );
1257 y.store( j+SIMDSIZE , xmm2 );
1258 y.store( j+SIMDSIZE*2UL, xmm3 );
1261 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1272 SIMDType xmm2( y.load(j+SIMDSIZE) );
1274 for(
size_t i=ibegin; i<iend; ++i ) {
1276 xmm1 += x1 * A.load(i,j );
1277 xmm2 += x1 * A.load(i,j+SIMDSIZE);
1280 y.store( j , xmm1 );
1281 y.store( j+SIMDSIZE, xmm2 );
1284 for( ; j<jpos; j+=SIMDSIZE )
1296 for(
size_t i=ibegin; i<iend; ++i ) {
1297 xmm1 +=
set( x[i] ) * A.load(i,j);
1303 for( ; remainder && j<N; ++j )
1315 for(
size_t i=ibegin; i<iend; ++i ) {
1316 value += x[i] * A(i,j);
1339 template<
typename VT1
1343 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1345 selectDefaultAddAssignKernel( y, x, A );
1364 template<
typename VT1
1368 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1372 const size_t M( A.rows() );
1373 const size_t N( A.columns() );
1375 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
1376 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1380 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1381 for(
size_t ii=0UL; ii<M; ii+=iblock )
1383 const size_t iend(
min( ii+iblock, M ) );
1384 const size_t jtmp(
min( jj+jblock, N ) );
1389 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1390 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1396 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1398 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1400 for(
size_t i=ii; i<iend; ++i ) {
1402 xmm1 += x1 * A.load(i,j );
1403 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1404 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1405 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1406 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1407 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1408 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1409 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1412 y.store( j , y.load(j ) + xmm1 );
1413 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1414 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1415 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1416 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1417 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1418 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1419 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1422 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1426 for(
size_t i=ii; i<iend; ++i ) {
1428 xmm1 += x1 * A.load(i,j );
1429 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1430 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1431 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1434 y.store( j , y.load(j ) + xmm1 );
1435 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1436 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1437 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1440 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1444 for(
size_t i=ii; i<iend; ++i ) {
1446 xmm1 += x1 * A.load(i,j );
1447 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1448 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1451 y.store( j , y.load(j ) + xmm1 );
1452 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1453 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1456 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1460 for(
size_t i=ii; i<iend; ++i ) {
1462 xmm1 += x1 * A.load(i,j );
1463 xmm2 += x1 * A.load(i,j+SIMDSIZE);
1466 y.store( j , y.load(j ) + xmm1 );
1467 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1470 for( ; j<jpos; j+=SIMDSIZE )
1474 for(
size_t i=ii; i<iend; ++i ) {
1475 xmm1 +=
set( x[i] ) * A.load(i,j);
1478 y.store( j, y.load(j) + xmm1 );
1481 for( ; remainder && j<jend; ++j )
1485 for(
size_t i=ii; i<iend; ++i ) {
1486 value += x[i] * A(i,j);
1511 template<
typename VT1
1515 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1517 selectLargeAddAssignKernel( y, x, A );
1523 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1537 template<
typename VT1
1541 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1548 addAssign( y, tmp );
1551 gemv( y, x, A, ET(1), ET(1) );
1575 template<
typename VT1 >
1582 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1594 TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1610 template<
typename VT1
1613 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1617 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1618 selectSmallSubAssignKernel( y, x, A );
1620 selectBlasSubAssignKernel( y, x, A );
1639 template<
typename VT1
1642 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1644 const size_t M( A.rows() );
1645 const size_t N( A.columns() );
1647 for(
size_t i=0UL; i<M; ++i )
1651 y[i] -= x[i] * A(i,i);
1663 const size_t jnum( jend - jbegin );
1664 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1666 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1667 y[j ] -= x[i] * A(i,j );
1668 y[j+1UL] -= x[i] * A(i,j+1UL);
1671 y[jpos] -= x[i] * A(i,jpos);
1693 template<
typename VT1
1697 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1699 selectDefaultSubAssignKernel( y, x, A );
1719 template<
typename VT1
1723 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1727 const size_t M( A.rows() );
1728 const size_t N( A.columns() );
1730 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1735 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1746 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1747 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1748 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1749 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1750 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1751 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1752 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1754 for(
size_t i=ibegin; i<iend; ++i ) {
1756 xmm1 -= x1 * A.load(i,j );
1757 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1758 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1759 xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1760 xmm5 -= x1 * A.load(i,j+SIMDSIZE*4UL);
1761 xmm6 -= x1 * A.load(i,j+SIMDSIZE*5UL);
1762 xmm7 -= x1 * A.load(i,j+SIMDSIZE*6UL);
1763 xmm8 -= x1 * A.load(i,j+SIMDSIZE*7UL);
1766 y.store( j , xmm1 );
1767 y.store( j+SIMDSIZE , xmm2 );
1768 y.store( j+SIMDSIZE*2UL, xmm3 );
1769 y.store( j+SIMDSIZE*3UL, xmm4 );
1770 y.store( j+SIMDSIZE*4UL, xmm5 );
1771 y.store( j+SIMDSIZE*5UL, xmm6 );
1772 y.store( j+SIMDSIZE*6UL, xmm7 );
1773 y.store( j+SIMDSIZE*7UL, xmm8 );
1776 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1787 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1788 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1789 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1791 for(
size_t i=ibegin; i<iend; ++i ) {
1793 xmm1 -= x1 * A.load(i,j );
1794 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1795 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1796 xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1799 y.store( j , xmm1 );
1800 y.store( j+SIMDSIZE , xmm2 );
1801 y.store( j+SIMDSIZE*2UL, xmm3 );
1802 y.store( j+SIMDSIZE*3UL, xmm4 );
1805 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1816 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1817 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1819 for(
size_t i=ibegin; i<iend; ++i ) {
1821 xmm1 -= x1 * A.load(i,j );
1822 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1823 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1826 y.store( j , xmm1 );
1827 y.store( j+SIMDSIZE , xmm2 );
1828 y.store( j+SIMDSIZE*2UL, xmm3 );
1831 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1842 SIMDType xmm2( y.load(j+SIMDSIZE) );
1844 for(
size_t i=ibegin; i<iend; ++i ) {
1846 xmm1 -= x1 * A.load(i,j );
1847 xmm2 -= x1 * A.load(i,j+SIMDSIZE);
1850 y.store( j , xmm1 );
1851 y.store( j+SIMDSIZE, xmm2 );
1854 for( ; j<jpos; j+=SIMDSIZE )
1866 for(
size_t i=ibegin; i<iend; ++i ) {
1867 xmm1 -=
set( x[i] ) * A.load(i,j);
1873 for( ; remainder && j<N; ++j )
1885 for(
size_t i=ibegin; i<iend; ++i ) {
1886 value += x[i] * A(i,j);
1909 template<
typename VT1
1913 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1915 selectDefaultSubAssignKernel( y, x, A );
1935 template<
typename VT1
1939 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1943 const size_t M( A.rows() );
1944 const size_t N( A.columns() );
1946 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
1947 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1951 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1952 for(
size_t ii=0UL; ii<M; ii+=iblock )
1954 const size_t iend(
min( ii+iblock, M ) );
1955 const size_t jtmp(
min( jj+jblock, N ) );
1960 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1961 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1967 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1969 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1971 for(
size_t i=ii; i<iend; ++i ) {
1973 xmm1 += x1 * A.load(i,j );
1974 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1975 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1976 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1977 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1978 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1979 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1980 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1983 y.store( j , y.load(j ) - xmm1 );
1984 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
1985 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
1986 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
1987 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
1988 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
1989 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
1990 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
1993 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1997 for(
size_t i=ii; i<iend; ++i ) {
1999 xmm1 += x1 * A.load(i,j );
2000 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2001 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2002 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2005 y.store( j , y.load(j ) - xmm1 );
2006 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2007 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2008 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2011 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2015 for(
size_t i=ii; i<iend; ++i ) {
2017 xmm1 += x1 * A.load(i,j );
2018 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2019 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2022 y.store( j , y.load(j ) - xmm1 );
2023 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2024 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2027 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2031 for(
size_t i=ii; i<iend; ++i ) {
2033 xmm1 += x1 * A.load(i,j );
2034 xmm2 += x1 * A.load(i,j+SIMDSIZE);
2037 y.store( j , y.load(j ) - xmm1 );
2038 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2041 for( ; j<jpos; j+=SIMDSIZE )
2045 for(
size_t i=ii; i<iend; ++i ) {
2046 xmm1 +=
set( x[i] ) * A.load(i,j);
2049 y.store( j, y.load(j) - xmm1 );
2052 for( ; remainder && j<jend; ++j )
2056 for(
size_t i=ii; i<iend; ++i ) {
2057 value += x[i] * A(i,j);
2082 template<
typename VT1
2086 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2088 selectLargeSubAssignKernel( y, x, A );
2094 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2108 template<
typename VT1
2112 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2119 subAssign( y, tmp );
2122 gemv( y, x, A, ET(-1), ET(1) );
2146 template<
typename VT1 >
2158 multAssign( ~lhs, tmp );
2180 template<
typename VT1 >
2192 divAssign( ~lhs, tmp );
2216 template<
typename VT1 >
2224 if( rhs.mat_.rows() == 0UL ) {
2228 else if( rhs.mat_.columns() == 0UL ) {
2260 template<
typename VT1 >
2293 template<
typename VT1 >
2301 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2337 template<
typename VT1 >
2345 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2381 template<
typename VT1 >
2418 template<
typename VT1 >
2469 template<
typename VT
2473 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2504 template<
typename T1 >
2505 struct UseSMPAssign {
2506 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
2514 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2515 struct UseBlasKernel {
2521 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2536 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2537 struct UseVectorizedDefaultKernel {
2538 enum :
bool { value = useOptimizedKernels &&
2540 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2576 VT::simdEnabled && MT::simdEnabled &&
2582 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2583 !evaluateMatrix && MT::smpAssignable };
2611 return vector_[index] * scalar_;
2623 if( index >= vector_.size() ) {
2626 return (*
this)[index];
2635 inline size_t size()
const {
2636 return vector_.size();
2666 template<
typename T >
2667 inline bool canAlias(
const T* alias )
const {
2668 return vector_.canAlias( alias );
2678 template<
typename T >
2679 inline bool isAliased(
const T* alias )
const {
2680 return vector_.isAliased( alias );
2690 return vector_.isAligned();
2705 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2706 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
2728 template<
typename VT1 >
2738 if( right.rows() == 0UL ) {
2742 else if( right.columns() == 0UL ) {
2754 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.
scalar_ );
2769 template<
typename VT1
2773 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2777 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2778 selectSmallAssignKernel( y, x, A, scalar );
2780 selectBlasAssignKernel( y, x, A, scalar );
2798 template<
typename VT1
2802 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2804 const size_t M( A.rows() );
2805 const size_t N( A.columns() );
2814 y[j] = x[0UL] * A(0UL,j);
2822 y[i] = x[i] * A(i,i) * scalar;
2834 const size_t jnum( jend - jbegin );
2835 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2837 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2838 y[j ] += x[i] * A(i,j );
2839 y[j+1UL] += x[i] * A(i,j+1UL);
2842 y[jpos] += x[i] * A(i,jpos);
2845 y[jend] = x[i] * A(i,jend);
2878 template<
typename VT1
2883 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2885 selectDefaultAssignKernel( y, x, A, scalar );
2903 template<
typename VT1
2908 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2912 const size_t M( A.rows() );
2913 const size_t N( A.columns() );
2915 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
2918 const SIMDType factor(
set( scalar ) );
2922 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2932 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934 for(
size_t i=ibegin; i<iend; ++i ) {
2936 xmm1 += x1 * A.load(i,j );
2937 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2938 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2939 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2940 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
2941 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
2942 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
2943 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
2946 y.store( j , xmm1*factor );
2947 y.store( j+SIMDSIZE , xmm2*factor );
2948 y.store( j+SIMDSIZE*2UL, xmm3*factor );
2949 y.store( j+SIMDSIZE*3UL, xmm4*factor );
2950 y.store( j+SIMDSIZE*4UL, xmm5*factor );
2951 y.store( j+SIMDSIZE*5UL, xmm6*factor );
2952 y.store( j+SIMDSIZE*6UL, xmm7*factor );
2953 y.store( j+SIMDSIZE*7UL, xmm8*factor );
2956 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2968 for(
size_t i=ibegin; i<iend; ++i ) {
2970 xmm1 += x1 * A.load(i,j );
2971 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2972 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2973 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2976 y.store( j , xmm1*factor );
2977 y.store( j+SIMDSIZE , xmm2*factor );
2978 y.store( j+SIMDSIZE*2UL, xmm3*factor );
2979 y.store( j+SIMDSIZE*3UL, xmm4*factor );
2982 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2994 for(
size_t i=ibegin; i<iend; ++i ) {
2996 xmm1 += x1 * A.load(i,j );
2997 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2998 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3001 y.store( j , xmm1*factor );
3002 y.store( j+SIMDSIZE , xmm2*factor );
3003 y.store( j+SIMDSIZE*2UL, xmm3*factor );
3006 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3018 for(
size_t i=ibegin; i<iend; ++i ) {
3020 xmm1 += x1 * A.load(i,j );
3021 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3024 y.store( j , xmm1*factor );
3025 y.store( j+SIMDSIZE, xmm2*factor );
3028 for( ; j<jpos; j+=SIMDSIZE )
3040 for(
size_t i=ibegin; i<iend; ++i ) {
3041 xmm1 +=
set( x[i] ) * A.load(i,j);
3044 y.store( j, xmm1*factor );
3047 for( ; remainder && j<N; ++j )
3059 for(
size_t i=ibegin; i<iend; ++i ) {
3060 value += x[i] * A(i,j);
3063 y[j] = value * scalar;
3082 template<
typename VT1
3087 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3089 selectDefaultAssignKernel( y, x, A, scalar );
3107 template<
typename VT1
3112 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3116 const size_t M( A.rows() );
3117 const size_t N( A.columns() );
3119 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3120 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3122 const SIMDType factor(
set( scalar ) );
3128 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3129 for(
size_t ii=0UL; ii<M; ii+=iblock )
3131 const size_t iend(
min( ii+iblock, M ) );
3132 const size_t jtmp(
min( jj+jblock, N ) );
3137 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3138 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3144 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3146 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3148 for(
size_t i=ii; i<iend; ++i ) {
3150 xmm1 += x1 * A.load(i,j );
3151 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3152 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3153 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3154 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3155 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3156 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3157 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3160 y.store( j , y.load(j ) + xmm1*factor );
3161 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3162 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3163 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3164 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3165 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3166 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3167 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3170 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3174 for(
size_t i=ii; i<iend; ++i ) {
3176 xmm1 += x1 * A.load(i,j );
3177 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3178 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3179 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3182 y.store( j , y.load(j ) + xmm1*factor );
3183 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3184 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3185 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3188 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3192 for(
size_t i=ii; i<iend; ++i ) {
3194 xmm1 += x1 * A.load(i,j );
3195 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3196 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3199 y.store( j , y.load(j ) + xmm1*factor );
3200 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3201 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3204 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3208 for(
size_t i=ii; i<iend; ++i ) {
3210 xmm1 += x1 * A.load(i,j );
3211 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3214 y.store( j , y.load(j ) + xmm1*factor );
3215 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3218 for( ; j<jpos; j+=SIMDSIZE )
3222 for(
size_t i=ii; i<iend; ++i ) {
3223 xmm1 +=
set( x[i] ) * A.load(i,j);
3226 y.store( j, y.load(j) + xmm1*factor );
3229 for( ; remainder && j<jend; ++j )
3233 for(
size_t i=ii; i<iend; ++i ) {
3234 value += x[i] * A(i,j);
3237 y[j] += value * scalar;
3257 template<
typename VT1
3262 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3264 selectLargeAssignKernel( y, x, A, scalar );
3269 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3283 template<
typename VT1
3288 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3293 assign( y, scalar * x );
3297 gemv( y, x, A, ET(scalar), ET(0) );
3315 template<
typename VT1 >
3327 assign( ~lhs, tmp );
3343 template<
typename VT1 >
3353 if( right.rows() == 0UL || right.columns() == 0UL ) {
3365 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3380 template<
typename VT1
3384 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3388 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3389 selectSmallAddAssignKernel( y, x, A, scalar );
3391 selectBlasAddAssignKernel( y, x, A, scalar );
3409 template<
typename VT1
3413 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3415 y.addAssign( x * A * scalar );
3433 template<
typename VT1
3438 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3440 selectDefaultAddAssignKernel( y, x, A, scalar );
3459 template<
typename VT1
3464 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3468 const size_t M( A.rows() );
3469 const size_t N( A.columns() );
3471 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
3474 const SIMDType factor(
set( scalar ) );
3478 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3488 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3490 for(
size_t i=ibegin; i<iend; ++i ) {
3492 xmm1 += x1 * A.load(i,j );
3493 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3494 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3495 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3496 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3497 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3498 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3499 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3502 y.store( j , y.load(j ) + xmm1*factor );
3503 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3504 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3505 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3506 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3507 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3508 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3509 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3512 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3524 for(
size_t i=ibegin; i<iend; ++i ) {
3526 xmm1 += x1 * A.load(i,j );
3527 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3528 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3529 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3532 y.store( j , y.load(j ) + xmm1*factor );
3533 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3534 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3535 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3538 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3550 for(
size_t i=ibegin; i<iend; ++i ) {
3552 xmm1 += x1 * A.load(i,j );
3553 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3554 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3557 y.store( j , y.load(j ) + xmm1*factor );
3558 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3559 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3562 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3574 for(
size_t i=ibegin; i<iend; ++i ) {
3576 xmm1 += x1 * A.load(i,j );
3577 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3580 y.store( j , y.load(j ) + xmm1*factor );
3581 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3584 for( ; j<jpos; j+=SIMDSIZE )
3596 for(
size_t i=ibegin; i<iend; ++i ) {
3597 xmm1 +=
set( x[i] ) * A.load(i,j);
3600 y.store( j, y.load(j) + xmm1*factor );
3603 for( ; remainder && j<N; ++j )
3615 for(
size_t i=ibegin; i<iend; ++i ) {
3616 value += x[i] * A(i,j);
3619 y[j] += value * scalar;
3638 template<
typename VT1
3643 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3645 selectDefaultAddAssignKernel( y, x, A, scalar );
3664 template<
typename VT1
3669 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3673 const size_t M( A.rows() );
3674 const size_t N( A.columns() );
3676 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3677 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3679 const SIMDType factor(
set( scalar ) );
3683 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3684 for(
size_t ii=0UL; ii<M; ii+=iblock )
3686 const size_t iend(
min( ii+iblock, M ) );
3687 const size_t jtmp(
min( jj+jblock, N ) );
3692 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3693 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3699 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3701 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3703 for(
size_t i=ii; i<iend; ++i ) {
3705 xmm1 += x1 * A.load(i,j );
3706 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3707 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3708 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3709 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3710 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3711 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3712 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3715 y.store( j , y.load(j ) + xmm1*factor );
3716 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3717 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3718 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3719 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3720 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3721 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3722 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3725 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3729 for(
size_t i=ii; i<iend; ++i ) {
3731 xmm1 += x1 * A.load(i,j );
3732 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3733 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3734 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3737 y.store( j , y.load(j ) + xmm1*factor );
3738 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3739 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3740 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3743 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3747 for(
size_t i=ii; i<iend; ++i ) {
3749 xmm1 += x1 * A.load(i,j );
3750 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3751 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3754 y.store( j , y.load(j ) + xmm1*factor );
3755 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3756 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3759 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3763 for(
size_t i=ii; i<iend; ++i ) {
3765 xmm1 += x1 * A.load(i,j );
3766 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3769 y.store( j , y.load(j ) + xmm1*factor );
3770 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3773 for( ; j<jpos; j+=SIMDSIZE )
3777 for(
size_t i=ii; i<iend; ++i ) {
3778 xmm1 +=
set( x[i] ) * A.load(i,j);
3781 y.store( j, y.load(j) + xmm1*factor );
3784 for( ; remainder && j<jend; ++j )
3788 for(
size_t i=ii; i<iend; ++i ) {
3789 value += x[i] * A(i,j);
3792 y[j] += value * scalar;
3813 template<
typename VT1
3818 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3820 selectLargeAddAssignKernel( y, x, A, scalar );
3825 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3839 template<
typename VT1
3844 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3851 addAssign( y, tmp );
3854 gemv( y, x, A, ET(scalar), ET(1) );
3876 template<
typename VT1 >
3886 if( right.rows() == 0UL || right.columns() == 0UL ) {
3898 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3913 template<
typename VT1
3917 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3921 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3922 selectSmallSubAssignKernel( y, x, A, scalar );
3924 selectBlasSubAssignKernel( y, x, A, scalar );
3942 template<
typename VT1
3946 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3948 y.subAssign( x * A * scalar );
3966 template<
typename VT1
3971 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3973 selectDefaultSubAssignKernel( y, x, A, scalar );
3992 template<
typename VT1
3997 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4001 const size_t M( A.rows() );
4002 const size_t N( A.columns() );
4004 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
4007 const SIMDType factor(
set( scalar ) );
4011 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4021 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4023 for(
size_t i=ibegin; i<iend; ++i ) {
4025 xmm1 += x1 * A.load(i,j );
4026 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4027 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4028 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4029 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4030 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4031 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4032 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4035 y.store( j , y.load(j ) - xmm1*factor );
4036 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4037 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4038 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4039 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4040 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4041 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4042 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4045 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4057 for(
size_t i=ibegin; i<iend; ++i ) {
4059 xmm1 += x1 * A.load(i,j );
4060 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4061 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4062 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4065 y.store( j , y.load(j ) - xmm1*factor );
4066 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4067 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4068 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4071 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4083 for(
size_t i=ibegin; i<iend; ++i ) {
4085 xmm1 += x1 * A.load(i,j );
4086 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4087 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4090 y.store( j , y.load(j ) - xmm1*factor );
4091 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4092 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4095 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4107 for(
size_t i=ibegin; i<iend; ++i ) {
4109 xmm1 += x1 * A.load(i,j );
4110 xmm2 += x1 * A.load(i,j+SIMDSIZE);
4113 y.store( j , y.load(j ) - xmm1*factor );
4114 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4117 for( ; j<jpos; j+=SIMDSIZE )
4129 for(
size_t i=ibegin; i<iend; ++i ) {
4130 xmm1 +=
set( x[i] ) * A.load(i,j);
4133 y.store( j, y.load(j) - xmm1*factor );
4136 for( ; remainder && j<N; ++j )
4148 for(
size_t i=ibegin; i<iend; ++i ) {
4149 value += x[i] * A(i,j);
4152 y[j] -= value * scalar;
4171 template<
typename VT1
4176 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4178 selectDefaultSubAssignKernel( y, x, A, scalar );
4197 template<
typename VT1
4202 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4206 const size_t M( A.rows() );
4207 const size_t N( A.columns() );
4209 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
4210 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4212 const SIMDType factor(
set( scalar ) );
4216 for(
size_t jj=0U; jj<N; jj+=jblock ) {
4217 for(
size_t ii=0UL; ii<M; ii+=iblock )
4219 const size_t iend(
min( ii+iblock, M ) );
4220 const size_t jtmp(
min( jj+jblock, N ) );
4225 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4226 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
4232 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4234 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4236 for(
size_t i=ii; i<iend; ++i ) {
4238 xmm1 += x1 * A.load(i,j );
4239 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4240 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4241 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4242 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4243 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4244 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4245 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4248 y.store( j , y.load(j ) - xmm1*factor );
4249 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4250 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4251 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4252 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4253 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4254 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4255 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4258 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4262 for(
size_t i=ii; i<iend; ++i ) {
4264 xmm1 += x1 * A.load(i,j );
4265 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4266 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4267 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4270 y.store( j , y.load(j ) - xmm1*factor );
4271 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4272 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4273 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4276 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4280 for(
size_t i=ii; i<iend; ++i ) {
4282 xmm1 += x1 * A.load(i,j );
4283 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4284 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4287 y.store( j , y.load(j ) - xmm1*factor );
4288 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4289 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4292 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4296 for(
size_t i=ii; i<iend; ++i ) {
4298 xmm1 += x1 * A.load(i,j );
4299 xmm2 += x1 * A.load(i,j+SIMDSIZE);
4302 y.store( j , y.load(j ) - xmm1*factor );
4303 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4306 for( ; j<jpos; j+=SIMDSIZE )
4310 for(
size_t i=ii; i<iend; ++i ) {
4311 xmm1 +=
set( x[i] ) * A.load(i,j);
4314 y.store( j, y.load(j) - xmm1*factor );
4317 for( ; remainder && j<jend; ++j )
4321 for(
size_t i=ii; i<iend; ++i ) {
4322 value += x[i] * A(i,j);
4325 y[j] -= value * scalar;
4346 template<
typename VT1
4351 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4353 selectLargeSubAssignKernel( y, x, A, scalar );
4358 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4372 template<
typename VT1
4377 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4384 subAssign( y, tmp );
4387 gemv( y, x, A, ET(-scalar), ET(1) );
4409 template<
typename VT1 >
4421 multAssign( ~lhs, tmp );
4441 template<
typename VT1 >
4453 divAssign( ~lhs, tmp );
4475 template<
typename VT1 >
4486 if( right.rows() == 0UL ) {
4490 else if( right.columns() == 0UL ) {
4520 template<
typename VT1 >
4551 template<
typename VT1 >
4562 if( right.rows() == 0UL || right.columns() == 0UL ) {
4596 template<
typename VT1 >
4607 if( right.rows() == 0UL || right.columns() == 0UL ) {
4642 template<
typename VT1 >
4677 template<
typename VT1 >
4752 template<
typename VT
4754 inline decltype(
auto)
4761 if( (~vec).
size() != (~mat).
rows() ) {
4793 template<
typename VT
4795 inline decltype(
auto)
4816 template<
typename VT,
typename MT >
4817 struct Size< TDVecDMatMultExpr<VT,MT> >
4834 template<
typename VT,
typename MT >
4835 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4836 :
public BoolConstant< And< IsAligned<VT>, IsAligned<MT> >::value >
If_< IsExpression< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:213
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Headerfile for the generic min algorithm.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:206
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:129
Header file for the IsDiagonal type trait.
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:294
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:128
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:521
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:361
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:383
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:327
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1809
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:351
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:131
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:132
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:208
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:262
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the Columns type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:207
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:88
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:384
decltype(auto) operator*(const DenseMatrix< MT1, false > &lhs, const DenseMatrix< MT2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8893
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:133
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:248
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:222
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:593
If_< IsExpression< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:216
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:371
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:209
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:219
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:130
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Header file for the TVecMatMultExpr base class.
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:156
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:307
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:339
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:324
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:109
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:210
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:106
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:74
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:317
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:205
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.