35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 122 template<
typename VT
124 class TDVecDMatMultExpr
125 :
public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
126 ,
private Computation
155 template<
typename T1 >
156 struct UseSMPAssign {
157 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
167 template<
typename T1,
typename T2,
typename T3 >
168 struct UseBlasKernel {
174 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
190 template<
typename T1,
typename T2,
typename T3 >
191 struct UseVectorizedDefaultKernel {
192 enum :
bool { value = useOptimizedKernels &&
194 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
230 VT::simdEnabled && MT::simdEnabled &&
235 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
236 !evaluateMatrix && MT::smpAssignable };
269 return vec_[index] *
mat_(index,index);
299 if( index >=
mat_.columns() ) {
302 return (*
this)[index];
311 inline size_t size() const noexcept {
312 return mat_.columns();
342 template<
typename T >
343 inline bool canAlias(
const T* alias )
const noexcept {
344 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
354 template<
typename T >
355 inline bool isAliased(
const T* alias )
const noexcept {
356 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
366 return vec_.isAligned() &&
mat_.isAligned();
380 (
mat_.rows() *
mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
381 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
404 template<
typename VT1 >
411 if( rhs.mat_.rows() == 0UL ) {
415 else if( rhs.mat_.columns() == 0UL ) {
427 TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
443 template<
typename VT1
446 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
450 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
451 selectSmallAssignKernel( y, x, A );
453 selectBlasAssignKernel( y, x, A );
472 template<
typename VT1
475 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
477 const size_t M( A.rows() );
478 const size_t N( A.columns() );
487 for(
size_t j=jbegin; j<N; ++j ) {
488 y[j] = x[0UL] * A(0UL,j);
496 y[i] = x[i] * A(i,i);
508 const size_t jnum( jend - jbegin );
509 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
511 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
512 y[j ] += x[i] * A(i,j );
513 y[j+1UL] += x[i] * A(i,j+1UL);
516 y[jpos] += x[i] * A(i,jpos);
519 y[jend] = x[i] * A(i,jend);
545 template<
typename VT1
549 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
551 selectDefaultAssignKernel( y, x, A );
570 template<
typename VT1
574 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
578 const size_t M( A.rows() );
579 const size_t N( A.columns() );
581 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
586 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
596 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
598 for(
size_t i=ibegin; i<iend; ++i ) {
600 xmm1 += x1 * A.load(i,j );
601 xmm2 += x1 * A.load(i,j+SIMDSIZE );
602 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
603 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
604 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
605 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
606 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
607 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
611 y.store( j+SIMDSIZE , xmm2 );
612 y.store( j+SIMDSIZE*2UL, xmm3 );
613 y.store( j+SIMDSIZE*3UL, xmm4 );
614 y.store( j+SIMDSIZE*4UL, xmm5 );
615 y.store( j+SIMDSIZE*5UL, xmm6 );
616 y.store( j+SIMDSIZE*6UL, xmm7 );
617 y.store( j+SIMDSIZE*7UL, xmm8 );
620 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
632 for(
size_t i=ibegin; i<iend; ++i ) {
634 xmm1 += x1 * A.load(i,j );
635 xmm2 += x1 * A.load(i,j+SIMDSIZE );
636 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
637 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
641 y.store( j+SIMDSIZE , xmm2 );
642 y.store( j+SIMDSIZE*2UL, xmm3 );
643 y.store( j+SIMDSIZE*3UL, xmm4 );
646 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
658 for(
size_t i=ibegin; i<iend; ++i ) {
660 xmm1 += x1 * A.load(i,j );
661 xmm2 += x1 * A.load(i,j+SIMDSIZE );
662 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
666 y.store( j+SIMDSIZE , xmm2 );
667 y.store( j+SIMDSIZE*2UL, xmm3 );
670 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
682 for(
size_t i=ibegin; i<iend; ++i ) {
684 xmm1 += x1 * A.load(i,j );
685 xmm2 += x1 * A.load(i,j+SIMDSIZE);
689 y.store( j+SIMDSIZE, xmm2 );
692 for( ; j<jpos; j+=SIMDSIZE )
704 for(
size_t i=ibegin; i<iend; ++i ) {
705 xmm1 +=
set( x[i] ) * A.load(i,j);
711 for( ; remainder && j<N; ++j )
723 for(
size_t i=ibegin; i<iend; ++i ) {
724 value += x[i] * A(i,j);
747 template<
typename VT1
751 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
753 selectDefaultAssignKernel( y, x, A );
772 template<
typename VT1
776 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
780 const size_t M( A.rows() );
781 const size_t N( A.columns() );
783 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
784 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
790 for(
size_t jj=0U; jj<N; jj+=jblock ) {
791 for(
size_t ii=0UL; ii<M; ii+=iblock )
793 const size_t iend(
min( ii+iblock, M ) );
794 const size_t jtmp(
min( jj+jblock, N ) );
799 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
800 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
806 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
808 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
810 for(
size_t i=ii; i<iend; ++i ) {
812 xmm1 += x1 * A.load(i,j );
813 xmm2 += x1 * A.load(i,j+SIMDSIZE );
814 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
815 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
816 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
817 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
818 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
819 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
822 y.store( j , y.load(j ) + xmm1 );
823 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
824 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
825 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
826 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
827 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
828 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
829 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
832 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
836 for(
size_t i=ii; i<iend; ++i ) {
838 xmm1 += x1 * A.load(i,j );
839 xmm2 += x1 * A.load(i,j+SIMDSIZE );
840 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
841 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
844 y.store( j , y.load(j ) + xmm1 );
845 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
846 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
847 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
850 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
854 for(
size_t i=ii; i<iend; ++i ) {
856 xmm1 += x1 * A.load(i,j );
857 xmm2 += x1 * A.load(i,j+SIMDSIZE );
858 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
861 y.store( j , y.load(j ) + xmm1 );
862 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
863 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
866 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
870 for(
size_t i=ii; i<iend; ++i ) {
872 xmm1 += x1 * A.load(i,j );
873 xmm2 += x1 * A.load(i,j+SIMDSIZE);
876 y.store( j , y.load(j ) + xmm1 );
877 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
880 for( ; j<jpos; j+=SIMDSIZE )
884 for(
size_t i=ii; i<iend; ++i ) {
885 xmm1 +=
set( x[i] ) * A.load(i,j);
888 y.store( j, y.load(j) + xmm1 );
891 for( ; remainder && j<jend; ++j )
895 for(
size_t i=ii; i<iend; ++i ) {
896 value += x[i] * A(i,j);
921 template<
typename VT1
925 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
927 selectLargeAssignKernel( y, x, A );
933 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 947 template<
typename VT1
951 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
960 gemv( y, x, A, ET(1), ET(0) );
980 template<
typename VT1 >
1010 template<
typename VT1 >
1017 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1029 TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1045 template<
typename VT1
1048 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1052 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1053 selectSmallAddAssignKernel( y, x, A );
1055 selectBlasAddAssignKernel( y, x, A );
1074 template<
typename VT1
1077 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1079 const size_t M( A.rows() );
1080 const size_t N( A.columns() );
1082 for(
size_t i=0UL; i<M; ++i )
1086 y[i] += x[i] * A(i,i);
1098 const size_t jnum( jend - jbegin );
1099 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1101 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1102 y[j ] += x[i] * A(i,j );
1103 y[j+1UL] += x[i] * A(i,j+1UL);
1106 y[jpos] += x[i] * A(i,jpos);
1128 template<
typename VT1
1132 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1134 selectDefaultAddAssignKernel( y, x, A );
1153 template<
typename VT1
1157 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1161 const size_t M( A.rows() );
1162 const size_t N( A.columns() );
1164 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1169 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1180 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1181 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1182 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1183 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1184 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1185 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1186 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1188 for(
size_t i=ibegin; i<iend; ++i ) {
1190 xmm1 += x1 * A.load(i,j );
1191 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1192 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1193 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1194 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1195 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1196 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1197 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1200 y.store( j , xmm1 );
1201 y.store( j+SIMDSIZE , xmm2 );
1202 y.store( j+SIMDSIZE*2UL, xmm3 );
1203 y.store( j+SIMDSIZE*3UL, xmm4 );
1204 y.store( j+SIMDSIZE*4UL, xmm5 );
1205 y.store( j+SIMDSIZE*5UL, xmm6 );
1206 y.store( j+SIMDSIZE*6UL, xmm7 );
1207 y.store( j+SIMDSIZE*7UL, xmm8 );
1210 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1221 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1222 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1223 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1225 for(
size_t i=ibegin; i<iend; ++i ) {
1227 xmm1 += x1 * A.load(i,j );
1228 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1229 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1230 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1233 y.store( j , xmm1 );
1234 y.store( j+SIMDSIZE , xmm2 );
1235 y.store( j+SIMDSIZE*2UL, xmm3 );
1236 y.store( j+SIMDSIZE*3UL, xmm4 );
1239 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1250 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1251 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1253 for(
size_t i=ibegin; i<iend; ++i ) {
1255 xmm1 += x1 * A.load(i,j );
1256 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1257 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1260 y.store( j , xmm1 );
1261 y.store( j+SIMDSIZE , xmm2 );
1262 y.store( j+SIMDSIZE*2UL, xmm3 );
1265 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1276 SIMDType xmm2( y.load(j+SIMDSIZE) );
1278 for(
size_t i=ibegin; i<iend; ++i ) {
1280 xmm1 += x1 * A.load(i,j );
1281 xmm2 += x1 * A.load(i,j+SIMDSIZE);
1284 y.store( j , xmm1 );
1285 y.store( j+SIMDSIZE, xmm2 );
1288 for( ; j<jpos; j+=SIMDSIZE )
1300 for(
size_t i=ibegin; i<iend; ++i ) {
1301 xmm1 +=
set( x[i] ) * A.load(i,j);
1307 for( ; remainder && j<N; ++j )
1319 for(
size_t i=ibegin; i<iend; ++i ) {
1320 value += x[i] * A(i,j);
1343 template<
typename VT1
1347 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1349 selectDefaultAddAssignKernel( y, x, A );
1368 template<
typename VT1
1372 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1376 const size_t M( A.rows() );
1377 const size_t N( A.columns() );
1379 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
1380 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1384 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1385 for(
size_t ii=0UL; ii<M; ii+=iblock )
1387 const size_t iend(
min( ii+iblock, M ) );
1388 const size_t jtmp(
min( jj+jblock, N ) );
1393 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1394 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1400 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1402 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1404 for(
size_t i=ii; i<iend; ++i ) {
1406 xmm1 += x1 * A.load(i,j );
1407 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1408 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1409 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1410 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1411 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1412 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1413 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1416 y.store( j , y.load(j ) + xmm1 );
1417 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1418 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1419 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1420 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1421 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1422 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1423 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1426 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1430 for(
size_t i=ii; i<iend; ++i ) {
1432 xmm1 += x1 * A.load(i,j );
1433 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1434 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1435 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1438 y.store( j , y.load(j ) + xmm1 );
1439 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1440 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1441 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1444 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1448 for(
size_t i=ii; i<iend; ++i ) {
1450 xmm1 += x1 * A.load(i,j );
1451 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1452 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1455 y.store( j , y.load(j ) + xmm1 );
1456 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1457 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1460 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1464 for(
size_t i=ii; i<iend; ++i ) {
1466 xmm1 += x1 * A.load(i,j );
1467 xmm2 += x1 * A.load(i,j+SIMDSIZE);
1470 y.store( j , y.load(j ) + xmm1 );
1471 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1474 for( ; j<jpos; j+=SIMDSIZE )
1478 for(
size_t i=ii; i<iend; ++i ) {
1479 xmm1 +=
set( x[i] ) * A.load(i,j);
1482 y.store( j, y.load(j) + xmm1 );
1485 for( ; remainder && j<jend; ++j )
1489 for(
size_t i=ii; i<iend; ++i ) {
1490 value += x[i] * A(i,j);
1515 template<
typename VT1
1519 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1521 selectLargeAddAssignKernel( y, x, A );
1527 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1541 template<
typename VT1
1545 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1552 addAssign( y, tmp );
1555 gemv( y, x, A, ET(1), ET(1) );
1579 template<
typename VT1 >
1586 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1598 TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1614 template<
typename VT1
1617 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1621 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1622 selectSmallSubAssignKernel( y, x, A );
1624 selectBlasSubAssignKernel( y, x, A );
1643 template<
typename VT1
1646 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1648 const size_t M( A.rows() );
1649 const size_t N( A.columns() );
1651 for(
size_t i=0UL; i<M; ++i )
1655 y[i] -= x[i] * A(i,i);
1667 const size_t jnum( jend - jbegin );
1668 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1670 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1671 y[j ] -= x[i] * A(i,j );
1672 y[j+1UL] -= x[i] * A(i,j+1UL);
1675 y[jpos] -= x[i] * A(i,jpos);
1697 template<
typename VT1
1701 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1703 selectDefaultSubAssignKernel( y, x, A );
1723 template<
typename VT1
1727 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1731 const size_t M( A.rows() );
1732 const size_t N( A.columns() );
1734 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1739 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1750 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1751 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1752 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1753 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1754 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1755 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1756 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1758 for(
size_t i=ibegin; i<iend; ++i ) {
1760 xmm1 -= x1 * A.load(i,j );
1761 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1762 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1763 xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1764 xmm5 -= x1 * A.load(i,j+SIMDSIZE*4UL);
1765 xmm6 -= x1 * A.load(i,j+SIMDSIZE*5UL);
1766 xmm7 -= x1 * A.load(i,j+SIMDSIZE*6UL);
1767 xmm8 -= x1 * A.load(i,j+SIMDSIZE*7UL);
1770 y.store( j , xmm1 );
1771 y.store( j+SIMDSIZE , xmm2 );
1772 y.store( j+SIMDSIZE*2UL, xmm3 );
1773 y.store( j+SIMDSIZE*3UL, xmm4 );
1774 y.store( j+SIMDSIZE*4UL, xmm5 );
1775 y.store( j+SIMDSIZE*5UL, xmm6 );
1776 y.store( j+SIMDSIZE*6UL, xmm7 );
1777 y.store( j+SIMDSIZE*7UL, xmm8 );
1780 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1791 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1792 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1793 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1795 for(
size_t i=ibegin; i<iend; ++i ) {
1797 xmm1 -= x1 * A.load(i,j );
1798 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1799 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1800 xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1803 y.store( j , xmm1 );
1804 y.store( j+SIMDSIZE , xmm2 );
1805 y.store( j+SIMDSIZE*2UL, xmm3 );
1806 y.store( j+SIMDSIZE*3UL, xmm4 );
1809 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1820 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1821 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1823 for(
size_t i=ibegin; i<iend; ++i ) {
1825 xmm1 -= x1 * A.load(i,j );
1826 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1827 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1830 y.store( j , xmm1 );
1831 y.store( j+SIMDSIZE , xmm2 );
1832 y.store( j+SIMDSIZE*2UL, xmm3 );
1835 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1846 SIMDType xmm2( y.load(j+SIMDSIZE) );
1848 for(
size_t i=ibegin; i<iend; ++i ) {
1850 xmm1 -= x1 * A.load(i,j );
1851 xmm2 -= x1 * A.load(i,j+SIMDSIZE);
1854 y.store( j , xmm1 );
1855 y.store( j+SIMDSIZE, xmm2 );
1858 for( ; j<jpos; j+=SIMDSIZE )
1870 for(
size_t i=ibegin; i<iend; ++i ) {
1871 xmm1 -=
set( x[i] ) * A.load(i,j);
1877 for( ; remainder && j<N; ++j )
1889 for(
size_t i=ibegin; i<iend; ++i ) {
1890 value += x[i] * A(i,j);
1913 template<
typename VT1
1917 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1919 selectDefaultSubAssignKernel( y, x, A );
1939 template<
typename VT1
1943 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1947 const size_t M( A.rows() );
1948 const size_t N( A.columns() );
1950 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
1951 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1955 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1956 for(
size_t ii=0UL; ii<M; ii+=iblock )
1958 const size_t iend(
min( ii+iblock, M ) );
1959 const size_t jtmp(
min( jj+jblock, N ) );
1964 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1965 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1971 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1973 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1975 for(
size_t i=ii; i<iend; ++i ) {
1977 xmm1 += x1 * A.load(i,j );
1978 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1979 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1980 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1981 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1982 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1983 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1984 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1987 y.store( j , y.load(j ) - xmm1 );
1988 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
1989 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
1990 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
1991 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
1992 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
1993 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
1994 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
1997 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2001 for(
size_t i=ii; i<iend; ++i ) {
2003 xmm1 += x1 * A.load(i,j );
2004 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2005 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2006 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2009 y.store( j , y.load(j ) - xmm1 );
2010 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2011 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2012 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2015 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2019 for(
size_t i=ii; i<iend; ++i ) {
2021 xmm1 += x1 * A.load(i,j );
2022 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2023 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2026 y.store( j , y.load(j ) - xmm1 );
2027 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2028 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2031 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2035 for(
size_t i=ii; i<iend; ++i ) {
2037 xmm1 += x1 * A.load(i,j );
2038 xmm2 += x1 * A.load(i,j+SIMDSIZE);
2041 y.store( j , y.load(j ) - xmm1 );
2042 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2045 for( ; j<jpos; j+=SIMDSIZE )
2049 for(
size_t i=ii; i<iend; ++i ) {
2050 xmm1 +=
set( x[i] ) * A.load(i,j);
2053 y.store( j, y.load(j) - xmm1 );
2056 for( ; remainder && j<jend; ++j )
2060 for(
size_t i=ii; i<iend; ++i ) {
2061 value += x[i] * A(i,j);
2086 template<
typename VT1
2090 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2092 selectLargeSubAssignKernel( y, x, A );
2098 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2112 template<
typename VT1
2116 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2123 subAssign( y, tmp );
2126 gemv( y, x, A, ET(-1), ET(1) );
2150 template<
typename VT1 >
2162 multAssign( ~lhs, tmp );
2184 template<
typename VT1 >
2196 divAssign( ~lhs, tmp );
2220 template<
typename VT1 >
2228 if( rhs.mat_.rows() == 0UL ) {
2232 else if( rhs.mat_.columns() == 0UL ) {
2264 template<
typename VT1 >
2297 template<
typename VT1 >
2305 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2341 template<
typename VT1 >
2349 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2385 template<
typename VT1 >
2422 template<
typename VT1 >
2473 template<
typename VT
2477 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2508 template<
typename T1 >
2509 struct UseSMPAssign {
2510 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
2518 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2519 struct UseBlasKernel {
2525 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2540 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2541 struct UseVectorizedDefaultKernel {
2542 enum :
bool { value = useOptimizedKernels &&
2544 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2580 VT::simdEnabled && MT::simdEnabled &&
2586 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2587 !evaluateMatrix && MT::smpAssignable };
2615 return vector_[index] * scalar_;
2627 if( index >= vector_.size() ) {
2630 return (*
this)[index];
2639 inline size_t size()
const {
2640 return vector_.size();
2670 template<
typename T >
2671 inline bool canAlias(
const T* alias )
const {
2672 return vector_.canAlias( alias );
2682 template<
typename T >
2683 inline bool isAliased(
const T* alias )
const {
2684 return vector_.isAliased( alias );
2694 return vector_.isAligned();
2709 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2710 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
2732 template<
typename VT1 >
2742 if( right.rows() == 0UL ) {
2746 else if( right.columns() == 0UL ) {
2758 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.
scalar_ );
2773 template<
typename VT1
2777 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2781 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2782 selectSmallAssignKernel( y, x, A, scalar );
2784 selectBlasAssignKernel( y, x, A, scalar );
2802 template<
typename VT1
2806 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2808 const size_t M( A.rows() );
2809 const size_t N( A.columns() );
2818 y[j] = x[0UL] * A(0UL,j);
2826 y[i] = x[i] * A(i,i) * scalar;
2838 const size_t jnum( jend - jbegin );
2839 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2841 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2842 y[j ] += x[i] * A(i,j );
2843 y[j+1UL] += x[i] * A(i,j+1UL);
2846 y[jpos] += x[i] * A(i,jpos);
2849 y[jend] = x[i] * A(i,jend);
2882 template<
typename VT1
2887 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2889 selectDefaultAssignKernel( y, x, A, scalar );
2907 template<
typename VT1
2912 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2916 const size_t M( A.rows() );
2917 const size_t N( A.columns() );
2919 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
2922 const SIMDType factor(
set( scalar ) );
2926 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2936 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2938 for(
size_t i=ibegin; i<iend; ++i ) {
2940 xmm1 += x1 * A.load(i,j );
2941 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2942 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2943 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2944 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
2945 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
2946 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
2947 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
2950 y.store( j , xmm1*factor );
2951 y.store( j+SIMDSIZE , xmm2*factor );
2952 y.store( j+SIMDSIZE*2UL, xmm3*factor );
2953 y.store( j+SIMDSIZE*3UL, xmm4*factor );
2954 y.store( j+SIMDSIZE*4UL, xmm5*factor );
2955 y.store( j+SIMDSIZE*5UL, xmm6*factor );
2956 y.store( j+SIMDSIZE*6UL, xmm7*factor );
2957 y.store( j+SIMDSIZE*7UL, xmm8*factor );
2960 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2972 for(
size_t i=ibegin; i<iend; ++i ) {
2974 xmm1 += x1 * A.load(i,j );
2975 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2976 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2977 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2980 y.store( j , xmm1*factor );
2981 y.store( j+SIMDSIZE , xmm2*factor );
2982 y.store( j+SIMDSIZE*2UL, xmm3*factor );
2983 y.store( j+SIMDSIZE*3UL, xmm4*factor );
2986 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2998 for(
size_t i=ibegin; i<iend; ++i ) {
3000 xmm1 += x1 * A.load(i,j );
3001 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3002 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3005 y.store( j , xmm1*factor );
3006 y.store( j+SIMDSIZE , xmm2*factor );
3007 y.store( j+SIMDSIZE*2UL, xmm3*factor );
3010 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3022 for(
size_t i=ibegin; i<iend; ++i ) {
3024 xmm1 += x1 * A.load(i,j );
3025 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3028 y.store( j , xmm1*factor );
3029 y.store( j+SIMDSIZE, xmm2*factor );
3032 for( ; j<jpos; j+=SIMDSIZE )
3044 for(
size_t i=ibegin; i<iend; ++i ) {
3045 xmm1 +=
set( x[i] ) * A.load(i,j);
3048 y.store( j, xmm1*factor );
3051 for( ; remainder && j<N; ++j )
3063 for(
size_t i=ibegin; i<iend; ++i ) {
3064 value += x[i] * A(i,j);
3067 y[j] = value * scalar;
3086 template<
typename VT1
3091 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3093 selectDefaultAssignKernel( y, x, A, scalar );
3111 template<
typename VT1
3116 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3120 const size_t M( A.rows() );
3121 const size_t N( A.columns() );
3123 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3124 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3126 const SIMDType factor(
set( scalar ) );
3132 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3133 for(
size_t ii=0UL; ii<M; ii+=iblock )
3135 const size_t iend(
min( ii+iblock, M ) );
3136 const size_t jtmp(
min( jj+jblock, N ) );
3141 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3142 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3148 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3150 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3152 for(
size_t i=ii; i<iend; ++i ) {
3154 xmm1 += x1 * A.load(i,j );
3155 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3156 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3157 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3158 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3159 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3160 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3161 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3164 y.store( j , y.load(j ) + xmm1*factor );
3165 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3166 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3167 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3168 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3169 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3170 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3171 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3174 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3178 for(
size_t i=ii; i<iend; ++i ) {
3180 xmm1 += x1 * A.load(i,j );
3181 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3182 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3183 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3186 y.store( j , y.load(j ) + xmm1*factor );
3187 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3188 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3189 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3192 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3196 for(
size_t i=ii; i<iend; ++i ) {
3198 xmm1 += x1 * A.load(i,j );
3199 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3200 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3203 y.store( j , y.load(j ) + xmm1*factor );
3204 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3205 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3208 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3212 for(
size_t i=ii; i<iend; ++i ) {
3214 xmm1 += x1 * A.load(i,j );
3215 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3218 y.store( j , y.load(j ) + xmm1*factor );
3219 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3222 for( ; j<jpos; j+=SIMDSIZE )
3226 for(
size_t i=ii; i<iend; ++i ) {
3227 xmm1 +=
set( x[i] ) * A.load(i,j);
3230 y.store( j, y.load(j) + xmm1*factor );
3233 for( ; remainder && j<jend; ++j )
3237 for(
size_t i=ii; i<iend; ++i ) {
3238 value += x[i] * A(i,j);
3241 y[j] += value * scalar;
3261 template<
typename VT1
3266 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3268 selectLargeAssignKernel( y, x, A, scalar );
3273 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3287 template<
typename VT1
3292 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3297 assign( y, scalar * x );
3301 gemv( y, x, A, ET(scalar), ET(0) );
3319 template<
typename VT1 >
3331 assign( ~lhs, tmp );
3347 template<
typename VT1 >
3357 if( right.rows() == 0UL || right.columns() == 0UL ) {
3369 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3384 template<
typename VT1
3388 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3392 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3393 selectSmallAddAssignKernel( y, x, A, scalar );
3395 selectBlasAddAssignKernel( y, x, A, scalar );
3413 template<
typename VT1
3417 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3419 y.addAssign( x * A * scalar );
3437 template<
typename VT1
3442 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3444 selectDefaultAddAssignKernel( y, x, A, scalar );
3463 template<
typename VT1
3468 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3472 const size_t M( A.rows() );
3473 const size_t N( A.columns() );
3475 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
3478 const SIMDType factor(
set( scalar ) );
3482 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3492 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3494 for(
size_t i=ibegin; i<iend; ++i ) {
3496 xmm1 += x1 * A.load(i,j );
3497 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3498 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3499 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3500 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3501 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3502 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3503 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3506 y.store( j , y.load(j ) + xmm1*factor );
3507 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3508 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3509 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3510 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3511 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3512 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3513 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3516 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3528 for(
size_t i=ibegin; i<iend; ++i ) {
3530 xmm1 += x1 * A.load(i,j );
3531 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3532 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3533 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3536 y.store( j , y.load(j ) + xmm1*factor );
3537 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3538 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3539 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3542 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3554 for(
size_t i=ibegin; i<iend; ++i ) {
3556 xmm1 += x1 * A.load(i,j );
3557 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3558 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3561 y.store( j , y.load(j ) + xmm1*factor );
3562 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3563 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3566 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3578 for(
size_t i=ibegin; i<iend; ++i ) {
3580 xmm1 += x1 * A.load(i,j );
3581 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3584 y.store( j , y.load(j ) + xmm1*factor );
3585 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3588 for( ; j<jpos; j+=SIMDSIZE )
3600 for(
size_t i=ibegin; i<iend; ++i ) {
3601 xmm1 +=
set( x[i] ) * A.load(i,j);
3604 y.store( j, y.load(j) + xmm1*factor );
3607 for( ; remainder && j<N; ++j )
3619 for(
size_t i=ibegin; i<iend; ++i ) {
3620 value += x[i] * A(i,j);
3623 y[j] += value * scalar;
3642 template<
typename VT1
3647 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3649 selectDefaultAddAssignKernel( y, x, A, scalar );
3668 template<
typename VT1
3673 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3677 const size_t M( A.rows() );
3678 const size_t N( A.columns() );
3680 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3681 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3683 const SIMDType factor(
set( scalar ) );
3687 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3688 for(
size_t ii=0UL; ii<M; ii+=iblock )
3690 const size_t iend(
min( ii+iblock, M ) );
3691 const size_t jtmp(
min( jj+jblock, N ) );
3696 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3697 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3703 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3705 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3707 for(
size_t i=ii; i<iend; ++i ) {
3709 xmm1 += x1 * A.load(i,j );
3710 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3711 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3712 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3713 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3714 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3715 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3716 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3719 y.store( j , y.load(j ) + xmm1*factor );
3720 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3721 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3722 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3723 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3724 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3725 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3726 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3729 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3733 for(
size_t i=ii; i<iend; ++i ) {
3735 xmm1 += x1 * A.load(i,j );
3736 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3737 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3738 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3741 y.store( j , y.load(j ) + xmm1*factor );
3742 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3743 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3744 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3747 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3751 for(
size_t i=ii; i<iend; ++i ) {
3753 xmm1 += x1 * A.load(i,j );
3754 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3755 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3758 y.store( j , y.load(j ) + xmm1*factor );
3759 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3760 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3763 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3767 for(
size_t i=ii; i<iend; ++i ) {
3769 xmm1 += x1 * A.load(i,j );
3770 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3773 y.store( j , y.load(j ) + xmm1*factor );
3774 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3777 for( ; j<jpos; j+=SIMDSIZE )
3781 for(
size_t i=ii; i<iend; ++i ) {
3782 xmm1 +=
set( x[i] ) * A.load(i,j);
3785 y.store( j, y.load(j) + xmm1*factor );
3788 for( ; remainder && j<jend; ++j )
3792 for(
size_t i=ii; i<iend; ++i ) {
3793 value += x[i] * A(i,j);
3796 y[j] += value * scalar;
3817 template<
typename VT1
3822 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3824 selectLargeAddAssignKernel( y, x, A, scalar );
3829 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3843 template<
typename VT1
3848 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3855 addAssign( y, tmp );
3858 gemv( y, x, A, ET(scalar), ET(1) );
3880 template<
typename VT1 >
3890 if( right.rows() == 0UL || right.columns() == 0UL ) {
3902 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3917 template<
typename VT1
3921 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3925 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3926 selectSmallSubAssignKernel( y, x, A, scalar );
3928 selectBlasSubAssignKernel( y, x, A, scalar );
3946 template<
typename VT1
3950 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3952 y.subAssign( x * A * scalar );
3970 template<
typename VT1
3975 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3977 selectDefaultSubAssignKernel( y, x, A, scalar );
3996 template<
typename VT1
4001 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4005 const size_t M( A.rows() );
4006 const size_t N( A.columns() );
4008 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
4011 const SIMDType factor(
set( scalar ) );
4015 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4025 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4027 for(
size_t i=ibegin; i<iend; ++i ) {
4029 xmm1 += x1 * A.load(i,j );
4030 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4031 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4032 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4033 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4034 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4035 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4036 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4039 y.store( j , y.load(j ) - xmm1*factor );
4040 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4041 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4042 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4043 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4044 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4045 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4046 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4049 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4061 for(
size_t i=ibegin; i<iend; ++i ) {
4063 xmm1 += x1 * A.load(i,j );
4064 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4065 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4066 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4069 y.store( j , y.load(j ) - xmm1*factor );
4070 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4071 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4072 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4075 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4087 for(
size_t i=ibegin; i<iend; ++i ) {
4089 xmm1 += x1 * A.load(i,j );
4090 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4091 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4094 y.store( j , y.load(j ) - xmm1*factor );
4095 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4096 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4099 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4111 for(
size_t i=ibegin; i<iend; ++i ) {
4113 xmm1 += x1 * A.load(i,j );
4114 xmm2 += x1 * A.load(i,j+SIMDSIZE);
4117 y.store( j , y.load(j ) - xmm1*factor );
4118 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4121 for( ; j<jpos; j+=SIMDSIZE )
4133 for(
size_t i=ibegin; i<iend; ++i ) {
4134 xmm1 +=
set( x[i] ) * A.load(i,j);
4137 y.store( j, y.load(j) - xmm1*factor );
4140 for( ; remainder && j<N; ++j )
4152 for(
size_t i=ibegin; i<iend; ++i ) {
4153 value += x[i] * A(i,j);
4156 y[j] -= value * scalar;
4175 template<
typename VT1
4180 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4182 selectDefaultSubAssignKernel( y, x, A, scalar );
4201 template<
typename VT1
4206 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4210 const size_t M( A.rows() );
4211 const size_t N( A.columns() );
4213 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
4214 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4216 const SIMDType factor(
set( scalar ) );
4220 for(
size_t jj=0U; jj<N; jj+=jblock ) {
4221 for(
size_t ii=0UL; ii<M; ii+=iblock )
4223 const size_t iend(
min( ii+iblock, M ) );
4224 const size_t jtmp(
min( jj+jblock, N ) );
4229 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4230 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
4236 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4238 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4240 for(
size_t i=ii; i<iend; ++i ) {
4242 xmm1 += x1 * A.load(i,j );
4243 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4244 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4245 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4246 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4247 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4248 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4249 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4252 y.store( j , y.load(j ) - xmm1*factor );
4253 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4254 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4255 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4256 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4257 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4258 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4259 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4262 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4266 for(
size_t i=ii; i<iend; ++i ) {
4268 xmm1 += x1 * A.load(i,j );
4269 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4270 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4271 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4274 y.store( j , y.load(j ) - xmm1*factor );
4275 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4276 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4277 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4280 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4284 for(
size_t i=ii; i<iend; ++i ) {
4286 xmm1 += x1 * A.load(i,j );
4287 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4288 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4291 y.store( j , y.load(j ) - xmm1*factor );
4292 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4293 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4296 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4300 for(
size_t i=ii; i<iend; ++i ) {
4302 xmm1 += x1 * A.load(i,j );
4303 xmm2 += x1 * A.load(i,j+SIMDSIZE);
4306 y.store( j , y.load(j ) - xmm1*factor );
4307 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4310 for( ; j<jpos; j+=SIMDSIZE )
4314 for(
size_t i=ii; i<iend; ++i ) {
4315 xmm1 +=
set( x[i] ) * A.load(i,j);
4318 y.store( j, y.load(j) - xmm1*factor );
4321 for( ; remainder && j<jend; ++j )
4325 for(
size_t i=ii; i<iend; ++i ) {
4326 value += x[i] * A(i,j);
4329 y[j] -= value * scalar;
4350 template<
typename VT1
4355 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4357 selectLargeSubAssignKernel( y, x, A, scalar );
4362 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4376 template<
typename VT1
4381 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4388 subAssign( y, tmp );
4391 gemv( y, x, A, ET(-scalar), ET(1) );
4413 template<
typename VT1 >
4425 multAssign( ~lhs, tmp );
4445 template<
typename VT1 >
4457 divAssign( ~lhs, tmp );
4479 template<
typename VT1 >
4490 if( right.rows() == 0UL ) {
4494 else if( right.columns() == 0UL ) {
4524 template<
typename VT1 >
4555 template<
typename VT1 >
4566 if( right.rows() == 0UL || right.columns() == 0UL ) {
4600 template<
typename VT1 >
4611 if( right.rows() == 0UL || right.columns() == 0UL ) {
4646 template<
typename VT1 >
4681 template<
typename VT1 >
4756 template<
typename VT
4758 inline decltype(
auto)
4765 if( (~vec).
size() != (~mat).
rows() ) {
4797 template<
typename VT
4799 inline decltype(
auto)
4820 template<
typename VT,
typename MT >
4821 struct Size< TDVecDMatMultExpr<VT,MT>, 0UL >
4822 :
public Size<MT,1UL>
4838 template<
typename VT,
typename MT >
4839 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4840 :
public And< IsAligned<VT>, IsAligned<MT> >
If_< IsExpression< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:215
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:131
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:208
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:131
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:298
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:130
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:519
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1903
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:365
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:387
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Constraints on the storage order of matrix types.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:331
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1950
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:355
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:133
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:134
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:210
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:264
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:209
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:388
decltype(auto) operator*(const DenseMatrix< MT1, false > &lhs, const DenseMatrix< MT2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8893
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:590
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:135
Header file for the IsPadded type trait.
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:250
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:224
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
If_< IsExpression< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:218
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:375
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:211
Header file for the IsContiguous type trait.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:221
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:132
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Header file for the TVecMatMultExpr base class.
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:156
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:311
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:343
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:490
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:108
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:212
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:104
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:321
Header file for the IsComplexFloat type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Compile time logical 'and' evaluation.The And alias declaration performs at compile time a logical 'a...
Definition: And.h:76
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
Constraint on the data type.
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:207
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.