35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 124 template<
typename VT
126 class TDVecDMatMultExpr :
public DenseVector< TDVecDMatMultExpr<VT,MT>, true >
127 ,
private TVecMatMultExpr
128 ,
private Computation
157 template<
typename T1 >
158 struct UseSMPAssign {
159 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
169 template<
typename T1,
typename T2,
typename T3 >
170 struct UseBlasKernel {
176 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
192 template<
typename T1,
typename T2,
typename T3 >
193 struct UseVectorizedDefaultKernel {
196 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
232 VT::simdEnabled && MT::simdEnabled &&
237 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
238 !evaluateMatrix && MT::smpAssignable };
271 return vec_[index] *
mat_(index,index);
298 inline ReturnType
at(
size_t index )
const {
299 if( index >=
mat_.columns() ) {
302 return (*
this)[index];
311 inline size_t size() const noexcept {
312 return mat_.columns();
342 template<
typename T >
343 inline bool canAlias(
const T* alias )
const noexcept {
344 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
354 template<
typename T >
355 inline bool isAliased(
const T* alias )
const noexcept {
356 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
366 return vec_.isAligned() &&
mat_.isAligned();
378 (
mat_.rows() *
mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
379 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
402 template<
typename VT1 >
409 if( rhs.mat_.rows() == 0UL ) {
413 else if( rhs.mat_.columns() == 0UL ) {
417 LT x(
serial( rhs.vec_ ) );
418 RT A(
serial( rhs.mat_ ) );
425 TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
441 template<
typename VT1
444 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
448 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
449 selectSmallAssignKernel( y, x, A );
451 selectBlasAssignKernel( y, x, A );
470 template<
typename VT1
473 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
475 const size_t M( A.rows() );
476 const size_t N( A.columns() );
485 for(
size_t j=jbegin; j<N; ++j ) {
486 y[j] = x[0UL] * A(0UL,j);
494 y[i] = x[i] * A(i,i);
506 const size_t jnum( jend - jbegin );
507 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
509 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
510 y[j ] += x[i] * A(i,j );
511 y[j+1UL] += x[i] * A(i,j+1UL);
514 y[jpos] += x[i] * A(i,jpos);
517 y[jend] = x[i] * A(i,jend);
543 template<
typename VT1
547 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
549 selectDefaultAssignKernel( y, x, A );
568 template<
typename VT1
572 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
576 const size_t M( A.rows() );
577 const size_t N( A.columns() );
579 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
584 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
594 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
596 for(
size_t i=ibegin; i<iend; ++i ) {
597 const SIMDType x1(
set( x[i] ) );
598 xmm1 += x1 * A.load(i,j );
599 xmm2 += x1 * A.load(i,j+SIMDSIZE );
600 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
601 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
602 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
603 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
604 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
605 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
609 y.store( j+SIMDSIZE , xmm2 );
610 y.store( j+SIMDSIZE*2UL, xmm3 );
611 y.store( j+SIMDSIZE*3UL, xmm4 );
612 y.store( j+SIMDSIZE*4UL, xmm5 );
613 y.store( j+SIMDSIZE*5UL, xmm6 );
614 y.store( j+SIMDSIZE*6UL, xmm7 );
615 y.store( j+SIMDSIZE*7UL, xmm8 );
618 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
628 SIMDType xmm1, xmm2, xmm3, xmm4;
630 for(
size_t i=ibegin; i<iend; ++i ) {
631 const SIMDType x1(
set( x[i] ) );
632 xmm1 += x1 * A.load(i,j );
633 xmm2 += x1 * A.load(i,j+SIMDSIZE );
634 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
635 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
639 y.store( j+SIMDSIZE , xmm2 );
640 y.store( j+SIMDSIZE*2UL, xmm3 );
641 y.store( j+SIMDSIZE*3UL, xmm4 );
644 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
654 SIMDType xmm1, xmm2, xmm3;
656 for(
size_t i=ibegin; i<iend; ++i ) {
657 const SIMDType x1(
set( x[i] ) );
658 xmm1 += x1 * A.load(i,j );
659 xmm2 += x1 * A.load(i,j+SIMDSIZE );
660 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
664 y.store( j+SIMDSIZE , xmm2 );
665 y.store( j+SIMDSIZE*2UL, xmm3 );
668 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
680 for(
size_t i=ibegin; i<iend; ++i ) {
681 const SIMDType x1(
set( x[i] ) );
682 xmm1 += x1 * A.load(i,j );
683 xmm2 += x1 * A.load(i,j+SIMDSIZE);
687 y.store( j+SIMDSIZE, xmm2 );
690 for( ; j<jpos; j+=SIMDSIZE )
702 for(
size_t i=ibegin; i<iend; ++i ) {
703 xmm1 +=
set( x[i] ) * A.load(i,j);
709 for( ; remainder && j<N; ++j )
721 for(
size_t i=ibegin; i<iend; ++i ) {
722 value += x[i] * A(i,j);
745 template<
typename VT1
749 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
751 selectDefaultAssignKernel( y, x, A );
770 template<
typename VT1
774 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
778 const size_t M( A.rows() );
779 const size_t N( A.columns() );
781 const size_t jblock( 32768UL /
sizeof( ElementType ) );
782 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
788 for(
size_t jj=0U; jj<N; jj+=jblock ) {
789 for(
size_t ii=0UL; ii<M; ii+=iblock )
791 const size_t iend(
min( ii+iblock, M ) );
792 const size_t jtmp(
min( jj+jblock, N ) );
797 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
798 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
804 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
806 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
808 for(
size_t i=ii; i<iend; ++i ) {
809 const SIMDType x1(
set( x[i] ) );
810 xmm1 += x1 * A.load(i,j );
811 xmm2 += x1 * A.load(i,j+SIMDSIZE );
812 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
813 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
814 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
815 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
816 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
817 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
820 y.store( j , y.load(j ) + xmm1 );
821 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
822 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
823 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
824 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
825 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
826 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
827 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
830 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
832 SIMDType xmm1, xmm2, xmm3, xmm4;
834 for(
size_t i=ii; i<iend; ++i ) {
835 const SIMDType x1(
set( x[i] ) );
836 xmm1 += x1 * A.load(i,j );
837 xmm2 += x1 * A.load(i,j+SIMDSIZE );
838 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
839 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
842 y.store( j , y.load(j ) + xmm1 );
843 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
844 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
845 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
848 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
850 SIMDType xmm1, xmm2, xmm3;
852 for(
size_t i=ii; i<iend; ++i ) {
853 const SIMDType x1(
set( x[i] ) );
854 xmm1 += x1 * A.load(i,j );
855 xmm2 += x1 * A.load(i,j+SIMDSIZE );
856 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
859 y.store( j , y.load(j ) + xmm1 );
860 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
861 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
864 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
868 for(
size_t i=ii; i<iend; ++i ) {
869 const SIMDType x1(
set( x[i] ) );
870 xmm1 += x1 * A.load(i,j );
871 xmm2 += x1 * A.load(i,j+SIMDSIZE);
874 y.store( j , y.load(j ) + xmm1 );
875 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
878 for( ; j<jpos; j+=SIMDSIZE )
882 for(
size_t i=ii; i<iend; ++i ) {
883 xmm1 +=
set( x[i] ) * A.load(i,j);
886 y.store( j, y.load(j) + xmm1 );
889 for( ; remainder && j<jend; ++j )
893 for(
size_t i=ii; i<iend; ++i ) {
894 value += x[i] * A(i,j);
919 template<
typename VT1
923 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
925 selectLargeAssignKernel( y, x, A );
931 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 945 template<
typename VT1
949 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
958 gemv( y, x, A, ET(1), ET(0) );
978 template<
typename VT1 >
989 const ResultType tmp(
serial( rhs ) );
1008 template<
typename VT1 >
1015 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1019 LT x(
serial( rhs.vec_ ) );
1020 RT A(
serial( rhs.mat_ ) );
1027 TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1043 template<
typename VT1
1046 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1050 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1051 selectSmallAddAssignKernel( y, x, A );
1053 selectBlasAddAssignKernel( y, x, A );
1072 template<
typename VT1
1075 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1077 const size_t M( A.rows() );
1078 const size_t N( A.columns() );
1080 for(
size_t i=0UL; i<M; ++i )
1084 y[i] += x[i] * A(i,i);
1096 const size_t jnum( jend - jbegin );
1097 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1099 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1100 y[j ] += x[i] * A(i,j );
1101 y[j+1UL] += x[i] * A(i,j+1UL);
1104 y[jpos] += x[i] * A(i,jpos);
1126 template<
typename VT1
1130 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1132 selectDefaultAddAssignKernel( y, x, A );
1151 template<
typename VT1
1155 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1159 const size_t M( A.rows() );
1160 const size_t N( A.columns() );
1162 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1167 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1177 SIMDType xmm1( y.load(j ) );
1178 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1179 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1180 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1181 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1182 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1183 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1184 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1186 for(
size_t i=ibegin; i<iend; ++i ) {
1187 const SIMDType x1(
set( x[i] ) );
1188 xmm1 += x1 * A.load(i,j );
1189 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1190 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1191 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1192 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1193 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1194 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1195 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1198 y.store( j , xmm1 );
1199 y.store( j+SIMDSIZE , xmm2 );
1200 y.store( j+SIMDSIZE*2UL, xmm3 );
1201 y.store( j+SIMDSIZE*3UL, xmm4 );
1202 y.store( j+SIMDSIZE*4UL, xmm5 );
1203 y.store( j+SIMDSIZE*5UL, xmm6 );
1204 y.store( j+SIMDSIZE*6UL, xmm7 );
1205 y.store( j+SIMDSIZE*7UL, xmm8 );
1208 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1218 SIMDType xmm1( y.load(j ) );
1219 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1220 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1221 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1223 for(
size_t i=ibegin; i<iend; ++i ) {
1224 const SIMDType x1(
set( x[i] ) );
1225 xmm1 += x1 * A.load(i,j );
1226 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1227 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1228 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1231 y.store( j , xmm1 );
1232 y.store( j+SIMDSIZE , xmm2 );
1233 y.store( j+SIMDSIZE*2UL, xmm3 );
1234 y.store( j+SIMDSIZE*3UL, xmm4 );
1237 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1247 SIMDType xmm1( y.load(j ) );
1248 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1249 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1251 for(
size_t i=ibegin; i<iend; ++i ) {
1252 const SIMDType x1(
set( x[i] ) );
1253 xmm1 += x1 * A.load(i,j );
1254 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1255 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1258 y.store( j , xmm1 );
1259 y.store( j+SIMDSIZE , xmm2 );
1260 y.store( j+SIMDSIZE*2UL, xmm3 );
1263 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1273 SIMDType xmm1( y.load(j ) );
1274 SIMDType xmm2( y.load(j+SIMDSIZE) );
1276 for(
size_t i=ibegin; i<iend; ++i ) {
1277 const SIMDType x1(
set( x[i] ) );
1278 xmm1 += x1 * A.load(i,j );
1279 xmm2 += x1 * A.load(i,j+SIMDSIZE);
1282 y.store( j , xmm1 );
1283 y.store( j+SIMDSIZE, xmm2 );
1286 for( ; j<jpos; j+=SIMDSIZE )
1296 SIMDType xmm1( y.load(j) );
1298 for(
size_t i=ibegin; i<iend; ++i ) {
1299 xmm1 +=
set( x[i] ) * A.load(i,j);
1305 for( ; remainder && j<N; ++j )
1317 for(
size_t i=ibegin; i<iend; ++i ) {
1318 value += x[i] * A(i,j);
1341 template<
typename VT1
1345 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1347 selectDefaultAddAssignKernel( y, x, A );
1366 template<
typename VT1
1370 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1374 const size_t M( A.rows() );
1375 const size_t N( A.columns() );
1377 const size_t jblock( 32768UL /
sizeof( ElementType ) );
1378 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1382 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1383 for(
size_t ii=0UL; ii<M; ii+=iblock )
1385 const size_t iend(
min( ii+iblock, M ) );
1386 const size_t jtmp(
min( jj+jblock, N ) );
1391 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1392 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1398 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1400 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1402 for(
size_t i=ii; i<iend; ++i ) {
1403 const SIMDType x1(
set( x[i] ) );
1404 xmm1 += x1 * A.load(i,j );
1405 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1406 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1407 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1408 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1409 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1410 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1411 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1414 y.store( j , y.load(j ) + xmm1 );
1415 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1416 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1417 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1418 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1419 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1420 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1421 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1424 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1426 SIMDType xmm1, xmm2, xmm3, xmm4;
1428 for(
size_t i=ii; i<iend; ++i ) {
1429 const SIMDType x1(
set( x[i] ) );
1430 xmm1 += x1 * A.load(i,j );
1431 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1432 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1433 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1436 y.store( j , y.load(j ) + xmm1 );
1437 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1438 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1439 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1442 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1444 SIMDType xmm1, xmm2, xmm3;
1446 for(
size_t i=ii; i<iend; ++i ) {
1447 const SIMDType x1(
set( x[i] ) );
1448 xmm1 += x1 * A.load(i,j );
1449 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1450 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1453 y.store( j , y.load(j ) + xmm1 );
1454 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1455 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1458 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1460 SIMDType xmm1, xmm2;
1462 for(
size_t i=ii; i<iend; ++i ) {
1463 const SIMDType x1(
set( x[i] ) );
1464 xmm1 += x1 * A.load(i,j );
1465 xmm2 += x1 * A.load(i,j+SIMDSIZE);
1468 y.store( j , y.load(j ) + xmm1 );
1469 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1472 for( ; j<jpos; j+=SIMDSIZE )
1476 for(
size_t i=ii; i<iend; ++i ) {
1477 xmm1 +=
set( x[i] ) * A.load(i,j);
1480 y.store( j, y.load(j) + xmm1 );
1483 for( ; remainder && j<jend; ++j )
1487 for(
size_t i=ii; i<iend; ++i ) {
1488 value += x[i] * A(i,j);
1513 template<
typename VT1
1517 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1519 selectLargeAddAssignKernel( y, x, A );
1525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1539 template<
typename VT1
1543 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1550 addAssign( y, tmp );
1553 gemv( y, x, A, ET(1), ET(1) );
1577 template<
typename VT1 >
1584 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1588 LT x(
serial( rhs.vec_ ) );
1589 RT A(
serial( rhs.mat_ ) );
1596 TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1612 template<
typename VT1
1615 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1619 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1620 selectSmallSubAssignKernel( y, x, A );
1622 selectBlasSubAssignKernel( y, x, A );
1641 template<
typename VT1
1644 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1646 const size_t M( A.rows() );
1647 const size_t N( A.columns() );
1649 for(
size_t i=0UL; i<M; ++i )
1653 y[i] -= x[i] * A(i,i);
1665 const size_t jnum( jend - jbegin );
1666 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1668 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1669 y[j ] -= x[i] * A(i,j );
1670 y[j+1UL] -= x[i] * A(i,j+1UL);
1673 y[jpos] -= x[i] * A(i,jpos);
1695 template<
typename VT1
1699 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1701 selectDefaultSubAssignKernel( y, x, A );
1721 template<
typename VT1
1725 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1729 const size_t M( A.rows() );
1730 const size_t N( A.columns() );
1732 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1737 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1747 SIMDType xmm1( y.load(j ) );
1748 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1749 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1750 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1751 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1752 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1753 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1754 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1756 for(
size_t i=ibegin; i<iend; ++i ) {
1757 const SIMDType x1(
set( x[i] ) );
1758 xmm1 -= x1 * A.load(i,j );
1759 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1760 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1761 xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1762 xmm5 -= x1 * A.load(i,j+SIMDSIZE*4UL);
1763 xmm6 -= x1 * A.load(i,j+SIMDSIZE*5UL);
1764 xmm7 -= x1 * A.load(i,j+SIMDSIZE*6UL);
1765 xmm8 -= x1 * A.load(i,j+SIMDSIZE*7UL);
1768 y.store( j , xmm1 );
1769 y.store( j+SIMDSIZE , xmm2 );
1770 y.store( j+SIMDSIZE*2UL, xmm3 );
1771 y.store( j+SIMDSIZE*3UL, xmm4 );
1772 y.store( j+SIMDSIZE*4UL, xmm5 );
1773 y.store( j+SIMDSIZE*5UL, xmm6 );
1774 y.store( j+SIMDSIZE*6UL, xmm7 );
1775 y.store( j+SIMDSIZE*7UL, xmm8 );
1778 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1788 SIMDType xmm1( y.load(j ) );
1789 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1790 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1791 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1793 for(
size_t i=ibegin; i<iend; ++i ) {
1794 const SIMDType x1(
set( x[i] ) );
1795 xmm1 -= x1 * A.load(i,j );
1796 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1797 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1798 xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1801 y.store( j , xmm1 );
1802 y.store( j+SIMDSIZE , xmm2 );
1803 y.store( j+SIMDSIZE*2UL, xmm3 );
1804 y.store( j+SIMDSIZE*3UL, xmm4 );
1807 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1817 SIMDType xmm1( y.load(j ) );
1818 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1819 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1821 for(
size_t i=ibegin; i<iend; ++i ) {
1822 const SIMDType x1(
set( x[i] ) );
1823 xmm1 -= x1 * A.load(i,j );
1824 xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1825 xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1828 y.store( j , xmm1 );
1829 y.store( j+SIMDSIZE , xmm2 );
1830 y.store( j+SIMDSIZE*2UL, xmm3 );
1833 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1843 SIMDType xmm1( y.load(j ) );
1844 SIMDType xmm2( y.load(j+SIMDSIZE) );
1846 for(
size_t i=ibegin; i<iend; ++i ) {
1847 const SIMDType x1(
set( x[i] ) );
1848 xmm1 -= x1 * A.load(i,j );
1849 xmm2 -= x1 * A.load(i,j+SIMDSIZE);
1852 y.store( j , xmm1 );
1853 y.store( j+SIMDSIZE, xmm2 );
1856 for( ; j<jpos; j+=SIMDSIZE )
1866 SIMDType xmm1( y.load(j) );
1868 for(
size_t i=ibegin; i<iend; ++i ) {
1869 xmm1 -=
set( x[i] ) * A.load(i,j);
1875 for( ; remainder && j<N; ++j )
1887 for(
size_t i=ibegin; i<iend; ++i ) {
1888 value += x[i] * A(i,j);
1911 template<
typename VT1
1915 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1917 selectDefaultSubAssignKernel( y, x, A );
1937 template<
typename VT1
1941 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1945 const size_t M( A.rows() );
1946 const size_t N( A.columns() );
1948 const size_t jblock( 32768UL /
sizeof( ElementType ) );
1949 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1953 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1954 for(
size_t ii=0UL; ii<M; ii+=iblock )
1956 const size_t iend(
min( ii+iblock, M ) );
1957 const size_t jtmp(
min( jj+jblock, N ) );
1962 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1963 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1969 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1971 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1973 for(
size_t i=ii; i<iend; ++i ) {
1974 const SIMDType x1(
set( x[i] ) );
1975 xmm1 += x1 * A.load(i,j );
1976 xmm2 += x1 * A.load(i,j+SIMDSIZE );
1977 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1978 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1979 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1980 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1981 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1982 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1985 y.store( j , y.load(j ) - xmm1 );
1986 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
1987 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
1988 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
1989 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
1990 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
1991 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
1992 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
1995 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1997 SIMDType xmm1, xmm2, xmm3, xmm4;
1999 for(
size_t i=ii; i<iend; ++i ) {
2000 const SIMDType x1(
set( x[i] ) );
2001 xmm1 += x1 * A.load(i,j );
2002 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2003 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2004 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2007 y.store( j , y.load(j ) - xmm1 );
2008 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2009 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2010 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2013 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2015 SIMDType xmm1, xmm2, xmm3;
2017 for(
size_t i=ii; i<iend; ++i ) {
2018 const SIMDType x1(
set( x[i] ) );
2019 xmm1 += x1 * A.load(i,j );
2020 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2021 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2024 y.store( j , y.load(j ) - xmm1 );
2025 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2026 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2029 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2031 SIMDType xmm1, xmm2;
2033 for(
size_t i=ii; i<iend; ++i ) {
2034 const SIMDType x1(
set( x[i] ) );
2035 xmm1 += x1 * A.load(i,j );
2036 xmm2 += x1 * A.load(i,j+SIMDSIZE);
2039 y.store( j , y.load(j ) - xmm1 );
2040 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2043 for( ; j<jpos; j+=SIMDSIZE )
2047 for(
size_t i=ii; i<iend; ++i ) {
2048 xmm1 +=
set( x[i] ) * A.load(i,j);
2051 y.store( j, y.load(j) - xmm1 );
2054 for( ; remainder && j<jend; ++j )
2058 for(
size_t i=ii; i<iend; ++i ) {
2059 value += x[i] * A(i,j);
2084 template<
typename VT1
2088 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2090 selectLargeSubAssignKernel( y, x, A );
2096 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2110 template<
typename VT1
2114 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2121 subAssign( y, tmp );
2124 gemv( y, x, A, ET(-1), ET(1) );
2148 template<
typename VT1 >
2159 const ResultType tmp(
serial( rhs ) );
2160 multAssign( ~lhs, tmp );
2182 template<
typename VT1 >
2193 const ResultType tmp(
serial( rhs ) );
2194 divAssign( ~lhs, tmp );
2218 template<
typename VT1 >
2226 if( rhs.mat_.rows() == 0UL ) {
2230 else if( rhs.mat_.columns() == 0UL ) {
2262 template<
typename VT1 >
2274 const ResultType tmp( rhs );
2295 template<
typename VT1 >
2303 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2339 template<
typename VT1 >
2347 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2383 template<
typename VT1 >
2395 const ResultType tmp( rhs );
2420 template<
typename VT1 >
2432 const ResultType tmp( rhs );
2471 template<
typename VT
2475 :
public DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true >
2507 template<
typename T1 >
2508 struct UseSMPAssign {
2509 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
2517 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2518 struct UseBlasKernel {
2524 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2539 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2540 struct UseVectorizedDefaultKernel {
2543 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2579 VT::simdEnabled && MT::simdEnabled &&
2585 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2586 !evaluateMatrix && MT::smpAssignable };
2612 inline ReturnType
operator[](
size_t index )
const {
2614 return vector_[index] * scalar_;
2625 inline ReturnType
at(
size_t index )
const {
2626 if( index >= vector_.size() ) {
2629 return (*
this)[index];
2638 inline size_t size()
const {
2639 return vector_.size();
2669 template<
typename T >
2670 inline bool canAlias(
const T* alias )
const {
2671 return vector_.canAlias( alias );
2681 template<
typename T >
2682 inline bool isAliased(
const T* alias )
const {
2683 return vector_.isAliased( alias );
2693 return vector_.isAligned();
2706 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2707 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
2713 LeftOperand vector_;
2714 RightOperand scalar_;
2729 template<
typename VT1 >
2739 if( right.rows() == 0UL ) {
2743 else if( right.columns() == 0UL ) {
2755 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.
scalar_ );
2770 template<
typename VT1
2774 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2778 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2779 selectSmallAssignKernel( y, x, A, scalar );
2781 selectBlasAssignKernel( y, x, A, scalar );
2799 template<
typename VT1
2803 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2805 const size_t M( A.rows() );
2806 const size_t N( A.columns() );
2815 y[j] = x[0UL] * A(0UL,j);
2823 y[i] = x[i] * A(i,i) * scalar;
2835 const size_t jnum( jend - jbegin );
2836 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2838 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2839 y[j ] += x[i] * A(i,j );
2840 y[j+1UL] += x[i] * A(i,j+1UL);
2843 y[jpos] += x[i] * A(i,jpos);
2846 y[jend] = x[i] * A(i,jend);
2879 template<
typename VT1
2884 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2886 selectDefaultAssignKernel( y, x, A, scalar );
2904 template<
typename VT1
2909 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2913 const size_t M( A.rows() );
2914 const size_t N( A.columns() );
2916 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
2919 const SIMDType factor(
set( scalar ) );
2923 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2933 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2935 for(
size_t i=ibegin; i<iend; ++i ) {
2936 const SIMDType x1(
set( x[i] ) );
2937 xmm1 += x1 * A.load(i,j );
2938 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2939 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2940 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2941 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
2942 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
2943 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
2944 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
2947 y.store( j , xmm1*factor );
2948 y.store( j+SIMDSIZE , xmm2*factor );
2949 y.store( j+SIMDSIZE*2UL, xmm3*factor );
2950 y.store( j+SIMDSIZE*3UL, xmm4*factor );
2951 y.store( j+SIMDSIZE*4UL, xmm5*factor );
2952 y.store( j+SIMDSIZE*5UL, xmm6*factor );
2953 y.store( j+SIMDSIZE*6UL, xmm7*factor );
2954 y.store( j+SIMDSIZE*7UL, xmm8*factor );
2957 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2967 SIMDType xmm1, xmm2, xmm3, xmm4;
2969 for(
size_t i=ibegin; i<iend; ++i ) {
2970 const SIMDType x1(
set( x[i] ) );
2971 xmm1 += x1 * A.load(i,j );
2972 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2973 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2974 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2977 y.store( j , xmm1*factor );
2978 y.store( j+SIMDSIZE , xmm2*factor );
2979 y.store( j+SIMDSIZE*2UL, xmm3*factor );
2980 y.store( j+SIMDSIZE*3UL, xmm4*factor );
2983 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2993 SIMDType xmm1, xmm2, xmm3;
2995 for(
size_t i=ibegin; i<iend; ++i ) {
2996 const SIMDType x1(
set( x[i] ) );
2997 xmm1 += x1 * A.load(i,j );
2998 xmm2 += x1 * A.load(i,j+SIMDSIZE );
2999 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3002 y.store( j , xmm1*factor );
3003 y.store( j+SIMDSIZE , xmm2*factor );
3004 y.store( j+SIMDSIZE*2UL, xmm3*factor );
3007 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3017 SIMDType xmm1, xmm2;
3019 for(
size_t i=ibegin; i<iend; ++i ) {
3020 const SIMDType x1(
set( x[i] ) );
3021 xmm1 += x1 * A.load(i,j );
3022 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3025 y.store( j , xmm1*factor );
3026 y.store( j+SIMDSIZE, xmm2*factor );
3029 for( ; j<jpos; j+=SIMDSIZE )
3041 for(
size_t i=ibegin; i<iend; ++i ) {
3042 xmm1 +=
set( x[i] ) * A.load(i,j);
3045 y.store( j, xmm1*factor );
3048 for( ; remainder && j<N; ++j )
3060 for(
size_t i=ibegin; i<iend; ++i ) {
3061 value += x[i] * A(i,j);
3064 y[j] = value * scalar;
3083 template<
typename VT1
3088 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3090 selectDefaultAssignKernel( y, x, A, scalar );
3108 template<
typename VT1
3113 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3117 const size_t M( A.rows() );
3118 const size_t N( A.columns() );
3120 const size_t jblock( 32768UL /
sizeof( ElementType ) );
3121 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3123 const SIMDType factor(
set( scalar ) );
3129 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3130 for(
size_t ii=0UL; ii<M; ii+=iblock )
3132 const size_t iend(
min( ii+iblock, M ) );
3133 const size_t jtmp(
min( jj+jblock, N ) );
3138 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3139 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3145 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3147 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3149 for(
size_t i=ii; i<iend; ++i ) {
3150 const SIMDType x1(
set( x[i] ) );
3151 xmm1 += x1 * A.load(i,j );
3152 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3153 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3154 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3155 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3156 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3157 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3158 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3161 y.store( j , y.load(j ) + xmm1*factor );
3162 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3163 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3164 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3165 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3166 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3167 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3168 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3171 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3173 SIMDType xmm1, xmm2, xmm3, xmm4;
3175 for(
size_t i=ii; i<iend; ++i ) {
3176 const SIMDType x1(
set( x[i] ) );
3177 xmm1 += x1 * A.load(i,j );
3178 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3179 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3180 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3183 y.store( j , y.load(j ) + xmm1*factor );
3184 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3185 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3186 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3189 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3191 SIMDType xmm1, xmm2, xmm3;
3193 for(
size_t i=ii; i<iend; ++i ) {
3194 const SIMDType x1(
set( x[i] ) );
3195 xmm1 += x1 * A.load(i,j );
3196 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3197 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3200 y.store( j , y.load(j ) + xmm1*factor );
3201 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3202 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3205 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3207 SIMDType xmm1, xmm2;
3209 for(
size_t i=ii; i<iend; ++i ) {
3210 const SIMDType x1(
set( x[i] ) );
3211 xmm1 += x1 * A.load(i,j );
3212 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3215 y.store( j , y.load(j ) + xmm1*factor );
3216 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3219 for( ; j<jpos; j+=SIMDSIZE )
3223 for(
size_t i=ii; i<iend; ++i ) {
3224 xmm1 +=
set( x[i] ) * A.load(i,j);
3227 y.store( j, y.load(j) + xmm1*factor );
3230 for( ; remainder && j<jend; ++j )
3234 for(
size_t i=ii; i<iend; ++i ) {
3235 value += x[i] * A(i,j);
3238 y[j] += value * scalar;
3258 template<
typename VT1
3263 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3265 selectLargeAssignKernel( y, x, A, scalar );
3270 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3284 template<
typename VT1
3289 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3294 assign( y, scalar * x );
3298 gemv( y, x, A, ET(scalar), ET(0) );
3316 template<
typename VT1 >
3327 const ResultType tmp(
serial( rhs ) );
3328 assign( ~lhs, tmp );
3344 template<
typename VT1 >
3354 if( right.rows() == 0UL || right.columns() == 0UL ) {
3366 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3381 template<
typename VT1
3385 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3389 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3390 selectSmallAddAssignKernel( y, x, A, scalar );
3392 selectBlasAddAssignKernel( y, x, A, scalar );
3410 template<
typename VT1
3414 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3416 y.addAssign( x * A * scalar );
3434 template<
typename VT1
3439 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3441 selectDefaultAddAssignKernel( y, x, A, scalar );
3460 template<
typename VT1
3465 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3469 const size_t M( A.rows() );
3470 const size_t N( A.columns() );
3472 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
3475 const SIMDType factor(
set( scalar ) );
3479 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3489 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3491 for(
size_t i=ibegin; i<iend; ++i ) {
3492 const SIMDType x1(
set( x[i] ) );
3493 xmm1 += x1 * A.load(i,j );
3494 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3495 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3496 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3497 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3498 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3499 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3500 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3503 y.store( j , y.load(j ) + xmm1*factor );
3504 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3505 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3506 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3507 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3508 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3509 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3510 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3513 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3523 SIMDType xmm1, xmm2, xmm3, xmm4;
3525 for(
size_t i=ibegin; i<iend; ++i ) {
3526 const SIMDType x1(
set( x[i] ) );
3527 xmm1 += x1 * A.load(i,j );
3528 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3529 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3530 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3533 y.store( j , y.load(j ) + xmm1*factor );
3534 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3535 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3536 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3539 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3549 SIMDType xmm1, xmm2, xmm3;
3551 for(
size_t i=ibegin; i<iend; ++i ) {
3552 const SIMDType x1(
set( x[i] ) );
3553 xmm1 += x1 * A.load(i,j );
3554 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3555 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3558 y.store( j , y.load(j ) + xmm1*factor );
3559 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3560 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3563 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3573 SIMDType xmm1, xmm2;
3575 for(
size_t i=ibegin; i<iend; ++i ) {
3576 const SIMDType x1(
set( x[i] ) );
3577 xmm1 += x1 * A.load(i,j );
3578 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3581 y.store( j , y.load(j ) + xmm1*factor );
3582 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3585 for( ; j<jpos; j+=SIMDSIZE )
3597 for(
size_t i=ibegin; i<iend; ++i ) {
3598 xmm1 +=
set( x[i] ) * A.load(i,j);
3601 y.store( j, y.load(j) + xmm1*factor );
3604 for( ; remainder && j<N; ++j )
3616 for(
size_t i=ibegin; i<iend; ++i ) {
3617 value += x[i] * A(i,j);
3620 y[j] += value * scalar;
3639 template<
typename VT1
3644 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3646 selectDefaultAddAssignKernel( y, x, A, scalar );
3665 template<
typename VT1
3670 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3674 const size_t M( A.rows() );
3675 const size_t N( A.columns() );
3677 const size_t jblock( 32768UL /
sizeof( ElementType ) );
3678 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3680 const SIMDType factor(
set( scalar ) );
3684 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3685 for(
size_t ii=0UL; ii<M; ii+=iblock )
3687 const size_t iend(
min( ii+iblock, M ) );
3688 const size_t jtmp(
min( jj+jblock, N ) );
3693 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3694 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3700 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3702 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3704 for(
size_t i=ii; i<iend; ++i ) {
3705 const SIMDType x1(
set( x[i] ) );
3706 xmm1 += x1 * A.load(i,j );
3707 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3708 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3709 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3710 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3711 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3712 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3713 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3716 y.store( j , y.load(j ) + xmm1*factor );
3717 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3718 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3719 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3720 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3721 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3722 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3723 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3726 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3728 SIMDType xmm1, xmm2, xmm3, xmm4;
3730 for(
size_t i=ii; i<iend; ++i ) {
3731 const SIMDType x1(
set( x[i] ) );
3732 xmm1 += x1 * A.load(i,j );
3733 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3734 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3735 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3738 y.store( j , y.load(j ) + xmm1*factor );
3739 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3740 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3741 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3744 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3746 SIMDType xmm1, xmm2, xmm3;
3748 for(
size_t i=ii; i<iend; ++i ) {
3749 const SIMDType x1(
set( x[i] ) );
3750 xmm1 += x1 * A.load(i,j );
3751 xmm2 += x1 * A.load(i,j+SIMDSIZE );
3752 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3755 y.store( j , y.load(j ) + xmm1*factor );
3756 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3757 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3760 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3762 SIMDType xmm1, xmm2;
3764 for(
size_t i=ii; i<iend; ++i ) {
3765 const SIMDType x1(
set( x[i] ) );
3766 xmm1 += x1 * A.load(i,j );
3767 xmm2 += x1 * A.load(i,j+SIMDSIZE);
3770 y.store( j , y.load(j ) + xmm1*factor );
3771 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3774 for( ; j<jpos; j+=SIMDSIZE )
3778 for(
size_t i=ii; i<iend; ++i ) {
3779 xmm1 +=
set( x[i] ) * A.load(i,j);
3782 y.store( j, y.load(j) + xmm1*factor );
3785 for( ; remainder && j<jend; ++j )
3789 for(
size_t i=ii; i<iend; ++i ) {
3790 value += x[i] * A(i,j);
3793 y[j] += value * scalar;
3814 template<
typename VT1
3819 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3821 selectLargeAddAssignKernel( y, x, A, scalar );
3826 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3840 template<
typename VT1
3845 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3852 addAssign( y, tmp );
3855 gemv( y, x, A, ET(scalar), ET(1) );
3877 template<
typename VT1 >
3887 if( right.rows() == 0UL || right.columns() == 0UL ) {
3899 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3914 template<
typename VT1
3918 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3922 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3923 selectSmallSubAssignKernel( y, x, A, scalar );
3925 selectBlasSubAssignKernel( y, x, A, scalar );
3943 template<
typename VT1
3947 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3949 y.subAssign( x * A * scalar );
3967 template<
typename VT1
3972 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3974 selectDefaultSubAssignKernel( y, x, A, scalar );
3993 template<
typename VT1
3998 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4002 const size_t M( A.rows() );
4003 const size_t N( A.columns() );
4005 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
4008 const SIMDType factor(
set( scalar ) );
4012 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4022 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4024 for(
size_t i=ibegin; i<iend; ++i ) {
4025 const SIMDType x1(
set( x[i] ) );
4026 xmm1 += x1 * A.load(i,j );
4027 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4028 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4029 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4030 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4031 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4032 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4033 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4036 y.store( j , y.load(j ) - xmm1*factor );
4037 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4038 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4039 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4040 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4041 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4042 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4043 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4046 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4056 SIMDType xmm1, xmm2, xmm3, xmm4;
4058 for(
size_t i=ibegin; i<iend; ++i ) {
4059 const SIMDType x1(
set( x[i] ) );
4060 xmm1 += x1 * A.load(i,j );
4061 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4062 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4063 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4066 y.store( j , y.load(j ) - xmm1*factor );
4067 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4068 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4069 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4072 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4082 SIMDType xmm1, xmm2, xmm3;
4084 for(
size_t i=ibegin; i<iend; ++i ) {
4085 const SIMDType x1(
set( x[i] ) );
4086 xmm1 += x1 * A.load(i,j );
4087 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4088 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4091 y.store( j , y.load(j ) - xmm1*factor );
4092 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4093 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4096 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4106 SIMDType xmm1, xmm2;
4108 for(
size_t i=ibegin; i<iend; ++i ) {
4109 const SIMDType x1(
set( x[i] ) );
4110 xmm1 += x1 * A.load(i,j );
4111 xmm2 += x1 * A.load(i,j+SIMDSIZE);
4114 y.store( j , y.load(j ) - xmm1*factor );
4115 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4118 for( ; j<jpos; j+=SIMDSIZE )
4130 for(
size_t i=ibegin; i<iend; ++i ) {
4131 xmm1 +=
set( x[i] ) * A.load(i,j);
4134 y.store( j, y.load(j) - xmm1*factor );
4137 for( ; remainder && j<N; ++j )
4149 for(
size_t i=ibegin; i<iend; ++i ) {
4150 value += x[i] * A(i,j);
4153 y[j] -= value * scalar;
4172 template<
typename VT1
4177 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4179 selectDefaultSubAssignKernel( y, x, A, scalar );
4198 template<
typename VT1
4203 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4207 const size_t M( A.rows() );
4208 const size_t N( A.columns() );
4210 const size_t jblock( 32768UL /
sizeof( ElementType ) );
4211 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4213 const SIMDType factor(
set( scalar ) );
4217 for(
size_t jj=0U; jj<N; jj+=jblock ) {
4218 for(
size_t ii=0UL; ii<M; ii+=iblock )
4220 const size_t iend(
min( ii+iblock, M ) );
4221 const size_t jtmp(
min( jj+jblock, N ) );
4226 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4227 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
4233 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4235 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4237 for(
size_t i=ii; i<iend; ++i ) {
4238 const SIMDType x1(
set( x[i] ) );
4239 xmm1 += x1 * A.load(i,j );
4240 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4241 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4242 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4243 xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4244 xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4245 xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4246 xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4249 y.store( j , y.load(j ) - xmm1*factor );
4250 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4251 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4252 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4253 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4254 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4255 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4256 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4259 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4261 SIMDType xmm1, xmm2, xmm3, xmm4;
4263 for(
size_t i=ii; i<iend; ++i ) {
4264 const SIMDType x1(
set( x[i] ) );
4265 xmm1 += x1 * A.load(i,j );
4266 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4267 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4268 xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4271 y.store( j , y.load(j ) - xmm1*factor );
4272 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4273 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4274 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4277 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4279 SIMDType xmm1, xmm2, xmm3;
4281 for(
size_t i=ii; i<iend; ++i ) {
4282 const SIMDType x1(
set( x[i] ) );
4283 xmm1 += x1 * A.load(i,j );
4284 xmm2 += x1 * A.load(i,j+SIMDSIZE );
4285 xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4288 y.store( j , y.load(j ) - xmm1*factor );
4289 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4290 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4293 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4295 SIMDType xmm1, xmm2;
4297 for(
size_t i=ii; i<iend; ++i ) {
4298 const SIMDType x1(
set( x[i] ) );
4299 xmm1 += x1 * A.load(i,j );
4300 xmm2 += x1 * A.load(i,j+SIMDSIZE);
4303 y.store( j , y.load(j ) - xmm1*factor );
4304 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4307 for( ; j<jpos; j+=SIMDSIZE )
4311 for(
size_t i=ii; i<iend; ++i ) {
4312 xmm1 +=
set( x[i] ) * A.load(i,j);
4315 y.store( j, y.load(j) - xmm1*factor );
4318 for( ; remainder && j<jend; ++j )
4322 for(
size_t i=ii; i<iend; ++i ) {
4323 value += x[i] * A(i,j);
4326 y[j] -= value * scalar;
4347 template<
typename VT1
4352 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4354 selectLargeSubAssignKernel( y, x, A, scalar );
4359 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4373 template<
typename VT1
4378 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4385 subAssign( y, tmp );
4388 gemv( y, x, A, ET(-scalar), ET(1) );
4410 template<
typename VT1 >
4421 const ResultType tmp(
serial( rhs ) );
4422 multAssign( ~lhs, tmp );
4442 template<
typename VT1 >
4453 const ResultType tmp(
serial( rhs ) );
4454 divAssign( ~lhs, tmp );
4476 template<
typename VT1 >
4487 if( right.rows() == 0UL ) {
4491 else if( right.columns() == 0UL ) {
4521 template<
typename VT1 >
4533 const ResultType tmp( rhs );
4552 template<
typename VT1 >
4563 if( right.rows() == 0UL || right.columns() == 0UL ) {
4597 template<
typename VT1 >
4608 if( right.rows() == 0UL || right.columns() == 0UL ) {
4643 template<
typename VT1 >
4655 const ResultType tmp( rhs );
4678 template<
typename VT1 >
4690 const ResultType tmp( rhs );
4753 template<
typename T1
4760 if( (~vec).
size() != (~mat).
rows() ) {
4790 template<
typename T1
4815 template<
typename VT,
typename MT >
4816 struct Size< TDVecDMatMultExpr<VT,MT> > :
public Columns<MT>
4832 template<
typename VT,
typename MT >
4833 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4834 :
public BoolConstant< And< IsAligned<VT>, IsAligned<MT> >::value >
4850 template<
typename VT,
typename MT,
bool AF >
typename SubmatrixExprTrait< MT, AF >::Type SubmatrixExprTrait_
Auxiliary alias declaration for the SubmatrixExprTrait type trait.The SubmatrixExprTrait_ alias decla...
Definition: SubmatrixExprTrait.h:134
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:213
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Evaluation of the expression type type of a subvector operation.Via this type trait it is possible to...
Definition: SubvectorExprTrait.h:79
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:211
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:194
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:298
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:532
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1755
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:365
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:163
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:385
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:331
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1802
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:133
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:355
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:71
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:214
Header file for the IsComplexDouble type trait.
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:136
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
typename MultExprTrait< T1, T2 >::Type MultExprTrait_
Auxiliary alias declaration for the MultExprTrait class template.The MultExprTrait_ alias declaration...
Definition: MultExprTrait.h:344
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:135
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
If_< IsExpression< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:217
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:266
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
If_< IsExpression< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:220
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the Columns type trait.
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:137
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraint on the data type.
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:134
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:386
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:603
Header file for all forward declarations for expression class templates.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:128
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:252
TDVecDMatMultExpr< VT, MT > This
Type of this TDVecDMatMultExpr instance.
Definition: TDVecDMatMultExpr.h:208
Header file for the IsNumeric type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:604
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:66
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:375
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:212
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:93
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:226
Header file for the TVecMatMultExpr base class.
Constraint on the data type.
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:146
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:311
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:223
Header file for BLAS general matrix/vector multiplication functions (gemv)
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:343
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:320
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:110
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:209
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:223
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:132
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:117
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:75
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:120
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:210
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:321
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:76
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:363
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
const DMatDMatMultExpr< T1, T2, false, false, false, false > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7505
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.