35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_ 119 template<
typename MT
122 :
public MatVecMultExpr< DenseVector< DMatDVecMultExpr<MT,VT>, false > >
152 template<
typename T1 >
153 struct UseSMPAssign {
154 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseBlasKernel {
171 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
187 template<
typename T1,
typename T2,
typename T3 >
188 struct UseVectorizedDefaultKernel {
189 enum :
bool { value = useOptimizedKernels &&
191 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
227 MT::simdEnabled && VT::simdEnabled &&
232 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
233 !evaluateVector && VT::smpAssignable };
266 return mat_(index,index) *
vec_[index];
276 const size_t n (
mat_.columns() -
begin );
294 if( index >=
mat_.rows() ) {
297 return (*
this)[index];
306 inline size_t size() const noexcept {
337 template<
typename T >
338 inline bool canAlias(
const T* alias )
const noexcept {
339 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
349 template<
typename T >
350 inline bool isAliased(
const T* alias )
const noexcept {
351 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
361 return mat_.isAligned() &&
vec_.isAligned();
375 (
mat_.rows() *
mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
376 (
size() > SMP_DMATDVECMULT_THRESHOLD );
399 template<
typename VT1 >
406 if( rhs.
mat_.rows() == 0UL ) {
409 else if( rhs.
mat_.columns() == 0UL ) {
422 DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
438 template<
typename VT1
441 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
445 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
446 selectSmallAssignKernel( y, A, x );
448 selectBlasAssignKernel( y, A, x );
467 template<
typename VT1
470 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
491 template<
typename VT1
495 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
497 selectDefaultAssignKernel( y, A, x );
516 template<
typename VT1
520 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
524 const size_t M( A.rows() );
525 const size_t N( A.columns() );
529 for( ; (i+8UL) <= M; i+=8UL )
539 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
540 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
542 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
545 for( ; j<jpos; j+=SIMDSIZE ) {
547 xmm1 += A.load(i ,j) * x1;
548 xmm2 += A.load(i+1UL,j) * x1;
549 xmm3 += A.load(i+2UL,j) * x1;
550 xmm4 += A.load(i+3UL,j) * x1;
551 xmm5 += A.load(i+4UL,j) * x1;
552 xmm6 += A.load(i+5UL,j) * x1;
553 xmm7 += A.load(i+6UL,j) * x1;
554 xmm8 += A.load(i+7UL,j) * x1;
558 y[i+1UL] =
sum( xmm2 );
559 y[i+2UL] =
sum( xmm3 );
560 y[i+3UL] =
sum( xmm4 );
561 y[i+4UL] =
sum( xmm5 );
562 y[i+5UL] =
sum( xmm6 );
563 y[i+6UL] =
sum( xmm7 );
564 y[i+7UL] =
sum( xmm8 );
566 for( ; remainder && j<jend; ++j ) {
567 y[i ] += A(i ,j) * x[j];
568 y[i+1UL] += A(i+1UL,j) * x[j];
569 y[i+2UL] += A(i+2UL,j) * x[j];
570 y[i+3UL] += A(i+3UL,j) * x[j];
571 y[i+4UL] += A(i+4UL,j) * x[j];
572 y[i+5UL] += A(i+5UL,j) * x[j];
573 y[i+6UL] += A(i+6UL,j) * x[j];
574 y[i+7UL] += A(i+7UL,j) * x[j];
578 for( ; (i+4UL) <= M; i+=4UL )
588 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
589 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
594 for( ; j<jpos; j+=SIMDSIZE ) {
596 xmm1 += A.load(i ,j) * x1;
597 xmm2 += A.load(i+1UL,j) * x1;
598 xmm3 += A.load(i+2UL,j) * x1;
599 xmm4 += A.load(i+3UL,j) * x1;
603 y[i+1UL] =
sum( xmm2 );
604 y[i+2UL] =
sum( xmm3 );
605 y[i+3UL] =
sum( xmm4 );
607 for( ; remainder && j<jend; ++j ) {
608 y[i ] += A(i ,j) * x[j];
609 y[i+1UL] += A(i+1UL,j) * x[j];
610 y[i+2UL] += A(i+2UL,j) * x[j];
611 y[i+3UL] += A(i+3UL,j) * x[j];
615 for( ; (i+3UL) <= M; i+=3UL )
625 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
626 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
631 for( ; j<jpos; j+=SIMDSIZE ) {
633 xmm1 += A.load(i ,j) * x1;
634 xmm2 += A.load(i+1UL,j) * x1;
635 xmm3 += A.load(i+2UL,j) * x1;
639 y[i+1UL] =
sum( xmm2 );
640 y[i+2UL] =
sum( xmm3 );
642 for( ; remainder && j<jend; ++j ) {
643 y[i ] += A(i ,j) * x[j];
644 y[i+1UL] += A(i+1UL,j) * x[j];
645 y[i+2UL] += A(i+2UL,j) * x[j];
649 for( ; (i+2UL) <= M; i+=2UL )
659 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
660 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
665 for( ; j<jpos; j+=SIMDSIZE ) {
667 xmm1 += A.load(i ,j) * x1;
668 xmm2 += A.load(i+1UL,j) * x1;
672 y[i+1UL] =
sum( xmm2 );
674 for( ; remainder && j<jend; ++j ) {
675 y[i ] += A(i ,j) * x[j];
676 y[i+1UL] += A(i+1UL,j) * x[j];
690 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
691 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
696 for( ; j<jpos; j+=SIMDSIZE ) {
697 xmm1 += A.load(i,j) * x.load(j);
702 for( ; remainder && j<jend; ++j ) {
703 y[i] += A(i,j) * x[j];
724 template<
typename VT1
728 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
730 selectDefaultAssignKernel( y, A, x );
749 template<
typename VT1
753 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
757 const size_t M( A.rows() );
758 const size_t N( A.columns() );
764 for( ; (i+8UL) <= M; i+=8UL )
774 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
775 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
779 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
780 const size_t j1( j+SIMDSIZE );
781 const size_t j2( j+SIMDSIZE*2UL );
782 const size_t j3( j+SIMDSIZE*3UL );
787 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
788 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
789 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
790 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
791 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
792 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
793 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
794 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
797 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
798 const size_t j1( j+SIMDSIZE );
801 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
802 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
803 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
804 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
805 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
806 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
807 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
808 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
811 for( ; j<jpos; j+=SIMDSIZE ) {
813 y[i ] +=
sum( A.load(i ,j) * x1 );
814 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
815 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
816 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
817 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
818 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
819 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
820 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
823 for( ; remainder && j<jend; ++j ) {
824 y[i ] += A(i ,j) * x[j];
825 y[i+1UL] += A(i+1UL,j) * x[j];
826 y[i+2UL] += A(i+2UL,j) * x[j];
827 y[i+3UL] += A(i+3UL,j) * x[j];
828 y[i+4UL] += A(i+4UL,j) * x[j];
829 y[i+5UL] += A(i+5UL,j) * x[j];
830 y[i+6UL] += A(i+6UL,j) * x[j];
831 y[i+7UL] += A(i+7UL,j) * x[j];
835 for( ; (i+4UL) <= M; i+=4UL )
845 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
846 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
850 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
851 const size_t j1( j+SIMDSIZE );
852 const size_t j2( j+SIMDSIZE*2UL );
853 const size_t j3( j+SIMDSIZE*3UL );
858 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
859 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
860 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
861 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
864 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
865 const size_t j1( j+SIMDSIZE );
868 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
869 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
870 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
871 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
874 for( ; j<jpos; j+=SIMDSIZE ) {
876 y[i ] +=
sum( A.load(i ,j) * x1 );
877 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
878 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
879 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
882 for( ; remainder && j<jend; ++j ) {
883 y[i ] += A(i ,j) * x[j];
884 y[i+1UL] += A(i+1UL,j) * x[j];
885 y[i+2UL] += A(i+2UL,j) * x[j];
886 y[i+3UL] += A(i+3UL,j) * x[j];
890 for( ; (i+2UL) <= M; i+=2UL )
900 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
901 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
905 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
906 const size_t j1( j+SIMDSIZE );
907 const size_t j2( j+SIMDSIZE*2UL );
908 const size_t j3( j+SIMDSIZE*3UL );
913 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
914 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
917 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
918 const size_t j1( j+SIMDSIZE );
921 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
922 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
925 for( ; j<jpos; j+=SIMDSIZE ) {
927 y[i ] +=
sum( A.load(i ,j) * x1 );
928 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
931 for( ; remainder && j<jend; ++j ) {
932 y[i ] += A(i ,j) * x[j];
933 y[i+1UL] += A(i+1UL,j) * x[j];
947 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
948 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
952 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
953 const size_t j1( j+SIMDSIZE );
954 const size_t j2( j+SIMDSIZE*2UL );
955 const size_t j3( j+SIMDSIZE*3UL );
960 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
963 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
964 const size_t j1( j+SIMDSIZE );
967 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
970 for( ; j<jpos; j+=SIMDSIZE ) {
972 y[i] +=
sum( A.load(i,j) * x1 );
975 for( ; remainder && j<jend; ++j ) {
976 y[i] += A(i,j) * x[j];
997 template<
typename VT1
1001 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1003 selectLargeAssignKernel( y, A, x );
1009 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1023 template<
typename VT1
1027 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1036 gemv( y, A, x, ET(1), ET(0) );
1056 template<
typename VT1 >
1068 assign( ~lhs, tmp );
1086 template<
typename VT1 >
1093 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
1105 DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1121 template<
typename VT1
1124 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1128 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1129 selectSmallAddAssignKernel( y, A, x );
1131 selectBlasAddAssignKernel( y, A, x );
1150 template<
typename VT1
1153 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1155 y.addAssign( A * x );
1174 template<
typename VT1
1178 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1180 selectDefaultAddAssignKernel( y, A, x );
1199 template<
typename VT1
1203 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1207 const size_t M( A.rows() );
1208 const size_t N( A.columns() );
1212 for( ; (i+8UL) <= M; i+=8UL )
1222 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1223 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1225 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1228 for( ; j<jpos; j+=SIMDSIZE ) {
1230 xmm1 += A.load(i ,j) * x1;
1231 xmm2 += A.load(i+1UL,j) * x1;
1232 xmm3 += A.load(i+2UL,j) * x1;
1233 xmm4 += A.load(i+3UL,j) * x1;
1234 xmm5 += A.load(i+4UL,j) * x1;
1235 xmm6 += A.load(i+5UL,j) * x1;
1236 xmm7 += A.load(i+6UL,j) * x1;
1237 xmm8 += A.load(i+7UL,j) * x1;
1240 y[i ] +=
sum( xmm1 );
1241 y[i+1UL] +=
sum( xmm2 );
1242 y[i+2UL] +=
sum( xmm3 );
1243 y[i+3UL] +=
sum( xmm4 );
1244 y[i+4UL] +=
sum( xmm5 );
1245 y[i+5UL] +=
sum( xmm6 );
1246 y[i+6UL] +=
sum( xmm7 );
1247 y[i+7UL] +=
sum( xmm8 );
1249 for( ; remainder && j<jend; ++j ) {
1250 y[i ] += A(i ,j) * x[j];
1251 y[i+1UL] += A(i+1UL,j) * x[j];
1252 y[i+2UL] += A(i+2UL,j) * x[j];
1253 y[i+3UL] += A(i+3UL,j) * x[j];
1254 y[i+4UL] += A(i+4UL,j) * x[j];
1255 y[i+5UL] += A(i+5UL,j) * x[j];
1256 y[i+6UL] += A(i+6UL,j) * x[j];
1257 y[i+7UL] += A(i+7UL,j) * x[j];
1261 for( ; (i+4UL) <= M; i+=4UL )
1271 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1272 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1277 for( ; j<jpos; j+=SIMDSIZE ) {
1279 xmm1 += A.load(i ,j) * x1;
1280 xmm2 += A.load(i+1UL,j) * x1;
1281 xmm3 += A.load(i+2UL,j) * x1;
1282 xmm4 += A.load(i+3UL,j) * x1;
1285 y[i ] +=
sum( xmm1 );
1286 y[i+1UL] +=
sum( xmm2 );
1287 y[i+2UL] +=
sum( xmm3 );
1288 y[i+3UL] +=
sum( xmm4 );
1290 for( ; remainder && j<jend; ++j ) {
1291 y[i ] += A(i ,j) * x[j];
1292 y[i+1UL] += A(i+1UL,j) * x[j];
1293 y[i+2UL] += A(i+2UL,j) * x[j];
1294 y[i+3UL] += A(i+3UL,j) * x[j];
1298 for( ; (i+3UL) <= M; i+=3UL )
1308 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1309 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1314 for( ; j<jpos; j+=SIMDSIZE ) {
1316 xmm1 += A.load(i ,j) * x1;
1317 xmm2 += A.load(i+1UL,j) * x1;
1318 xmm3 += A.load(i+2UL,j) * x1;
1321 y[i ] +=
sum( xmm1 );
1322 y[i+1UL] +=
sum( xmm2 );
1323 y[i+2UL] +=
sum( xmm3 );
1325 for( ; remainder && j<jend; ++j ) {
1326 y[i ] += A(i ,j) * x[j];
1327 y[i+1UL] += A(i+1UL,j) * x[j];
1328 y[i+2UL] += A(i+2UL,j) * x[j];
1332 for( ; (i+2UL) <= M; i+=2UL )
1342 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1343 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1348 for( ; j<jpos; j+=SIMDSIZE ) {
1350 xmm1 += A.load(i ,j) * x1;
1351 xmm2 += A.load(i+1UL,j) * x1;
1354 y[i ] +=
sum( xmm1 );
1355 y[i+1UL] +=
sum( xmm2 );
1357 for( ; remainder && j<jend; ++j ) {
1358 y[i ] += A(i ,j) * x[j];
1359 y[i+1UL] += A(i+1UL,j) * x[j];
1373 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1374 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1379 for( ; j<jpos; j+=SIMDSIZE ) {
1380 xmm1 += A.load(i,j) * x.load(j);
1383 y[i] +=
sum( xmm1 );
1385 for( ; remainder && j<jend; ++j ) {
1386 y[i] += A(i,j) * x[j];
1407 template<
typename VT1
1411 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1413 selectDefaultAddAssignKernel( y, A, x );
1432 template<
typename VT1
1436 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1440 const size_t M( A.rows() );
1441 const size_t N( A.columns() );
1445 for( ; (i+8UL) <= M; i+=8UL )
1455 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1456 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1460 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1461 const size_t j1( j+SIMDSIZE );
1462 const size_t j2( j+SIMDSIZE*2UL );
1463 const size_t j3( j+SIMDSIZE*3UL );
1468 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1469 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1470 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1471 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1472 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1473 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1474 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1475 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1478 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1479 const size_t j1( j+SIMDSIZE );
1482 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1483 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1484 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1485 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1486 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1487 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1488 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1489 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1492 for( ; j<jpos; j+=SIMDSIZE ) {
1494 y[i ] +=
sum( A.load(i ,j) * x1 );
1495 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1496 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1497 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1498 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
1499 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
1500 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
1501 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
1504 for( ; remainder && j<jend; ++j ) {
1505 y[i ] += A(i ,j) * x[j];
1506 y[i+1UL] += A(i+1UL,j) * x[j];
1507 y[i+2UL] += A(i+2UL,j) * x[j];
1508 y[i+3UL] += A(i+3UL,j) * x[j];
1509 y[i+4UL] += A(i+4UL,j) * x[j];
1510 y[i+5UL] += A(i+5UL,j) * x[j];
1511 y[i+6UL] += A(i+6UL,j) * x[j];
1512 y[i+7UL] += A(i+7UL,j) * x[j];
1516 for( ; (i+4UL) <= M; i+=4UL )
1526 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1527 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1531 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1532 const size_t j1( j+SIMDSIZE );
1533 const size_t j2( j+SIMDSIZE*2UL );
1534 const size_t j3( j+SIMDSIZE*3UL );
1539 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1540 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1541 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1542 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1545 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1546 const size_t j1( j+SIMDSIZE );
1549 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1550 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1551 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1552 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1555 for( ; j<jpos; j+=SIMDSIZE ) {
1557 y[i ] +=
sum( A.load(i ,j) * x1 );
1558 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1559 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1560 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1563 for( ; remainder && j<jend; ++j ) {
1564 y[i ] += A(i ,j) * x[j];
1565 y[i+1UL] += A(i+1UL,j) * x[j];
1566 y[i+2UL] += A(i+2UL,j) * x[j];
1567 y[i+3UL] += A(i+3UL,j) * x[j];
1571 for( ; (i+2UL) <= M; i+=2UL )
1581 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1582 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1586 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1587 const size_t j1( j+SIMDSIZE );
1588 const size_t j2( j+SIMDSIZE*2UL );
1589 const size_t j3( j+SIMDSIZE*3UL );
1594 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1595 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1598 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1599 const size_t j1( j+SIMDSIZE );
1602 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1603 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1606 for( ; j<jpos; j+=SIMDSIZE ) {
1608 y[i ] +=
sum( A.load(i ,j) * x1 );
1609 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1612 for( ; remainder && j<jend; ++j ) {
1613 y[i ] += A(i ,j) * x[j];
1614 y[i+1UL] += A(i+1UL,j) * x[j];
1628 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1629 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1633 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1634 const size_t j1( j+SIMDSIZE );
1635 const size_t j2( j+SIMDSIZE*2UL );
1636 const size_t j3( j+SIMDSIZE*3UL );
1641 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1644 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1645 const size_t j1( j+SIMDSIZE );
1648 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1651 for( ; j<jpos; j+=SIMDSIZE ) {
1653 y[i] +=
sum( A.load(i,j) * x1 );
1656 for( ; remainder && j<jend; ++j ) {
1657 y[i] += A(i,j) * x[j];
1678 template<
typename VT1
1682 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1684 selectLargeAddAssignKernel( y, A, x );
1690 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1704 template<
typename VT1
1708 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1715 addAssign( y, tmp );
1718 gemv( y, A, x, ET(1), ET(1) );
1742 template<
typename VT1 >
1749 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
1761 DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1777 template<
typename VT1
1780 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1784 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1785 selectSmallSubAssignKernel( y, A, x );
1787 selectBlasSubAssignKernel( y, A, x );
1806 template<
typename VT1
1809 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1811 y.subAssign( A * x );
1830 template<
typename VT1
1834 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1836 selectDefaultSubAssignKernel( y, A, x );
1855 template<
typename VT1
1859 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1863 const size_t M( A.rows() );
1864 const size_t N( A.columns() );
1868 for( ; (i+8UL) <= M; i+=8UL )
1878 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1879 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1881 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1884 for( ; j<jpos; j+=SIMDSIZE ) {
1886 xmm1 += A.load(i ,j) * x1;
1887 xmm2 += A.load(i+1UL,j) * x1;
1888 xmm3 += A.load(i+2UL,j) * x1;
1889 xmm4 += A.load(i+3UL,j) * x1;
1890 xmm5 += A.load(i+4UL,j) * x1;
1891 xmm6 += A.load(i+5UL,j) * x1;
1892 xmm7 += A.load(i+6UL,j) * x1;
1893 xmm8 += A.load(i+7UL,j) * x1;
1896 y[i ] -=
sum( xmm1 );
1897 y[i+1UL] -=
sum( xmm2 );
1898 y[i+2UL] -=
sum( xmm3 );
1899 y[i+3UL] -=
sum( xmm4 );
1900 y[i+4UL] -=
sum( xmm5 );
1901 y[i+5UL] -=
sum( xmm6 );
1902 y[i+6UL] -=
sum( xmm7 );
1903 y[i+7UL] -=
sum( xmm8 );
1905 for( ; remainder && j<jend; ++j ) {
1906 y[i ] -= A(i ,j) * x[j];
1907 y[i+1UL] -= A(i+1UL,j) * x[j];
1908 y[i+2UL] -= A(i+2UL,j) * x[j];
1909 y[i+3UL] -= A(i+3UL,j) * x[j];
1910 y[i+4UL] -= A(i+4UL,j) * x[j];
1911 y[i+5UL] -= A(i+5UL,j) * x[j];
1912 y[i+6UL] -= A(i+6UL,j) * x[j];
1913 y[i+7UL] -= A(i+7UL,j) * x[j];
1917 for( ; (i+4UL) <= M; i+=4UL )
1927 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1928 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1933 for( ; j<jpos; j+=SIMDSIZE ) {
1935 xmm1 += A.load(i ,j) * x1;
1936 xmm2 += A.load(i+1UL,j) * x1;
1937 xmm3 += A.load(i+2UL,j) * x1;
1938 xmm4 += A.load(i+3UL,j) * x1;
1941 y[i ] -=
sum( xmm1 );
1942 y[i+1UL] -=
sum( xmm2 );
1943 y[i+2UL] -=
sum( xmm3 );
1944 y[i+3UL] -=
sum( xmm4 );
1946 for( ; remainder && j<jend; ++j ) {
1947 y[i ] -= A(i ,j) * x[j];
1948 y[i+1UL] -= A(i+1UL,j) * x[j];
1949 y[i+2UL] -= A(i+2UL,j) * x[j];
1950 y[i+3UL] -= A(i+3UL,j) * x[j];
1954 for( ; (i+3UL) <= M; i+=3UL )
1964 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1965 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1970 for( ; j<jpos; j+=SIMDSIZE ) {
1972 xmm1 += A.load(i ,j) * x1;
1973 xmm2 += A.load(i+1UL,j) * x1;
1974 xmm3 += A.load(i+2UL,j) * x1;
1977 y[i ] -=
sum( xmm1 );
1978 y[i+1UL] -=
sum( xmm2 );
1979 y[i+2UL] -=
sum( xmm3 );
1981 for( ; remainder && j<jend; ++j ) {
1982 y[i ] -= A(i ,j) * x[j];
1983 y[i+1UL] -= A(i+1UL,j) * x[j];
1984 y[i+2UL] -= A(i+2UL,j) * x[j];
1988 for( ; (i+2UL) <= M; i+=2UL )
1998 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1999 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2004 for( ; j<jpos; j+=SIMDSIZE ) {
2006 xmm1 += A.load(i ,j) * x1;
2007 xmm2 += A.load(i+1UL,j) * x1;
2010 y[i ] -=
sum( xmm1 );
2011 y[i+1UL] -=
sum( xmm2 );
2013 for( ; remainder && j<jend; ++j ) {
2014 y[i ] -= A(i ,j) * x[j];
2015 y[i+1UL] -= A(i+1UL,j) * x[j];
2029 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2030 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2035 for( ; j<jpos; j+=SIMDSIZE ) {
2036 xmm1 += A.load(i,j) * x.load(j);
2039 y[i] -=
sum( xmm1 );
2041 for( ; remainder && j<jend; ++j ) {
2042 y[i] -= A(i,j) * x[j];
2063 template<
typename VT1
2067 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2069 selectDefaultSubAssignKernel( y, A, x );
2088 template<
typename VT1
2092 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2096 const size_t M( A.rows() );
2097 const size_t N( A.columns() );
2101 for( ; (i+8UL) <= M; i+=8UL )
2111 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2112 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2116 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2117 const size_t j1( j+SIMDSIZE );
2118 const size_t j2( j+SIMDSIZE*2UL );
2119 const size_t j3( j+SIMDSIZE*3UL );
2124 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2125 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2126 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2127 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2128 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2129 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2130 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2131 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2134 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2135 const size_t j1( j+SIMDSIZE );
2138 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2139 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2140 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2141 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2142 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2143 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2144 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2145 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2148 for( ; j<jpos; j+=SIMDSIZE ) {
2150 y[i ] -=
sum( A.load(i ,j) * x1 );
2151 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2152 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2153 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2154 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 );
2155 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 );
2156 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 );
2157 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 );
2160 for( ; remainder && j<jend; ++j ) {
2161 y[i ] -= A(i ,j) * x[j];
2162 y[i+1UL] -= A(i+1UL,j) * x[j];
2163 y[i+2UL] -= A(i+2UL,j) * x[j];
2164 y[i+3UL] -= A(i+3UL,j) * x[j];
2165 y[i+4UL] -= A(i+4UL,j) * x[j];
2166 y[i+5UL] -= A(i+5UL,j) * x[j];
2167 y[i+6UL] -= A(i+6UL,j) * x[j];
2168 y[i+7UL] -= A(i+7UL,j) * x[j];
2172 for( ; (i+4UL) <= M; i+=4UL )
2182 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2183 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2187 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2188 const size_t j1( j+SIMDSIZE );
2189 const size_t j2( j+SIMDSIZE*2UL );
2190 const size_t j3( j+SIMDSIZE*3UL );
2195 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2196 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2197 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2198 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2201 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2202 const size_t j1( j+SIMDSIZE );
2205 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2206 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2207 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2208 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2211 for( ; j<jpos; j+=SIMDSIZE ) {
2213 y[i ] -=
sum( A.load(i ,j) * x1 );
2214 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2215 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2216 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2219 for( ; remainder && j<jend; ++j ) {
2220 y[i ] -= A(i ,j) * x[j];
2221 y[i+1UL] -= A(i+1UL,j) * x[j];
2222 y[i+2UL] -= A(i+2UL,j) * x[j];
2223 y[i+3UL] -= A(i+3UL,j) * x[j];
2227 for( ; (i+2UL) <= M; i+=2UL )
2237 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2238 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2242 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2243 const size_t j1( j+SIMDSIZE );
2244 const size_t j2( j+SIMDSIZE*2UL );
2245 const size_t j3( j+SIMDSIZE*3UL );
2250 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2251 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2254 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2255 const size_t j1( j+SIMDSIZE );
2258 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2259 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2262 for( ; j<jpos; j+=SIMDSIZE ) {
2264 y[i ] -=
sum( A.load(i ,j) * x1 );
2265 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2268 for( ; remainder && j<jend; ++j ) {
2269 y[i ] -= A(i ,j) * x[j];
2270 y[i+1UL] -= A(i+1UL,j) * x[j];
2284 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2285 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2289 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2290 const size_t j1( j+SIMDSIZE );
2291 const size_t j2( j+SIMDSIZE*2UL );
2292 const size_t j3( j+SIMDSIZE*3UL );
2297 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2300 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2301 const size_t j1( j+SIMDSIZE );
2304 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2307 for( ; j<jpos; j+=SIMDSIZE ) {
2309 y[i] -=
sum( A.load(i,j) * x1 );
2312 for( ; remainder && j<jend; ++j ) {
2313 y[i] -= A(i,j) * x[j];
2334 template<
typename VT1
2338 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2340 selectLargeSubAssignKernel( y, A, x );
2346 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2360 template<
typename VT1
2364 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2371 subAssign( y, tmp );
2374 gemv( y, A, x, ET(-1), ET(1) );
2398 template<
typename VT1 >
2410 multAssign( ~lhs, tmp );
2432 template<
typename VT1 >
2444 divAssign( ~lhs, tmp );
2468 template<
typename VT1 >
2476 if( rhs.
mat_.rows() == 0UL ) {
2479 else if( rhs.
mat_.columns() == 0UL ) {
2512 template<
typename VT1 >
2545 template<
typename VT1 >
2553 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
2589 template<
typename VT1 >
2597 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
2633 template<
typename VT1 >
2670 template<
typename VT1 >
2721 template<
typename MT
2725 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false > >
2756 template<
typename T1 >
2757 struct UseSMPAssign {
2758 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
2766 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2767 struct UseBlasKernel {
2773 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2788 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2789 struct UseVectorizedDefaultKernel {
2790 enum :
bool { value = useOptimizedKernels &&
2792 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2828 MT::simdEnabled && VT::simdEnabled &&
2834 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2835 !evaluateVector && VT::smpAssignable };
2863 return vector_[index] * scalar_;
2875 if( index >= vector_.size() ) {
2878 return (*
this)[index];
2887 inline size_t size()
const {
2888 return vector_.size();
2918 template<
typename T >
2919 inline bool canAlias(
const T* alias )
const {
2920 return vector_.canAlias( alias );
2930 template<
typename T >
2931 inline bool isAliased(
const T* alias )
const {
2932 return vector_.isAliased( alias );
2942 return vector_.isAligned();
2957 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2958 (
size() > SMP_DMATDVECMULT_THRESHOLD );
2980 template<
typename VT1 >
2990 if( left.rows() == 0UL ) {
2993 else if( left.columns() == 0UL ) {
3006 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3021 template<
typename VT1
3025 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3029 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3030 selectSmallAssignKernel( y, A, x, scalar );
3032 selectBlasAssignKernel( y, A, x, scalar );
3050 template<
typename VT1
3055 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3057 y.assign( A * x * scalar );
3075 template<
typename VT1
3080 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3082 selectDefaultAssignKernel( y, A, x, scalar );
3100 template<
typename VT1
3105 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3109 const size_t M( A.rows() );
3110 const size_t N( A.columns() );
3114 for( ; (i+8UL) <= M; i+=8UL )
3124 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3125 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3127 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3130 for( ; j<jpos; j+=SIMDSIZE ) {
3132 xmm1 += A.load(i ,j) * x1;
3133 xmm2 += A.load(i+1UL,j) * x1;
3134 xmm3 += A.load(i+2UL,j) * x1;
3135 xmm4 += A.load(i+3UL,j) * x1;
3136 xmm5 += A.load(i+4UL,j) * x1;
3137 xmm6 += A.load(i+5UL,j) * x1;
3138 xmm7 += A.load(i+6UL,j) * x1;
3139 xmm8 += A.load(i+7UL,j) * x1;
3142 y[i ] =
sum( xmm1 ) * scalar;
3143 y[i+1UL] =
sum( xmm2 ) * scalar;
3144 y[i+2UL] =
sum( xmm3 ) * scalar;
3145 y[i+3UL] =
sum( xmm4 ) * scalar;
3146 y[i+4UL] =
sum( xmm5 ) * scalar;
3147 y[i+5UL] =
sum( xmm6 ) * scalar;
3148 y[i+6UL] =
sum( xmm7 ) * scalar;
3149 y[i+7UL] =
sum( xmm8 ) * scalar;
3151 for( ; remainder && j<jend; ++j ) {
3152 y[i ] += A(i ,j) * x[j] * scalar;
3153 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3154 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3155 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3156 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3157 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3158 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3159 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3163 for( ; (i+4UL) <= M; i+=4UL )
3173 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3174 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3179 for( ; j<jpos; j+=SIMDSIZE ) {
3181 xmm1 += A.load(i ,j) * x1;
3182 xmm2 += A.load(i+1UL,j) * x1;
3183 xmm3 += A.load(i+2UL,j) * x1;
3184 xmm4 += A.load(i+3UL,j) * x1;
3187 y[i ] =
sum( xmm1 ) * scalar;
3188 y[i+1UL] =
sum( xmm2 ) * scalar;
3189 y[i+2UL] =
sum( xmm3 ) * scalar;
3190 y[i+3UL] =
sum( xmm4 ) * scalar;
3192 for( ; remainder && j<jend; ++j ) {
3193 y[i ] += A(i ,j) * x[j] * scalar;
3194 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3195 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3196 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3200 for( ; (i+3UL) <= M; i+=3UL )
3210 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3211 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3216 for( ; j<jpos; j+=SIMDSIZE ) {
3218 xmm1 += A.load(i ,j) * x1;
3219 xmm2 += A.load(i+1UL,j) * x1;
3220 xmm3 += A.load(i+2UL,j) * x1;
3223 y[i ] =
sum( xmm1 ) * scalar;
3224 y[i+1UL] =
sum( xmm2 ) * scalar;
3225 y[i+2UL] =
sum( xmm3 ) * scalar;
3227 for( ; remainder && j<jend; ++j ) {
3228 y[i ] += A(i ,j) * x[j] * scalar;
3229 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3230 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3234 for( ; (i+2UL) <= M; i+=2UL )
3244 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3245 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3250 for( ; j<jpos; j+=SIMDSIZE ) {
3252 xmm1 += A.load(i ,j) * x1;
3253 xmm2 += A.load(i+1UL,j) * x1;
3256 y[i ] =
sum( xmm1 ) * scalar;
3257 y[i+1UL] =
sum( xmm2 ) * scalar;
3259 for( ; remainder && j<jend; ++j ) {
3260 y[i ] += A(i ,j) * x[j] * scalar;
3261 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3275 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3276 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3281 for( ; j<jpos; j+=SIMDSIZE ) {
3282 xmm1 += A.load(i,j) * x.load(j);
3285 y[i] =
sum( xmm1 ) * scalar;
3287 for( ; remainder && j<jend; ++j ) {
3288 y[i] += A(i,j) * x[j] * scalar;
3308 template<
typename VT1
3313 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3315 selectDefaultAssignKernel( y, A, x, scalar );
3333 template<
typename VT1
3338 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3342 const size_t M( A.rows() );
3343 const size_t N( A.columns() );
3349 for( ; (i+8UL) <= M; i+=8UL )
3359 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3360 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3364 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3365 const size_t j1( j+SIMDSIZE );
3366 const size_t j2( j+SIMDSIZE*2UL );
3367 const size_t j3( j+SIMDSIZE*3UL );
3372 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3373 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3374 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3375 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3376 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3377 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3378 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3379 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3382 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3383 const size_t j1( j+SIMDSIZE );
3386 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3387 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3388 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3389 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3390 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3391 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3392 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3393 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3396 for( ; j<jpos; j+=SIMDSIZE ) {
3398 y[i ] +=
sum( A.load(i ,j) * x1 );
3399 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3400 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3401 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3402 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
3403 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
3404 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
3405 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
3408 for( ; remainder && j<jend; ++j ) {
3409 y[i ] += A(i ,j) * x[j];
3410 y[i+1UL] += A(i+1UL,j) * x[j];
3411 y[i+2UL] += A(i+2UL,j) * x[j];
3412 y[i+3UL] += A(i+3UL,j) * x[j];
3413 y[i+4UL] += A(i+4UL,j) * x[j];
3414 y[i+5UL] += A(i+5UL,j) * x[j];
3415 y[i+6UL] += A(i+6UL,j) * x[j];
3416 y[i+7UL] += A(i+7UL,j) * x[j];
3429 for( ; (i+4UL) <= M; i+=4UL )
3439 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3440 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3444 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3445 const size_t j1( j+SIMDSIZE );
3446 const size_t j2( j+SIMDSIZE*2UL );
3447 const size_t j3( j+SIMDSIZE*3UL );
3452 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3453 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3454 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3455 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3458 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3459 const size_t j1( j+SIMDSIZE );
3462 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3463 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3464 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3465 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3468 for( ; j<jpos; j+=SIMDSIZE ) {
3470 y[i ] +=
sum( A.load(i ,j) * x1 );
3471 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3472 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3473 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3476 for( ; remainder && j<jend; ++j ) {
3477 y[i ] += A(i ,j) * x[j];
3478 y[i+1UL] += A(i+1UL,j) * x[j];
3479 y[i+2UL] += A(i+2UL,j) * x[j];
3480 y[i+3UL] += A(i+3UL,j) * x[j];
3489 for( ; (i+2UL) <= M; i+=2UL )
3499 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3500 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3504 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3505 const size_t j1( j+SIMDSIZE );
3506 const size_t j2( j+SIMDSIZE*2UL );
3507 const size_t j3( j+SIMDSIZE*3UL );
3512 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3513 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3516 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3517 const size_t j1( j+SIMDSIZE );
3520 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3521 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3524 for( ; j<jpos; j+=SIMDSIZE ) {
3526 y[i ] +=
sum( A.load(i ,j) * x1 );
3527 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3530 for( ; remainder && j<jend; ++j ) {
3531 y[i ] += A(i ,j) * x[j];
3532 y[i+1UL] += A(i+1UL,j) * x[j];
3549 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3550 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3554 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3555 const size_t j1( j+SIMDSIZE );
3556 const size_t j2( j+SIMDSIZE*2UL );
3557 const size_t j3( j+SIMDSIZE*3UL );
3562 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3565 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3566 const size_t j1( j+SIMDSIZE );
3569 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3572 for( ; j<jpos; j+=SIMDSIZE ) {
3574 y[i] +=
sum( A.load(i,j) * x1 );
3577 for( ; remainder && j<jend; ++j ) {
3578 y[i] += A(i,j) * x[j];
3600 template<
typename VT1
3605 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3607 selectLargeAssignKernel( y, A, x, scalar );
3612 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3626 template<
typename VT1
3631 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3636 assign( y, scalar * x );
3640 gemv( y, A, x, ET(scalar), ET(0) );
3658 template<
typename VT1 >
3670 assign( ~lhs, tmp );
3686 template<
typename VT1 >
3696 if( left.rows() == 0UL || left.columns() == 0UL ) {
3708 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3723 template<
typename VT1
3727 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3731 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3732 selectSmallAddAssignKernel( y, A, x, scalar );
3734 selectBlasAddAssignKernel( y, A, x, scalar );
3752 template<
typename VT1
3756 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3758 y.addAssign( A * x * scalar );
3776 template<
typename VT1
3781 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3783 selectDefaultAddAssignKernel( y, A, x, scalar );
3801 template<
typename VT1
3806 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3810 const size_t M( A.rows() );
3811 const size_t N( A.columns() );
3815 for( ; (i+8UL) <= M; i+=8UL )
3825 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3826 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3828 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3831 for( ; j<jpos; j+=SIMDSIZE ) {
3833 xmm1 += A.load(i ,j) * x1;
3834 xmm2 += A.load(i+1UL,j) * x1;
3835 xmm3 += A.load(i+2UL,j) * x1;
3836 xmm4 += A.load(i+3UL,j) * x1;
3837 xmm5 += A.load(i+4UL,j) * x1;
3838 xmm6 += A.load(i+5UL,j) * x1;
3839 xmm7 += A.load(i+6UL,j) * x1;
3840 xmm8 += A.load(i+7UL,j) * x1;
3843 y[i ] +=
sum( xmm1 ) * scalar;
3844 y[i+1UL] +=
sum( xmm2 ) * scalar;
3845 y[i+2UL] +=
sum( xmm3 ) * scalar;
3846 y[i+3UL] +=
sum( xmm4 ) * scalar;
3847 y[i+4UL] +=
sum( xmm5 ) * scalar;
3848 y[i+5UL] +=
sum( xmm6 ) * scalar;
3849 y[i+6UL] +=
sum( xmm7 ) * scalar;
3850 y[i+7UL] +=
sum( xmm8 ) * scalar;
3852 for( ; remainder && j<jend; ++j ) {
3853 y[i ] += A(i ,j) * x[j] * scalar;
3854 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3855 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3856 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3857 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3858 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3859 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3860 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3864 for( ; (i+4UL) <= M; i+=4UL )
3874 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3875 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3880 for( ; j<jpos; j+=SIMDSIZE ) {
3882 xmm1 += A.load(i ,j) * x1;
3883 xmm2 += A.load(i+1UL,j) * x1;
3884 xmm3 += A.load(i+2UL,j) * x1;
3885 xmm4 += A.load(i+3UL,j) * x1;
3888 y[i ] +=
sum( xmm1 ) * scalar;
3889 y[i+1UL] +=
sum( xmm2 ) * scalar;
3890 y[i+2UL] +=
sum( xmm3 ) * scalar;
3891 y[i+3UL] +=
sum( xmm4 ) * scalar;
3893 for( ; remainder && j<jend; ++j ) {
3894 y[i ] += A(i ,j) * x[j] * scalar;
3895 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3896 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3897 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3901 for( ; (i+3UL) <= M; i+=3UL )
3911 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3912 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3917 for( ; j<jpos; j+=SIMDSIZE ) {
3919 xmm1 += A.load(i ,j) * x1;
3920 xmm2 += A.load(i+1UL,j) * x1;
3921 xmm3 += A.load(i+2UL,j) * x1;
3924 y[i ] +=
sum( xmm1 ) * scalar;
3925 y[i+1UL] +=
sum( xmm2 ) * scalar;
3926 y[i+2UL] +=
sum( xmm3 ) * scalar;
3928 for( ; remainder && j<jend; ++j ) {
3929 y[i ] += A(i ,j) * x[j] * scalar;
3930 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3931 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3935 for( ; (i+2UL) <= M; i+=2UL )
3945 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3946 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3951 for( ; j<jpos; j+=SIMDSIZE ) {
3953 xmm1 += A.load(i ,j) * x1;
3954 xmm2 += A.load(i+1UL,j) * x1;
3957 y[i ] +=
sum( xmm1 ) * scalar;
3958 y[i+1UL] +=
sum( xmm2 ) * scalar;
3960 for( ; remainder && j<jend; ++j ) {
3961 y[i ] += A(i ,j) * x[j] * scalar;
3962 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3976 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3977 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3982 for( ; j<jpos; j+=SIMDSIZE ) {
3983 xmm1 += A.load(i,j) * x.load(j);
3986 y[i] +=
sum( xmm1 ) * scalar;
3988 for( ; remainder && j<jend; ++j ) {
3989 y[i] += A(i,j) * x[j] * scalar;
4009 template<
typename VT1
4014 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4016 selectDefaultAddAssignKernel( y, A, x, scalar );
4034 template<
typename VT1
4039 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4043 const size_t M( A.rows() );
4044 const size_t N( A.columns() );
4048 for( ; (i+8UL) <= M; i+=8UL )
4058 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4059 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4063 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4064 const size_t j1( j+SIMDSIZE );
4065 const size_t j2( j+SIMDSIZE*2UL );
4066 const size_t j3( j+SIMDSIZE*3UL );
4071 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4072 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4073 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4074 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4075 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4076 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4077 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4078 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4081 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4082 const size_t j1( j+SIMDSIZE );
4085 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4086 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4087 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4088 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4089 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4090 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4091 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4092 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4095 for( ; j<jpos; j+=SIMDSIZE ) {
4097 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4098 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4099 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4100 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4101 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4102 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4103 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4104 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4107 for( ; remainder && j<jend; ++j ) {
4108 y[i ] += A(i ,j) * x[j] * scalar;
4109 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4110 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4111 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4112 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4113 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4114 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4115 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4119 for( ; (i+4UL) <= M; i+=4UL )
4129 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4130 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4134 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4135 const size_t j1( j+SIMDSIZE );
4136 const size_t j2( j+SIMDSIZE*2UL );
4137 const size_t j3( j+SIMDSIZE*3UL );
4142 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4143 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4144 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4145 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4148 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4149 const size_t j1( j+SIMDSIZE );
4152 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4153 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4154 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4155 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4158 for( ; j<jpos; j+=SIMDSIZE ) {
4160 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4161 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4162 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4163 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4166 for( ; remainder && j<jend; ++j ) {
4167 y[i ] += A(i ,j) * x[j] * scalar;
4168 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4169 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4170 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4174 for( ; (i+2UL) <= M; i+=2UL )
4184 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4185 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4189 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4190 const size_t j1( j+SIMDSIZE );
4191 const size_t j2( j+SIMDSIZE*2UL );
4192 const size_t j3( j+SIMDSIZE*3UL );
4197 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4198 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4201 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4202 const size_t j1( j+SIMDSIZE );
4205 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4206 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4209 for( ; j<jpos; j+=SIMDSIZE ) {
4211 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4212 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4215 for( ; remainder && j<jend; ++j ) {
4216 y[i ] += A(i ,j) * x[j] * scalar;
4217 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4231 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4232 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4236 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4237 const size_t j1( j+SIMDSIZE );
4238 const size_t j2( j+SIMDSIZE*2UL );
4239 const size_t j3( j+SIMDSIZE*3UL );
4244 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4247 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4248 const size_t j1( j+SIMDSIZE );
4251 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4254 for( ; j<jpos; j+=SIMDSIZE ) {
4256 y[i] +=
sum( A.load(i,j) * x1 ) * scalar;
4259 for( ; remainder && j<jend; ++j ) {
4260 y[i] += A(i,j) * x[j] * scalar;
4280 template<
typename VT1
4285 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4287 selectLargeAddAssignKernel( y, A, x, scalar );
4292 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4306 template<
typename VT1
4311 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4318 addAssign( y, tmp );
4321 gemv( y, A, x, ET(scalar), ET(1) );
4343 template<
typename VT1 >
4353 if( left.rows() == 0UL || left.columns() == 0UL ) {
4365 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.
scalar_ );
4380 template<
typename VT1
4384 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4388 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4389 selectSmallSubAssignKernel( y, A, x, scalar );
4391 selectBlasSubAssignKernel( y, A, x, scalar );
4409 template<
typename VT1
4413 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4415 y.subAssign( A * x * scalar );
4433 template<
typename VT1
4438 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4440 selectDefaultSubAssignKernel( y, A, x, scalar );
4458 template<
typename VT1
4463 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4467 const size_t M( A.rows() );
4468 const size_t N( A.columns() );
4472 for( ; (i+8UL) <= M; i+=8UL )
4482 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4483 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4485 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4488 for( ; j<jpos; j+=SIMDSIZE ) {
4490 xmm1 += A.load(i ,j) * x1;
4491 xmm2 += A.load(i+1UL,j) * x1;
4492 xmm3 += A.load(i+2UL,j) * x1;
4493 xmm4 += A.load(i+3UL,j) * x1;
4494 xmm5 += A.load(i+4UL,j) * x1;
4495 xmm6 += A.load(i+5UL,j) * x1;
4496 xmm7 += A.load(i+6UL,j) * x1;
4497 xmm8 += A.load(i+7UL,j) * x1;
4500 y[i ] -=
sum( xmm1 ) * scalar;
4501 y[i+1UL] -=
sum( xmm2 ) * scalar;
4502 y[i+2UL] -=
sum( xmm3 ) * scalar;
4503 y[i+3UL] -=
sum( xmm4 ) * scalar;
4504 y[i+4UL] -=
sum( xmm5 ) * scalar;
4505 y[i+5UL] -=
sum( xmm6 ) * scalar;
4506 y[i+6UL] -=
sum( xmm7 ) * scalar;
4507 y[i+7UL] -=
sum( xmm8 ) * scalar;
4509 for( ; remainder && j<jend; ++j ) {
4510 y[i ] -= A(i ,j) * x[j] * scalar;
4511 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4512 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4513 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4514 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4515 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4516 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4517 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4521 for( ; (i+4UL) <= M; i+=4UL )
4531 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4532 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4537 for( ; j<jpos; j+=SIMDSIZE ) {
4539 xmm1 += A.load(i ,j) * x1;
4540 xmm2 += A.load(i+1UL,j) * x1;
4541 xmm3 += A.load(i+2UL,j) * x1;
4542 xmm4 += A.load(i+3UL,j) * x1;
4545 y[i ] -=
sum( xmm1 ) * scalar;
4546 y[i+1UL] -=
sum( xmm2 ) * scalar;
4547 y[i+2UL] -=
sum( xmm3 ) * scalar;
4548 y[i+3UL] -=
sum( xmm4 ) * scalar;
4550 for( ; remainder && j<jend; ++j ) {
4551 y[i ] -= A(i ,j) * x[j] * scalar;
4552 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4553 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4554 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4558 for( ; (i+3UL) <= M; i+=3UL )
4568 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4569 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4574 for( ; j<jpos; j+=SIMDSIZE ) {
4576 xmm1 += A.load(i ,j) * x1;
4577 xmm2 += A.load(i+1UL,j) * x1;
4578 xmm3 += A.load(i+2UL,j) * x1;
4581 y[i ] -=
sum( xmm1 ) * scalar;
4582 y[i+1UL] -=
sum( xmm2 ) * scalar;
4583 y[i+2UL] -=
sum( xmm3 ) * scalar;
4585 for( ; remainder && j<jend; ++j ) {
4586 y[i ] -= A(i ,j) * x[j] * scalar;
4587 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4588 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4592 for( ; (i+2UL) <= M; i+=2UL )
4602 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4603 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4608 for( ; j<jpos; j+=SIMDSIZE ) {
4610 xmm1 += A.load(i ,j) * x1;
4611 xmm2 += A.load(i+1UL,j) * x1;
4614 y[i ] -=
sum( xmm1 ) * scalar;
4615 y[i+1UL] -=
sum( xmm2 ) * scalar;
4617 for( ; remainder && j<jend; ++j ) {
4618 y[i ] -= A(i ,j) * x[j] * scalar;
4619 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4633 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4634 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4639 for( ; j<jpos; j+=SIMDSIZE ) {
4640 xmm1 += A.load(i,j) * x.load(j);
4643 y[i] -=
sum( xmm1 ) * scalar;
4645 for( ; remainder && j<jend; ++j ) {
4646 y[i] -= A(i,j) * x[j] * scalar;
4666 template<
typename VT1
4671 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4673 selectDefaultSubAssignKernel( y, A, x, scalar );
4691 template<
typename VT1
4696 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4700 const size_t M( A.rows() );
4701 const size_t N( A.columns() );
4705 for( ; (i+8UL) <= M; i+=8UL )
4715 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4716 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4720 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4721 const size_t j1( j+SIMDSIZE );
4722 const size_t j2( j+SIMDSIZE*2UL );
4723 const size_t j3( j+SIMDSIZE*3UL );
4728 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4729 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4730 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4731 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4732 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4733 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4734 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4735 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4738 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4739 const size_t j1( j+SIMDSIZE );
4742 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4743 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4744 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4745 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4746 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4747 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4748 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4749 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4752 for( ; j<jpos; j+=SIMDSIZE ) {
4754 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4755 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4756 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4757 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4758 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4759 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4760 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4761 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4764 for( ; remainder && j<jend; ++j ) {
4765 y[i ] -= A(i ,j) * x[j] * scalar;
4766 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4767 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4768 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4769 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4770 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4771 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4772 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4776 for( ; (i+4UL) <= M; i+=4UL )
4786 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4787 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4791 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4792 const size_t j1( j+SIMDSIZE );
4793 const size_t j2( j+SIMDSIZE*2UL );
4794 const size_t j3( j+SIMDSIZE*3UL );
4799 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4800 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4801 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4802 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4805 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4806 const size_t j1( j+SIMDSIZE );
4809 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4810 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4811 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4812 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4815 for( ; j<jpos; j+=SIMDSIZE ) {
4817 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4818 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4819 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4820 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4823 for( ; remainder && j<jend; ++j ) {
4824 y[i ] -= A(i ,j) * x[j] * scalar;
4825 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4826 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4827 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4831 for( ; (i+2UL) <= M; i+=2UL )
4841 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4842 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4846 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4847 const size_t j1( j+SIMDSIZE );
4848 const size_t j2( j+SIMDSIZE*2UL );
4849 const size_t j3( j+SIMDSIZE*3UL );
4854 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4855 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4858 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4859 const size_t j1( j+SIMDSIZE );
4862 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4863 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4866 for( ; j<jpos; j+=SIMDSIZE ) {
4868 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4869 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4872 for( ; remainder && j<jend; ++j ) {
4873 y[i ] -= A(i ,j) * x[j] * scalar;
4874 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4888 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4889 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4893 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4894 const size_t j1( j+SIMDSIZE );
4895 const size_t j2( j+SIMDSIZE*2UL );
4896 const size_t j3( j+SIMDSIZE*3UL );
4901 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4904 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4905 const size_t j1( j+SIMDSIZE );
4908 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4911 for( ; j<jpos; j+=SIMDSIZE ) {
4913 y[i] -=
sum( A.load(i,j) * x1 ) * scalar;
4916 for( ; remainder && j<jend; ++j ) {
4917 y[i] -= A(i,j) * x[j] * scalar;
4937 template<
typename VT1
4942 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4944 selectLargeSubAssignKernel( y, A, x, scalar );
4949 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4963 template<
typename VT1
4968 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4975 subAssign( y, tmp );
4978 gemv( y, A, x, ET(-scalar), ET(1) );
5000 template<
typename VT1 >
5012 multAssign( ~lhs, tmp );
5032 template<
typename VT1 >
5044 divAssign( ~lhs, tmp );
5066 template<
typename VT1 >
5077 if( left.rows() == 0UL ) {
5080 else if( left.columns() == 0UL ) {
5111 template<
typename VT1 >
5142 template<
typename VT1 >
5153 if( left.rows() == 0UL || left.columns() == 0UL ) {
5187 template<
typename VT1 >
5198 if( left.rows() == 0UL || left.columns() == 0UL ) {
5232 template<
typename VT1 >
5267 template<
typename VT1 >
5341 template<
typename MT
5343 inline decltype(
auto)
5382 template<
typename MT
5384 inline decltype(
auto)
5389 return (~mat).leftOperand() * ( (~mat).
rightOperand() * vec );
5405 template<
typename MT,
typename VT >
5406 struct Size< DMatDVecMultExpr<MT,VT> >
5423 template<
typename MT,
typename VT >
5424 struct IsAligned< DMatDVecMultExpr<MT,VT> >
5425 :
public BoolConstant< And< IsAligned<MT>, IsAligned<VT> >::value >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
If_< IsExpression< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:212
Header file for the Rows type trait.
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:221
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:261
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:218
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:129
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:121
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
If_< IsExpression< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:215
Header file for the IsSame and IsStrictlySame type traits.
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:127
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:207
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:132
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:521
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:338
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:350
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:204
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:370
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Row< MT > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:124
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:206
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:130
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:306
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:340
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:67
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:88
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:109
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:326
decltype(auto) operator*(const DenseMatrix< MT1, false > &lhs, const DenseMatrix< MT2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8893
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:382
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:209
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:593
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:360
Header file for run time assertion macros.
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:131
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:383
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:247
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:128
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:205
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:106
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:74
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:75
Header file for the IsComplex type trait.
Constraint on the data type.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:316
Header file for the MatVecMultExpr base class.
Constraint on the data type.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:293
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:208
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the function trace functionality.