35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_ 120 template<
typename MT
123 :
public MatVecMultExpr< DenseVector< DMatDVecMultExpr<MT,VT>, false > >
153 template<
typename T1 >
154 struct UseSMPAssign {
155 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
165 template<
typename T1,
typename T2,
typename T3 >
166 struct UseBlasKernel {
172 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
188 template<
typename T1,
typename T2,
typename T3 >
189 struct UseVectorizedDefaultKernel {
190 enum :
bool { value = useOptimizedKernels &&
192 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
228 MT::simdEnabled && VT::simdEnabled &&
233 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
234 !evaluateVector && VT::smpAssignable };
267 return mat_(index,index) *
vec_[index];
278 const size_t n (
mat_.columns() -
begin );
297 if( index >=
mat_.rows() ) {
300 return (*
this)[index];
309 inline size_t size() const noexcept {
340 template<
typename T >
341 inline bool canAlias(
const T* alias )
const noexcept {
342 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
352 template<
typename T >
353 inline bool isAliased(
const T* alias )
const noexcept {
354 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
364 return mat_.isAligned() &&
vec_.isAligned();
378 (
mat_.rows() *
mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
379 (
size() > SMP_DMATDVECMULT_THRESHOLD );
402 template<
typename VT1 >
409 if( rhs.
mat_.rows() == 0UL ) {
412 else if( rhs.
mat_.columns() == 0UL ) {
425 DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
441 template<
typename VT1
444 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
448 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
449 selectSmallAssignKernel( y, A, x );
451 selectBlasAssignKernel( y, A, x );
470 template<
typename VT1
473 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
494 template<
typename VT1
498 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
500 selectDefaultAssignKernel( y, A, x );
519 template<
typename VT1
523 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
527 const size_t M( A.rows() );
528 const size_t N( A.columns() );
532 for( ; (i+8UL) <= M; i+=8UL )
542 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
543 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
545 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
548 for( ; j<jpos; j+=SIMDSIZE ) {
550 xmm1 += A.load(i ,j) * x1;
551 xmm2 += A.load(i+1UL,j) * x1;
552 xmm3 += A.load(i+2UL,j) * x1;
553 xmm4 += A.load(i+3UL,j) * x1;
554 xmm5 += A.load(i+4UL,j) * x1;
555 xmm6 += A.load(i+5UL,j) * x1;
556 xmm7 += A.load(i+6UL,j) * x1;
557 xmm8 += A.load(i+7UL,j) * x1;
561 y[i+1UL] =
sum( xmm2 );
562 y[i+2UL] =
sum( xmm3 );
563 y[i+3UL] =
sum( xmm4 );
564 y[i+4UL] =
sum( xmm5 );
565 y[i+5UL] =
sum( xmm6 );
566 y[i+6UL] =
sum( xmm7 );
567 y[i+7UL] =
sum( xmm8 );
569 for( ; remainder && j<jend; ++j ) {
570 y[i ] += A(i ,j) * x[j];
571 y[i+1UL] += A(i+1UL,j) * x[j];
572 y[i+2UL] += A(i+2UL,j) * x[j];
573 y[i+3UL] += A(i+3UL,j) * x[j];
574 y[i+4UL] += A(i+4UL,j) * x[j];
575 y[i+5UL] += A(i+5UL,j) * x[j];
576 y[i+6UL] += A(i+6UL,j) * x[j];
577 y[i+7UL] += A(i+7UL,j) * x[j];
581 for( ; (i+4UL) <= M; i+=4UL )
591 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
592 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
597 for( ; j<jpos; j+=SIMDSIZE ) {
599 xmm1 += A.load(i ,j) * x1;
600 xmm2 += A.load(i+1UL,j) * x1;
601 xmm3 += A.load(i+2UL,j) * x1;
602 xmm4 += A.load(i+3UL,j) * x1;
606 y[i+1UL] =
sum( xmm2 );
607 y[i+2UL] =
sum( xmm3 );
608 y[i+3UL] =
sum( xmm4 );
610 for( ; remainder && j<jend; ++j ) {
611 y[i ] += A(i ,j) * x[j];
612 y[i+1UL] += A(i+1UL,j) * x[j];
613 y[i+2UL] += A(i+2UL,j) * x[j];
614 y[i+3UL] += A(i+3UL,j) * x[j];
618 for( ; (i+3UL) <= M; i+=3UL )
628 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
629 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
634 for( ; j<jpos; j+=SIMDSIZE ) {
636 xmm1 += A.load(i ,j) * x1;
637 xmm2 += A.load(i+1UL,j) * x1;
638 xmm3 += A.load(i+2UL,j) * x1;
642 y[i+1UL] =
sum( xmm2 );
643 y[i+2UL] =
sum( xmm3 );
645 for( ; remainder && j<jend; ++j ) {
646 y[i ] += A(i ,j) * x[j];
647 y[i+1UL] += A(i+1UL,j) * x[j];
648 y[i+2UL] += A(i+2UL,j) * x[j];
652 for( ; (i+2UL) <= M; i+=2UL )
662 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
663 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
668 for( ; j<jpos; j+=SIMDSIZE ) {
670 xmm1 += A.load(i ,j) * x1;
671 xmm2 += A.load(i+1UL,j) * x1;
675 y[i+1UL] =
sum( xmm2 );
677 for( ; remainder && j<jend; ++j ) {
678 y[i ] += A(i ,j) * x[j];
679 y[i+1UL] += A(i+1UL,j) * x[j];
693 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
694 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
699 for( ; j<jpos; j+=SIMDSIZE ) {
700 xmm1 += A.load(i,j) * x.load(j);
705 for( ; remainder && j<jend; ++j ) {
706 y[i] += A(i,j) * x[j];
727 template<
typename VT1
731 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
733 selectDefaultAssignKernel( y, A, x );
752 template<
typename VT1
756 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
760 const size_t M( A.rows() );
761 const size_t N( A.columns() );
767 for( ; (i+8UL) <= M; i+=8UL )
777 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
778 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
782 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
783 const size_t j1( j+SIMDSIZE );
784 const size_t j2( j+SIMDSIZE*2UL );
785 const size_t j3( j+SIMDSIZE*3UL );
790 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
791 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
792 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
793 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
794 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
795 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
796 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
797 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
800 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
801 const size_t j1( j+SIMDSIZE );
804 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
805 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
806 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
807 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
808 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
809 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
810 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
811 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
814 for( ; j<jpos; j+=SIMDSIZE ) {
816 y[i ] +=
sum( A.load(i ,j) * x1 );
817 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
818 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
819 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
820 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
821 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
822 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
823 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
826 for( ; remainder && j<jend; ++j ) {
827 y[i ] += A(i ,j) * x[j];
828 y[i+1UL] += A(i+1UL,j) * x[j];
829 y[i+2UL] += A(i+2UL,j) * x[j];
830 y[i+3UL] += A(i+3UL,j) * x[j];
831 y[i+4UL] += A(i+4UL,j) * x[j];
832 y[i+5UL] += A(i+5UL,j) * x[j];
833 y[i+6UL] += A(i+6UL,j) * x[j];
834 y[i+7UL] += A(i+7UL,j) * x[j];
838 for( ; (i+4UL) <= M; i+=4UL )
848 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
849 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
853 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
854 const size_t j1( j+SIMDSIZE );
855 const size_t j2( j+SIMDSIZE*2UL );
856 const size_t j3( j+SIMDSIZE*3UL );
861 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
862 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
863 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
864 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
867 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
868 const size_t j1( j+SIMDSIZE );
871 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
872 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
873 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
874 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
877 for( ; j<jpos; j+=SIMDSIZE ) {
879 y[i ] +=
sum( A.load(i ,j) * x1 );
880 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
881 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
882 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
885 for( ; remainder && j<jend; ++j ) {
886 y[i ] += A(i ,j) * x[j];
887 y[i+1UL] += A(i+1UL,j) * x[j];
888 y[i+2UL] += A(i+2UL,j) * x[j];
889 y[i+3UL] += A(i+3UL,j) * x[j];
893 for( ; (i+2UL) <= M; i+=2UL )
903 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
904 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
908 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
909 const size_t j1( j+SIMDSIZE );
910 const size_t j2( j+SIMDSIZE*2UL );
911 const size_t j3( j+SIMDSIZE*3UL );
916 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
917 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
920 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
921 const size_t j1( j+SIMDSIZE );
924 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
925 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
928 for( ; j<jpos; j+=SIMDSIZE ) {
930 y[i ] +=
sum( A.load(i ,j) * x1 );
931 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
934 for( ; remainder && j<jend; ++j ) {
935 y[i ] += A(i ,j) * x[j];
936 y[i+1UL] += A(i+1UL,j) * x[j];
950 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
951 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
955 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
956 const size_t j1( j+SIMDSIZE );
957 const size_t j2( j+SIMDSIZE*2UL );
958 const size_t j3( j+SIMDSIZE*3UL );
963 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
966 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
967 const size_t j1( j+SIMDSIZE );
970 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
973 for( ; j<jpos; j+=SIMDSIZE ) {
975 y[i] +=
sum( A.load(i,j) * x1 );
978 for( ; remainder && j<jend; ++j ) {
979 y[i] += A(i,j) * x[j];
1000 template<
typename VT1
1004 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1006 selectLargeAssignKernel( y, A, x );
1012 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1026 template<
typename VT1
1030 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1039 gemv( y, A, x, ET(1), ET(0) );
1059 template<
typename VT1 >
1071 assign( ~lhs, tmp );
1089 template<
typename VT1 >
1096 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
1108 DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1124 template<
typename VT1
1127 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1131 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1132 selectSmallAddAssignKernel( y, A, x );
1134 selectBlasAddAssignKernel( y, A, x );
1153 template<
typename VT1
1156 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1158 y.addAssign( A * x );
1177 template<
typename VT1
1181 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1183 selectDefaultAddAssignKernel( y, A, x );
1202 template<
typename VT1
1206 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1210 const size_t M( A.rows() );
1211 const size_t N( A.columns() );
1215 for( ; (i+8UL) <= M; i+=8UL )
1225 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1226 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1228 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1231 for( ; j<jpos; j+=SIMDSIZE ) {
1233 xmm1 += A.load(i ,j) * x1;
1234 xmm2 += A.load(i+1UL,j) * x1;
1235 xmm3 += A.load(i+2UL,j) * x1;
1236 xmm4 += A.load(i+3UL,j) * x1;
1237 xmm5 += A.load(i+4UL,j) * x1;
1238 xmm6 += A.load(i+5UL,j) * x1;
1239 xmm7 += A.load(i+6UL,j) * x1;
1240 xmm8 += A.load(i+7UL,j) * x1;
1243 y[i ] +=
sum( xmm1 );
1244 y[i+1UL] +=
sum( xmm2 );
1245 y[i+2UL] +=
sum( xmm3 );
1246 y[i+3UL] +=
sum( xmm4 );
1247 y[i+4UL] +=
sum( xmm5 );
1248 y[i+5UL] +=
sum( xmm6 );
1249 y[i+6UL] +=
sum( xmm7 );
1250 y[i+7UL] +=
sum( xmm8 );
1252 for( ; remainder && j<jend; ++j ) {
1253 y[i ] += A(i ,j) * x[j];
1254 y[i+1UL] += A(i+1UL,j) * x[j];
1255 y[i+2UL] += A(i+2UL,j) * x[j];
1256 y[i+3UL] += A(i+3UL,j) * x[j];
1257 y[i+4UL] += A(i+4UL,j) * x[j];
1258 y[i+5UL] += A(i+5UL,j) * x[j];
1259 y[i+6UL] += A(i+6UL,j) * x[j];
1260 y[i+7UL] += A(i+7UL,j) * x[j];
1264 for( ; (i+4UL) <= M; i+=4UL )
1274 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1275 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1280 for( ; j<jpos; j+=SIMDSIZE ) {
1282 xmm1 += A.load(i ,j) * x1;
1283 xmm2 += A.load(i+1UL,j) * x1;
1284 xmm3 += A.load(i+2UL,j) * x1;
1285 xmm4 += A.load(i+3UL,j) * x1;
1288 y[i ] +=
sum( xmm1 );
1289 y[i+1UL] +=
sum( xmm2 );
1290 y[i+2UL] +=
sum( xmm3 );
1291 y[i+3UL] +=
sum( xmm4 );
1293 for( ; remainder && j<jend; ++j ) {
1294 y[i ] += A(i ,j) * x[j];
1295 y[i+1UL] += A(i+1UL,j) * x[j];
1296 y[i+2UL] += A(i+2UL,j) * x[j];
1297 y[i+3UL] += A(i+3UL,j) * x[j];
1301 for( ; (i+3UL) <= M; i+=3UL )
1311 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1312 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1317 for( ; j<jpos; j+=SIMDSIZE ) {
1319 xmm1 += A.load(i ,j) * x1;
1320 xmm2 += A.load(i+1UL,j) * x1;
1321 xmm3 += A.load(i+2UL,j) * x1;
1324 y[i ] +=
sum( xmm1 );
1325 y[i+1UL] +=
sum( xmm2 );
1326 y[i+2UL] +=
sum( xmm3 );
1328 for( ; remainder && j<jend; ++j ) {
1329 y[i ] += A(i ,j) * x[j];
1330 y[i+1UL] += A(i+1UL,j) * x[j];
1331 y[i+2UL] += A(i+2UL,j) * x[j];
1335 for( ; (i+2UL) <= M; i+=2UL )
1345 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1346 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1351 for( ; j<jpos; j+=SIMDSIZE ) {
1353 xmm1 += A.load(i ,j) * x1;
1354 xmm2 += A.load(i+1UL,j) * x1;
1357 y[i ] +=
sum( xmm1 );
1358 y[i+1UL] +=
sum( xmm2 );
1360 for( ; remainder && j<jend; ++j ) {
1361 y[i ] += A(i ,j) * x[j];
1362 y[i+1UL] += A(i+1UL,j) * x[j];
1376 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1377 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1382 for( ; j<jpos; j+=SIMDSIZE ) {
1383 xmm1 += A.load(i,j) * x.load(j);
1386 y[i] +=
sum( xmm1 );
1388 for( ; remainder && j<jend; ++j ) {
1389 y[i] += A(i,j) * x[j];
1410 template<
typename VT1
1414 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1416 selectDefaultAddAssignKernel( y, A, x );
1435 template<
typename VT1
1439 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1443 const size_t M( A.rows() );
1444 const size_t N( A.columns() );
1448 for( ; (i+8UL) <= M; i+=8UL )
1458 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1459 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1463 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1464 const size_t j1( j+SIMDSIZE );
1465 const size_t j2( j+SIMDSIZE*2UL );
1466 const size_t j3( j+SIMDSIZE*3UL );
1471 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1472 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1473 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1474 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1475 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1476 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1477 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1478 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1481 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1482 const size_t j1( j+SIMDSIZE );
1485 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1486 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1487 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1488 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1489 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1490 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1491 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1492 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1495 for( ; j<jpos; j+=SIMDSIZE ) {
1497 y[i ] +=
sum( A.load(i ,j) * x1 );
1498 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1499 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1500 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1501 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
1502 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
1503 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
1504 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
1507 for( ; remainder && j<jend; ++j ) {
1508 y[i ] += A(i ,j) * x[j];
1509 y[i+1UL] += A(i+1UL,j) * x[j];
1510 y[i+2UL] += A(i+2UL,j) * x[j];
1511 y[i+3UL] += A(i+3UL,j) * x[j];
1512 y[i+4UL] += A(i+4UL,j) * x[j];
1513 y[i+5UL] += A(i+5UL,j) * x[j];
1514 y[i+6UL] += A(i+6UL,j) * x[j];
1515 y[i+7UL] += A(i+7UL,j) * x[j];
1519 for( ; (i+4UL) <= M; i+=4UL )
1529 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1530 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1534 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1535 const size_t j1( j+SIMDSIZE );
1536 const size_t j2( j+SIMDSIZE*2UL );
1537 const size_t j3( j+SIMDSIZE*3UL );
1542 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1543 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1544 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1545 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1548 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1549 const size_t j1( j+SIMDSIZE );
1552 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1553 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1554 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1555 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1558 for( ; j<jpos; j+=SIMDSIZE ) {
1560 y[i ] +=
sum( A.load(i ,j) * x1 );
1561 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1562 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1563 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1566 for( ; remainder && j<jend; ++j ) {
1567 y[i ] += A(i ,j) * x[j];
1568 y[i+1UL] += A(i+1UL,j) * x[j];
1569 y[i+2UL] += A(i+2UL,j) * x[j];
1570 y[i+3UL] += A(i+3UL,j) * x[j];
1574 for( ; (i+2UL) <= M; i+=2UL )
1584 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1585 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1589 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1590 const size_t j1( j+SIMDSIZE );
1591 const size_t j2( j+SIMDSIZE*2UL );
1592 const size_t j3( j+SIMDSIZE*3UL );
1597 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1598 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1601 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1602 const size_t j1( j+SIMDSIZE );
1605 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1606 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1609 for( ; j<jpos; j+=SIMDSIZE ) {
1611 y[i ] +=
sum( A.load(i ,j) * x1 );
1612 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1615 for( ; remainder && j<jend; ++j ) {
1616 y[i ] += A(i ,j) * x[j];
1617 y[i+1UL] += A(i+1UL,j) * x[j];
1631 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1632 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1636 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1637 const size_t j1( j+SIMDSIZE );
1638 const size_t j2( j+SIMDSIZE*2UL );
1639 const size_t j3( j+SIMDSIZE*3UL );
1644 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1647 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1648 const size_t j1( j+SIMDSIZE );
1651 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1654 for( ; j<jpos; j+=SIMDSIZE ) {
1656 y[i] +=
sum( A.load(i,j) * x1 );
1659 for( ; remainder && j<jend; ++j ) {
1660 y[i] += A(i,j) * x[j];
1681 template<
typename VT1
1685 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1687 selectLargeAddAssignKernel( y, A, x );
1693 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1707 template<
typename VT1
1711 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1718 addAssign( y, tmp );
1721 gemv( y, A, x, ET(1), ET(1) );
1745 template<
typename VT1 >
1752 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
1764 DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1780 template<
typename VT1
1783 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1787 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1788 selectSmallSubAssignKernel( y, A, x );
1790 selectBlasSubAssignKernel( y, A, x );
1809 template<
typename VT1
1812 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1814 y.subAssign( A * x );
1833 template<
typename VT1
1837 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1839 selectDefaultSubAssignKernel( y, A, x );
1858 template<
typename VT1
1862 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1866 const size_t M( A.rows() );
1867 const size_t N( A.columns() );
1871 for( ; (i+8UL) <= M; i+=8UL )
1881 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1882 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1884 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1887 for( ; j<jpos; j+=SIMDSIZE ) {
1889 xmm1 += A.load(i ,j) * x1;
1890 xmm2 += A.load(i+1UL,j) * x1;
1891 xmm3 += A.load(i+2UL,j) * x1;
1892 xmm4 += A.load(i+3UL,j) * x1;
1893 xmm5 += A.load(i+4UL,j) * x1;
1894 xmm6 += A.load(i+5UL,j) * x1;
1895 xmm7 += A.load(i+6UL,j) * x1;
1896 xmm8 += A.load(i+7UL,j) * x1;
1899 y[i ] -=
sum( xmm1 );
1900 y[i+1UL] -=
sum( xmm2 );
1901 y[i+2UL] -=
sum( xmm3 );
1902 y[i+3UL] -=
sum( xmm4 );
1903 y[i+4UL] -=
sum( xmm5 );
1904 y[i+5UL] -=
sum( xmm6 );
1905 y[i+6UL] -=
sum( xmm7 );
1906 y[i+7UL] -=
sum( xmm8 );
1908 for( ; remainder && j<jend; ++j ) {
1909 y[i ] -= A(i ,j) * x[j];
1910 y[i+1UL] -= A(i+1UL,j) * x[j];
1911 y[i+2UL] -= A(i+2UL,j) * x[j];
1912 y[i+3UL] -= A(i+3UL,j) * x[j];
1913 y[i+4UL] -= A(i+4UL,j) * x[j];
1914 y[i+5UL] -= A(i+5UL,j) * x[j];
1915 y[i+6UL] -= A(i+6UL,j) * x[j];
1916 y[i+7UL] -= A(i+7UL,j) * x[j];
1920 for( ; (i+4UL) <= M; i+=4UL )
1930 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1931 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1936 for( ; j<jpos; j+=SIMDSIZE ) {
1938 xmm1 += A.load(i ,j) * x1;
1939 xmm2 += A.load(i+1UL,j) * x1;
1940 xmm3 += A.load(i+2UL,j) * x1;
1941 xmm4 += A.load(i+3UL,j) * x1;
1944 y[i ] -=
sum( xmm1 );
1945 y[i+1UL] -=
sum( xmm2 );
1946 y[i+2UL] -=
sum( xmm3 );
1947 y[i+3UL] -=
sum( xmm4 );
1949 for( ; remainder && j<jend; ++j ) {
1950 y[i ] -= A(i ,j) * x[j];
1951 y[i+1UL] -= A(i+1UL,j) * x[j];
1952 y[i+2UL] -= A(i+2UL,j) * x[j];
1953 y[i+3UL] -= A(i+3UL,j) * x[j];
1957 for( ; (i+3UL) <= M; i+=3UL )
1967 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1968 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1973 for( ; j<jpos; j+=SIMDSIZE ) {
1975 xmm1 += A.load(i ,j) * x1;
1976 xmm2 += A.load(i+1UL,j) * x1;
1977 xmm3 += A.load(i+2UL,j) * x1;
1980 y[i ] -=
sum( xmm1 );
1981 y[i+1UL] -=
sum( xmm2 );
1982 y[i+2UL] -=
sum( xmm3 );
1984 for( ; remainder && j<jend; ++j ) {
1985 y[i ] -= A(i ,j) * x[j];
1986 y[i+1UL] -= A(i+1UL,j) * x[j];
1987 y[i+2UL] -= A(i+2UL,j) * x[j];
1991 for( ; (i+2UL) <= M; i+=2UL )
2001 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2002 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2007 for( ; j<jpos; j+=SIMDSIZE ) {
2009 xmm1 += A.load(i ,j) * x1;
2010 xmm2 += A.load(i+1UL,j) * x1;
2013 y[i ] -=
sum( xmm1 );
2014 y[i+1UL] -=
sum( xmm2 );
2016 for( ; remainder && j<jend; ++j ) {
2017 y[i ] -= A(i ,j) * x[j];
2018 y[i+1UL] -= A(i+1UL,j) * x[j];
2032 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2033 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2038 for( ; j<jpos; j+=SIMDSIZE ) {
2039 xmm1 += A.load(i,j) * x.load(j);
2042 y[i] -=
sum( xmm1 );
2044 for( ; remainder && j<jend; ++j ) {
2045 y[i] -= A(i,j) * x[j];
2066 template<
typename VT1
2070 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2072 selectDefaultSubAssignKernel( y, A, x );
2091 template<
typename VT1
2095 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2099 const size_t M( A.rows() );
2100 const size_t N( A.columns() );
2104 for( ; (i+8UL) <= M; i+=8UL )
2114 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2115 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2119 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2120 const size_t j1( j+SIMDSIZE );
2121 const size_t j2( j+SIMDSIZE*2UL );
2122 const size_t j3( j+SIMDSIZE*3UL );
2127 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2128 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2129 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2130 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2131 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2132 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2133 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2134 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2137 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2138 const size_t j1( j+SIMDSIZE );
2141 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2142 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2143 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2144 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2145 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2146 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2147 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2148 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2151 for( ; j<jpos; j+=SIMDSIZE ) {
2153 y[i ] -=
sum( A.load(i ,j) * x1 );
2154 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2155 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2156 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2157 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 );
2158 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 );
2159 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 );
2160 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 );
2163 for( ; remainder && j<jend; ++j ) {
2164 y[i ] -= A(i ,j) * x[j];
2165 y[i+1UL] -= A(i+1UL,j) * x[j];
2166 y[i+2UL] -= A(i+2UL,j) * x[j];
2167 y[i+3UL] -= A(i+3UL,j) * x[j];
2168 y[i+4UL] -= A(i+4UL,j) * x[j];
2169 y[i+5UL] -= A(i+5UL,j) * x[j];
2170 y[i+6UL] -= A(i+6UL,j) * x[j];
2171 y[i+7UL] -= A(i+7UL,j) * x[j];
2175 for( ; (i+4UL) <= M; i+=4UL )
2185 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2186 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2190 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2191 const size_t j1( j+SIMDSIZE );
2192 const size_t j2( j+SIMDSIZE*2UL );
2193 const size_t j3( j+SIMDSIZE*3UL );
2198 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2199 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2200 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2201 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2204 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2205 const size_t j1( j+SIMDSIZE );
2208 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2209 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2210 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2211 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2214 for( ; j<jpos; j+=SIMDSIZE ) {
2216 y[i ] -=
sum( A.load(i ,j) * x1 );
2217 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2218 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2219 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2222 for( ; remainder && j<jend; ++j ) {
2223 y[i ] -= A(i ,j) * x[j];
2224 y[i+1UL] -= A(i+1UL,j) * x[j];
2225 y[i+2UL] -= A(i+2UL,j) * x[j];
2226 y[i+3UL] -= A(i+3UL,j) * x[j];
2230 for( ; (i+2UL) <= M; i+=2UL )
2240 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2241 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2245 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2246 const size_t j1( j+SIMDSIZE );
2247 const size_t j2( j+SIMDSIZE*2UL );
2248 const size_t j3( j+SIMDSIZE*3UL );
2253 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2254 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2257 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2258 const size_t j1( j+SIMDSIZE );
2261 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2262 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2265 for( ; j<jpos; j+=SIMDSIZE ) {
2267 y[i ] -=
sum( A.load(i ,j) * x1 );
2268 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2271 for( ; remainder && j<jend; ++j ) {
2272 y[i ] -= A(i ,j) * x[j];
2273 y[i+1UL] -= A(i+1UL,j) * x[j];
2287 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2288 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2292 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2293 const size_t j1( j+SIMDSIZE );
2294 const size_t j2( j+SIMDSIZE*2UL );
2295 const size_t j3( j+SIMDSIZE*3UL );
2300 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2303 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2304 const size_t j1( j+SIMDSIZE );
2307 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2310 for( ; j<jpos; j+=SIMDSIZE ) {
2312 y[i] -=
sum( A.load(i,j) * x1 );
2315 for( ; remainder && j<jend; ++j ) {
2316 y[i] -= A(i,j) * x[j];
2337 template<
typename VT1
2341 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2343 selectLargeSubAssignKernel( y, A, x );
2349 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2363 template<
typename VT1
2367 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2374 subAssign( y, tmp );
2377 gemv( y, A, x, ET(-1), ET(1) );
2401 template<
typename VT1 >
2413 multAssign( ~lhs, tmp );
2435 template<
typename VT1 >
2447 divAssign( ~lhs, tmp );
2471 template<
typename VT1 >
2479 if( rhs.
mat_.rows() == 0UL ) {
2482 else if( rhs.
mat_.columns() == 0UL ) {
2515 template<
typename VT1 >
2548 template<
typename VT1 >
2556 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
2592 template<
typename VT1 >
2600 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
2636 template<
typename VT1 >
2673 template<
typename VT1 >
2724 template<
typename MT
2728 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false > >
2759 template<
typename T1 >
2760 struct UseSMPAssign {
2761 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
2769 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2770 struct UseBlasKernel {
2776 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2791 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2792 struct UseVectorizedDefaultKernel {
2793 enum :
bool { value = useOptimizedKernels &&
2795 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2831 MT::simdEnabled && VT::simdEnabled &&
2837 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2838 !evaluateVector && VT::smpAssignable };
2866 return vector_[index] * scalar_;
2878 if( index >= vector_.size() ) {
2881 return (*
this)[index];
2890 inline size_t size()
const {
2891 return vector_.size();
2921 template<
typename T >
2922 inline bool canAlias(
const T* alias )
const {
2923 return vector_.canAlias( alias );
2933 template<
typename T >
2934 inline bool isAliased(
const T* alias )
const {
2935 return vector_.isAliased( alias );
2945 return vector_.isAligned();
2960 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2961 (
size() > SMP_DMATDVECMULT_THRESHOLD );
2983 template<
typename VT1 >
2993 if( left.rows() == 0UL ) {
2996 else if( left.columns() == 0UL ) {
3009 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3024 template<
typename VT1
3028 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3032 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3033 selectSmallAssignKernel( y, A, x, scalar );
3035 selectBlasAssignKernel( y, A, x, scalar );
3053 template<
typename VT1
3058 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3060 y.assign( A * x * scalar );
3078 template<
typename VT1
3083 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3085 selectDefaultAssignKernel( y, A, x, scalar );
3103 template<
typename VT1
3108 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3112 const size_t M( A.rows() );
3113 const size_t N( A.columns() );
3117 for( ; (i+8UL) <= M; i+=8UL )
3127 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3128 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3130 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3133 for( ; j<jpos; j+=SIMDSIZE ) {
3135 xmm1 += A.load(i ,j) * x1;
3136 xmm2 += A.load(i+1UL,j) * x1;
3137 xmm3 += A.load(i+2UL,j) * x1;
3138 xmm4 += A.load(i+3UL,j) * x1;
3139 xmm5 += A.load(i+4UL,j) * x1;
3140 xmm6 += A.load(i+5UL,j) * x1;
3141 xmm7 += A.load(i+6UL,j) * x1;
3142 xmm8 += A.load(i+7UL,j) * x1;
3145 y[i ] =
sum( xmm1 ) * scalar;
3146 y[i+1UL] =
sum( xmm2 ) * scalar;
3147 y[i+2UL] =
sum( xmm3 ) * scalar;
3148 y[i+3UL] =
sum( xmm4 ) * scalar;
3149 y[i+4UL] =
sum( xmm5 ) * scalar;
3150 y[i+5UL] =
sum( xmm6 ) * scalar;
3151 y[i+6UL] =
sum( xmm7 ) * scalar;
3152 y[i+7UL] =
sum( xmm8 ) * scalar;
3154 for( ; remainder && j<jend; ++j ) {
3155 y[i ] += A(i ,j) * x[j] * scalar;
3156 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3157 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3158 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3159 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3160 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3161 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3162 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3166 for( ; (i+4UL) <= M; i+=4UL )
3176 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3177 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3182 for( ; j<jpos; j+=SIMDSIZE ) {
3184 xmm1 += A.load(i ,j) * x1;
3185 xmm2 += A.load(i+1UL,j) * x1;
3186 xmm3 += A.load(i+2UL,j) * x1;
3187 xmm4 += A.load(i+3UL,j) * x1;
3190 y[i ] =
sum( xmm1 ) * scalar;
3191 y[i+1UL] =
sum( xmm2 ) * scalar;
3192 y[i+2UL] =
sum( xmm3 ) * scalar;
3193 y[i+3UL] =
sum( xmm4 ) * scalar;
3195 for( ; remainder && j<jend; ++j ) {
3196 y[i ] += A(i ,j) * x[j] * scalar;
3197 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3198 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3199 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3203 for( ; (i+3UL) <= M; i+=3UL )
3213 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3214 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3219 for( ; j<jpos; j+=SIMDSIZE ) {
3221 xmm1 += A.load(i ,j) * x1;
3222 xmm2 += A.load(i+1UL,j) * x1;
3223 xmm3 += A.load(i+2UL,j) * x1;
3226 y[i ] =
sum( xmm1 ) * scalar;
3227 y[i+1UL] =
sum( xmm2 ) * scalar;
3228 y[i+2UL] =
sum( xmm3 ) * scalar;
3230 for( ; remainder && j<jend; ++j ) {
3231 y[i ] += A(i ,j) * x[j] * scalar;
3232 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3233 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3237 for( ; (i+2UL) <= M; i+=2UL )
3247 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3248 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3253 for( ; j<jpos; j+=SIMDSIZE ) {
3255 xmm1 += A.load(i ,j) * x1;
3256 xmm2 += A.load(i+1UL,j) * x1;
3259 y[i ] =
sum( xmm1 ) * scalar;
3260 y[i+1UL] =
sum( xmm2 ) * scalar;
3262 for( ; remainder && j<jend; ++j ) {
3263 y[i ] += A(i ,j) * x[j] * scalar;
3264 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3278 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3279 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3284 for( ; j<jpos; j+=SIMDSIZE ) {
3285 xmm1 += A.load(i,j) * x.load(j);
3288 y[i] =
sum( xmm1 ) * scalar;
3290 for( ; remainder && j<jend; ++j ) {
3291 y[i] += A(i,j) * x[j] * scalar;
3311 template<
typename VT1
3316 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3318 selectDefaultAssignKernel( y, A, x, scalar );
3336 template<
typename VT1
3341 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3345 const size_t M( A.rows() );
3346 const size_t N( A.columns() );
3352 for( ; (i+8UL) <= M; i+=8UL )
3362 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3363 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3367 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3368 const size_t j1( j+SIMDSIZE );
3369 const size_t j2( j+SIMDSIZE*2UL );
3370 const size_t j3( j+SIMDSIZE*3UL );
3375 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3376 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3377 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3378 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3379 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3380 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3381 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3382 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3385 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3386 const size_t j1( j+SIMDSIZE );
3389 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3390 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3391 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3392 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3393 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3394 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3395 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3396 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3399 for( ; j<jpos; j+=SIMDSIZE ) {
3401 y[i ] +=
sum( A.load(i ,j) * x1 );
3402 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3403 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3404 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3405 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
3406 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
3407 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
3408 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
3411 for( ; remainder && j<jend; ++j ) {
3412 y[i ] += A(i ,j) * x[j];
3413 y[i+1UL] += A(i+1UL,j) * x[j];
3414 y[i+2UL] += A(i+2UL,j) * x[j];
3415 y[i+3UL] += A(i+3UL,j) * x[j];
3416 y[i+4UL] += A(i+4UL,j) * x[j];
3417 y[i+5UL] += A(i+5UL,j) * x[j];
3418 y[i+6UL] += A(i+6UL,j) * x[j];
3419 y[i+7UL] += A(i+7UL,j) * x[j];
3432 for( ; (i+4UL) <= M; i+=4UL )
3442 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3443 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3447 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3448 const size_t j1( j+SIMDSIZE );
3449 const size_t j2( j+SIMDSIZE*2UL );
3450 const size_t j3( j+SIMDSIZE*3UL );
3455 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3456 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3457 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3458 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3461 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3462 const size_t j1( j+SIMDSIZE );
3465 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3466 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3467 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3468 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3471 for( ; j<jpos; j+=SIMDSIZE ) {
3473 y[i ] +=
sum( A.load(i ,j) * x1 );
3474 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3475 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3476 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3479 for( ; remainder && j<jend; ++j ) {
3480 y[i ] += A(i ,j) * x[j];
3481 y[i+1UL] += A(i+1UL,j) * x[j];
3482 y[i+2UL] += A(i+2UL,j) * x[j];
3483 y[i+3UL] += A(i+3UL,j) * x[j];
3492 for( ; (i+2UL) <= M; i+=2UL )
3502 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3503 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3507 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3508 const size_t j1( j+SIMDSIZE );
3509 const size_t j2( j+SIMDSIZE*2UL );
3510 const size_t j3( j+SIMDSIZE*3UL );
3515 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3516 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3519 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3520 const size_t j1( j+SIMDSIZE );
3523 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3524 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3527 for( ; j<jpos; j+=SIMDSIZE ) {
3529 y[i ] +=
sum( A.load(i ,j) * x1 );
3530 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3533 for( ; remainder && j<jend; ++j ) {
3534 y[i ] += A(i ,j) * x[j];
3535 y[i+1UL] += A(i+1UL,j) * x[j];
3552 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3553 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3557 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3558 const size_t j1( j+SIMDSIZE );
3559 const size_t j2( j+SIMDSIZE*2UL );
3560 const size_t j3( j+SIMDSIZE*3UL );
3565 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3568 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3569 const size_t j1( j+SIMDSIZE );
3572 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3575 for( ; j<jpos; j+=SIMDSIZE ) {
3577 y[i] +=
sum( A.load(i,j) * x1 );
3580 for( ; remainder && j<jend; ++j ) {
3581 y[i] += A(i,j) * x[j];
3603 template<
typename VT1
3608 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3610 selectLargeAssignKernel( y, A, x, scalar );
3615 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3629 template<
typename VT1
3634 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3639 assign( y, scalar * x );
3643 gemv( y, A, x, ET(scalar), ET(0) );
3661 template<
typename VT1 >
3673 assign( ~lhs, tmp );
3689 template<
typename VT1 >
3699 if( left.rows() == 0UL || left.columns() == 0UL ) {
3711 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3726 template<
typename VT1
3730 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3734 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3735 selectSmallAddAssignKernel( y, A, x, scalar );
3737 selectBlasAddAssignKernel( y, A, x, scalar );
3755 template<
typename VT1
3759 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3761 y.addAssign( A * x * scalar );
3779 template<
typename VT1
3784 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3786 selectDefaultAddAssignKernel( y, A, x, scalar );
3804 template<
typename VT1
3809 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3813 const size_t M( A.rows() );
3814 const size_t N( A.columns() );
3818 for( ; (i+8UL) <= M; i+=8UL )
3828 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3829 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3831 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3834 for( ; j<jpos; j+=SIMDSIZE ) {
3836 xmm1 += A.load(i ,j) * x1;
3837 xmm2 += A.load(i+1UL,j) * x1;
3838 xmm3 += A.load(i+2UL,j) * x1;
3839 xmm4 += A.load(i+3UL,j) * x1;
3840 xmm5 += A.load(i+4UL,j) * x1;
3841 xmm6 += A.load(i+5UL,j) * x1;
3842 xmm7 += A.load(i+6UL,j) * x1;
3843 xmm8 += A.load(i+7UL,j) * x1;
3846 y[i ] +=
sum( xmm1 ) * scalar;
3847 y[i+1UL] +=
sum( xmm2 ) * scalar;
3848 y[i+2UL] +=
sum( xmm3 ) * scalar;
3849 y[i+3UL] +=
sum( xmm4 ) * scalar;
3850 y[i+4UL] +=
sum( xmm5 ) * scalar;
3851 y[i+5UL] +=
sum( xmm6 ) * scalar;
3852 y[i+6UL] +=
sum( xmm7 ) * scalar;
3853 y[i+7UL] +=
sum( xmm8 ) * scalar;
3855 for( ; remainder && j<jend; ++j ) {
3856 y[i ] += A(i ,j) * x[j] * scalar;
3857 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3858 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3859 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3860 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3861 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3862 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3863 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3867 for( ; (i+4UL) <= M; i+=4UL )
3877 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3878 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3883 for( ; j<jpos; j+=SIMDSIZE ) {
3885 xmm1 += A.load(i ,j) * x1;
3886 xmm2 += A.load(i+1UL,j) * x1;
3887 xmm3 += A.load(i+2UL,j) * x1;
3888 xmm4 += A.load(i+3UL,j) * x1;
3891 y[i ] +=
sum( xmm1 ) * scalar;
3892 y[i+1UL] +=
sum( xmm2 ) * scalar;
3893 y[i+2UL] +=
sum( xmm3 ) * scalar;
3894 y[i+3UL] +=
sum( xmm4 ) * scalar;
3896 for( ; remainder && j<jend; ++j ) {
3897 y[i ] += A(i ,j) * x[j] * scalar;
3898 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3899 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3900 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3904 for( ; (i+3UL) <= M; i+=3UL )
3914 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3915 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3920 for( ; j<jpos; j+=SIMDSIZE ) {
3922 xmm1 += A.load(i ,j) * x1;
3923 xmm2 += A.load(i+1UL,j) * x1;
3924 xmm3 += A.load(i+2UL,j) * x1;
3927 y[i ] +=
sum( xmm1 ) * scalar;
3928 y[i+1UL] +=
sum( xmm2 ) * scalar;
3929 y[i+2UL] +=
sum( xmm3 ) * scalar;
3931 for( ; remainder && j<jend; ++j ) {
3932 y[i ] += A(i ,j) * x[j] * scalar;
3933 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3934 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3938 for( ; (i+2UL) <= M; i+=2UL )
3948 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3949 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3954 for( ; j<jpos; j+=SIMDSIZE ) {
3956 xmm1 += A.load(i ,j) * x1;
3957 xmm2 += A.load(i+1UL,j) * x1;
3960 y[i ] +=
sum( xmm1 ) * scalar;
3961 y[i+1UL] +=
sum( xmm2 ) * scalar;
3963 for( ; remainder && j<jend; ++j ) {
3964 y[i ] += A(i ,j) * x[j] * scalar;
3965 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3979 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3980 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3985 for( ; j<jpos; j+=SIMDSIZE ) {
3986 xmm1 += A.load(i,j) * x.load(j);
3989 y[i] +=
sum( xmm1 ) * scalar;
3991 for( ; remainder && j<jend; ++j ) {
3992 y[i] += A(i,j) * x[j] * scalar;
4012 template<
typename VT1
4017 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4019 selectDefaultAddAssignKernel( y, A, x, scalar );
4037 template<
typename VT1
4042 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4046 const size_t M( A.rows() );
4047 const size_t N( A.columns() );
4051 for( ; (i+8UL) <= M; i+=8UL )
4061 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4062 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4066 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4067 const size_t j1( j+SIMDSIZE );
4068 const size_t j2( j+SIMDSIZE*2UL );
4069 const size_t j3( j+SIMDSIZE*3UL );
4074 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4075 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4076 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4077 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4078 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4079 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4080 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4081 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4084 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4085 const size_t j1( j+SIMDSIZE );
4088 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4089 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4090 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4091 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4092 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4093 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4094 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4095 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4098 for( ; j<jpos; j+=SIMDSIZE ) {
4100 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4101 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4102 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4103 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4104 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4105 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4106 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4107 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4110 for( ; remainder && j<jend; ++j ) {
4111 y[i ] += A(i ,j) * x[j] * scalar;
4112 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4113 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4114 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4115 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4116 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4117 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4118 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4122 for( ; (i+4UL) <= M; i+=4UL )
4132 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4133 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4137 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4138 const size_t j1( j+SIMDSIZE );
4139 const size_t j2( j+SIMDSIZE*2UL );
4140 const size_t j3( j+SIMDSIZE*3UL );
4145 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4146 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4147 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4148 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4151 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4152 const size_t j1( j+SIMDSIZE );
4155 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4156 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4157 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4158 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4161 for( ; j<jpos; j+=SIMDSIZE ) {
4163 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4164 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4165 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4166 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4169 for( ; remainder && j<jend; ++j ) {
4170 y[i ] += A(i ,j) * x[j] * scalar;
4171 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4172 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4173 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4177 for( ; (i+2UL) <= M; i+=2UL )
4187 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4188 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4192 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4193 const size_t j1( j+SIMDSIZE );
4194 const size_t j2( j+SIMDSIZE*2UL );
4195 const size_t j3( j+SIMDSIZE*3UL );
4200 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4201 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4204 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4205 const size_t j1( j+SIMDSIZE );
4208 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4209 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4212 for( ; j<jpos; j+=SIMDSIZE ) {
4214 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4215 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4218 for( ; remainder && j<jend; ++j ) {
4219 y[i ] += A(i ,j) * x[j] * scalar;
4220 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4234 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4235 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4239 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4240 const size_t j1( j+SIMDSIZE );
4241 const size_t j2( j+SIMDSIZE*2UL );
4242 const size_t j3( j+SIMDSIZE*3UL );
4247 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4250 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4251 const size_t j1( j+SIMDSIZE );
4254 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4257 for( ; j<jpos; j+=SIMDSIZE ) {
4259 y[i] +=
sum( A.load(i,j) * x1 ) * scalar;
4262 for( ; remainder && j<jend; ++j ) {
4263 y[i] += A(i,j) * x[j] * scalar;
4283 template<
typename VT1
4288 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4290 selectLargeAddAssignKernel( y, A, x, scalar );
4295 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4309 template<
typename VT1
4314 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4321 addAssign( y, tmp );
4324 gemv( y, A, x, ET(scalar), ET(1) );
4346 template<
typename VT1 >
4356 if( left.rows() == 0UL || left.columns() == 0UL ) {
4368 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.
scalar_ );
4383 template<
typename VT1
4387 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4391 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4392 selectSmallSubAssignKernel( y, A, x, scalar );
4394 selectBlasSubAssignKernel( y, A, x, scalar );
4412 template<
typename VT1
4416 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4418 y.subAssign( A * x * scalar );
4436 template<
typename VT1
4441 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4443 selectDefaultSubAssignKernel( y, A, x, scalar );
4461 template<
typename VT1
4466 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4470 const size_t M( A.rows() );
4471 const size_t N( A.columns() );
4475 for( ; (i+8UL) <= M; i+=8UL )
4485 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4486 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4488 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4491 for( ; j<jpos; j+=SIMDSIZE ) {
4493 xmm1 += A.load(i ,j) * x1;
4494 xmm2 += A.load(i+1UL,j) * x1;
4495 xmm3 += A.load(i+2UL,j) * x1;
4496 xmm4 += A.load(i+3UL,j) * x1;
4497 xmm5 += A.load(i+4UL,j) * x1;
4498 xmm6 += A.load(i+5UL,j) * x1;
4499 xmm7 += A.load(i+6UL,j) * x1;
4500 xmm8 += A.load(i+7UL,j) * x1;
4503 y[i ] -=
sum( xmm1 ) * scalar;
4504 y[i+1UL] -=
sum( xmm2 ) * scalar;
4505 y[i+2UL] -=
sum( xmm3 ) * scalar;
4506 y[i+3UL] -=
sum( xmm4 ) * scalar;
4507 y[i+4UL] -=
sum( xmm5 ) * scalar;
4508 y[i+5UL] -=
sum( xmm6 ) * scalar;
4509 y[i+6UL] -=
sum( xmm7 ) * scalar;
4510 y[i+7UL] -=
sum( xmm8 ) * scalar;
4512 for( ; remainder && j<jend; ++j ) {
4513 y[i ] -= A(i ,j) * x[j] * scalar;
4514 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4515 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4516 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4517 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4518 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4519 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4520 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4524 for( ; (i+4UL) <= M; i+=4UL )
4534 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4535 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4540 for( ; j<jpos; j+=SIMDSIZE ) {
4542 xmm1 += A.load(i ,j) * x1;
4543 xmm2 += A.load(i+1UL,j) * x1;
4544 xmm3 += A.load(i+2UL,j) * x1;
4545 xmm4 += A.load(i+3UL,j) * x1;
4548 y[i ] -=
sum( xmm1 ) * scalar;
4549 y[i+1UL] -=
sum( xmm2 ) * scalar;
4550 y[i+2UL] -=
sum( xmm3 ) * scalar;
4551 y[i+3UL] -=
sum( xmm4 ) * scalar;
4553 for( ; remainder && j<jend; ++j ) {
4554 y[i ] -= A(i ,j) * x[j] * scalar;
4555 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4556 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4557 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4561 for( ; (i+3UL) <= M; i+=3UL )
4571 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4572 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4577 for( ; j<jpos; j+=SIMDSIZE ) {
4579 xmm1 += A.load(i ,j) * x1;
4580 xmm2 += A.load(i+1UL,j) * x1;
4581 xmm3 += A.load(i+2UL,j) * x1;
4584 y[i ] -=
sum( xmm1 ) * scalar;
4585 y[i+1UL] -=
sum( xmm2 ) * scalar;
4586 y[i+2UL] -=
sum( xmm3 ) * scalar;
4588 for( ; remainder && j<jend; ++j ) {
4589 y[i ] -= A(i ,j) * x[j] * scalar;
4590 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4591 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4595 for( ; (i+2UL) <= M; i+=2UL )
4605 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4606 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4611 for( ; j<jpos; j+=SIMDSIZE ) {
4613 xmm1 += A.load(i ,j) * x1;
4614 xmm2 += A.load(i+1UL,j) * x1;
4617 y[i ] -=
sum( xmm1 ) * scalar;
4618 y[i+1UL] -=
sum( xmm2 ) * scalar;
4620 for( ; remainder && j<jend; ++j ) {
4621 y[i ] -= A(i ,j) * x[j] * scalar;
4622 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4636 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4637 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4642 for( ; j<jpos; j+=SIMDSIZE ) {
4643 xmm1 += A.load(i,j) * x.load(j);
4646 y[i] -=
sum( xmm1 ) * scalar;
4648 for( ; remainder && j<jend; ++j ) {
4649 y[i] -= A(i,j) * x[j] * scalar;
4669 template<
typename VT1
4674 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4676 selectDefaultSubAssignKernel( y, A, x, scalar );
4694 template<
typename VT1
4699 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4703 const size_t M( A.rows() );
4704 const size_t N( A.columns() );
4708 for( ; (i+8UL) <= M; i+=8UL )
4718 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4719 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4723 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4724 const size_t j1( j+SIMDSIZE );
4725 const size_t j2( j+SIMDSIZE*2UL );
4726 const size_t j3( j+SIMDSIZE*3UL );
4731 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4732 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4733 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4734 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4735 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4736 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4737 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4738 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4741 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4742 const size_t j1( j+SIMDSIZE );
4745 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4746 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4747 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4748 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4749 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4750 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4751 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4752 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4755 for( ; j<jpos; j+=SIMDSIZE ) {
4757 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4758 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4759 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4760 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4761 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4762 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4763 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4764 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4767 for( ; remainder && j<jend; ++j ) {
4768 y[i ] -= A(i ,j) * x[j] * scalar;
4769 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4770 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4771 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4772 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4773 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4774 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4775 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4779 for( ; (i+4UL) <= M; i+=4UL )
4789 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4790 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4794 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4795 const size_t j1( j+SIMDSIZE );
4796 const size_t j2( j+SIMDSIZE*2UL );
4797 const size_t j3( j+SIMDSIZE*3UL );
4802 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4803 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4804 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4805 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4808 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4809 const size_t j1( j+SIMDSIZE );
4812 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4813 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4814 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4815 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4818 for( ; j<jpos; j+=SIMDSIZE ) {
4820 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4821 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4822 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4823 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4826 for( ; remainder && j<jend; ++j ) {
4827 y[i ] -= A(i ,j) * x[j] * scalar;
4828 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4829 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4830 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4834 for( ; (i+2UL) <= M; i+=2UL )
4844 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4845 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4849 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4850 const size_t j1( j+SIMDSIZE );
4851 const size_t j2( j+SIMDSIZE*2UL );
4852 const size_t j3( j+SIMDSIZE*3UL );
4857 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4858 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4861 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4862 const size_t j1( j+SIMDSIZE );
4865 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4866 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4869 for( ; j<jpos; j+=SIMDSIZE ) {
4871 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4872 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4875 for( ; remainder && j<jend; ++j ) {
4876 y[i ] -= A(i ,j) * x[j] * scalar;
4877 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4891 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4892 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4896 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4897 const size_t j1( j+SIMDSIZE );
4898 const size_t j2( j+SIMDSIZE*2UL );
4899 const size_t j3( j+SIMDSIZE*3UL );
4904 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4907 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4908 const size_t j1( j+SIMDSIZE );
4911 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4914 for( ; j<jpos; j+=SIMDSIZE ) {
4916 y[i] -=
sum( A.load(i,j) * x1 ) * scalar;
4919 for( ; remainder && j<jend; ++j ) {
4920 y[i] -= A(i,j) * x[j] * scalar;
4940 template<
typename VT1
4945 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4947 selectLargeSubAssignKernel( y, A, x, scalar );
4952 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4966 template<
typename VT1
4971 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4978 subAssign( y, tmp );
4981 gemv( y, A, x, ET(-scalar), ET(1) );
5003 template<
typename VT1 >
5015 multAssign( ~lhs, tmp );
5035 template<
typename VT1 >
5047 divAssign( ~lhs, tmp );
5069 template<
typename VT1 >
5080 if( left.rows() == 0UL ) {
5083 else if( left.columns() == 0UL ) {
5114 template<
typename VT1 >
5145 template<
typename VT1 >
5156 if( left.rows() == 0UL || left.columns() == 0UL ) {
5190 template<
typename VT1 >
5201 if( left.rows() == 0UL || left.columns() == 0UL ) {
5235 template<
typename VT1 >
5270 template<
typename VT1 >
5344 template<
typename MT
5346 inline decltype(
auto)
5385 template<
typename MT
5387 inline decltype(
auto)
5392 return (~mat).leftOperand() * ( (~mat).
rightOperand() * vec );
5408 template<
typename MT,
typename VT >
5409 struct Size< DMatDVecMultExpr<MT,VT>, 0UL >
5410 :
public Size<MT,0UL>
5426 template<
typename MT,
typename VT >
5427 struct IsAligned< DMatDVecMultExpr<MT,VT> >
5428 :
public And< IsAligned<MT>, IsAligned<VT> >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
If_< IsExpression< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:213
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:222
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:262
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:219
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:130
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:122
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
If_< IsExpression< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:216
Header file for the IsSame and IsStrictlySame type traits.
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:128
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:208
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:133
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:519
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:341
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:353
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:205
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:373
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:207
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:131
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:309
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:506
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:67
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:108
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:329
decltype(auto) operator*(const DenseMatrix< MT1, false > &lhs, const DenseMatrix< MT2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8893
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:590
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:385
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:210
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:363
Header file for run time assertion macros.
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:132
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:386
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:248
Header file for the IsContiguous type trait.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:131
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:129
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:206
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:104
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Compile time logical 'and' evaluation.The And alias declaration performs at compile time a logical 'a...
Definition: And.h:76
Constraint on the data type.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:319
Header file for the MatVecMultExpr base class.
Constraint on the data type.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:296
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:209
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the function trace functionality.