35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_ 123 template<
typename MT
156 template<
typename T1 >
157 struct UseSMPAssign {
158 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
168 template<
typename T1,
typename T2,
typename T3 >
169 struct UseBlasKernel {
175 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
191 template<
typename T1,
typename T2,
typename T3 >
192 struct UseVectorizedDefaultKernel {
195 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
231 MT::simdEnabled && VT::simdEnabled &&
236 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
237 !evaluateVector && VT::smpAssignable };
270 return mat_(index,index) *
vec_[index];
280 const size_t n (
mat_.columns() -
begin );
297 inline ReturnType
at(
size_t index )
const {
298 if( index >=
mat_.rows() ) {
301 return (*
this)[index];
310 inline size_t size() const noexcept {
341 template<
typename T >
342 inline bool canAlias(
const T* alias )
const noexcept {
343 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
353 template<
typename T >
354 inline bool isAliased(
const T* alias )
const noexcept {
355 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
365 return mat_.isAligned() &&
vec_.isAligned();
377 (
mat_.rows() *
mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
378 (
size() > SMP_DMATDVECMULT_THRESHOLD );
401 template<
typename VT1 >
408 if( rhs.
mat_.rows() == 0UL ) {
411 else if( rhs.
mat_.columns() == 0UL ) {
424 DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
440 template<
typename VT1
443 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
447 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
448 selectSmallAssignKernel( y, A, x );
450 selectBlasAssignKernel( y, A, x );
469 template<
typename VT1
472 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
493 template<
typename VT1
497 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
499 selectDefaultAssignKernel( y, A, x );
518 template<
typename VT1
522 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
526 const size_t M( A.rows() );
527 const size_t N( A.columns() );
531 for( ; (i+8UL) <= M; i+=8UL )
541 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
542 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
544 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
547 for( ; j<jpos; j+=SIMDSIZE ) {
548 const SIMDType x1( x.load(j) );
549 xmm1 += A.load(i ,j) * x1;
550 xmm2 += A.load(i+1UL,j) * x1;
551 xmm3 += A.load(i+2UL,j) * x1;
552 xmm4 += A.load(i+3UL,j) * x1;
553 xmm5 += A.load(i+4UL,j) * x1;
554 xmm6 += A.load(i+5UL,j) * x1;
555 xmm7 += A.load(i+6UL,j) * x1;
556 xmm8 += A.load(i+7UL,j) * x1;
560 y[i+1UL] =
sum( xmm2 );
561 y[i+2UL] =
sum( xmm3 );
562 y[i+3UL] =
sum( xmm4 );
563 y[i+4UL] =
sum( xmm5 );
564 y[i+5UL] =
sum( xmm6 );
565 y[i+6UL] =
sum( xmm7 );
566 y[i+7UL] =
sum( xmm8 );
568 for( ; remainder && j<jend; ++j ) {
569 y[i ] += A(i ,j) * x[j];
570 y[i+1UL] += A(i+1UL,j) * x[j];
571 y[i+2UL] += A(i+2UL,j) * x[j];
572 y[i+3UL] += A(i+3UL,j) * x[j];
573 y[i+4UL] += A(i+4UL,j) * x[j];
574 y[i+5UL] += A(i+5UL,j) * x[j];
575 y[i+6UL] += A(i+6UL,j) * x[j];
576 y[i+7UL] += A(i+7UL,j) * x[j];
580 for( ; (i+4UL) <= M; i+=4UL )
590 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
591 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
593 SIMDType xmm1, xmm2, xmm3, xmm4;
596 for( ; j<jpos; j+=SIMDSIZE ) {
597 const SIMDType x1( x.load(j) );
598 xmm1 += A.load(i ,j) * x1;
599 xmm2 += A.load(i+1UL,j) * x1;
600 xmm3 += A.load(i+2UL,j) * x1;
601 xmm4 += A.load(i+3UL,j) * x1;
605 y[i+1UL] =
sum( xmm2 );
606 y[i+2UL] =
sum( xmm3 );
607 y[i+3UL] =
sum( xmm4 );
609 for( ; remainder && j<jend; ++j ) {
610 y[i ] += A(i ,j) * x[j];
611 y[i+1UL] += A(i+1UL,j) * x[j];
612 y[i+2UL] += A(i+2UL,j) * x[j];
613 y[i+3UL] += A(i+3UL,j) * x[j];
617 for( ; (i+3UL) <= M; i+=3UL )
627 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
628 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
630 SIMDType xmm1, xmm2, xmm3;
633 for( ; j<jpos; j+=SIMDSIZE ) {
634 const SIMDType x1( x.load(j) );
635 xmm1 += A.load(i ,j) * x1;
636 xmm2 += A.load(i+1UL,j) * x1;
637 xmm3 += A.load(i+2UL,j) * x1;
641 y[i+1UL] =
sum( xmm2 );
642 y[i+2UL] =
sum( xmm3 );
644 for( ; remainder && j<jend; ++j ) {
645 y[i ] += A(i ,j) * x[j];
646 y[i+1UL] += A(i+1UL,j) * x[j];
647 y[i+2UL] += A(i+2UL,j) * x[j];
651 for( ; (i+2UL) <= M; i+=2UL )
661 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
662 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
667 for( ; j<jpos; j+=SIMDSIZE ) {
668 const SIMDType x1( x.load(j) );
669 xmm1 += A.load(i ,j) * x1;
670 xmm2 += A.load(i+1UL,j) * x1;
674 y[i+1UL] =
sum( xmm2 );
676 for( ; remainder && j<jend; ++j ) {
677 y[i ] += A(i ,j) * x[j];
678 y[i+1UL] += A(i+1UL,j) * x[j];
692 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
693 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
698 for( ; j<jpos; j+=SIMDSIZE ) {
699 xmm1 += A.load(i,j) * x.load(j);
704 for( ; remainder && j<jend; ++j ) {
705 y[i] += A(i,j) * x[j];
726 template<
typename VT1
730 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
732 selectDefaultAssignKernel( y, A, x );
751 template<
typename VT1
755 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
759 const size_t M( A.rows() );
760 const size_t N( A.columns() );
766 for( ; (i+8UL) <= M; i+=8UL )
776 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
777 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
781 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
782 const size_t j1( j+SIMDSIZE );
783 const size_t j2( j+SIMDSIZE*2UL );
784 const size_t j3( j+SIMDSIZE*3UL );
785 const SIMDType x1( x.load(j ) );
786 const SIMDType x2( x.load(j1) );
787 const SIMDType x3( x.load(j2) );
788 const SIMDType x4( x.load(j3) );
789 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
790 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
791 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
792 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
793 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
794 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
795 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
796 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
799 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
800 const size_t j1( j+SIMDSIZE );
801 const SIMDType x1( x.load(j ) );
802 const SIMDType x2( x.load(j1) );
803 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
804 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
805 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
806 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
807 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
808 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
809 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
810 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
813 for( ; j<jpos; j+=SIMDSIZE ) {
814 const SIMDType x1( x.load(j) );
815 y[i ] +=
sum( A.load(i ,j) * x1 );
816 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
817 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
818 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
819 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
820 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
821 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
822 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
825 for( ; remainder && j<jend; ++j ) {
826 y[i ] += A(i ,j) * x[j];
827 y[i+1UL] += A(i+1UL,j) * x[j];
828 y[i+2UL] += A(i+2UL,j) * x[j];
829 y[i+3UL] += A(i+3UL,j) * x[j];
830 y[i+4UL] += A(i+4UL,j) * x[j];
831 y[i+5UL] += A(i+5UL,j) * x[j];
832 y[i+6UL] += A(i+6UL,j) * x[j];
833 y[i+7UL] += A(i+7UL,j) * x[j];
837 for( ; (i+4UL) <= M; i+=4UL )
847 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
848 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
852 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
853 const size_t j1( j+SIMDSIZE );
854 const size_t j2( j+SIMDSIZE*2UL );
855 const size_t j3( j+SIMDSIZE*3UL );
856 const SIMDType x1( x.load(j ) );
857 const SIMDType x2( x.load(j1) );
858 const SIMDType x3( x.load(j2) );
859 const SIMDType x4( x.load(j3) );
860 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
861 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
862 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
863 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
866 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
867 const size_t j1( j+SIMDSIZE );
868 const SIMDType x1( x.load(j ) );
869 const SIMDType x2( x.load(j1) );
870 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
871 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
872 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
873 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
876 for( ; j<jpos; j+=SIMDSIZE ) {
877 const SIMDType x1( x.load(j) );
878 y[i ] +=
sum( A.load(i ,j) * x1 );
879 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
880 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
881 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
884 for( ; remainder && j<jend; ++j ) {
885 y[i ] += A(i ,j) * x[j];
886 y[i+1UL] += A(i+1UL,j) * x[j];
887 y[i+2UL] += A(i+2UL,j) * x[j];
888 y[i+3UL] += A(i+3UL,j) * x[j];
892 for( ; (i+2UL) <= M; i+=2UL )
902 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
903 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
907 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
908 const size_t j1( j+SIMDSIZE );
909 const size_t j2( j+SIMDSIZE*2UL );
910 const size_t j3( j+SIMDSIZE*3UL );
911 const SIMDType x1( x.load(j ) );
912 const SIMDType x2( x.load(j1) );
913 const SIMDType x3( x.load(j2) );
914 const SIMDType x4( x.load(j3) );
915 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
916 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
919 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
920 const size_t j1( j+SIMDSIZE );
921 const SIMDType x1( x.load(j ) );
922 const SIMDType x2( x.load(j1) );
923 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
924 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
927 for( ; j<jpos; j+=SIMDSIZE ) {
928 const SIMDType x1( x.load(j) );
929 y[i ] +=
sum( A.load(i ,j) * x1 );
930 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
933 for( ; remainder && j<jend; ++j ) {
934 y[i ] += A(i ,j) * x[j];
935 y[i+1UL] += A(i+1UL,j) * x[j];
949 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
950 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
954 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
955 const size_t j1( j+SIMDSIZE );
956 const size_t j2( j+SIMDSIZE*2UL );
957 const size_t j3( j+SIMDSIZE*3UL );
958 const SIMDType x1( x.load(j ) );
959 const SIMDType x2( x.load(j1) );
960 const SIMDType x3( x.load(j2) );
961 const SIMDType x4( x.load(j3) );
962 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
965 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
966 const size_t j1( j+SIMDSIZE );
967 const SIMDType x1( x.load(j ) );
968 const SIMDType x2( x.load(j1) );
969 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
972 for( ; j<jpos; j+=SIMDSIZE ) {
973 const SIMDType x1( x.load(j) );
974 y[i] +=
sum( A.load(i,j) * x1 );
977 for( ; remainder && j<jend; ++j ) {
978 y[i] += A(i,j) * x[j];
999 template<
typename VT1
1003 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1005 selectLargeAssignKernel( y, A, x );
1011 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1025 template<
typename VT1
1029 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1038 gemv( y, A, x, ET(1), ET(0) );
1058 template<
typename VT1 >
1069 const ResultType tmp(
serial( rhs ) );
1070 assign( ~lhs, tmp );
1088 template<
typename VT1 >
1095 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
1107 DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1123 template<
typename VT1
1126 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1130 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1131 selectSmallAddAssignKernel( y, A, x );
1133 selectBlasAddAssignKernel( y, A, x );
1152 template<
typename VT1
1155 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1157 y.addAssign( A * x );
1176 template<
typename VT1
1180 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1182 selectDefaultAddAssignKernel( y, A, x );
1201 template<
typename VT1
1205 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1209 const size_t M( A.rows() );
1210 const size_t N( A.columns() );
1214 for( ; (i+8UL) <= M; i+=8UL )
1224 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1225 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1227 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1230 for( ; j<jpos; j+=SIMDSIZE ) {
1231 const SIMDType x1( x.load(j) );
1232 xmm1 += A.load(i ,j) * x1;
1233 xmm2 += A.load(i+1UL,j) * x1;
1234 xmm3 += A.load(i+2UL,j) * x1;
1235 xmm4 += A.load(i+3UL,j) * x1;
1236 xmm5 += A.load(i+4UL,j) * x1;
1237 xmm6 += A.load(i+5UL,j) * x1;
1238 xmm7 += A.load(i+6UL,j) * x1;
1239 xmm8 += A.load(i+7UL,j) * x1;
1242 y[i ] +=
sum( xmm1 );
1243 y[i+1UL] +=
sum( xmm2 );
1244 y[i+2UL] +=
sum( xmm3 );
1245 y[i+3UL] +=
sum( xmm4 );
1246 y[i+4UL] +=
sum( xmm5 );
1247 y[i+5UL] +=
sum( xmm6 );
1248 y[i+6UL] +=
sum( xmm7 );
1249 y[i+7UL] +=
sum( xmm8 );
1251 for( ; remainder && j<jend; ++j ) {
1252 y[i ] += A(i ,j) * x[j];
1253 y[i+1UL] += A(i+1UL,j) * x[j];
1254 y[i+2UL] += A(i+2UL,j) * x[j];
1255 y[i+3UL] += A(i+3UL,j) * x[j];
1256 y[i+4UL] += A(i+4UL,j) * x[j];
1257 y[i+5UL] += A(i+5UL,j) * x[j];
1258 y[i+6UL] += A(i+6UL,j) * x[j];
1259 y[i+7UL] += A(i+7UL,j) * x[j];
1263 for( ; (i+4UL) <= M; i+=4UL )
1273 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1274 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1276 SIMDType xmm1, xmm2, xmm3, xmm4;
1279 for( ; j<jpos; j+=SIMDSIZE ) {
1280 const SIMDType x1( x.load(j) );
1281 xmm1 += A.load(i ,j) * x1;
1282 xmm2 += A.load(i+1UL,j) * x1;
1283 xmm3 += A.load(i+2UL,j) * x1;
1284 xmm4 += A.load(i+3UL,j) * x1;
1287 y[i ] +=
sum( xmm1 );
1288 y[i+1UL] +=
sum( xmm2 );
1289 y[i+2UL] +=
sum( xmm3 );
1290 y[i+3UL] +=
sum( xmm4 );
1292 for( ; remainder && j<jend; ++j ) {
1293 y[i ] += A(i ,j) * x[j];
1294 y[i+1UL] += A(i+1UL,j) * x[j];
1295 y[i+2UL] += A(i+2UL,j) * x[j];
1296 y[i+3UL] += A(i+3UL,j) * x[j];
1300 for( ; (i+3UL) <= M; i+=3UL )
1310 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1311 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1313 SIMDType xmm1, xmm2, xmm3;
1316 for( ; j<jpos; j+=SIMDSIZE ) {
1317 const SIMDType x1( x.load(j) );
1318 xmm1 += A.load(i ,j) * x1;
1319 xmm2 += A.load(i+1UL,j) * x1;
1320 xmm3 += A.load(i+2UL,j) * x1;
1323 y[i ] +=
sum( xmm1 );
1324 y[i+1UL] +=
sum( xmm2 );
1325 y[i+2UL] +=
sum( xmm3 );
1327 for( ; remainder && j<jend; ++j ) {
1328 y[i ] += A(i ,j) * x[j];
1329 y[i+1UL] += A(i+1UL,j) * x[j];
1330 y[i+2UL] += A(i+2UL,j) * x[j];
1334 for( ; (i+2UL) <= M; i+=2UL )
1344 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1345 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1347 SIMDType xmm1, xmm2;
1350 for( ; j<jpos; j+=SIMDSIZE ) {
1351 const SIMDType x1( x.load(j) );
1352 xmm1 += A.load(i ,j) * x1;
1353 xmm2 += A.load(i+1UL,j) * x1;
1356 y[i ] +=
sum( xmm1 );
1357 y[i+1UL] +=
sum( xmm2 );
1359 for( ; remainder && j<jend; ++j ) {
1360 y[i ] += A(i ,j) * x[j];
1361 y[i+1UL] += A(i+1UL,j) * x[j];
1375 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1376 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1381 for( ; j<jpos; j+=SIMDSIZE ) {
1382 xmm1 += A.load(i,j) * x.load(j);
1385 y[i] +=
sum( xmm1 );
1387 for( ; remainder && j<jend; ++j ) {
1388 y[i] += A(i,j) * x[j];
1409 template<
typename VT1
1413 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1415 selectDefaultAddAssignKernel( y, A, x );
1434 template<
typename VT1
1438 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1442 const size_t M( A.rows() );
1443 const size_t N( A.columns() );
1447 for( ; (i+8UL) <= M; i+=8UL )
1457 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1458 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1462 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1463 const size_t j1( j+SIMDSIZE );
1464 const size_t j2( j+SIMDSIZE*2UL );
1465 const size_t j3( j+SIMDSIZE*3UL );
1466 const SIMDType x1( x.load(j ) );
1467 const SIMDType x2( x.load(j1) );
1468 const SIMDType x3( x.load(j2) );
1469 const SIMDType x4( x.load(j3) );
1470 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1471 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1472 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1473 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1474 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1475 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1476 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1477 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1480 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1481 const size_t j1( j+SIMDSIZE );
1482 const SIMDType x1( x.load(j ) );
1483 const SIMDType x2( x.load(j1) );
1484 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1485 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1486 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1487 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1488 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1489 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1490 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1491 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1494 for( ; j<jpos; j+=SIMDSIZE ) {
1495 const SIMDType x1( x.load(j) );
1496 y[i ] +=
sum( A.load(i ,j) * x1 );
1497 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1498 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1499 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1500 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
1501 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
1502 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
1503 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
1506 for( ; remainder && j<jend; ++j ) {
1507 y[i ] += A(i ,j) * x[j];
1508 y[i+1UL] += A(i+1UL,j) * x[j];
1509 y[i+2UL] += A(i+2UL,j) * x[j];
1510 y[i+3UL] += A(i+3UL,j) * x[j];
1511 y[i+4UL] += A(i+4UL,j) * x[j];
1512 y[i+5UL] += A(i+5UL,j) * x[j];
1513 y[i+6UL] += A(i+6UL,j) * x[j];
1514 y[i+7UL] += A(i+7UL,j) * x[j];
1518 for( ; (i+4UL) <= M; i+=4UL )
1528 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1529 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1533 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1534 const size_t j1( j+SIMDSIZE );
1535 const size_t j2( j+SIMDSIZE*2UL );
1536 const size_t j3( j+SIMDSIZE*3UL );
1537 const SIMDType x1( x.load(j ) );
1538 const SIMDType x2( x.load(j1) );
1539 const SIMDType x3( x.load(j2) );
1540 const SIMDType x4( x.load(j3) );
1541 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1542 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1543 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1544 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1547 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1548 const size_t j1( j+SIMDSIZE );
1549 const SIMDType x1( x.load(j ) );
1550 const SIMDType x2( x.load(j1) );
1551 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1552 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1553 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1554 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1557 for( ; j<jpos; j+=SIMDSIZE ) {
1558 const SIMDType x1( x.load(j) );
1559 y[i ] +=
sum( A.load(i ,j) * x1 );
1560 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1561 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1562 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1565 for( ; remainder && j<jend; ++j ) {
1566 y[i ] += A(i ,j) * x[j];
1567 y[i+1UL] += A(i+1UL,j) * x[j];
1568 y[i+2UL] += A(i+2UL,j) * x[j];
1569 y[i+3UL] += A(i+3UL,j) * x[j];
1573 for( ; (i+2UL) <= M; i+=2UL )
1583 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1584 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1588 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1589 const size_t j1( j+SIMDSIZE );
1590 const size_t j2( j+SIMDSIZE*2UL );
1591 const size_t j3( j+SIMDSIZE*3UL );
1592 const SIMDType x1( x.load(j ) );
1593 const SIMDType x2( x.load(j1) );
1594 const SIMDType x3( x.load(j2) );
1595 const SIMDType x4( x.load(j3) );
1596 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1597 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1600 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1601 const size_t j1( j+SIMDSIZE );
1602 const SIMDType x1( x.load(j ) );
1603 const SIMDType x2( x.load(j1) );
1604 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1605 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1608 for( ; j<jpos; j+=SIMDSIZE ) {
1609 const SIMDType x1( x.load(j) );
1610 y[i ] +=
sum( A.load(i ,j) * x1 );
1611 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1614 for( ; remainder && j<jend; ++j ) {
1615 y[i ] += A(i ,j) * x[j];
1616 y[i+1UL] += A(i+1UL,j) * x[j];
1630 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1631 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1635 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1636 const size_t j1( j+SIMDSIZE );
1637 const size_t j2( j+SIMDSIZE*2UL );
1638 const size_t j3( j+SIMDSIZE*3UL );
1639 const SIMDType x1( x.load(j ) );
1640 const SIMDType x2( x.load(j1) );
1641 const SIMDType x3( x.load(j2) );
1642 const SIMDType x4( x.load(j3) );
1643 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1646 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1647 const size_t j1( j+SIMDSIZE );
1648 const SIMDType x1( x.load(j ) );
1649 const SIMDType x2( x.load(j1) );
1650 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1653 for( ; j<jpos; j+=SIMDSIZE ) {
1654 const SIMDType x1( x.load(j) );
1655 y[i] +=
sum( A.load(i,j) * x1 );
1658 for( ; remainder && j<jend; ++j ) {
1659 y[i] += A(i,j) * x[j];
1680 template<
typename VT1
1684 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1686 selectLargeAddAssignKernel( y, A, x );
1692 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1706 template<
typename VT1
1710 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1717 addAssign( y, tmp );
1720 gemv( y, A, x, ET(1), ET(1) );
1744 template<
typename VT1 >
1751 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
1763 DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1779 template<
typename VT1
1782 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1786 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1787 selectSmallSubAssignKernel( y, A, x );
1789 selectBlasSubAssignKernel( y, A, x );
1808 template<
typename VT1
1811 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1813 y.subAssign( A * x );
1832 template<
typename VT1
1836 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1838 selectDefaultSubAssignKernel( y, A, x );
1857 template<
typename VT1
1861 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1865 const size_t M( A.rows() );
1866 const size_t N( A.columns() );
1870 for( ; (i+8UL) <= M; i+=8UL )
1880 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1881 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1883 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1886 for( ; j<jpos; j+=SIMDSIZE ) {
1887 const SIMDType x1( x.load(j) );
1888 xmm1 += A.load(i ,j) * x1;
1889 xmm2 += A.load(i+1UL,j) * x1;
1890 xmm3 += A.load(i+2UL,j) * x1;
1891 xmm4 += A.load(i+3UL,j) * x1;
1892 xmm5 += A.load(i+4UL,j) * x1;
1893 xmm6 += A.load(i+5UL,j) * x1;
1894 xmm7 += A.load(i+6UL,j) * x1;
1895 xmm8 += A.load(i+7UL,j) * x1;
1898 y[i ] -=
sum( xmm1 );
1899 y[i+1UL] -=
sum( xmm2 );
1900 y[i+2UL] -=
sum( xmm3 );
1901 y[i+3UL] -=
sum( xmm4 );
1902 y[i+4UL] -=
sum( xmm5 );
1903 y[i+5UL] -=
sum( xmm6 );
1904 y[i+6UL] -=
sum( xmm7 );
1905 y[i+7UL] -=
sum( xmm8 );
1907 for( ; remainder && j<jend; ++j ) {
1908 y[i ] -= A(i ,j) * x[j];
1909 y[i+1UL] -= A(i+1UL,j) * x[j];
1910 y[i+2UL] -= A(i+2UL,j) * x[j];
1911 y[i+3UL] -= A(i+3UL,j) * x[j];
1912 y[i+4UL] -= A(i+4UL,j) * x[j];
1913 y[i+5UL] -= A(i+5UL,j) * x[j];
1914 y[i+6UL] -= A(i+6UL,j) * x[j];
1915 y[i+7UL] -= A(i+7UL,j) * x[j];
1919 for( ; (i+4UL) <= M; i+=4UL )
1929 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1930 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1932 SIMDType xmm1, xmm2, xmm3, xmm4;
1935 for( ; j<jpos; j+=SIMDSIZE ) {
1936 const SIMDType x1( x.load(j) );
1937 xmm1 += A.load(i ,j) * x1;
1938 xmm2 += A.load(i+1UL,j) * x1;
1939 xmm3 += A.load(i+2UL,j) * x1;
1940 xmm4 += A.load(i+3UL,j) * x1;
1943 y[i ] -=
sum( xmm1 );
1944 y[i+1UL] -=
sum( xmm2 );
1945 y[i+2UL] -=
sum( xmm3 );
1946 y[i+3UL] -=
sum( xmm4 );
1948 for( ; remainder && j<jend; ++j ) {
1949 y[i ] -= A(i ,j) * x[j];
1950 y[i+1UL] -= A(i+1UL,j) * x[j];
1951 y[i+2UL] -= A(i+2UL,j) * x[j];
1952 y[i+3UL] -= A(i+3UL,j) * x[j];
1956 for( ; (i+3UL) <= M; i+=3UL )
1966 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1967 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1969 SIMDType xmm1, xmm2, xmm3;
1972 for( ; j<jpos; j+=SIMDSIZE ) {
1973 const SIMDType x1( x.load(j) );
1974 xmm1 += A.load(i ,j) * x1;
1975 xmm2 += A.load(i+1UL,j) * x1;
1976 xmm3 += A.load(i+2UL,j) * x1;
1979 y[i ] -=
sum( xmm1 );
1980 y[i+1UL] -=
sum( xmm2 );
1981 y[i+2UL] -=
sum( xmm3 );
1983 for( ; remainder && j<jend; ++j ) {
1984 y[i ] -= A(i ,j) * x[j];
1985 y[i+1UL] -= A(i+1UL,j) * x[j];
1986 y[i+2UL] -= A(i+2UL,j) * x[j];
1990 for( ; (i+2UL) <= M; i+=2UL )
2000 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2001 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2003 SIMDType xmm1, xmm2;
2006 for( ; j<jpos; j+=SIMDSIZE ) {
2007 const SIMDType x1( x.load(j) );
2008 xmm1 += A.load(i ,j) * x1;
2009 xmm2 += A.load(i+1UL,j) * x1;
2012 y[i ] -=
sum( xmm1 );
2013 y[i+1UL] -=
sum( xmm2 );
2015 for( ; remainder && j<jend; ++j ) {
2016 y[i ] -= A(i ,j) * x[j];
2017 y[i+1UL] -= A(i+1UL,j) * x[j];
2031 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2032 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2037 for( ; j<jpos; j+=SIMDSIZE ) {
2038 xmm1 += A.load(i,j) * x.load(j);
2041 y[i] -=
sum( xmm1 );
2043 for( ; remainder && j<jend; ++j ) {
2044 y[i] -= A(i,j) * x[j];
2065 template<
typename VT1
2069 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2071 selectDefaultSubAssignKernel( y, A, x );
2090 template<
typename VT1
2094 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2098 const size_t M( A.rows() );
2099 const size_t N( A.columns() );
2103 for( ; (i+8UL) <= M; i+=8UL )
2113 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2114 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2118 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2119 const size_t j1( j+SIMDSIZE );
2120 const size_t j2( j+SIMDSIZE*2UL );
2121 const size_t j3( j+SIMDSIZE*3UL );
2122 const SIMDType x1( x.load(j ) );
2123 const SIMDType x2( x.load(j1) );
2124 const SIMDType x3( x.load(j2) );
2125 const SIMDType x4( x.load(j3) );
2126 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2127 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2128 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2129 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2130 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2131 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2132 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2133 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2136 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2137 const size_t j1( j+SIMDSIZE );
2138 const SIMDType x1( x.load(j ) );
2139 const SIMDType x2( x.load(j1) );
2140 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2141 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2142 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2143 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2144 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2145 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2146 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2147 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2150 for( ; j<jpos; j+=SIMDSIZE ) {
2151 const SIMDType x1( x.load(j) );
2152 y[i ] -=
sum( A.load(i ,j) * x1 );
2153 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2154 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2155 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2156 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 );
2157 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 );
2158 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 );
2159 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 );
2162 for( ; remainder && j<jend; ++j ) {
2163 y[i ] -= A(i ,j) * x[j];
2164 y[i+1UL] -= A(i+1UL,j) * x[j];
2165 y[i+2UL] -= A(i+2UL,j) * x[j];
2166 y[i+3UL] -= A(i+3UL,j) * x[j];
2167 y[i+4UL] -= A(i+4UL,j) * x[j];
2168 y[i+5UL] -= A(i+5UL,j) * x[j];
2169 y[i+6UL] -= A(i+6UL,j) * x[j];
2170 y[i+7UL] -= A(i+7UL,j) * x[j];
2174 for( ; (i+4UL) <= M; i+=4UL )
2184 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2185 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2189 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2190 const size_t j1( j+SIMDSIZE );
2191 const size_t j2( j+SIMDSIZE*2UL );
2192 const size_t j3( j+SIMDSIZE*3UL );
2193 const SIMDType x1( x.load(j ) );
2194 const SIMDType x2( x.load(j1) );
2195 const SIMDType x3( x.load(j2) );
2196 const SIMDType x4( x.load(j3) );
2197 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2198 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2199 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2200 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2203 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2204 const size_t j1( j+SIMDSIZE );
2205 const SIMDType x1( x.load(j ) );
2206 const SIMDType x2( x.load(j1) );
2207 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2208 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2209 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2210 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2213 for( ; j<jpos; j+=SIMDSIZE ) {
2214 const SIMDType x1( x.load(j) );
2215 y[i ] -=
sum( A.load(i ,j) * x1 );
2216 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2217 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2218 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2221 for( ; remainder && j<jend; ++j ) {
2222 y[i ] -= A(i ,j) * x[j];
2223 y[i+1UL] -= A(i+1UL,j) * x[j];
2224 y[i+2UL] -= A(i+2UL,j) * x[j];
2225 y[i+3UL] -= A(i+3UL,j) * x[j];
2229 for( ; (i+2UL) <= M; i+=2UL )
2239 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2240 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2244 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2245 const size_t j1( j+SIMDSIZE );
2246 const size_t j2( j+SIMDSIZE*2UL );
2247 const size_t j3( j+SIMDSIZE*3UL );
2248 const SIMDType x1( x.load(j ) );
2249 const SIMDType x2( x.load(j1) );
2250 const SIMDType x3( x.load(j2) );
2251 const SIMDType x4( x.load(j3) );
2252 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2253 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2256 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2257 const size_t j1( j+SIMDSIZE );
2258 const SIMDType x1( x.load(j ) );
2259 const SIMDType x2( x.load(j1) );
2260 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2261 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2264 for( ; j<jpos; j+=SIMDSIZE ) {
2265 const SIMDType x1( x.load(j) );
2266 y[i ] -=
sum( A.load(i ,j) * x1 );
2267 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2270 for( ; remainder && j<jend; ++j ) {
2271 y[i ] -= A(i ,j) * x[j];
2272 y[i+1UL] -= A(i+1UL,j) * x[j];
2286 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2287 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2291 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2292 const size_t j1( j+SIMDSIZE );
2293 const size_t j2( j+SIMDSIZE*2UL );
2294 const size_t j3( j+SIMDSIZE*3UL );
2295 const SIMDType x1( x.load(j ) );
2296 const SIMDType x2( x.load(j1) );
2297 const SIMDType x3( x.load(j2) );
2298 const SIMDType x4( x.load(j3) );
2299 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2302 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2303 const size_t j1( j+SIMDSIZE );
2304 const SIMDType x1( x.load(j ) );
2305 const SIMDType x2( x.load(j1) );
2306 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2309 for( ; j<jpos; j+=SIMDSIZE ) {
2310 const SIMDType x1( x.load(j) );
2311 y[i] -=
sum( A.load(i,j) * x1 );
2314 for( ; remainder && j<jend; ++j ) {
2315 y[i] -= A(i,j) * x[j];
2336 template<
typename VT1
2340 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2342 selectLargeSubAssignKernel( y, A, x );
2348 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2362 template<
typename VT1
2366 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2373 subAssign( y, tmp );
2376 gemv( y, A, x, ET(-1), ET(1) );
2400 template<
typename VT1 >
2411 const ResultType tmp(
serial( rhs ) );
2412 multAssign( ~lhs, tmp );
2434 template<
typename VT1 >
2445 const ResultType tmp(
serial( rhs ) );
2446 divAssign( ~lhs, tmp );
2470 template<
typename VT1 >
2478 if( rhs.
mat_.rows() == 0UL ) {
2481 else if( rhs.
mat_.columns() == 0UL ) {
2514 template<
typename VT1 >
2526 const ResultType tmp( rhs );
2547 template<
typename VT1 >
2555 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
2591 template<
typename VT1 >
2599 if( rhs.
mat_.rows() == 0UL || rhs.
mat_.columns() == 0UL ) {
2635 template<
typename VT1 >
2647 const ResultType tmp( rhs );
2672 template<
typename VT1 >
2684 const ResultType tmp( rhs );
2723 template<
typename MT
2727 :
public DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false >
2759 template<
typename T1 >
2760 struct UseSMPAssign {
2761 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
2769 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2770 struct UseBlasKernel {
2776 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2791 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2792 struct UseVectorizedDefaultKernel {
2795 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2831 MT::simdEnabled && VT::simdEnabled &&
2837 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2838 !evaluateVector && VT::smpAssignable };
2864 inline ReturnType
operator[](
size_t index )
const {
2866 return vector_[index] * scalar_;
2877 inline ReturnType
at(
size_t index )
const {
2878 if( index >= vector_.size() ) {
2881 return (*
this)[index];
2890 inline size_t size()
const {
2891 return vector_.size();
2921 template<
typename T >
2922 inline bool canAlias(
const T* alias )
const {
2923 return vector_.canAlias( alias );
2933 template<
typename T >
2934 inline bool isAliased(
const T* alias )
const {
2935 return vector_.isAliased( alias );
2945 return vector_.isAligned();
2958 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2959 (
size() > SMP_DMATDVECMULT_THRESHOLD );
2965 LeftOperand vector_;
2966 RightOperand scalar_;
2981 template<
typename VT1 >
2991 if( left.rows() == 0UL ) {
2994 else if( left.columns() == 0UL ) {
3007 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3022 template<
typename VT1
3026 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3030 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3031 selectSmallAssignKernel( y, A, x, scalar );
3033 selectBlasAssignKernel( y, A, x, scalar );
3051 template<
typename VT1
3056 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3058 y.assign( A * x * scalar );
3076 template<
typename VT1
3081 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3083 selectDefaultAssignKernel( y, A, x, scalar );
3101 template<
typename VT1
3106 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3110 const size_t M( A.rows() );
3111 const size_t N( A.columns() );
3115 for( ; (i+8UL) <= M; i+=8UL )
3125 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3126 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3128 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3131 for( ; j<jpos; j+=SIMDSIZE ) {
3132 const SIMDType x1( x.load(j) );
3133 xmm1 += A.load(i ,j) * x1;
3134 xmm2 += A.load(i+1UL,j) * x1;
3135 xmm3 += A.load(i+2UL,j) * x1;
3136 xmm4 += A.load(i+3UL,j) * x1;
3137 xmm5 += A.load(i+4UL,j) * x1;
3138 xmm6 += A.load(i+5UL,j) * x1;
3139 xmm7 += A.load(i+6UL,j) * x1;
3140 xmm8 += A.load(i+7UL,j) * x1;
3143 y[i ] =
sum( xmm1 ) * scalar;
3144 y[i+1UL] =
sum( xmm2 ) * scalar;
3145 y[i+2UL] =
sum( xmm3 ) * scalar;
3146 y[i+3UL] =
sum( xmm4 ) * scalar;
3147 y[i+4UL] =
sum( xmm5 ) * scalar;
3148 y[i+5UL] =
sum( xmm6 ) * scalar;
3149 y[i+6UL] =
sum( xmm7 ) * scalar;
3150 y[i+7UL] =
sum( xmm8 ) * scalar;
3152 for( ; remainder && j<jend; ++j ) {
3153 y[i ] += A(i ,j) * x[j] * scalar;
3154 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3155 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3156 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3157 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3158 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3159 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3160 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3164 for( ; (i+4UL) <= M; i+=4UL )
3174 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3175 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3177 SIMDType xmm1, xmm2, xmm3, xmm4;
3180 for( ; j<jpos; j+=SIMDSIZE ) {
3181 const SIMDType x1( x.load(j) );
3182 xmm1 += A.load(i ,j) * x1;
3183 xmm2 += A.load(i+1UL,j) * x1;
3184 xmm3 += A.load(i+2UL,j) * x1;
3185 xmm4 += A.load(i+3UL,j) * x1;
3188 y[i ] =
sum( xmm1 ) * scalar;
3189 y[i+1UL] =
sum( xmm2 ) * scalar;
3190 y[i+2UL] =
sum( xmm3 ) * scalar;
3191 y[i+3UL] =
sum( xmm4 ) * scalar;
3193 for( ; remainder && j<jend; ++j ) {
3194 y[i ] += A(i ,j) * x[j] * scalar;
3195 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3196 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3197 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3201 for( ; (i+3UL) <= M; i+=3UL )
3211 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3212 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3214 SIMDType xmm1, xmm2, xmm3;
3217 for( ; j<jpos; j+=SIMDSIZE ) {
3218 const SIMDType x1( x.load(j) );
3219 xmm1 += A.load(i ,j) * x1;
3220 xmm2 += A.load(i+1UL,j) * x1;
3221 xmm3 += A.load(i+2UL,j) * x1;
3224 y[i ] =
sum( xmm1 ) * scalar;
3225 y[i+1UL] =
sum( xmm2 ) * scalar;
3226 y[i+2UL] =
sum( xmm3 ) * scalar;
3228 for( ; remainder && j<jend; ++j ) {
3229 y[i ] += A(i ,j) * x[j] * scalar;
3230 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3231 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3235 for( ; (i+2UL) <= M; i+=2UL )
3245 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3246 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3248 SIMDType xmm1, xmm2;
3251 for( ; j<jpos; j+=SIMDSIZE ) {
3252 const SIMDType x1( x.load(j) );
3253 xmm1 += A.load(i ,j) * x1;
3254 xmm2 += A.load(i+1UL,j) * x1;
3257 y[i ] =
sum( xmm1 ) * scalar;
3258 y[i+1UL] =
sum( xmm2 ) * scalar;
3260 for( ; remainder && j<jend; ++j ) {
3261 y[i ] += A(i ,j) * x[j] * scalar;
3262 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3276 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3277 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3282 for( ; j<jpos; j+=SIMDSIZE ) {
3283 xmm1 += A.load(i,j) * x.load(j);
3286 y[i] =
sum( xmm1 ) * scalar;
3288 for( ; remainder && j<jend; ++j ) {
3289 y[i] += A(i,j) * x[j] * scalar;
3309 template<
typename VT1
3314 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3316 selectDefaultAssignKernel( y, A, x, scalar );
3334 template<
typename VT1
3339 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3343 const size_t M( A.rows() );
3344 const size_t N( A.columns() );
3350 for( ; (i+8UL) <= M; i+=8UL )
3360 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3361 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3365 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3366 const size_t j1( j+SIMDSIZE );
3367 const size_t j2( j+SIMDSIZE*2UL );
3368 const size_t j3( j+SIMDSIZE*3UL );
3369 const SIMDType x1( x.load(j ) );
3370 const SIMDType x2( x.load(j1) );
3371 const SIMDType x3( x.load(j2) );
3372 const SIMDType x4( x.load(j3) );
3373 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3374 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3375 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3376 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3377 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3378 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3379 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3380 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3383 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3384 const size_t j1( j+SIMDSIZE );
3385 const SIMDType x1( x.load(j ) );
3386 const SIMDType x2( x.load(j1) );
3387 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3388 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3389 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3390 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3391 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3392 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3393 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3394 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3397 for( ; j<jpos; j+=SIMDSIZE ) {
3398 const SIMDType x1( x.load(j) );
3399 y[i ] +=
sum( A.load(i ,j) * x1 );
3400 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3401 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3402 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3403 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
3404 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
3405 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
3406 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
3409 for( ; remainder && j<jend; ++j ) {
3410 y[i ] += A(i ,j) * x[j];
3411 y[i+1UL] += A(i+1UL,j) * x[j];
3412 y[i+2UL] += A(i+2UL,j) * x[j];
3413 y[i+3UL] += A(i+3UL,j) * x[j];
3414 y[i+4UL] += A(i+4UL,j) * x[j];
3415 y[i+5UL] += A(i+5UL,j) * x[j];
3416 y[i+6UL] += A(i+6UL,j) * x[j];
3417 y[i+7UL] += A(i+7UL,j) * x[j];
3430 for( ; (i+4UL) <= M; i+=4UL )
3440 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3441 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3445 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3446 const size_t j1( j+SIMDSIZE );
3447 const size_t j2( j+SIMDSIZE*2UL );
3448 const size_t j3( j+SIMDSIZE*3UL );
3449 const SIMDType x1( x.load(j ) );
3450 const SIMDType x2( x.load(j1) );
3451 const SIMDType x3( x.load(j2) );
3452 const SIMDType x4( x.load(j3) );
3453 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3454 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3455 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3456 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3459 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3460 const size_t j1( j+SIMDSIZE );
3461 const SIMDType x1( x.load(j ) );
3462 const SIMDType x2( x.load(j1) );
3463 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3464 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3465 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3466 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3469 for( ; j<jpos; j+=SIMDSIZE ) {
3470 const SIMDType x1( x.load(j) );
3471 y[i ] +=
sum( A.load(i ,j) * x1 );
3472 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3473 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3474 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3477 for( ; remainder && j<jend; ++j ) {
3478 y[i ] += A(i ,j) * x[j];
3479 y[i+1UL] += A(i+1UL,j) * x[j];
3480 y[i+2UL] += A(i+2UL,j) * x[j];
3481 y[i+3UL] += A(i+3UL,j) * x[j];
3490 for( ; (i+2UL) <= M; i+=2UL )
3500 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3501 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3505 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3506 const size_t j1( j+SIMDSIZE );
3507 const size_t j2( j+SIMDSIZE*2UL );
3508 const size_t j3( j+SIMDSIZE*3UL );
3509 const SIMDType x1( x.load(j ) );
3510 const SIMDType x2( x.load(j1) );
3511 const SIMDType x3( x.load(j2) );
3512 const SIMDType x4( x.load(j3) );
3513 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3514 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3517 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3518 const size_t j1( j+SIMDSIZE );
3519 const SIMDType x1( x.load(j ) );
3520 const SIMDType x2( x.load(j1) );
3521 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3522 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3525 for( ; j<jpos; j+=SIMDSIZE ) {
3526 const SIMDType x1( x.load(j) );
3527 y[i ] +=
sum( A.load(i ,j) * x1 );
3528 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3531 for( ; remainder && j<jend; ++j ) {
3532 y[i ] += A(i ,j) * x[j];
3533 y[i+1UL] += A(i+1UL,j) * x[j];
3550 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3551 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3555 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3556 const size_t j1( j+SIMDSIZE );
3557 const size_t j2( j+SIMDSIZE*2UL );
3558 const size_t j3( j+SIMDSIZE*3UL );
3559 const SIMDType x1( x.load(j ) );
3560 const SIMDType x2( x.load(j1) );
3561 const SIMDType x3( x.load(j2) );
3562 const SIMDType x4( x.load(j3) );
3563 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3566 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3567 const size_t j1( j+SIMDSIZE );
3568 const SIMDType x1( x.load(j ) );
3569 const SIMDType x2( x.load(j1) );
3570 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3573 for( ; j<jpos; j+=SIMDSIZE ) {
3574 const SIMDType x1( x.load(j) );
3575 y[i] +=
sum( A.load(i,j) * x1 );
3578 for( ; remainder && j<jend; ++j ) {
3579 y[i] += A(i,j) * x[j];
3601 template<
typename VT1
3606 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3608 selectLargeAssignKernel( y, A, x, scalar );
3613 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3627 template<
typename VT1
3632 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3637 assign( y, scalar * x );
3641 gemv( y, A, x, ET(scalar), ET(0) );
3659 template<
typename VT1 >
3670 const ResultType tmp(
serial( rhs ) );
3671 assign( ~lhs, tmp );
3687 template<
typename VT1 >
3697 if( left.rows() == 0UL || left.columns() == 0UL ) {
3709 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.
scalar_ );
3724 template<
typename VT1
3728 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3732 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3733 selectSmallAddAssignKernel( y, A, x, scalar );
3735 selectBlasAddAssignKernel( y, A, x, scalar );
3753 template<
typename VT1
3757 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3759 y.addAssign( A * x * scalar );
3777 template<
typename VT1
3782 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3784 selectDefaultAddAssignKernel( y, A, x, scalar );
3802 template<
typename VT1
3807 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3811 const size_t M( A.rows() );
3812 const size_t N( A.columns() );
3816 for( ; (i+8UL) <= M; i+=8UL )
3826 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3827 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3829 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3832 for( ; j<jpos; j+=SIMDSIZE ) {
3833 const SIMDType x1( x.load(j) );
3834 xmm1 += A.load(i ,j) * x1;
3835 xmm2 += A.load(i+1UL,j) * x1;
3836 xmm3 += A.load(i+2UL,j) * x1;
3837 xmm4 += A.load(i+3UL,j) * x1;
3838 xmm5 += A.load(i+4UL,j) * x1;
3839 xmm6 += A.load(i+5UL,j) * x1;
3840 xmm7 += A.load(i+6UL,j) * x1;
3841 xmm8 += A.load(i+7UL,j) * x1;
3844 y[i ] +=
sum( xmm1 ) * scalar;
3845 y[i+1UL] +=
sum( xmm2 ) * scalar;
3846 y[i+2UL] +=
sum( xmm3 ) * scalar;
3847 y[i+3UL] +=
sum( xmm4 ) * scalar;
3848 y[i+4UL] +=
sum( xmm5 ) * scalar;
3849 y[i+5UL] +=
sum( xmm6 ) * scalar;
3850 y[i+6UL] +=
sum( xmm7 ) * scalar;
3851 y[i+7UL] +=
sum( xmm8 ) * scalar;
3853 for( ; remainder && j<jend; ++j ) {
3854 y[i ] += A(i ,j) * x[j] * scalar;
3855 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3856 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3857 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3858 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3859 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3860 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3861 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3865 for( ; (i+4UL) <= M; i+=4UL )
3875 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3876 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3878 SIMDType xmm1, xmm2, xmm3, xmm4;
3881 for( ; j<jpos; j+=SIMDSIZE ) {
3882 const SIMDType x1( x.load(j) );
3883 xmm1 += A.load(i ,j) * x1;
3884 xmm2 += A.load(i+1UL,j) * x1;
3885 xmm3 += A.load(i+2UL,j) * x1;
3886 xmm4 += A.load(i+3UL,j) * x1;
3889 y[i ] +=
sum( xmm1 ) * scalar;
3890 y[i+1UL] +=
sum( xmm2 ) * scalar;
3891 y[i+2UL] +=
sum( xmm3 ) * scalar;
3892 y[i+3UL] +=
sum( xmm4 ) * scalar;
3894 for( ; remainder && j<jend; ++j ) {
3895 y[i ] += A(i ,j) * x[j] * scalar;
3896 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3897 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3898 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3902 for( ; (i+3UL) <= M; i+=3UL )
3912 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3913 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3915 SIMDType xmm1, xmm2, xmm3;
3918 for( ; j<jpos; j+=SIMDSIZE ) {
3919 const SIMDType x1( x.load(j) );
3920 xmm1 += A.load(i ,j) * x1;
3921 xmm2 += A.load(i+1UL,j) * x1;
3922 xmm3 += A.load(i+2UL,j) * x1;
3925 y[i ] +=
sum( xmm1 ) * scalar;
3926 y[i+1UL] +=
sum( xmm2 ) * scalar;
3927 y[i+2UL] +=
sum( xmm3 ) * scalar;
3929 for( ; remainder && j<jend; ++j ) {
3930 y[i ] += A(i ,j) * x[j] * scalar;
3931 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3932 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3936 for( ; (i+2UL) <= M; i+=2UL )
3946 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3947 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3949 SIMDType xmm1, xmm2;
3952 for( ; j<jpos; j+=SIMDSIZE ) {
3953 const SIMDType x1( x.load(j) );
3954 xmm1 += A.load(i ,j) * x1;
3955 xmm2 += A.load(i+1UL,j) * x1;
3958 y[i ] +=
sum( xmm1 ) * scalar;
3959 y[i+1UL] +=
sum( xmm2 ) * scalar;
3961 for( ; remainder && j<jend; ++j ) {
3962 y[i ] += A(i ,j) * x[j] * scalar;
3963 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3977 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3978 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3983 for( ; j<jpos; j+=SIMDSIZE ) {
3984 xmm1 += A.load(i,j) * x.load(j);
3987 y[i] +=
sum( xmm1 ) * scalar;
3989 for( ; remainder && j<jend; ++j ) {
3990 y[i] += A(i,j) * x[j] * scalar;
4010 template<
typename VT1
4015 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4017 selectDefaultAddAssignKernel( y, A, x, scalar );
4035 template<
typename VT1
4040 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4044 const size_t M( A.rows() );
4045 const size_t N( A.columns() );
4049 for( ; (i+8UL) <= M; i+=8UL )
4059 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4060 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4064 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4065 const size_t j1( j+SIMDSIZE );
4066 const size_t j2( j+SIMDSIZE*2UL );
4067 const size_t j3( j+SIMDSIZE*3UL );
4068 const SIMDType x1( x.load(j ) );
4069 const SIMDType x2( x.load(j1) );
4070 const SIMDType x3( x.load(j2) );
4071 const SIMDType x4( x.load(j3) );
4072 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4073 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4074 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4075 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4076 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4077 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4078 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4079 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4082 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4083 const size_t j1( j+SIMDSIZE );
4084 const SIMDType x1( x.load(j ) );
4085 const SIMDType x2( x.load(j1) );
4086 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4087 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4088 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4089 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4090 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4091 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4092 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4093 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4096 for( ; j<jpos; j+=SIMDSIZE ) {
4097 const SIMDType x1( x.load(j) );
4098 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4099 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4100 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4101 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4102 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4103 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4104 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4105 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4108 for( ; remainder && j<jend; ++j ) {
4109 y[i ] += A(i ,j) * x[j] * scalar;
4110 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4111 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4112 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4113 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4114 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4115 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4116 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4120 for( ; (i+4UL) <= M; i+=4UL )
4130 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4131 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4135 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4136 const size_t j1( j+SIMDSIZE );
4137 const size_t j2( j+SIMDSIZE*2UL );
4138 const size_t j3( j+SIMDSIZE*3UL );
4139 const SIMDType x1( x.load(j ) );
4140 const SIMDType x2( x.load(j1) );
4141 const SIMDType x3( x.load(j2) );
4142 const SIMDType x4( x.load(j3) );
4143 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4144 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4145 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4146 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4149 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4150 const size_t j1( j+SIMDSIZE );
4151 const SIMDType x1( x.load(j ) );
4152 const SIMDType x2( x.load(j1) );
4153 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4154 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4155 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4156 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4159 for( ; j<jpos; j+=SIMDSIZE ) {
4160 const SIMDType x1( x.load(j) );
4161 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4162 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4163 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4164 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4167 for( ; remainder && j<jend; ++j ) {
4168 y[i ] += A(i ,j) * x[j] * scalar;
4169 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4170 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4171 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4175 for( ; (i+2UL) <= M; i+=2UL )
4185 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4186 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4190 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4191 const size_t j1( j+SIMDSIZE );
4192 const size_t j2( j+SIMDSIZE*2UL );
4193 const size_t j3( j+SIMDSIZE*3UL );
4194 const SIMDType x1( x.load(j ) );
4195 const SIMDType x2( x.load(j1) );
4196 const SIMDType x3( x.load(j2) );
4197 const SIMDType x4( x.load(j3) );
4198 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4199 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4202 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4203 const size_t j1( j+SIMDSIZE );
4204 const SIMDType x1( x.load(j ) );
4205 const SIMDType x2( x.load(j1) );
4206 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4207 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4210 for( ; j<jpos; j+=SIMDSIZE ) {
4211 const SIMDType x1( x.load(j) );
4212 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4213 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4216 for( ; remainder && j<jend; ++j ) {
4217 y[i ] += A(i ,j) * x[j] * scalar;
4218 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4232 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4233 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4237 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4238 const size_t j1( j+SIMDSIZE );
4239 const size_t j2( j+SIMDSIZE*2UL );
4240 const size_t j3( j+SIMDSIZE*3UL );
4241 const SIMDType x1( x.load(j ) );
4242 const SIMDType x2( x.load(j1) );
4243 const SIMDType x3( x.load(j2) );
4244 const SIMDType x4( x.load(j3) );
4245 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4248 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4249 const size_t j1( j+SIMDSIZE );
4250 const SIMDType x1( x.load(j ) );
4251 const SIMDType x2( x.load(j1) );
4252 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4255 for( ; j<jpos; j+=SIMDSIZE ) {
4256 const SIMDType x1( x.load(j) );
4257 y[i] +=
sum( A.load(i,j) * x1 ) * scalar;
4260 for( ; remainder && j<jend; ++j ) {
4261 y[i] += A(i,j) * x[j] * scalar;
4281 template<
typename VT1
4286 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4288 selectLargeAddAssignKernel( y, A, x, scalar );
4293 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4307 template<
typename VT1
4312 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4319 addAssign( y, tmp );
4322 gemv( y, A, x, ET(scalar), ET(1) );
4344 template<
typename VT1 >
4354 if( left.rows() == 0UL || left.columns() == 0UL ) {
4366 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.
scalar_ );
4381 template<
typename VT1
4385 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4389 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4390 selectSmallSubAssignKernel( y, A, x, scalar );
4392 selectBlasSubAssignKernel( y, A, x, scalar );
4410 template<
typename VT1
4414 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4416 y.subAssign( A * x * scalar );
4434 template<
typename VT1
4439 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4441 selectDefaultSubAssignKernel( y, A, x, scalar );
4459 template<
typename VT1
4464 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4468 const size_t M( A.rows() );
4469 const size_t N( A.columns() );
4473 for( ; (i+8UL) <= M; i+=8UL )
4483 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4484 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4486 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4489 for( ; j<jpos; j+=SIMDSIZE ) {
4490 const SIMDType x1( x.load(j) );
4491 xmm1 += A.load(i ,j) * x1;
4492 xmm2 += A.load(i+1UL,j) * x1;
4493 xmm3 += A.load(i+2UL,j) * x1;
4494 xmm4 += A.load(i+3UL,j) * x1;
4495 xmm5 += A.load(i+4UL,j) * x1;
4496 xmm6 += A.load(i+5UL,j) * x1;
4497 xmm7 += A.load(i+6UL,j) * x1;
4498 xmm8 += A.load(i+7UL,j) * x1;
4501 y[i ] -=
sum( xmm1 ) * scalar;
4502 y[i+1UL] -=
sum( xmm2 ) * scalar;
4503 y[i+2UL] -=
sum( xmm3 ) * scalar;
4504 y[i+3UL] -=
sum( xmm4 ) * scalar;
4505 y[i+4UL] -=
sum( xmm5 ) * scalar;
4506 y[i+5UL] -=
sum( xmm6 ) * scalar;
4507 y[i+6UL] -=
sum( xmm7 ) * scalar;
4508 y[i+7UL] -=
sum( xmm8 ) * scalar;
4510 for( ; remainder && j<jend; ++j ) {
4511 y[i ] -= A(i ,j) * x[j] * scalar;
4512 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4513 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4514 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4515 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4516 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4517 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4518 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4522 for( ; (i+4UL) <= M; i+=4UL )
4532 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4533 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4535 SIMDType xmm1, xmm2, xmm3, xmm4;
4538 for( ; j<jpos; j+=SIMDSIZE ) {
4539 const SIMDType x1( x.load(j) );
4540 xmm1 += A.load(i ,j) * x1;
4541 xmm2 += A.load(i+1UL,j) * x1;
4542 xmm3 += A.load(i+2UL,j) * x1;
4543 xmm4 += A.load(i+3UL,j) * x1;
4546 y[i ] -=
sum( xmm1 ) * scalar;
4547 y[i+1UL] -=
sum( xmm2 ) * scalar;
4548 y[i+2UL] -=
sum( xmm3 ) * scalar;
4549 y[i+3UL] -=
sum( xmm4 ) * scalar;
4551 for( ; remainder && j<jend; ++j ) {
4552 y[i ] -= A(i ,j) * x[j] * scalar;
4553 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4554 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4555 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4559 for( ; (i+3UL) <= M; i+=3UL )
4569 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4570 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4572 SIMDType xmm1, xmm2, xmm3;
4575 for( ; j<jpos; j+=SIMDSIZE ) {
4576 const SIMDType x1( x.load(j) );
4577 xmm1 += A.load(i ,j) * x1;
4578 xmm2 += A.load(i+1UL,j) * x1;
4579 xmm3 += A.load(i+2UL,j) * x1;
4582 y[i ] -=
sum( xmm1 ) * scalar;
4583 y[i+1UL] -=
sum( xmm2 ) * scalar;
4584 y[i+2UL] -=
sum( xmm3 ) * scalar;
4586 for( ; remainder && j<jend; ++j ) {
4587 y[i ] -= A(i ,j) * x[j] * scalar;
4588 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4589 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4593 for( ; (i+2UL) <= M; i+=2UL )
4603 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4604 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4606 SIMDType xmm1, xmm2;
4609 for( ; j<jpos; j+=SIMDSIZE ) {
4610 const SIMDType x1( x.load(j) );
4611 xmm1 += A.load(i ,j) * x1;
4612 xmm2 += A.load(i+1UL,j) * x1;
4615 y[i ] -=
sum( xmm1 ) * scalar;
4616 y[i+1UL] -=
sum( xmm2 ) * scalar;
4618 for( ; remainder && j<jend; ++j ) {
4619 y[i ] -= A(i ,j) * x[j] * scalar;
4620 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4634 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4635 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4640 for( ; j<jpos; j+=SIMDSIZE ) {
4641 xmm1 += A.load(i,j) * x.load(j);
4644 y[i] -=
sum( xmm1 ) * scalar;
4646 for( ; remainder && j<jend; ++j ) {
4647 y[i] -= A(i,j) * x[j] * scalar;
4667 template<
typename VT1
4672 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4674 selectDefaultSubAssignKernel( y, A, x, scalar );
4692 template<
typename VT1
4697 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4701 const size_t M( A.rows() );
4702 const size_t N( A.columns() );
4706 for( ; (i+8UL) <= M; i+=8UL )
4716 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4717 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4721 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4722 const size_t j1( j+SIMDSIZE );
4723 const size_t j2( j+SIMDSIZE*2UL );
4724 const size_t j3( j+SIMDSIZE*3UL );
4725 const SIMDType x1( x.load(j ) );
4726 const SIMDType x2( x.load(j1) );
4727 const SIMDType x3( x.load(j2) );
4728 const SIMDType x4( x.load(j3) );
4729 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4730 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4731 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4732 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4733 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4734 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4735 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4736 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4739 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4740 const size_t j1( j+SIMDSIZE );
4741 const SIMDType x1( x.load(j ) );
4742 const SIMDType x2( x.load(j1) );
4743 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4744 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4745 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4746 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4747 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4748 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4749 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4750 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4753 for( ; j<jpos; j+=SIMDSIZE ) {
4754 const SIMDType x1( x.load(j) );
4755 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4756 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4757 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4758 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4759 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4760 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4761 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4762 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4765 for( ; remainder && j<jend; ++j ) {
4766 y[i ] -= A(i ,j) * x[j] * scalar;
4767 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4768 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4769 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4770 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4771 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4772 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4773 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4777 for( ; (i+4UL) <= M; i+=4UL )
4787 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4788 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4792 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4793 const size_t j1( j+SIMDSIZE );
4794 const size_t j2( j+SIMDSIZE*2UL );
4795 const size_t j3( j+SIMDSIZE*3UL );
4796 const SIMDType x1( x.load(j ) );
4797 const SIMDType x2( x.load(j1) );
4798 const SIMDType x3( x.load(j2) );
4799 const SIMDType x4( x.load(j3) );
4800 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4801 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4802 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4803 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4806 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4807 const size_t j1( j+SIMDSIZE );
4808 const SIMDType x1( x.load(j ) );
4809 const SIMDType x2( x.load(j1) );
4810 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4811 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4812 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4813 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4816 for( ; j<jpos; j+=SIMDSIZE ) {
4817 const SIMDType x1( x.load(j) );
4818 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4819 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4820 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4821 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4824 for( ; remainder && j<jend; ++j ) {
4825 y[i ] -= A(i ,j) * x[j] * scalar;
4826 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4827 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4828 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4832 for( ; (i+2UL) <= M; i+=2UL )
4842 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4843 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4847 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4848 const size_t j1( j+SIMDSIZE );
4849 const size_t j2( j+SIMDSIZE*2UL );
4850 const size_t j3( j+SIMDSIZE*3UL );
4851 const SIMDType x1( x.load(j ) );
4852 const SIMDType x2( x.load(j1) );
4853 const SIMDType x3( x.load(j2) );
4854 const SIMDType x4( x.load(j3) );
4855 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4856 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4859 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4860 const size_t j1( j+SIMDSIZE );
4861 const SIMDType x1( x.load(j ) );
4862 const SIMDType x2( x.load(j1) );
4863 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4864 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4867 for( ; j<jpos; j+=SIMDSIZE ) {
4868 const SIMDType x1( x.load(j) );
4869 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4870 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4873 for( ; remainder && j<jend; ++j ) {
4874 y[i ] -= A(i ,j) * x[j] * scalar;
4875 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4889 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4890 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4894 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4895 const size_t j1( j+SIMDSIZE );
4896 const size_t j2( j+SIMDSIZE*2UL );
4897 const size_t j3( j+SIMDSIZE*3UL );
4898 const SIMDType x1( x.load(j ) );
4899 const SIMDType x2( x.load(j1) );
4900 const SIMDType x3( x.load(j2) );
4901 const SIMDType x4( x.load(j3) );
4902 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4905 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4906 const size_t j1( j+SIMDSIZE );
4907 const SIMDType x1( x.load(j ) );
4908 const SIMDType x2( x.load(j1) );
4909 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4912 for( ; j<jpos; j+=SIMDSIZE ) {
4913 const SIMDType x1( x.load(j) );
4914 y[i] -=
sum( A.load(i,j) * x1 ) * scalar;
4917 for( ; remainder && j<jend; ++j ) {
4918 y[i] -= A(i,j) * x[j] * scalar;
4938 template<
typename VT1
4943 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4945 selectLargeSubAssignKernel( y, A, x, scalar );
4950 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4964 template<
typename VT1
4969 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4976 subAssign( y, tmp );
4979 gemv( y, A, x, ET(-scalar), ET(1) );
5001 template<
typename VT1 >
5012 const ResultType tmp(
serial( rhs ) );
5013 multAssign( ~lhs, tmp );
5033 template<
typename VT1 >
5044 const ResultType tmp(
serial( rhs ) );
5045 divAssign( ~lhs, tmp );
5067 template<
typename VT1 >
5078 if( left.rows() == 0UL ) {
5081 else if( left.columns() == 0UL ) {
5112 template<
typename VT1 >
5124 const ResultType tmp( rhs );
5143 template<
typename VT1 >
5154 if( left.rows() == 0UL || left.columns() == 0UL ) {
5188 template<
typename VT1 >
5199 if( left.rows() == 0UL || left.columns() == 0UL ) {
5233 template<
typename VT1 >
5245 const ResultType tmp( rhs );
5268 template<
typename VT1 >
5280 const ResultType tmp( rhs );
5342 template<
typename T1
5379 template<
typename T1
5389 return (~mat).leftOperand() * ( (~mat).
rightOperand() * vec );
5404 template<
typename MT,
typename VT >
5405 struct Size< DMatDVecMultExpr<MT,VT> > :
public Rows<MT>
5421 template<
typename MT,
typename VT >
5422 struct IsAligned< DMatDVecMultExpr<MT,VT> >
5423 :
public BoolConstant< And< IsAligned<MT>, IsAligned<VT> >::value >
5439 template<
typename MT,
typename VT,
bool AF >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
If_< IsExpression< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:216
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:211
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Evaluation of the expression type type of a subvector operation.Via this type trait it is possible to...
Definition: SubvectorExprTrait.h:79
Header file for the Rows type trait.
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
BLAZE_ALWAYS_INLINE const complex< int8_t > sum(const SIMDcint8 &a) noexcept
Returns the sum of all elements in the 8-bit integral complex SIMD vector.
Definition: Reduction.h:63
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:265
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:125
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:194
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:136
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:532
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:163
If_< IsExpression< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:219
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:342
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:354
DMatDVecMultExpr< MT, VT > This
Type of this DMatDVecMultExpr instance.
Definition: DMatDVecMultExpr.h:207
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:133
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:71
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:374
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:225
Header file for the IsComplexDouble type trait.
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:135
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:210
Constraint on the data type.
typename MultExprTrait< T1, T2 >::Type MultExprTrait_
Auxiliary alias declaration for the MultExprTrait class template.The MultExprTrait_ alias declaration...
Definition: MultExprTrait.h:344
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:310
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename SubvectorExprTrait< VT, AF >::Type SubvectorExprTrait_
Auxiliary alias declaration for the SubvectorExprTrait type trait.The SubvectorExprTrait_ alias decla...
Definition: SubvectorExprTrait.h:133
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:336
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:66
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:128
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:110
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:330
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:603
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:384
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:208
Header file for the IsNumeric type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:604
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:66
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:364
Header file for run time assertion macros.
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:385
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:251
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:134
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:209
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:213
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:93
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Constraint on the data type.
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:223
Header file for BLAS general matrix/vector multiplication functions (gemv)
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:132
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:117
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:75
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:120
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:212
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:76
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Constraint on the data type.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:363
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:320
Header file for the MatVecMultExpr base class.
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:222
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:297
const DMatDMatMultExpr< T1, T2, false, false, false, false > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7505
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:131