35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
123 template<
typename MT
156 template<
typename T1 >
157 struct UseSMPAssign {
158 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
168 template<
typename T1,
typename T2,
typename T3 >
169 struct UseBlasKernel {
175 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
191 template<
typename T1,
typename T2,
typename T3 >
192 struct UseVectorizedDefaultKernel {
195 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
231 MT::simdEnabled && VT::simdEnabled &&
236 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
237 !evaluateVector && VT::smpAssignable };
270 return mat_(index,index) *
vec_[index];
280 const size_t n (
mat_.columns() -
begin );
297 inline ReturnType
at(
size_t index )
const {
298 if( index >=
mat_.rows() ) {
301 return (*
this)[index];
310 inline size_t size() const noexcept {
341 template<
typename T >
342 inline bool canAlias(
const T* alias )
const noexcept {
343 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
353 template<
typename T >
354 inline bool isAliased(
const T* alias )
const noexcept {
355 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
365 return mat_.isAligned() &&
vec_.isAligned();
377 (
mat_.rows() *
mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
378 (
size() > SMP_DMATDVECMULT_THRESHOLD );
401 template<
typename VT1 >
408 if( rhs.
mat_.rows() == 0UL ) {
411 else if( rhs.
mat_.columns() == 0UL ) {
424 DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
440 template<
typename VT1
443 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
447 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
448 selectSmallAssignKernel( y, A, x );
450 selectBlasAssignKernel( y, A, x );
469 template<
typename VT1
472 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
493 template<
typename VT1
496 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
497 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
499 selectDefaultAssignKernel( y, A, x );
518 template<
typename VT1
521 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
522 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
524 const size_t M( A.rows() );
525 const size_t N( A.columns() );
527 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
531 for( ; (i+8UL) <= M; i+=8UL )
533 const size_t jbegin( ( IsUpper<MT1>::value )
534 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
536 const size_t jend( ( IsLower<MT1>::value )
537 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
541 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
542 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
544 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
547 for( ; j<jpos; j+=SIMDSIZE ) {
548 const SIMDType x1( x.load(j) );
549 xmm1 = xmm1 + A.load(i ,j) * x1;
550 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
551 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
552 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
553 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
554 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
555 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
556 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
560 y[i+1UL] =
sum( xmm2 );
561 y[i+2UL] =
sum( xmm3 );
562 y[i+3UL] =
sum( xmm4 );
563 y[i+4UL] =
sum( xmm5 );
564 y[i+5UL] =
sum( xmm6 );
565 y[i+6UL] =
sum( xmm7 );
566 y[i+7UL] =
sum( xmm8 );
568 for( ; remainder && j<jend; ++j ) {
569 y[i ] += A(i ,j) * x[j];
570 y[i+1UL] += A(i+1UL,j) * x[j];
571 y[i+2UL] += A(i+2UL,j) * x[j];
572 y[i+3UL] += A(i+3UL,j) * x[j];
573 y[i+4UL] += A(i+4UL,j) * x[j];
574 y[i+5UL] += A(i+5UL,j) * x[j];
575 y[i+6UL] += A(i+6UL,j) * x[j];
576 y[i+7UL] += A(i+7UL,j) * x[j];
580 for( ; (i+4UL) <= M; i+=4UL )
582 const size_t jbegin( ( IsUpper<MT1>::value )
583 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
585 const size_t jend( ( IsLower<MT1>::value )
586 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
590 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
591 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
593 SIMDType xmm1, xmm2, xmm3, xmm4;
596 for( ; j<jpos; j+=SIMDSIZE ) {
597 const SIMDType x1( x.load(j) );
598 xmm1 = xmm1 + A.load(i ,j) * x1;
599 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
600 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
601 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
605 y[i+1UL] =
sum( xmm2 );
606 y[i+2UL] =
sum( xmm3 );
607 y[i+3UL] =
sum( xmm4 );
609 for( ; remainder && j<jend; ++j ) {
610 y[i ] += A(i ,j) * x[j];
611 y[i+1UL] += A(i+1UL,j) * x[j];
612 y[i+2UL] += A(i+2UL,j) * x[j];
613 y[i+3UL] += A(i+3UL,j) * x[j];
617 for( ; (i+3UL) <= M; i+=3UL )
619 const size_t jbegin( ( IsUpper<MT1>::value )
620 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
622 const size_t jend( ( IsLower<MT1>::value )
623 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
627 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
628 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
630 SIMDType xmm1, xmm2, xmm3;
633 for( ; j<jpos; j+=SIMDSIZE ) {
634 const SIMDType x1( x.load(j) );
635 xmm1 = xmm1 + A.load(i ,j) * x1;
636 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
637 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
641 y[i+1UL] =
sum( xmm2 );
642 y[i+2UL] =
sum( xmm3 );
644 for( ; remainder && j<jend; ++j ) {
645 y[i ] += A(i ,j) * x[j];
646 y[i+1UL] += A(i+1UL,j) * x[j];
647 y[i+2UL] += A(i+2UL,j) * x[j];
651 for( ; (i+2UL) <= M; i+=2UL )
653 const size_t jbegin( ( IsUpper<MT1>::value )
654 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
656 const size_t jend( ( IsLower<MT1>::value )
657 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
661 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
662 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
667 for( ; j<jpos; j+=SIMDSIZE ) {
668 const SIMDType x1( x.load(j) );
669 xmm1 = xmm1 + A.load(i ,j) * x1;
670 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
674 y[i+1UL] =
sum( xmm2 );
676 for( ; remainder && j<jend; ++j ) {
677 y[i ] += A(i ,j) * x[j];
678 y[i+1UL] += A(i+1UL,j) * x[j];
684 const size_t jbegin( ( IsUpper<MT1>::value )
685 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
687 const size_t jend( ( IsLower<MT1>::value )
688 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
692 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
693 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
698 for( ; j<jpos; j+=SIMDSIZE ) {
699 xmm1 = xmm1 + A.load(i,j) * x.load(j);
704 for( ; remainder && j<jend; ++j ) {
705 y[i] += A(i,j) * x[j];
726 template<
typename VT1
729 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
730 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
732 selectDefaultAssignKernel( y, A, x );
751 template<
typename VT1
754 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
755 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
757 const size_t M( A.rows() );
758 const size_t N( A.columns() );
760 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
766 for( ; (i+8UL) <= M; i+=8UL )
768 const size_t jbegin( ( IsUpper<MT1>::value )
769 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
771 const size_t jend( ( IsLower<MT1>::value )
772 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
776 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
777 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
781 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
782 const size_t j1( j+SIMDSIZE );
783 const size_t j2( j+SIMDSIZE*2UL );
784 const size_t j3( j+SIMDSIZE*3UL );
785 const SIMDType x1( x.load(j ) );
786 const SIMDType x2( x.load(j1) );
787 const SIMDType x3( x.load(j2) );
788 const SIMDType x4( x.load(j3) );
789 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
790 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
791 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
792 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
793 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
794 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
795 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
796 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
799 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
800 const size_t j1( j+SIMDSIZE );
801 const SIMDType x1( x.load(j ) );
802 const SIMDType x2( x.load(j1) );
803 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
804 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
805 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
806 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
807 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
808 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
809 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
810 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
813 for( ; j<jpos; j+=SIMDSIZE ) {
814 const SIMDType x1( x.load(j) );
815 y[i ] +=
sum( A.load(i ,j) * x1 );
816 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
817 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
818 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
819 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
820 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
821 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
822 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
825 for( ; remainder && j<jend; ++j ) {
826 y[i ] += A(i ,j) * x[j];
827 y[i+1UL] += A(i+1UL,j) * x[j];
828 y[i+2UL] += A(i+2UL,j) * x[j];
829 y[i+3UL] += A(i+3UL,j) * x[j];
830 y[i+4UL] += A(i+4UL,j) * x[j];
831 y[i+5UL] += A(i+5UL,j) * x[j];
832 y[i+6UL] += A(i+6UL,j) * x[j];
833 y[i+7UL] += A(i+7UL,j) * x[j];
837 for( ; (i+4UL) <= M; i+=4UL )
839 const size_t jbegin( ( IsUpper<MT1>::value )
840 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
842 const size_t jend( ( IsLower<MT1>::value )
843 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
847 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
848 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
852 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
853 const size_t j1( j+SIMDSIZE );
854 const size_t j2( j+SIMDSIZE*2UL );
855 const size_t j3( j+SIMDSIZE*3UL );
856 const SIMDType x1( x.load(j ) );
857 const SIMDType x2( x.load(j1) );
858 const SIMDType x3( x.load(j2) );
859 const SIMDType x4( x.load(j3) );
860 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
861 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
862 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
863 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
866 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
867 const size_t j1( j+SIMDSIZE );
868 const SIMDType x1( x.load(j ) );
869 const SIMDType x2( x.load(j1) );
870 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
871 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
872 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
873 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
876 for( ; j<jpos; j+=SIMDSIZE ) {
877 const SIMDType x1( x.load(j) );
878 y[i ] +=
sum( A.load(i ,j) * x1 );
879 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
880 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
881 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
884 for( ; remainder && j<jend; ++j ) {
885 y[i ] += A(i ,j) * x[j];
886 y[i+1UL] += A(i+1UL,j) * x[j];
887 y[i+2UL] += A(i+2UL,j) * x[j];
888 y[i+3UL] += A(i+3UL,j) * x[j];
892 for( ; (i+2UL) <= M; i+=2UL )
894 const size_t jbegin( ( IsUpper<MT1>::value )
895 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
897 const size_t jend( ( IsLower<MT1>::value )
898 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
902 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
903 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
907 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
908 const size_t j1( j+SIMDSIZE );
909 const size_t j2( j+SIMDSIZE*2UL );
910 const size_t j3( j+SIMDSIZE*3UL );
911 const SIMDType x1( x.load(j ) );
912 const SIMDType x2( x.load(j1) );
913 const SIMDType x3( x.load(j2) );
914 const SIMDType x4( x.load(j3) );
915 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
916 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
919 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
920 const size_t j1( j+SIMDSIZE );
921 const SIMDType x1( x.load(j ) );
922 const SIMDType x2( x.load(j1) );
923 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
924 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
927 for( ; j<jpos; j+=SIMDSIZE ) {
928 const SIMDType x1( x.load(j) );
929 y[i ] +=
sum( A.load(i ,j) * x1 );
930 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
933 for( ; remainder && j<jend; ++j ) {
934 y[i ] += A(i ,j) * x[j];
935 y[i+1UL] += A(i+1UL,j) * x[j];
941 const size_t jbegin( ( IsUpper<MT1>::value )
942 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
944 const size_t jend( ( IsLower<MT1>::value )
945 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
949 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
950 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
954 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
955 const size_t j1( j+SIMDSIZE );
956 const size_t j2( j+SIMDSIZE*2UL );
957 const size_t j3( j+SIMDSIZE*3UL );
958 const SIMDType x1( x.load(j ) );
959 const SIMDType x2( x.load(j1) );
960 const SIMDType x3( x.load(j2) );
961 const SIMDType x4( x.load(j3) );
962 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
965 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
966 const size_t j1( j+SIMDSIZE );
967 const SIMDType x1( x.load(j ) );
968 const SIMDType x2( x.load(j1) );
969 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
972 for( ; j<jpos; j+=SIMDSIZE ) {
973 const SIMDType x1( x.load(j) );
974 y[i] +=
sum( A.load(i,j) * x1 );
977 for( ; remainder && j<jend; ++j ) {
978 y[i] += A(i,j) * x[j];
999 template<
typename VT1
1002 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
1003 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1005 selectLargeAssignKernel( y, A, x );
1011 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1025 template<
typename VT1
1028 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
1029 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1031 typedef ElementType_<VT1> ET;
1033 if( IsTriangular<MT1>::value ) {
1035 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1038 gemv( y, A, x, ET(1), ET(0) );
1058 template<
typename VT1 >
1059 friend inline void assign( SparseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1069 const ResultType tmp(
serial( rhs ) );
1070 assign( ~lhs, tmp );
1088 template<
typename VT1 >
1089 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1095 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1099 LT A(
serial( rhs.mat_ ) );
1100 RT x(
serial( rhs.vec_ ) );
1107 DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1123 template<
typename VT1
1126 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1128 if( ( IsDiagonal<MT1>::value ) ||
1129 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1130 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1131 selectSmallAddAssignKernel( y, A, x );
1133 selectBlasAddAssignKernel( y, A, x );
1152 template<
typename VT1
1155 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1157 y.addAssign( A * x );
1176 template<
typename VT1
1179 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1180 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1182 selectDefaultAddAssignKernel( y, A, x );
1201 template<
typename VT1
1204 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1205 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1207 const size_t M( A.rows() );
1208 const size_t N( A.columns() );
1210 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1214 for( ; (i+8UL) <= M; i+=8UL )
1216 const size_t jbegin( ( IsUpper<MT1>::value )
1217 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1219 const size_t jend( ( IsLower<MT1>::value )
1220 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1224 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1225 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1227 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1230 for( ; j<jpos; j+=SIMDSIZE ) {
1231 const SIMDType x1( x.load(j) );
1232 xmm1 = xmm1 + A.load(i ,j) * x1;
1233 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1234 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1235 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1236 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1237 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1238 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1239 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1242 y[i ] +=
sum( xmm1 );
1243 y[i+1UL] +=
sum( xmm2 );
1244 y[i+2UL] +=
sum( xmm3 );
1245 y[i+3UL] +=
sum( xmm4 );
1246 y[i+4UL] +=
sum( xmm5 );
1247 y[i+5UL] +=
sum( xmm6 );
1248 y[i+6UL] +=
sum( xmm7 );
1249 y[i+7UL] +=
sum( xmm8 );
1251 for( ; remainder && j<jend; ++j ) {
1252 y[i ] += A(i ,j) * x[j];
1253 y[i+1UL] += A(i+1UL,j) * x[j];
1254 y[i+2UL] += A(i+2UL,j) * x[j];
1255 y[i+3UL] += A(i+3UL,j) * x[j];
1256 y[i+4UL] += A(i+4UL,j) * x[j];
1257 y[i+5UL] += A(i+5UL,j) * x[j];
1258 y[i+6UL] += A(i+6UL,j) * x[j];
1259 y[i+7UL] += A(i+7UL,j) * x[j];
1263 for( ; (i+4UL) <= M; i+=4UL )
1265 const size_t jbegin( ( IsUpper<MT1>::value )
1266 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1268 const size_t jend( ( IsLower<MT1>::value )
1269 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1273 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1274 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1276 SIMDType xmm1, xmm2, xmm3, xmm4;
1279 for( ; j<jpos; j+=SIMDSIZE ) {
1280 const SIMDType x1( x.load(j) );
1281 xmm1 = xmm1 + A.load(i ,j) * x1;
1282 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1283 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1284 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1287 y[i ] +=
sum( xmm1 );
1288 y[i+1UL] +=
sum( xmm2 );
1289 y[i+2UL] +=
sum( xmm3 );
1290 y[i+3UL] +=
sum( xmm4 );
1292 for( ; remainder && j<jend; ++j ) {
1293 y[i ] += A(i ,j) * x[j];
1294 y[i+1UL] += A(i+1UL,j) * x[j];
1295 y[i+2UL] += A(i+2UL,j) * x[j];
1296 y[i+3UL] += A(i+3UL,j) * x[j];
1300 for( ; (i+3UL) <= M; i+=3UL )
1302 const size_t jbegin( ( IsUpper<MT1>::value )
1303 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1305 const size_t jend( ( IsLower<MT1>::value )
1306 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1310 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1311 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1313 SIMDType xmm1, xmm2, xmm3;
1316 for( ; j<jpos; j+=SIMDSIZE ) {
1317 const SIMDType x1( x.load(j) );
1318 xmm1 = xmm1 + A.load(i ,j) * x1;
1319 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1320 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1323 y[i ] +=
sum( xmm1 );
1324 y[i+1UL] +=
sum( xmm2 );
1325 y[i+2UL] +=
sum( xmm3 );
1327 for( ; remainder && j<jend; ++j ) {
1328 y[i ] += A(i ,j) * x[j];
1329 y[i+1UL] += A(i+1UL,j) * x[j];
1330 y[i+2UL] += A(i+2UL,j) * x[j];
1334 for( ; (i+2UL) <= M; i+=2UL )
1336 const size_t jbegin( ( IsUpper<MT1>::value )
1337 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1339 const size_t jend( ( IsLower<MT1>::value )
1340 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1344 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1345 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1347 SIMDType xmm1, xmm2;
1350 for( ; j<jpos; j+=SIMDSIZE ) {
1351 const SIMDType x1( x.load(j) );
1352 xmm1 = xmm1 + A.load(i ,j) * x1;
1353 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1356 y[i ] +=
sum( xmm1 );
1357 y[i+1UL] +=
sum( xmm2 );
1359 for( ; remainder && j<jend; ++j ) {
1360 y[i ] += A(i ,j) * x[j];
1361 y[i+1UL] += A(i+1UL,j) * x[j];
1367 const size_t jbegin( ( IsUpper<MT1>::value )
1368 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1370 const size_t jend( ( IsLower<MT1>::value )
1371 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1375 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1376 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1381 for( ; j<jpos; j+=SIMDSIZE ) {
1382 xmm1 = xmm1 + A.load(i,j) * x.load(j);
1385 y[i] +=
sum( xmm1 );
1387 for( ; remainder && j<jend; ++j ) {
1388 y[i] += A(i,j) * x[j];
1409 template<
typename VT1
1412 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1413 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1415 selectDefaultAddAssignKernel( y, A, x );
1434 template<
typename VT1
1437 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1438 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1440 const size_t M( A.rows() );
1441 const size_t N( A.columns() );
1443 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1447 for( ; (i+8UL) <= M; i+=8UL )
1449 const size_t jbegin( ( IsUpper<MT1>::value )
1450 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1452 const size_t jend( ( IsLower<MT1>::value )
1453 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1457 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1458 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1462 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1463 const size_t j1( j+SIMDSIZE );
1464 const size_t j2( j+SIMDSIZE*2UL );
1465 const size_t j3( j+SIMDSIZE*3UL );
1466 const SIMDType x1( x.load(j ) );
1467 const SIMDType x2( x.load(j1) );
1468 const SIMDType x3( x.load(j2) );
1469 const SIMDType x4( x.load(j3) );
1470 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1471 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1472 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1473 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1474 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1475 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1476 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1477 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1480 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1481 const size_t j1( j+SIMDSIZE );
1482 const SIMDType x1( x.load(j ) );
1483 const SIMDType x2( x.load(j1) );
1484 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1485 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1486 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1487 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1488 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1489 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1490 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1491 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1494 for( ; j<jpos; j+=SIMDSIZE ) {
1495 const SIMDType x1( x.load(j) );
1496 y[i ] +=
sum( A.load(i ,j) * x1 );
1497 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1498 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1499 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1500 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
1501 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
1502 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
1503 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
1506 for( ; remainder && j<jend; ++j ) {
1507 y[i ] += A(i ,j) * x[j];
1508 y[i+1UL] += A(i+1UL,j) * x[j];
1509 y[i+2UL] += A(i+2UL,j) * x[j];
1510 y[i+3UL] += A(i+3UL,j) * x[j];
1511 y[i+4UL] += A(i+4UL,j) * x[j];
1512 y[i+5UL] += A(i+5UL,j) * x[j];
1513 y[i+6UL] += A(i+6UL,j) * x[j];
1514 y[i+7UL] += A(i+7UL,j) * x[j];
1518 for( ; (i+4UL) <= M; i+=4UL )
1520 const size_t jbegin( ( IsUpper<MT1>::value )
1521 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1523 const size_t jend( ( IsLower<MT1>::value )
1524 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1528 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1529 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1533 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1534 const size_t j1( j+SIMDSIZE );
1535 const size_t j2( j+SIMDSIZE*2UL );
1536 const size_t j3( j+SIMDSIZE*3UL );
1537 const SIMDType x1( x.load(j ) );
1538 const SIMDType x2( x.load(j1) );
1539 const SIMDType x3( x.load(j2) );
1540 const SIMDType x4( x.load(j3) );
1541 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1542 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1543 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1544 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1547 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1548 const size_t j1( j+SIMDSIZE );
1549 const SIMDType x1( x.load(j ) );
1550 const SIMDType x2( x.load(j1) );
1551 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1552 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1553 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1554 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1557 for( ; j<jpos; j+=SIMDSIZE ) {
1558 const SIMDType x1( x.load(j) );
1559 y[i ] +=
sum( A.load(i ,j) * x1 );
1560 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1561 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1562 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1565 for( ; remainder && j<jend; ++j ) {
1566 y[i ] += A(i ,j) * x[j];
1567 y[i+1UL] += A(i+1UL,j) * x[j];
1568 y[i+2UL] += A(i+2UL,j) * x[j];
1569 y[i+3UL] += A(i+3UL,j) * x[j];
1573 for( ; (i+2UL) <= M; i+=2UL )
1575 const size_t jbegin( ( IsUpper<MT1>::value )
1576 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1578 const size_t jend( ( IsLower<MT1>::value )
1579 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1583 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1584 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1588 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1589 const size_t j1( j+SIMDSIZE );
1590 const size_t j2( j+SIMDSIZE*2UL );
1591 const size_t j3( j+SIMDSIZE*3UL );
1592 const SIMDType x1( x.load(j ) );
1593 const SIMDType x2( x.load(j1) );
1594 const SIMDType x3( x.load(j2) );
1595 const SIMDType x4( x.load(j3) );
1596 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1597 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1600 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1601 const size_t j1( j+SIMDSIZE );
1602 const SIMDType x1( x.load(j ) );
1603 const SIMDType x2( x.load(j1) );
1604 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1605 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1608 for( ; j<jpos; j+=SIMDSIZE ) {
1609 const SIMDType x1( x.load(j) );
1610 y[i ] +=
sum( A.load(i ,j) * x1 );
1611 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1614 for( ; remainder && j<jend; ++j ) {
1615 y[i ] += A(i ,j) * x[j];
1616 y[i+1UL] += A(i+1UL,j) * x[j];
1622 const size_t jbegin( ( IsUpper<MT1>::value )
1623 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1625 const size_t jend( ( IsLower<MT1>::value )
1626 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1630 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1631 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1635 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1636 const size_t j1( j+SIMDSIZE );
1637 const size_t j2( j+SIMDSIZE*2UL );
1638 const size_t j3( j+SIMDSIZE*3UL );
1639 const SIMDType x1( x.load(j ) );
1640 const SIMDType x2( x.load(j1) );
1641 const SIMDType x3( x.load(j2) );
1642 const SIMDType x4( x.load(j3) );
1643 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1646 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1647 const size_t j1( j+SIMDSIZE );
1648 const SIMDType x1( x.load(j ) );
1649 const SIMDType x2( x.load(j1) );
1650 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1653 for( ; j<jpos; j+=SIMDSIZE ) {
1654 const SIMDType x1( x.load(j) );
1655 y[i] +=
sum( A.load(i,j) * x1 );
1658 for( ; remainder && j<jend; ++j ) {
1659 y[i] += A(i,j) * x[j];
1680 template<
typename VT1
1683 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
1684 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1686 selectLargeAddAssignKernel( y, A, x );
1692 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1706 template<
typename VT1
1709 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
1710 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1712 typedef ElementType_<VT1> ET;
1714 if( IsTriangular<MT1>::value ) {
1715 ResultType_<VT1> tmp(
serial( x ) );
1716 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1717 addAssign( y, tmp );
1720 gemv( y, A, x, ET(1), ET(1) );
1744 template<
typename VT1 >
1745 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1751 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1755 LT A(
serial( rhs.mat_ ) );
1756 RT x(
serial( rhs.vec_ ) );
1763 DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1779 template<
typename VT1
1782 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1784 if( ( IsDiagonal<MT1>::value ) ||
1785 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1786 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1787 selectSmallSubAssignKernel( y, A, x );
1789 selectBlasSubAssignKernel( y, A, x );
1808 template<
typename VT1
1811 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1813 y.subAssign( A * x );
1832 template<
typename VT1
1835 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1836 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1838 selectDefaultSubAssignKernel( y, A, x );
1857 template<
typename VT1
1860 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1861 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1863 const size_t M( A.rows() );
1864 const size_t N( A.columns() );
1866 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1870 for( ; (i+8UL) <= M; i+=8UL )
1872 const size_t jbegin( ( IsUpper<MT1>::value )
1873 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1875 const size_t jend( ( IsLower<MT1>::value )
1876 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1880 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1881 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1883 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1886 for( ; j<jpos; j+=SIMDSIZE ) {
1887 const SIMDType x1( x.load(j) );
1888 xmm1 = xmm1 + A.load(i ,j) * x1;
1889 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1890 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1891 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1892 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1893 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1894 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1895 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1898 y[i ] -=
sum( xmm1 );
1899 y[i+1UL] -=
sum( xmm2 );
1900 y[i+2UL] -=
sum( xmm3 );
1901 y[i+3UL] -=
sum( xmm4 );
1902 y[i+4UL] -=
sum( xmm5 );
1903 y[i+5UL] -=
sum( xmm6 );
1904 y[i+6UL] -=
sum( xmm7 );
1905 y[i+7UL] -=
sum( xmm8 );
1907 for( ; remainder && j<jend; ++j ) {
1908 y[i ] -= A(i ,j) * x[j];
1909 y[i+1UL] -= A(i+1UL,j) * x[j];
1910 y[i+2UL] -= A(i+2UL,j) * x[j];
1911 y[i+3UL] -= A(i+3UL,j) * x[j];
1912 y[i+4UL] -= A(i+4UL,j) * x[j];
1913 y[i+5UL] -= A(i+5UL,j) * x[j];
1914 y[i+6UL] -= A(i+6UL,j) * x[j];
1915 y[i+7UL] -= A(i+7UL,j) * x[j];
1919 for( ; (i+4UL) <= M; i+=4UL )
1921 const size_t jbegin( ( IsUpper<MT1>::value )
1922 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1924 const size_t jend( ( IsLower<MT1>::value )
1925 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1929 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1930 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1932 SIMDType xmm1, xmm2, xmm3, xmm4;
1935 for( ; j<jpos; j+=SIMDSIZE ) {
1936 const SIMDType x1( x.load(j) );
1937 xmm1 = xmm1 + A.load(i ,j) * x1;
1938 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1939 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1940 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1943 y[i ] -=
sum( xmm1 );
1944 y[i+1UL] -=
sum( xmm2 );
1945 y[i+2UL] -=
sum( xmm3 );
1946 y[i+3UL] -=
sum( xmm4 );
1948 for( ; remainder && j<jend; ++j ) {
1949 y[i ] -= A(i ,j) * x[j];
1950 y[i+1UL] -= A(i+1UL,j) * x[j];
1951 y[i+2UL] -= A(i+2UL,j) * x[j];
1952 y[i+3UL] -= A(i+3UL,j) * x[j];
1956 for( ; (i+3UL) <= M; i+=3UL )
1958 const size_t jbegin( ( IsUpper<MT1>::value )
1959 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1961 const size_t jend( ( IsLower<MT1>::value )
1962 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1966 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1967 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
1969 SIMDType xmm1, xmm2, xmm3;
1972 for( ; j<jpos; j+=SIMDSIZE ) {
1973 const SIMDType x1( x.load(j) );
1974 xmm1 = xmm1 + A.load(i ,j) * x1;
1975 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1976 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1979 y[i ] -=
sum( xmm1 );
1980 y[i+1UL] -=
sum( xmm2 );
1981 y[i+2UL] -=
sum( xmm3 );
1983 for( ; remainder && j<jend; ++j ) {
1984 y[i ] -= A(i ,j) * x[j];
1985 y[i+1UL] -= A(i+1UL,j) * x[j];
1986 y[i+2UL] -= A(i+2UL,j) * x[j];
1990 for( ; (i+2UL) <= M; i+=2UL )
1992 const size_t jbegin( ( IsUpper<MT1>::value )
1993 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
1995 const size_t jend( ( IsLower<MT1>::value )
1996 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2000 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2001 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2003 SIMDType xmm1, xmm2;
2006 for( ; j<jpos; j+=SIMDSIZE ) {
2007 const SIMDType x1( x.load(j) );
2008 xmm1 = xmm1 + A.load(i ,j) * x1;
2009 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2012 y[i ] -=
sum( xmm1 );
2013 y[i+1UL] -=
sum( xmm2 );
2015 for( ; remainder && j<jend; ++j ) {
2016 y[i ] -= A(i ,j) * x[j];
2017 y[i+1UL] -= A(i+1UL,j) * x[j];
2023 const size_t jbegin( ( IsUpper<MT1>::value )
2024 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
2026 const size_t jend( ( IsLower<MT1>::value )
2027 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2031 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2032 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2037 for( ; j<jpos; j+=SIMDSIZE ) {
2038 xmm1 = xmm1 + A.load(i,j) * x.load(j);
2041 y[i] -=
sum( xmm1 );
2043 for( ; remainder && j<jend; ++j ) {
2044 y[i] -= A(i,j) * x[j];
2065 template<
typename VT1
2068 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
2069 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2071 selectDefaultSubAssignKernel( y, A, x );
2090 template<
typename VT1
2093 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
2094 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2096 const size_t M( A.rows() );
2097 const size_t N( A.columns() );
2099 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
2103 for( ; (i+8UL) <= M; i+=8UL )
2105 const size_t jbegin( ( IsUpper<MT1>::value )
2106 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
2108 const size_t jend( ( IsLower<MT1>::value )
2109 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
2113 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2114 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2118 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2119 const size_t j1( j+SIMDSIZE );
2120 const size_t j2( j+SIMDSIZE*2UL );
2121 const size_t j3( j+SIMDSIZE*3UL );
2122 const SIMDType x1( x.load(j ) );
2123 const SIMDType x2( x.load(j1) );
2124 const SIMDType x3( x.load(j2) );
2125 const SIMDType x4( x.load(j3) );
2126 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2127 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2128 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2129 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2130 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2131 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2132 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2133 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2136 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2137 const size_t j1( j+SIMDSIZE );
2138 const SIMDType x1( x.load(j ) );
2139 const SIMDType x2( x.load(j1) );
2140 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2141 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2142 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2143 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2144 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2145 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2146 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2147 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2150 for( ; j<jpos; j+=SIMDSIZE ) {
2151 const SIMDType x1( x.load(j) );
2152 y[i ] -=
sum( A.load(i ,j) * x1 );
2153 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2154 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2155 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2156 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 );
2157 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 );
2158 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 );
2159 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 );
2162 for( ; remainder && j<jend; ++j ) {
2163 y[i ] -= A(i ,j) * x[j];
2164 y[i+1UL] -= A(i+1UL,j) * x[j];
2165 y[i+2UL] -= A(i+2UL,j) * x[j];
2166 y[i+3UL] -= A(i+3UL,j) * x[j];
2167 y[i+4UL] -= A(i+4UL,j) * x[j];
2168 y[i+5UL] -= A(i+5UL,j) * x[j];
2169 y[i+6UL] -= A(i+6UL,j) * x[j];
2170 y[i+7UL] -= A(i+7UL,j) * x[j];
2174 for( ; (i+4UL) <= M; i+=4UL )
2176 const size_t jbegin( ( IsUpper<MT1>::value )
2177 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
2179 const size_t jend( ( IsLower<MT1>::value )
2180 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
2184 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2185 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2189 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2190 const size_t j1( j+SIMDSIZE );
2191 const size_t j2( j+SIMDSIZE*2UL );
2192 const size_t j3( j+SIMDSIZE*3UL );
2193 const SIMDType x1( x.load(j ) );
2194 const SIMDType x2( x.load(j1) );
2195 const SIMDType x3( x.load(j2) );
2196 const SIMDType x4( x.load(j3) );
2197 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2198 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2199 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2200 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2203 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2204 const size_t j1( j+SIMDSIZE );
2205 const SIMDType x1( x.load(j ) );
2206 const SIMDType x2( x.load(j1) );
2207 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2208 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2209 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2210 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2213 for( ; j<jpos; j+=SIMDSIZE ) {
2214 const SIMDType x1( x.load(j) );
2215 y[i ] -=
sum( A.load(i ,j) * x1 );
2216 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2217 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2218 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2221 for( ; remainder && j<jend; ++j ) {
2222 y[i ] -= A(i ,j) * x[j];
2223 y[i+1UL] -= A(i+1UL,j) * x[j];
2224 y[i+2UL] -= A(i+2UL,j) * x[j];
2225 y[i+3UL] -= A(i+3UL,j) * x[j];
2229 for( ; (i+2UL) <= M; i+=2UL )
2231 const size_t jbegin( ( IsUpper<MT1>::value )
2232 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
2234 const size_t jend( ( IsLower<MT1>::value )
2235 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2239 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2240 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2244 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2245 const size_t j1( j+SIMDSIZE );
2246 const size_t j2( j+SIMDSIZE*2UL );
2247 const size_t j3( j+SIMDSIZE*3UL );
2248 const SIMDType x1( x.load(j ) );
2249 const SIMDType x2( x.load(j1) );
2250 const SIMDType x3( x.load(j2) );
2251 const SIMDType x4( x.load(j3) );
2252 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2253 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2256 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2257 const size_t j1( j+SIMDSIZE );
2258 const SIMDType x1( x.load(j ) );
2259 const SIMDType x2( x.load(j1) );
2260 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2261 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2264 for( ; j<jpos; j+=SIMDSIZE ) {
2265 const SIMDType x1( x.load(j) );
2266 y[i ] -=
sum( A.load(i ,j) * x1 );
2267 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2270 for( ; remainder && j<jend; ++j ) {
2271 y[i ] -= A(i ,j) * x[j];
2272 y[i+1UL] -= A(i+1UL,j) * x[j];
2278 const size_t jbegin( ( IsUpper<MT1>::value )
2279 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
2281 const size_t jend( ( IsLower<MT1>::value )
2282 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2286 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
2287 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
2291 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2292 const size_t j1( j+SIMDSIZE );
2293 const size_t j2( j+SIMDSIZE*2UL );
2294 const size_t j3( j+SIMDSIZE*3UL );
2295 const SIMDType x1( x.load(j ) );
2296 const SIMDType x2( x.load(j1) );
2297 const SIMDType x3( x.load(j2) );
2298 const SIMDType x4( x.load(j3) );
2299 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2302 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2303 const size_t j1( j+SIMDSIZE );
2304 const SIMDType x1( x.load(j ) );
2305 const SIMDType x2( x.load(j1) );
2306 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2309 for( ; j<jpos; j+=SIMDSIZE ) {
2310 const SIMDType x1( x.load(j) );
2311 y[i] -=
sum( A.load(i,j) * x1 );
2314 for( ; remainder && j<jend; ++j ) {
2315 y[i] -= A(i,j) * x[j];
2336 template<
typename VT1
2339 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
2340 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2342 selectLargeSubAssignKernel( y, A, x );
2348 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2362 template<
typename VT1
2365 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
2366 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2368 typedef ElementType_<VT1> ET;
2370 if( IsTriangular<MT1>::value ) {
2371 ResultType_<VT1> tmp(
serial( x ) );
2372 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2373 subAssign( y, tmp );
2376 gemv( y, A, x, ET(-1), ET(1) );
2400 template<
typename VT1 >
2401 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
2411 const ResultType tmp(
serial( rhs ) );
2412 multAssign( ~lhs, tmp );
2434 template<
typename VT1 >
2435 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
2445 const ResultType tmp(
serial( rhs ) );
2446 divAssign( ~lhs, tmp );
2470 template<
typename VT1 >
2471 friend inline EnableIf_< UseSMPAssign<VT1> >
2478 if( rhs.mat_.rows() == 0UL ) {
2481 else if( rhs.mat_.columns() == 0UL ) {
2514 template<
typename VT1 >
2515 friend inline EnableIf_< UseSMPAssign<VT1> >
2526 const ResultType tmp( rhs );
2547 template<
typename VT1 >
2548 friend inline EnableIf_< UseSMPAssign<VT1> >
2555 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2591 template<
typename VT1 >
2592 friend inline EnableIf_< UseSMPAssign<VT1> >
2599 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2635 template<
typename VT1 >
2636 friend inline EnableIf_< UseSMPAssign<VT1> >
2647 const ResultType tmp( rhs );
2672 template<
typename VT1 >
2673 friend inline EnableIf_< UseSMPAssign<VT1> >
2684 const ResultType tmp( rhs );
2723 template<
typename MT
2727 :
public DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false >
2728 ,
private VecScalarMultExpr
2729 ,
private Computation
2733 typedef DMatDVecMultExpr<MT,VT> MVM;
2734 typedef ResultType_<MVM> RES;
2735 typedef ResultType_<MT>
MRT;
2736 typedef ResultType_<VT>
VRT;
2737 typedef ElementType_<MRT>
MET;
2738 typedef ElementType_<VRT>
VET;
2739 typedef CompositeType_<MT>
MCT;
2740 typedef CompositeType_<VT>
VCT;
2745 enum :
bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2746 IsBLASCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2751 enum :
bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<MT>::value };
2759 template<
typename T1 >
2760 struct UseSMPAssign {
2761 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
2769 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2770 struct UseBlasKernel {
2772 HasMutableDataAccess<T1>::value &&
2773 HasConstDataAccess<T2>::value &&
2774 HasConstDataAccess<T3>::value &&
2775 !IsDiagonal<T2>::value &&
2776 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2777 IsBLASCompatible< ElementType_<T1> >::value &&
2778 IsBLASCompatible< ElementType_<T2> >::value &&
2779 IsBLASCompatible< ElementType_<T3> >::value &&
2780 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
2781 IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2782 !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
2791 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2792 struct UseVectorizedDefaultKernel {
2794 !IsDiagonal<T2>::value &&
2795 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2796 AreSIMDCombinable< ElementType_<T1>
2800 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2801 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2807 typedef DVecScalarMultExpr<MVM,ST,false>
This;
2811 typedef SIMDTrait_<ElementType>
SIMDType;
2816 typedef const DMatDVecMultExpr<MT,VT>
LeftOperand;
2822 typedef IfTrue_< evaluateMatrix, const MRT, MCT >
LT;
2825 typedef IfTrue_< evaluateVector, const VRT, VCT >
RT;
2830 enum :
bool { simdEnabled = !IsDiagonal<MT>::value &&
2831 MT::simdEnabled && VT::simdEnabled &&
2832 AreSIMDCombinable<MET,VET,ST>::value &&
2833 HasSIMDAdd<MET,VET>::value &&
2834 HasSIMDMult<MET,VET>::value };
2837 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2838 !evaluateVector && VT::smpAssignable };
2852 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
2864 inline ReturnType
operator[](
size_t index )
const {
2866 return vector_[index] * scalar_;
2877 inline ReturnType
at(
size_t index )
const {
2878 if( index >= vector_.size() ) {
2881 return (*
this)[index];
2890 inline size_t size()
const {
2891 return vector_.size();
2921 template<
typename T >
2922 inline bool canAlias(
const T* alias )
const {
2923 return vector_.canAlias( alias );
2933 template<
typename T >
2934 inline bool isAliased(
const T* alias )
const {
2935 return vector_.isAliased( alias );
2945 return vector_.isAligned();
2955 LeftOperand_<MVM> A( vector_.leftOperand() );
2957 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2958 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2959 (
size() > SMP_DMATDVECMULT_THRESHOLD );
2965 LeftOperand vector_;
2966 RightOperand scalar_;
2981 template<
typename VT1 >
2982 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2988 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
2989 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
2991 if( left.rows() == 0UL ) {
2994 else if( left.columns() == 0UL ) {
3007 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
3022 template<
typename VT1
3026 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3028 if( ( IsDiagonal<MT1>::value ) ||
3029 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3030 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3031 selectSmallAssignKernel( y, A, x, scalar );
3033 selectBlasAssignKernel( y, A, x, scalar );
3051 template<
typename VT1
3055 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3056 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3058 y.assign( A * x * scalar );
3076 template<
typename VT1
3080 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3081 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3083 selectDefaultAssignKernel( y, A, x, scalar );
3101 template<
typename VT1
3105 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3106 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3108 const size_t M( A.rows() );
3109 const size_t N( A.columns() );
3111 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3115 for( ; (i+8UL) <= M; i+=8UL )
3117 const size_t jbegin( ( IsUpper<MT1>::value )
3118 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3120 const size_t jend( ( IsLower<MT1>::value )
3121 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3125 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3126 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3128 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3131 for( ; j<jpos; j+=SIMDSIZE ) {
3132 const SIMDType x1( x.load(j) );
3133 xmm1 = xmm1 + A.load(i ,j) * x1;
3134 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3135 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3136 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3137 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3138 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3139 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3140 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3143 y[i ] =
sum( xmm1 ) * scalar;
3144 y[i+1UL] =
sum( xmm2 ) * scalar;
3145 y[i+2UL] =
sum( xmm3 ) * scalar;
3146 y[i+3UL] =
sum( xmm4 ) * scalar;
3147 y[i+4UL] =
sum( xmm5 ) * scalar;
3148 y[i+5UL] =
sum( xmm6 ) * scalar;
3149 y[i+6UL] =
sum( xmm7 ) * scalar;
3150 y[i+7UL] =
sum( xmm8 ) * scalar;
3152 for( ; remainder && j<jend; ++j ) {
3153 y[i ] += A(i ,j) * x[j] * scalar;
3154 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3155 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3156 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3157 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3158 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3159 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3160 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3164 for( ; (i+4UL) <= M; i+=4UL )
3166 const size_t jbegin( ( IsUpper<MT1>::value )
3167 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3169 const size_t jend( ( IsLower<MT1>::value )
3170 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3174 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3175 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3177 SIMDType xmm1, xmm2, xmm3, xmm4;
3180 for( ; j<jpos; j+=SIMDSIZE ) {
3181 const SIMDType x1( x.load(j) );
3182 xmm1 = xmm1 + A.load(i ,j) * x1;
3183 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3184 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3185 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3188 y[i ] =
sum( xmm1 ) * scalar;
3189 y[i+1UL] =
sum( xmm2 ) * scalar;
3190 y[i+2UL] =
sum( xmm3 ) * scalar;
3191 y[i+3UL] =
sum( xmm4 ) * scalar;
3193 for( ; remainder && j<jend; ++j ) {
3194 y[i ] += A(i ,j) * x[j] * scalar;
3195 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3196 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3197 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3201 for( ; (i+3UL) <= M; i+=3UL )
3203 const size_t jbegin( ( IsUpper<MT1>::value )
3204 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3206 const size_t jend( ( IsLower<MT1>::value )
3207 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3211 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3212 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3214 SIMDType xmm1, xmm2, xmm3;
3217 for( ; j<jpos; j+=SIMDSIZE ) {
3218 const SIMDType x1( x.load(j) );
3219 xmm1 = xmm1 + A.load(i ,j) * x1;
3220 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3221 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3224 y[i ] =
sum( xmm1 ) * scalar;
3225 y[i+1UL] =
sum( xmm2 ) * scalar;
3226 y[i+2UL] =
sum( xmm3 ) * scalar;
3228 for( ; remainder && j<jend; ++j ) {
3229 y[i ] += A(i ,j) * x[j] * scalar;
3230 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3231 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3235 for( ; (i+2UL) <= M; i+=2UL )
3237 const size_t jbegin( ( IsUpper<MT1>::value )
3238 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3240 const size_t jend( ( IsLower<MT1>::value )
3241 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3245 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3246 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3248 SIMDType xmm1, xmm2;
3251 for( ; j<jpos; j+=SIMDSIZE ) {
3252 const SIMDType x1( x.load(j) );
3253 xmm1 = xmm1 + A.load(i ,j) * x1;
3254 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3257 y[i ] =
sum( xmm1 ) * scalar;
3258 y[i+1UL] =
sum( xmm2 ) * scalar;
3260 for( ; remainder && j<jend; ++j ) {
3261 y[i ] += A(i ,j) * x[j] * scalar;
3262 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3268 const size_t jbegin( ( IsUpper<MT1>::value )
3269 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3271 const size_t jend( ( IsLower<MT1>::value )
3272 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3276 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3277 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3282 for( ; j<jpos; j+=SIMDSIZE ) {
3283 xmm1 = xmm1 + A.load(i,j) * x.load(j);
3286 y[i] =
sum( xmm1 ) * scalar;
3288 for( ; remainder && j<jend; ++j ) {
3289 y[i] += A(i,j) * x[j] * scalar;
3309 template<
typename VT1
3313 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3314 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3316 selectDefaultAssignKernel( y, A, x, scalar );
3334 template<
typename VT1
3338 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3339 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3341 const size_t M( A.rows() );
3342 const size_t N( A.columns() );
3344 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3350 for( ; (i+8UL) <= M; i+=8UL )
3352 const size_t jbegin( ( IsUpper<MT1>::value )
3353 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3355 const size_t jend( ( IsLower<MT1>::value )
3356 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3360 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3361 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3365 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3366 const size_t j1( j+SIMDSIZE );
3367 const size_t j2( j+SIMDSIZE*2UL );
3368 const size_t j3( j+SIMDSIZE*3UL );
3369 const SIMDType x1( x.load(j ) );
3370 const SIMDType x2( x.load(j1) );
3371 const SIMDType x3( x.load(j2) );
3372 const SIMDType x4( x.load(j3) );
3373 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3374 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3375 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3376 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3377 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3378 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3379 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3380 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3383 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3384 const size_t j1( j+SIMDSIZE );
3385 const SIMDType x1( x.load(j ) );
3386 const SIMDType x2( x.load(j1) );
3387 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3388 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3389 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3390 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3391 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3392 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3393 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3394 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3397 for( ; j<jpos; j+=SIMDSIZE ) {
3398 const SIMDType x1( x.load(j) );
3399 y[i ] +=
sum( A.load(i ,j) * x1 );
3400 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3401 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3402 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3403 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
3404 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
3405 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
3406 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
3409 for( ; remainder && j<jend; ++j ) {
3410 y[i ] += A(i ,j) * x[j];
3411 y[i+1UL] += A(i+1UL,j) * x[j];
3412 y[i+2UL] += A(i+2UL,j) * x[j];
3413 y[i+3UL] += A(i+3UL,j) * x[j];
3414 y[i+4UL] += A(i+4UL,j) * x[j];
3415 y[i+5UL] += A(i+5UL,j) * x[j];
3416 y[i+6UL] += A(i+6UL,j) * x[j];
3417 y[i+7UL] += A(i+7UL,j) * x[j];
3430 for( ; (i+4UL) <= M; i+=4UL )
3432 const size_t jbegin( ( IsUpper<MT1>::value )
3433 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3435 const size_t jend( ( IsLower<MT1>::value )
3436 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3440 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3441 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3445 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3446 const size_t j1( j+SIMDSIZE );
3447 const size_t j2( j+SIMDSIZE*2UL );
3448 const size_t j3( j+SIMDSIZE*3UL );
3449 const SIMDType x1( x.load(j ) );
3450 const SIMDType x2( x.load(j1) );
3451 const SIMDType x3( x.load(j2) );
3452 const SIMDType x4( x.load(j3) );
3453 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3454 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3455 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3456 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3459 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3460 const size_t j1( j+SIMDSIZE );
3461 const SIMDType x1( x.load(j ) );
3462 const SIMDType x2( x.load(j1) );
3463 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3464 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3465 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3466 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3469 for( ; j<jpos; j+=SIMDSIZE ) {
3470 const SIMDType x1( x.load(j) );
3471 y[i ] +=
sum( A.load(i ,j) * x1 );
3472 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3473 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3474 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3477 for( ; remainder && j<jend; ++j ) {
3478 y[i ] += A(i ,j) * x[j];
3479 y[i+1UL] += A(i+1UL,j) * x[j];
3480 y[i+2UL] += A(i+2UL,j) * x[j];
3481 y[i+3UL] += A(i+3UL,j) * x[j];
3490 for( ; (i+2UL) <= M; i+=2UL )
3492 const size_t jbegin( ( IsUpper<MT1>::value )
3493 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3495 const size_t jend( ( IsLower<MT1>::value )
3496 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3500 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3501 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3505 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3506 const size_t j1( j+SIMDSIZE );
3507 const size_t j2( j+SIMDSIZE*2UL );
3508 const size_t j3( j+SIMDSIZE*3UL );
3509 const SIMDType x1( x.load(j ) );
3510 const SIMDType x2( x.load(j1) );
3511 const SIMDType x3( x.load(j2) );
3512 const SIMDType x4( x.load(j3) );
3513 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3514 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3517 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3518 const size_t j1( j+SIMDSIZE );
3519 const SIMDType x1( x.load(j ) );
3520 const SIMDType x2( x.load(j1) );
3521 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3522 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3525 for( ; j<jpos; j+=SIMDSIZE ) {
3526 const SIMDType x1( x.load(j) );
3527 y[i ] +=
sum( A.load(i ,j) * x1 );
3528 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3531 for( ; remainder && j<jend; ++j ) {
3532 y[i ] += A(i ,j) * x[j];
3533 y[i+1UL] += A(i+1UL,j) * x[j];
3542 const size_t jbegin( ( IsUpper<MT1>::value )
3543 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3545 const size_t jend( ( IsLower<MT1>::value )
3546 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3550 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3551 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3555 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3556 const size_t j1( j+SIMDSIZE );
3557 const size_t j2( j+SIMDSIZE*2UL );
3558 const size_t j3( j+SIMDSIZE*3UL );
3559 const SIMDType x1( x.load(j ) );
3560 const SIMDType x2( x.load(j1) );
3561 const SIMDType x3( x.load(j2) );
3562 const SIMDType x4( x.load(j3) );
3563 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3566 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3567 const size_t j1( j+SIMDSIZE );
3568 const SIMDType x1( x.load(j ) );
3569 const SIMDType x2( x.load(j1) );
3570 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3573 for( ; j<jpos; j+=SIMDSIZE ) {
3574 const SIMDType x1( x.load(j) );
3575 y[i] +=
sum( A.load(i,j) * x1 );
3578 for( ; remainder && j<jend; ++j ) {
3579 y[i] += A(i,j) * x[j];
3601 template<
typename VT1
3605 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3606 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3608 selectLargeAssignKernel( y, A, x, scalar );
3613 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3627 template<
typename VT1
3631 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3632 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3634 typedef ElementType_<VT1> ET;
3636 if( IsTriangular<MT1>::value ) {
3637 assign( y, scalar * x );
3638 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3641 gemv( y, A, x, ET(scalar), ET(0) );
3659 template<
typename VT1 >
3660 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3670 const ResultType tmp(
serial( rhs ) );
3671 assign( ~lhs, tmp );
3687 template<
typename VT1 >
3688 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3694 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
3695 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
3697 if( left.rows() == 0UL || left.columns() == 0UL ) {
3709 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3724 template<
typename VT1
3728 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3730 if( ( IsDiagonal<MT1>::value ) ||
3731 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3732 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3733 selectSmallAddAssignKernel( y, A, x, scalar );
3735 selectBlasAddAssignKernel( y, A, x, scalar );
3753 template<
typename VT1
3757 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3759 y.addAssign( A * x * scalar );
3777 template<
typename VT1
3781 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3782 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3784 selectDefaultAddAssignKernel( y, A, x, scalar );
3802 template<
typename VT1
3806 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3807 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3809 const size_t M( A.rows() );
3810 const size_t N( A.columns() );
3812 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3816 for( ; (i+8UL) <= M; i+=8UL )
3818 const size_t jbegin( ( IsUpper<MT1>::value )
3819 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3821 const size_t jend( ( IsLower<MT1>::value )
3822 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3826 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3827 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3829 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3832 for( ; j<jpos; j+=SIMDSIZE ) {
3833 const SIMDType x1( x.load(j) );
3834 xmm1 = xmm1 + A.load(i ,j) * x1;
3835 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3836 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3837 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3838 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3839 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3840 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3841 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3844 y[i ] +=
sum( xmm1 ) * scalar;
3845 y[i+1UL] +=
sum( xmm2 ) * scalar;
3846 y[i+2UL] +=
sum( xmm3 ) * scalar;
3847 y[i+3UL] +=
sum( xmm4 ) * scalar;
3848 y[i+4UL] +=
sum( xmm5 ) * scalar;
3849 y[i+5UL] +=
sum( xmm6 ) * scalar;
3850 y[i+6UL] +=
sum( xmm7 ) * scalar;
3851 y[i+7UL] +=
sum( xmm8 ) * scalar;
3853 for( ; remainder && j<jend; ++j ) {
3854 y[i ] += A(i ,j) * x[j] * scalar;
3855 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3856 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3857 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3858 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3859 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3860 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3861 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3865 for( ; (i+4UL) <= M; i+=4UL )
3867 const size_t jbegin( ( IsUpper<MT1>::value )
3868 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3870 const size_t jend( ( IsLower<MT1>::value )
3871 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3875 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3876 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3878 SIMDType xmm1, xmm2, xmm3, xmm4;
3881 for( ; j<jpos; j+=SIMDSIZE ) {
3882 const SIMDType x1( x.load(j) );
3883 xmm1 = xmm1 + A.load(i ,j) * x1;
3884 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3885 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3886 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3889 y[i ] +=
sum( xmm1 ) * scalar;
3890 y[i+1UL] +=
sum( xmm2 ) * scalar;
3891 y[i+2UL] +=
sum( xmm3 ) * scalar;
3892 y[i+3UL] +=
sum( xmm4 ) * scalar;
3894 for( ; remainder && j<jend; ++j ) {
3895 y[i ] += A(i ,j) * x[j] * scalar;
3896 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3897 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3898 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3902 for( ; (i+3UL) <= M; i+=3UL )
3904 const size_t jbegin( ( IsUpper<MT1>::value )
3905 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3907 const size_t jend( ( IsLower<MT1>::value )
3908 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3912 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3913 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3915 SIMDType xmm1, xmm2, xmm3;
3918 for( ; j<jpos; j+=SIMDSIZE ) {
3919 const SIMDType x1( x.load(j) );
3920 xmm1 = xmm1 + A.load(i ,j) * x1;
3921 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3922 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3925 y[i ] +=
sum( xmm1 ) * scalar;
3926 y[i+1UL] +=
sum( xmm2 ) * scalar;
3927 y[i+2UL] +=
sum( xmm3 ) * scalar;
3929 for( ; remainder && j<jend; ++j ) {
3930 y[i ] += A(i ,j) * x[j] * scalar;
3931 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3932 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3936 for( ; (i+2UL) <= M; i+=2UL )
3938 const size_t jbegin( ( IsUpper<MT1>::value )
3939 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3941 const size_t jend( ( IsLower<MT1>::value )
3942 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3946 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3947 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3949 SIMDType xmm1, xmm2;
3952 for( ; j<jpos; j+=SIMDSIZE ) {
3953 const SIMDType x1( x.load(j) );
3954 xmm1 = xmm1 + A.load(i ,j) * x1;
3955 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3958 y[i ] +=
sum( xmm1 ) * scalar;
3959 y[i+1UL] +=
sum( xmm2 ) * scalar;
3961 for( ; remainder && j<jend; ++j ) {
3962 y[i ] += A(i ,j) * x[j] * scalar;
3963 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3969 const size_t jbegin( ( IsUpper<MT1>::value )
3970 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
3972 const size_t jend( ( IsLower<MT1>::value )
3973 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3977 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3978 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
3983 for( ; j<jpos; j+=SIMDSIZE ) {
3984 xmm1 = xmm1 + A.load(i,j) * x.load(j);
3987 y[i] +=
sum( xmm1 ) * scalar;
3989 for( ; remainder && j<jend; ++j ) {
3990 y[i] += A(i,j) * x[j] * scalar;
4010 template<
typename VT1
4014 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4015 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4017 selectDefaultAddAssignKernel( y, A, x, scalar );
4035 template<
typename VT1
4039 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4040 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4042 const size_t M( A.rows() );
4043 const size_t N( A.columns() );
4045 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4049 for( ; (i+8UL) <= M; i+=8UL )
4051 const size_t jbegin( ( IsUpper<MT1>::value )
4052 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4054 const size_t jend( ( IsLower<MT1>::value )
4055 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4059 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4060 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4064 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4065 const size_t j1( j+SIMDSIZE );
4066 const size_t j2( j+SIMDSIZE*2UL );
4067 const size_t j3( j+SIMDSIZE*3UL );
4068 const SIMDType x1( x.load(j ) );
4069 const SIMDType x2( x.load(j1) );
4070 const SIMDType x3( x.load(j2) );
4071 const SIMDType x4( x.load(j3) );
4072 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4073 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4074 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4075 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4076 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4077 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4078 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4079 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4082 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4083 const size_t j1( j+SIMDSIZE );
4084 const SIMDType x1( x.load(j ) );
4085 const SIMDType x2( x.load(j1) );
4086 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4087 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4088 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4089 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4090 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4091 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4092 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4093 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4096 for( ; j<jpos; j+=SIMDSIZE ) {
4097 const SIMDType x1( x.load(j) );
4098 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4099 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4100 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4101 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4102 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4103 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4104 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4105 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4108 for( ; remainder && j<jend; ++j ) {
4109 y[i ] += A(i ,j) * x[j] * scalar;
4110 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4111 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4112 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4113 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4114 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4115 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4116 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4120 for( ; (i+4UL) <= M; i+=4UL )
4122 const size_t jbegin( ( IsUpper<MT1>::value )
4123 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4125 const size_t jend( ( IsLower<MT1>::value )
4126 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4130 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4131 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4135 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4136 const size_t j1( j+SIMDSIZE );
4137 const size_t j2( j+SIMDSIZE*2UL );
4138 const size_t j3( j+SIMDSIZE*3UL );
4139 const SIMDType x1( x.load(j ) );
4140 const SIMDType x2( x.load(j1) );
4141 const SIMDType x3( x.load(j2) );
4142 const SIMDType x4( x.load(j3) );
4143 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4144 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4145 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4146 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4149 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4150 const size_t j1( j+SIMDSIZE );
4151 const SIMDType x1( x.load(j ) );
4152 const SIMDType x2( x.load(j1) );
4153 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4154 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4155 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4156 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4159 for( ; j<jpos; j+=SIMDSIZE ) {
4160 const SIMDType x1( x.load(j) );
4161 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4162 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4163 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4164 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4167 for( ; remainder && j<jend; ++j ) {
4168 y[i ] += A(i ,j) * x[j] * scalar;
4169 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4170 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4171 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4175 for( ; (i+2UL) <= M; i+=2UL )
4177 const size_t jbegin( ( IsUpper<MT1>::value )
4178 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4180 const size_t jend( ( IsLower<MT1>::value )
4181 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4185 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4186 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4190 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4191 const size_t j1( j+SIMDSIZE );
4192 const size_t j2( j+SIMDSIZE*2UL );
4193 const size_t j3( j+SIMDSIZE*3UL );
4194 const SIMDType x1( x.load(j ) );
4195 const SIMDType x2( x.load(j1) );
4196 const SIMDType x3( x.load(j2) );
4197 const SIMDType x4( x.load(j3) );
4198 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4199 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4202 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4203 const size_t j1( j+SIMDSIZE );
4204 const SIMDType x1( x.load(j ) );
4205 const SIMDType x2( x.load(j1) );
4206 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4207 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4210 for( ; j<jpos; j+=SIMDSIZE ) {
4211 const SIMDType x1( x.load(j) );
4212 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4213 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4216 for( ; remainder && j<jend; ++j ) {
4217 y[i ] += A(i ,j) * x[j] * scalar;
4218 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4224 const size_t jbegin( ( IsUpper<MT1>::value )
4225 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4227 const size_t jend( ( IsLower<MT1>::value )
4228 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4232 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4233 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4237 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4238 const size_t j1( j+SIMDSIZE );
4239 const size_t j2( j+SIMDSIZE*2UL );
4240 const size_t j3( j+SIMDSIZE*3UL );
4241 const SIMDType x1( x.load(j ) );
4242 const SIMDType x2( x.load(j1) );
4243 const SIMDType x3( x.load(j2) );
4244 const SIMDType x4( x.load(j3) );
4245 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4248 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4249 const size_t j1( j+SIMDSIZE );
4250 const SIMDType x1( x.load(j ) );
4251 const SIMDType x2( x.load(j1) );
4252 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4255 for( ; j<jpos; j+=SIMDSIZE ) {
4256 const SIMDType x1( x.load(j) );
4257 y[i] +=
sum( A.load(i,j) * x1 ) * scalar;
4260 for( ; remainder && j<jend; ++j ) {
4261 y[i] += A(i,j) * x[j] * scalar;
4281 template<
typename VT1
4285 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4286 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4288 selectLargeAddAssignKernel( y, A, x, scalar );
4293 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4307 template<
typename VT1
4311 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4312 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4314 typedef ElementType_<VT1> ET;
4316 if( IsTriangular<MT1>::value ) {
4317 ResultType_<VT1> tmp(
serial( scalar * x ) );
4318 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4319 addAssign( y, tmp );
4322 gemv( y, A, x, ET(scalar), ET(1) );
4344 template<
typename VT1 >
4345 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4351 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4352 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4354 if( left.rows() == 0UL || left.columns() == 0UL ) {
4366 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
4381 template<
typename VT1
4385 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4387 if( ( IsDiagonal<MT1>::value ) ||
4388 ( IsComputation<MT>::value && !evaluateMatrix ) ||
4389 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4390 selectSmallSubAssignKernel( y, A, x, scalar );
4392 selectBlasSubAssignKernel( y, A, x, scalar );
4410 template<
typename VT1
4414 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4416 y.subAssign( A * x * scalar );
4434 template<
typename VT1
4438 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4439 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4441 selectDefaultSubAssignKernel( y, A, x, scalar );
4459 template<
typename VT1
4463 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4464 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4466 const size_t M( A.rows() );
4467 const size_t N( A.columns() );
4469 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4473 for( ; (i+8UL) <= M; i+=8UL )
4475 const size_t jbegin( ( IsUpper<MT1>::value )
4476 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4478 const size_t jend( ( IsLower<MT1>::value )
4479 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4483 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4484 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4486 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4489 for( ; j<jpos; j+=SIMDSIZE ) {
4490 const SIMDType x1( x.load(j) );
4491 xmm1 = xmm1 + A.load(i ,j) * x1;
4492 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4493 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4494 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4495 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
4496 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
4497 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
4498 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
4501 y[i ] -=
sum( xmm1 ) * scalar;
4502 y[i+1UL] -=
sum( xmm2 ) * scalar;
4503 y[i+2UL] -=
sum( xmm3 ) * scalar;
4504 y[i+3UL] -=
sum( xmm4 ) * scalar;
4505 y[i+4UL] -=
sum( xmm5 ) * scalar;
4506 y[i+5UL] -=
sum( xmm6 ) * scalar;
4507 y[i+6UL] -=
sum( xmm7 ) * scalar;
4508 y[i+7UL] -=
sum( xmm8 ) * scalar;
4510 for( ; remainder && j<jend; ++j ) {
4511 y[i ] -= A(i ,j) * x[j] * scalar;
4512 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4513 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4514 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4515 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4516 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4517 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4518 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4522 for( ; (i+4UL) <= M; i+=4UL )
4524 const size_t jbegin( ( IsUpper<MT1>::value )
4525 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4527 const size_t jend( ( IsLower<MT1>::value )
4528 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4532 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4533 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4535 SIMDType xmm1, xmm2, xmm3, xmm4;
4538 for( ; j<jpos; j+=SIMDSIZE ) {
4539 const SIMDType x1( x.load(j) );
4540 xmm1 = xmm1 + A.load(i ,j) * x1;
4541 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4542 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4543 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4546 y[i ] -=
sum( xmm1 ) * scalar;
4547 y[i+1UL] -=
sum( xmm2 ) * scalar;
4548 y[i+2UL] -=
sum( xmm3 ) * scalar;
4549 y[i+3UL] -=
sum( xmm4 ) * scalar;
4551 for( ; remainder && j<jend; ++j ) {
4552 y[i ] -= A(i ,j) * x[j] * scalar;
4553 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4554 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4555 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4559 for( ; (i+3UL) <= M; i+=3UL )
4561 const size_t jbegin( ( IsUpper<MT1>::value )
4562 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4564 const size_t jend( ( IsLower<MT1>::value )
4565 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
4569 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4570 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4572 SIMDType xmm1, xmm2, xmm3;
4575 for( ; j<jpos; j+=SIMDSIZE ) {
4576 const SIMDType x1( x.load(j) );
4577 xmm1 = xmm1 + A.load(i ,j) * x1;
4578 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4579 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4582 y[i ] -=
sum( xmm1 ) * scalar;
4583 y[i+1UL] -=
sum( xmm2 ) * scalar;
4584 y[i+2UL] -=
sum( xmm3 ) * scalar;
4586 for( ; remainder && j<jend; ++j ) {
4587 y[i ] -= A(i ,j) * x[j] * scalar;
4588 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4589 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4593 for( ; (i+2UL) <= M; i+=2UL )
4595 const size_t jbegin( ( IsUpper<MT1>::value )
4596 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4598 const size_t jend( ( IsLower<MT1>::value )
4599 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4603 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4604 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4606 SIMDType xmm1, xmm2;
4609 for( ; j<jpos; j+=SIMDSIZE ) {
4610 const SIMDType x1( x.load(j) );
4611 xmm1 = xmm1 + A.load(i ,j) * x1;
4612 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4615 y[i ] -=
sum( xmm1 ) * scalar;
4616 y[i+1UL] -=
sum( xmm2 ) * scalar;
4618 for( ; remainder && j<jend; ++j ) {
4619 y[i ] -= A(i ,j) * x[j] * scalar;
4620 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4626 const size_t jbegin( ( IsUpper<MT1>::value )
4627 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4629 const size_t jend( ( IsLower<MT1>::value )
4630 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4634 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4635 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4640 for( ; j<jpos; j+=SIMDSIZE ) {
4641 xmm1 = xmm1 + A.load(i,j) * x.load(j);
4644 y[i] -=
sum( xmm1 ) * scalar;
4646 for( ; remainder && j<jend; ++j ) {
4647 y[i] -= A(i,j) * x[j] * scalar;
4667 template<
typename VT1
4671 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4672 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4674 selectDefaultSubAssignKernel( y, A, x, scalar );
4692 template<
typename VT1
4696 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4697 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4699 const size_t M( A.rows() );
4700 const size_t N( A.columns() );
4702 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4706 for( ; (i+8UL) <= M; i+=8UL )
4708 const size_t jbegin( ( IsUpper<MT1>::value )
4709 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4711 const size_t jend( ( IsLower<MT1>::value )
4712 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4716 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4717 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4721 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4722 const size_t j1( j+SIMDSIZE );
4723 const size_t j2( j+SIMDSIZE*2UL );
4724 const size_t j3( j+SIMDSIZE*3UL );
4725 const SIMDType x1( x.load(j ) );
4726 const SIMDType x2( x.load(j1) );
4727 const SIMDType x3( x.load(j2) );
4728 const SIMDType x4( x.load(j3) );
4729 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4730 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4731 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4732 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4733 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4734 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4735 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4736 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4739 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4740 const size_t j1( j+SIMDSIZE );
4741 const SIMDType x1( x.load(j ) );
4742 const SIMDType x2( x.load(j1) );
4743 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4744 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4745 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4746 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4747 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4748 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4749 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4750 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4753 for( ; j<jpos; j+=SIMDSIZE ) {
4754 const SIMDType x1( x.load(j) );
4755 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4756 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4757 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4758 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4759 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4760 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4761 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4762 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4765 for( ; remainder && j<jend; ++j ) {
4766 y[i ] -= A(i ,j) * x[j] * scalar;
4767 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4768 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4769 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4770 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4771 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4772 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4773 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4777 for( ; (i+4UL) <= M; i+=4UL )
4779 const size_t jbegin( ( IsUpper<MT1>::value )
4780 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4782 const size_t jend( ( IsLower<MT1>::value )
4783 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4787 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4788 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4792 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4793 const size_t j1( j+SIMDSIZE );
4794 const size_t j2( j+SIMDSIZE*2UL );
4795 const size_t j3( j+SIMDSIZE*3UL );
4796 const SIMDType x1( x.load(j ) );
4797 const SIMDType x2( x.load(j1) );
4798 const SIMDType x3( x.load(j2) );
4799 const SIMDType x4( x.load(j3) );
4800 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4801 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4802 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4803 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4806 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4807 const size_t j1( j+SIMDSIZE );
4808 const SIMDType x1( x.load(j ) );
4809 const SIMDType x2( x.load(j1) );
4810 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4811 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4812 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4813 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4816 for( ; j<jpos; j+=SIMDSIZE ) {
4817 const SIMDType x1( x.load(j) );
4818 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4819 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4820 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4821 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4824 for( ; remainder && j<jend; ++j ) {
4825 y[i ] -= A(i ,j) * x[j] * scalar;
4826 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4827 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4828 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4832 for( ; (i+2UL) <= M; i+=2UL )
4834 const size_t jbegin( ( IsUpper<MT1>::value )
4835 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4837 const size_t jend( ( IsLower<MT1>::value )
4838 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4842 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4843 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4847 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4848 const size_t j1( j+SIMDSIZE );
4849 const size_t j2( j+SIMDSIZE*2UL );
4850 const size_t j3( j+SIMDSIZE*3UL );
4851 const SIMDType x1( x.load(j ) );
4852 const SIMDType x2( x.load(j1) );
4853 const SIMDType x3( x.load(j2) );
4854 const SIMDType x4( x.load(j3) );
4855 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4856 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4859 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4860 const size_t j1( j+SIMDSIZE );
4861 const SIMDType x1( x.load(j ) );
4862 const SIMDType x2( x.load(j1) );
4863 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4864 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4867 for( ; j<jpos; j+=SIMDSIZE ) {
4868 const SIMDType x1( x.load(j) );
4869 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4870 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4873 for( ; remainder && j<jend; ++j ) {
4874 y[i ] -= A(i ,j) * x[j] * scalar;
4875 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4881 const size_t jbegin( ( IsUpper<MT1>::value )
4882 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-SIMDSIZE) )
4884 const size_t jend( ( IsLower<MT1>::value )
4885 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4889 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4890 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos,
"Invalid end calculation" );
4894 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4895 const size_t j1( j+SIMDSIZE );
4896 const size_t j2( j+SIMDSIZE*2UL );
4897 const size_t j3( j+SIMDSIZE*3UL );
4898 const SIMDType x1( x.load(j ) );
4899 const SIMDType x2( x.load(j1) );
4900 const SIMDType x3( x.load(j2) );
4901 const SIMDType x4( x.load(j3) );
4902 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4905 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4906 const size_t j1( j+SIMDSIZE );
4907 const SIMDType x1( x.load(j ) );
4908 const SIMDType x2( x.load(j1) );
4909 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4912 for( ; j<jpos; j+=SIMDSIZE ) {
4913 const SIMDType x1( x.load(j) );
4914 y[i] -=
sum( A.load(i,j) * x1 ) * scalar;
4917 for( ; remainder && j<jend; ++j ) {
4918 y[i] -= A(i,j) * x[j] * scalar;
4938 template<
typename VT1
4942 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4943 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4945 selectLargeSubAssignKernel( y, A, x, scalar );
4950 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4964 template<
typename VT1
4968 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4969 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4971 typedef ElementType_<VT1> ET;
4973 if( IsTriangular<MT1>::value ) {
4974 ResultType_<VT1> tmp(
serial( scalar * x ) );
4975 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4976 subAssign( y, tmp );
4979 gemv( y, A, x, ET(-scalar), ET(1) );
5001 template<
typename VT1 >
5002 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5012 const ResultType tmp(
serial( rhs ) );
5013 multAssign( ~lhs, tmp );
5033 template<
typename VT1 >
5034 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5044 const ResultType tmp(
serial( rhs ) );
5045 divAssign( ~lhs, tmp );
5067 template<
typename VT1 >
5068 friend inline EnableIf_< UseSMPAssign<VT1> >
5069 smpAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5075 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5076 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5078 if( left.rows() == 0UL ) {
5081 else if( left.columns() == 0UL ) {
5112 template<
typename VT1 >
5113 friend inline EnableIf_< UseSMPAssign<VT1> >
5114 smpAssign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5124 const ResultType tmp( rhs );
5143 template<
typename VT1 >
5144 friend inline EnableIf_< UseSMPAssign<VT1> >
5145 smpAddAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5151 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5152 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5154 if( left.rows() == 0UL || left.columns() == 0UL ) {
5188 template<
typename VT1 >
5189 friend inline EnableIf_< UseSMPAssign<VT1> >
5190 smpSubAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5196 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5197 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5199 if( left.rows() == 0UL || left.columns() == 0UL ) {
5233 template<
typename VT1 >
5234 friend inline EnableIf_< UseSMPAssign<VT1> >
5235 smpMultAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5245 const ResultType tmp( rhs );
5268 template<
typename VT1 >
5269 friend inline EnableIf_< UseSMPAssign<VT1> >
5270 smpDivAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5280 const ResultType tmp( rhs );
5342 template<
typename T1
5344 inline const DisableIf_< IsMatMatMultExpr<T1>, DMatDVecMultExpr<T1,T2> >
5379 template<
typename T1
5382 inline const EnableIf_< IsMatMatMultExpr<T1>, MultExprTrait_<T1,T2> >
5389 return (~mat).leftOperand() * ( (~mat).
rightOperand() * vec );
5404 template<
typename MT,
typename VT >
5421 template<
typename MT,
typename VT >
5423 :
public BoolConstant< And< IsAligned<MT>, IsAligned<VT> >::value >
5439 template<
typename MT,
typename VT,
bool AF >
5444 using Type = MultExprTrait_< SubmatrixExprTrait_<const MT,AF>
5445 , SubvectorExprTrait_<const VT,AF> >;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
If_< IsExpression< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:216
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:211
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Header file for basic type definitions.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:354
BLAZE_ALWAYS_INLINE const complex< int8_t > sum(const SIMDcint8 &a) noexcept
Returns the sum of all elements in the 8-bit integral complex SIMD vector.
Definition: Reduction.h:63
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:374
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:125
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:136
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
If_< IsExpression< VT >, const VT, const VT & > RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:219
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
DMatDVecMultExpr< MT, VT > This
Type of this DMatDVecMultExpr instance.
Definition: DMatDVecMultExpr.h:207
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:133
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: AreSIMDCombinable.h:121
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:225
Header file for the IsComplexDouble type trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:364
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:135
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:210
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:330
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:66
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:110
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:310
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:384
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:342
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:208
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
System settings for the BLAS mode.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:297
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:385
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:251
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:134
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:209
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:213
Constraint on the data type.
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS general matrix/vector multiplication functions (gemv)
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:132
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
Header file for the AreSIMDCombinable type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:320
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:212
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:330
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Constraint on the data type.
Header file for the complex data type.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:265
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the MatVecMultExpr base class.
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:222
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:131