35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
122 template<
typename VT
124 class TDVecTDMatMultExpr :
public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
125 ,
private TVecMatMultExpr
126 ,
private Computation
155 template<
typename T1 >
156 struct UseSMPAssign {
157 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
167 template<
typename T1,
typename T2,
typename T3 >
168 struct UseBlasKernel {
170 HasMutableDataAccess<T1>::value &&
171 HasConstDataAccess<T2>::value &&
172 HasConstDataAccess<T3>::value &&
173 !IsDiagonal<T3>::value &&
174 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
175 IsBLASCompatible< ElementType_<T1> >::value &&
176 IsBLASCompatible< ElementType_<T2> >::value &&
177 IsBLASCompatible< ElementType_<T3> >::value &&
178 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
179 IsSame< ElementType_<T1>, ElementType_<T3> >::value };
190 template<
typename T1,
typename T2,
typename T3 >
191 struct UseVectorizedDefaultKernel {
193 !IsDiagonal<T3>::value &&
194 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
195 AreSIMDCombinable< ElementType_<T1>
197 , ElementType_<T3> >::value &&
198 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
199 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
230 VT::simdEnabled && MT::simdEnabled &&
235 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
236 !evaluateMatrix && MT::smpAssignable };
269 return vec_[index] *
mat_(index,index);
296 inline ReturnType
at(
size_t index )
const {
297 if( index >=
mat_.columns() ) {
300 return (*
this)[index];
309 inline size_t size() const noexcept {
310 return mat_.columns();
340 template<
typename T >
341 inline bool canAlias(
const T* alias )
const noexcept {
342 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
352 template<
typename T >
353 inline bool isAliased(
const T* alias )
const noexcept {
354 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
364 return vec_.isAligned() &&
mat_.isAligned();
376 (
mat_.rows() *
mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
377 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
400 template<
typename VT1 >
407 if( rhs.mat_.rows() == 0UL ) {
411 else if( rhs.mat_.columns() == 0UL ) {
415 LT x(
serial( rhs.vec_ ) );
416 RT A(
serial( rhs.mat_ ) );
423 TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
439 template<
typename VT1
442 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
446 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
447 selectSmallAssignKernel( y, x, A );
449 selectBlasAssignKernel( y, x, A );
468 template<
typename VT1
471 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
492 template<
typename VT1
495 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
496 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
498 selectDefaultAssignKernel( y, x, A );
517 template<
typename VT1
520 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
521 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
523 const size_t M( A.rows() );
524 const size_t N( A.columns() );
526 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
530 for( ; (j+8UL) <= N; j+=8UL )
532 const size_t ibegin( ( IsLower<MT1>::value )
533 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
535 const size_t iend( ( IsUpper<MT1>::value )
536 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
540 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
541 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
543 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
546 for( ; i<ipos; i+=SIMDSIZE ) {
547 const SIMDType x1( x.load(i) );
548 xmm1 = xmm1 + x1 * A.load(i,j );
549 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
550 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
551 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
552 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
553 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
554 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
555 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
559 y[j+1UL] =
sum( xmm2 );
560 y[j+2UL] =
sum( xmm3 );
561 y[j+3UL] =
sum( xmm4 );
562 y[j+4UL] =
sum( xmm5 );
563 y[j+5UL] =
sum( xmm6 );
564 y[j+6UL] =
sum( xmm7 );
565 y[j+7UL] =
sum( xmm8 );
567 for( ; remainder && i<iend; ++i ) {
568 y[j ] += x[i] * A(i,j );
569 y[j+1UL] += x[i] * A(i,j+1UL);
570 y[j+2UL] += x[i] * A(i,j+2UL);
571 y[j+3UL] += x[i] * A(i,j+3UL);
572 y[j+4UL] += x[i] * A(i,j+4UL);
573 y[j+5UL] += x[i] * A(i,j+5UL);
574 y[j+6UL] += x[i] * A(i,j+6UL);
575 y[j+7UL] += x[i] * A(i,j+7UL);
579 for( ; (j+4UL) <= N; j+=4UL )
581 const size_t ibegin( ( IsLower<MT1>::value )
582 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
584 const size_t iend( ( IsUpper<MT1>::value )
585 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
589 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
590 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
592 SIMDType xmm1, xmm2, xmm3, xmm4;
595 for( ; i<ipos; i+=SIMDSIZE ) {
596 const SIMDType x1( x.load(i) );
597 xmm1 = xmm1 + x1 * A.load(i,j );
598 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
599 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
600 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
604 y[j+1UL] =
sum( xmm2 );
605 y[j+2UL] =
sum( xmm3 );
606 y[j+3UL] =
sum( xmm4 );
608 for( ; remainder && i<iend; ++i ) {
609 y[j ] += x[i] * A(i,j );
610 y[j+1UL] += x[i] * A(i,j+1UL);
611 y[j+2UL] += x[i] * A(i,j+2UL);
612 y[j+3UL] += x[i] * A(i,j+3UL);
616 for( ; (j+3UL) <= N; j+=3UL )
618 const size_t ibegin( ( IsLower<MT1>::value )
619 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
621 const size_t iend( ( IsUpper<MT1>::value )
622 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
626 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
627 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
629 SIMDType xmm1, xmm2, xmm3;
632 for( ; i<ipos; i+=SIMDSIZE ) {
633 const SIMDType x1( x.load(i) );
634 xmm1 = xmm1 + x1 * A.load(i,j );
635 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
636 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
640 y[j+1UL] =
sum( xmm2 );
641 y[j+2UL] =
sum( xmm3 );
643 for( ; remainder && i<iend; ++i ) {
644 y[j ] += x[i] * A(i,j );
645 y[j+1UL] += x[i] * A(i,j+1UL);
646 y[j+2UL] += x[i] * A(i,j+2UL);
650 for( ; (j+2UL) <= N; j+=2UL )
652 const size_t ibegin( ( IsLower<MT1>::value )
653 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
655 const size_t iend( ( IsUpper<MT1>::value )
656 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
660 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
661 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
666 for( ; i<ipos; i+=SIMDSIZE ) {
667 const SIMDType x1( x.load(i) );
668 xmm1 = xmm1 + x1 * A.load(i,j );
669 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
673 y[j+1UL] =
sum( xmm2 );
675 for( ; remainder && i<iend; ++i ) {
676 y[j ] += x[i] * A(i,j );
677 y[j+1UL] += x[i] * A(i,j+1UL);
683 const size_t ibegin( ( IsLower<MT1>::value )
684 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
686 const size_t iend( ( IsUpper<MT1>::value )
687 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
691 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
692 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
697 for( ; i<ipos; i+=SIMDSIZE ) {
698 xmm1 = xmm1 + x.load(i) * A.load(i,j);
703 for( ; remainder && i<iend; ++i ) {
704 y[j] += x[i] * A(i,j);
725 template<
typename VT1
728 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
729 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
731 selectDefaultAssignKernel( y, x, A );
750 template<
typename VT1
753 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
754 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
756 const size_t M( A.rows() );
757 const size_t N( A.columns() );
759 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
765 for( ; (j+8UL) <= N; j+=8UL )
767 const size_t ibegin( ( IsLower<MT1>::value )
768 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
770 const size_t iend( ( IsUpper<MT1>::value )
771 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
775 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
776 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
780 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
781 const size_t i1( i+SIMDSIZE );
782 const size_t i2( i+SIMDSIZE*2UL );
783 const size_t i3( i+SIMDSIZE*3UL );
784 const SIMDType x1( x.load(i ) );
785 const SIMDType x2( x.load(i1) );
786 const SIMDType x3( x.load(i2) );
787 const SIMDType x4( x.load(i3) );
788 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
789 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
790 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
791 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
792 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
793 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
794 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
795 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
798 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
799 const size_t i1( i+SIMDSIZE );
800 const SIMDType x1( x.load(i ) );
801 const SIMDType x2( x.load(i1) );
802 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
803 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
804 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
805 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
806 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
807 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
808 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
809 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
812 for( ; i<ipos; i+=SIMDSIZE ) {
813 const SIMDType x1( x.load(i) );
814 y[j ] +=
sum( x1 * A.load(i,j ) );
815 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
816 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
817 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
818 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
819 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
820 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
821 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
824 for( ; remainder && i<iend; ++i ) {
825 y[j ] += x[i] * A(i,j );
826 y[j+1UL] += x[i] * A(i,j+1UL);
827 y[j+2UL] += x[i] * A(i,j+2UL);
828 y[j+3UL] += x[i] * A(i,j+3UL);
829 y[j+4UL] += x[i] * A(i,j+4UL);
830 y[j+5UL] += x[i] * A(i,j+5UL);
831 y[j+6UL] += x[i] * A(i,j+6UL);
832 y[j+7UL] += x[i] * A(i,j+7UL);
836 for( ; (j+4UL) <= N; j+=4UL )
838 const size_t ibegin( ( IsLower<MT1>::value )
839 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
841 const size_t iend( ( IsUpper<MT1>::value )
842 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
846 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
847 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
851 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
852 const size_t i1( i+SIMDSIZE );
853 const size_t i2( i+SIMDSIZE*2UL );
854 const size_t i3( i+SIMDSIZE*3UL );
855 const SIMDType x1( x.load(i ) );
856 const SIMDType x2( x.load(i1) );
857 const SIMDType x3( x.load(i2) );
858 const SIMDType x4( x.load(i3) );
859 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
860 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
861 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
862 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
865 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
866 const size_t i1( i+SIMDSIZE );
867 const SIMDType x1( x.load(i ) );
868 const SIMDType x2( x.load(i1) );
869 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
870 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
871 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
872 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
875 for( ; i<ipos; i+=SIMDSIZE ) {
876 const SIMDType x1( x.load(i) );
877 y[j ] +=
sum( x1 * A.load(i,j ) );
878 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
879 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
880 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
883 for( ; remainder && i<iend; ++i ) {
884 y[j ] += x[i] * A(i,j );
885 y[j+1UL] += x[i] * A(i,j+1UL);
886 y[j+2UL] += x[i] * A(i,j+2UL);
887 y[j+3UL] += x[i] * A(i,j+3UL);
891 for( ; (j+2UL) <= N; j+=2UL )
893 const size_t ibegin( ( IsLower<MT1>::value )
894 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
896 const size_t iend( ( IsUpper<MT1>::value )
897 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
901 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
902 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
906 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
907 const size_t i1( i+SIMDSIZE );
908 const size_t i2( i+SIMDSIZE*2UL );
909 const size_t i3( i+SIMDSIZE*3UL );
910 const SIMDType x1( x.load(i ) );
911 const SIMDType x2( x.load(i1) );
912 const SIMDType x3( x.load(i2) );
913 const SIMDType x4( x.load(i3) );
914 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
915 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
918 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
919 const size_t i1( i+SIMDSIZE );
920 const SIMDType x1( x.load(i ) );
921 const SIMDType x2( x.load(i1) );
922 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
923 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
926 for( ; i<ipos; i+=SIMDSIZE ) {
927 const SIMDType x1( x.load(i) );
928 y[j ] +=
sum( x1 * A.load(i,j ) );
929 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
932 for( ; remainder && i<iend; ++i ) {
933 y[j ] += x[i] * A(i,j );
934 y[j+1UL] += x[i] * A(i,j+1UL);
940 const size_t ibegin( ( IsLower<MT1>::value )
941 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
943 const size_t iend( ( IsUpper<MT1>::value )
944 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
948 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
949 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
953 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
954 const size_t i1( i+SIMDSIZE );
955 const size_t i2( i+SIMDSIZE*2UL );
956 const size_t i3( i+SIMDSIZE*3UL );
957 const SIMDType x1( x.load(i ) );
958 const SIMDType x2( x.load(i1) );
959 const SIMDType x3( x.load(i2) );
960 const SIMDType x4( x.load(i3) );
961 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
964 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
965 const size_t i1( i+SIMDSIZE );
966 const SIMDType x1( x.load(i ) );
967 const SIMDType x2( x.load(i1) );
968 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
971 for( ; i<ipos; i+=SIMDSIZE ) {
972 const SIMDType x1( x.load(i) );
973 y[j] +=
sum( x1 * A.load(i,j) );
976 for( ; remainder && i<iend; ++i ) {
977 y[j] += x[i] * A(i,j);
998 template<
typename VT1
1001 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
1002 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1004 selectLargeAssignKernel( y, x, A );
1010 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1024 template<
typename VT1
1027 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
1028 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1030 typedef ElementType_<VT1> ET;
1032 if( IsTriangular<MT1>::value ) {
1034 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1037 gemv( y, x, A, ET(1), ET(0) );
1057 template<
typename VT1 >
1058 friend inline void assign( SparseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1068 const ResultType tmp(
serial( rhs ) );
1069 assign( ~lhs, tmp );
1087 template<
typename VT1 >
1088 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1094 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1098 LT x(
serial( rhs.vec_ ) );
1099 RT A(
serial( rhs.mat_ ) );
1106 TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1122 template<
typename VT1
1125 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1127 if( ( IsDiagonal<MT1>::value ) ||
1128 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1129 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1130 selectSmallAddAssignKernel( y, x, A );
1132 selectBlasAddAssignKernel( y, x, A );
1151 template<
typename VT1
1154 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1156 y.addAssign( x * A );
1175 template<
typename VT1
1178 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1179 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1181 selectDefaultAddAssignKernel( y, x, A );
1201 template<
typename VT1
1204 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1205 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1207 const size_t M( A.rows() );
1208 const size_t N( A.columns() );
1210 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1214 for( ; (j+8UL) <= N; j+=8UL )
1216 const size_t ibegin( ( IsLower<MT1>::value )
1217 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1219 const size_t iend( ( IsUpper<MT1>::value )
1220 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1224 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1225 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1227 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1230 for( ; i<ipos; i+=SIMDSIZE ) {
1231 const SIMDType x1( x.load(i) );
1232 xmm1 = xmm1 + x1 * A.load(i,j );
1233 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1234 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1235 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1236 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1237 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1238 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1239 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1242 y[j ] +=
sum( xmm1 );
1243 y[j+1UL] +=
sum( xmm2 );
1244 y[j+2UL] +=
sum( xmm3 );
1245 y[j+3UL] +=
sum( xmm4 );
1246 y[j+4UL] +=
sum( xmm5 );
1247 y[j+5UL] +=
sum( xmm6 );
1248 y[j+6UL] +=
sum( xmm7 );
1249 y[j+7UL] +=
sum( xmm8 );
1251 for( ; remainder && i<iend; ++i ) {
1252 y[j ] += x[i] * A(i,j );
1253 y[j+1UL] += x[i] * A(i,j+1UL);
1254 y[j+2UL] += x[i] * A(i,j+2UL);
1255 y[j+3UL] += x[i] * A(i,j+3UL);
1256 y[j+4UL] += x[i] * A(i,j+4UL);
1257 y[j+5UL] += x[i] * A(i,j+5UL);
1258 y[j+6UL] += x[i] * A(i,j+6UL);
1259 y[j+7UL] += x[i] * A(i,j+7UL);
1263 for( ; (j+4UL) <= N; j+=4UL )
1265 const size_t ibegin( ( IsLower<MT1>::value )
1266 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1268 const size_t iend( ( IsUpper<MT1>::value )
1269 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1273 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1274 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1276 SIMDType xmm1, xmm2, xmm3, xmm4;
1279 for( ; i<ipos; i+=SIMDSIZE ) {
1280 const SIMDType x1( x.load(i) );
1281 xmm1 = xmm1 + x1 * A.load(i,j );
1282 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1283 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1284 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1287 y[j ] +=
sum( xmm1 );
1288 y[j+1UL] +=
sum( xmm2 );
1289 y[j+2UL] +=
sum( xmm3 );
1290 y[j+3UL] +=
sum( xmm4 );
1292 for( ; remainder && i<iend; ++i ) {
1293 y[j ] += x[i] * A(i,j );
1294 y[j+1UL] += x[i] * A(i,j+1UL);
1295 y[j+2UL] += x[i] * A(i,j+2UL);
1296 y[j+3UL] += x[i] * A(i,j+3UL);
1300 for( ; (j+3UL) <= N; j+=3UL )
1302 const size_t ibegin( ( IsLower<MT1>::value )
1303 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1305 const size_t iend( ( IsUpper<MT1>::value )
1306 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1310 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1311 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1313 SIMDType xmm1, xmm2, xmm3;
1316 for( ; i<ipos; i+=SIMDSIZE ) {
1317 const SIMDType x1( x.load(i) );
1318 xmm1 = xmm1 + x1 * A.load(i,j );
1319 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1320 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1323 y[j ] +=
sum( xmm1 );
1324 y[j+1UL] +=
sum( xmm2 );
1325 y[j+2UL] +=
sum( xmm3 );
1327 for( ; remainder && i<iend; ++i ) {
1328 y[j ] += x[i] * A(i,j );
1329 y[j+1UL] += x[i] * A(i,j+1UL);
1330 y[j+2UL] += x[i] * A(i,j+2UL);
1334 for( ; (j+2UL) <= N; j+=2UL )
1336 const size_t ibegin( ( IsLower<MT1>::value )
1337 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1339 const size_t iend( ( IsUpper<MT1>::value )
1340 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1344 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1345 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1347 SIMDType xmm1, xmm2;
1350 for( ; i<ipos; i+=SIMDSIZE ) {
1351 const SIMDType x1( x.load(i) );
1352 xmm1 = xmm1 + x1 * A.load(i,j );
1353 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1356 y[j ] +=
sum( xmm1 );
1357 y[j+1UL] +=
sum( xmm2 );
1359 for( ; remainder && i<iend; ++i ) {
1360 y[j ] += x[i] * A(i,j );
1361 y[j+1UL] += x[i] * A(i,j+1UL);
1367 const size_t ibegin( ( IsLower<MT1>::value )
1368 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1370 const size_t iend( ( IsUpper<MT1>::value )
1371 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1375 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1376 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1381 for( ; i<ipos; i+=SIMDSIZE ) {
1382 xmm1 = xmm1 + A.load(i,j) * x.load(i);
1385 y[j] +=
sum( xmm1 );
1387 for( ; remainder && i<iend; ++i ) {
1388 y[j] += x[i] * A(i,j);
1409 template<
typename VT1
1412 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1413 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1415 selectDefaultAddAssignKernel( y, x, A );
1435 template<
typename VT1
1438 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1439 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1441 const size_t M( A.rows() );
1442 const size_t N( A.columns() );
1444 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1448 for( ; (j+8UL) <= N; j+=8UL )
1450 const size_t ibegin( ( IsLower<MT1>::value )
1451 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1453 const size_t iend( ( IsUpper<MT1>::value )
1454 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1458 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1459 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1463 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1464 const size_t i1( i+SIMDSIZE );
1465 const size_t i2( i+SIMDSIZE*2UL );
1466 const size_t i3( i+SIMDSIZE*3UL );
1467 const SIMDType x1( x.load(i ) );
1468 const SIMDType x2( x.load(i1) );
1469 const SIMDType x3( x.load(i2) );
1470 const SIMDType x4( x.load(i3) );
1471 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1472 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1473 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1474 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1475 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1476 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1477 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1478 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1481 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1482 const size_t i1( i+SIMDSIZE );
1483 const SIMDType x1( x.load(i ) );
1484 const SIMDType x2( x.load(i1) );
1485 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1486 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1487 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1488 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1489 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1490 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1491 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1492 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1495 for( ; i<ipos; i+=SIMDSIZE ) {
1496 const SIMDType x1( x.load(i) );
1497 y[j ] +=
sum( x1 * A.load(i,j ) );
1498 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1499 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1500 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1501 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
1502 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
1503 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
1504 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
1507 for( ; remainder && i<iend; ++i ) {
1508 y[j ] += x[i] * A(i,j );
1509 y[j+1UL] += x[i] * A(i,j+1UL);
1510 y[j+2UL] += x[i] * A(i,j+2UL);
1511 y[j+3UL] += x[i] * A(i,j+3UL);
1512 y[j+4UL] += x[i] * A(i,j+4UL);
1513 y[j+5UL] += x[i] * A(i,j+5UL);
1514 y[j+6UL] += x[i] * A(i,j+6UL);
1515 y[j+7UL] += x[i] * A(i,j+7UL);
1519 for( ; (j+4UL) <= N; j+=4UL )
1521 const size_t ibegin( ( IsLower<MT1>::value )
1522 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1524 const size_t iend( ( IsUpper<MT1>::value )
1525 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1529 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1530 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1534 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1535 const size_t i1( i+SIMDSIZE );
1536 const size_t i2( i+SIMDSIZE*2UL );
1537 const size_t i3( i+SIMDSIZE*3UL );
1538 const SIMDType x1( x.load(i ) );
1539 const SIMDType x2( x.load(i1) );
1540 const SIMDType x3( x.load(i2) );
1541 const SIMDType x4( x.load(i3) );
1542 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1543 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1544 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1545 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1548 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1549 const size_t i1( i+SIMDSIZE );
1550 const SIMDType x1( x.load(i ) );
1551 const SIMDType x2( x.load(i1) );
1552 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1553 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1554 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1555 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1558 for( ; i<ipos; i+=SIMDSIZE ) {
1559 const SIMDType x1( x.load(i) );
1560 y[j ] +=
sum( x1 * A.load(i,j ) );
1561 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1562 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1563 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1566 for( ; remainder && i<iend; ++i ) {
1567 y[j ] += x[i] * A(i,j );
1568 y[j+1UL] += x[i] * A(i,j+1UL);
1569 y[j+2UL] += x[i] * A(i,j+2UL);
1570 y[j+3UL] += x[i] * A(i,j+3UL);
1574 for( ; (j+2UL) <= N; j+=2UL )
1576 const size_t ibegin( ( IsLower<MT1>::value )
1577 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1579 const size_t iend( ( IsUpper<MT1>::value )
1580 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1584 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1585 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1589 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1590 const size_t i1( i+SIMDSIZE );
1591 const size_t i2( i+SIMDSIZE*2UL );
1592 const size_t i3( i+SIMDSIZE*3UL );
1593 const SIMDType x1( x.load(i ) );
1594 const SIMDType x2( x.load(i1) );
1595 const SIMDType x3( x.load(i2) );
1596 const SIMDType x4( x.load(i3) );
1597 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1598 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1601 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1602 const size_t i1( i+SIMDSIZE );
1603 const SIMDType x1( x.load(i ) );
1604 const SIMDType x2( x.load(i1) );
1605 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1606 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1609 for( ; i<ipos; i+=SIMDSIZE ) {
1610 const SIMDType x1( x.load(i) );
1611 y[j ] +=
sum( x1 * A.load(i,j ) );
1612 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1615 for( ; remainder && i<iend; ++i ) {
1616 y[j ] += x[i] * A(i,j );
1617 y[j+1UL] += x[i] * A(i,j+1UL);
1623 const size_t ibegin( ( IsLower<MT1>::value )
1624 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1626 const size_t iend( ( IsUpper<MT1>::value )
1627 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1631 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1632 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1636 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1637 const size_t i1( i+SIMDSIZE );
1638 const size_t i2( i+SIMDSIZE*2UL );
1639 const size_t i3( i+SIMDSIZE*3UL );
1640 const SIMDType x1( x.load(i ) );
1641 const SIMDType x2( x.load(i1) );
1642 const SIMDType x3( x.load(i2) );
1643 const SIMDType x4( x.load(i3) );
1644 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1647 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1648 const size_t i1( i+SIMDSIZE );
1649 const SIMDType x1( x.load(i ) );
1650 const SIMDType x2( x.load(i1) );
1651 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1654 for( ; i<ipos; i+=SIMDSIZE ) {
1655 const SIMDType x1( x.load(i) );
1656 y[j] +=
sum( x1 * A.load(i,j) );
1659 for( ; remainder && i<iend; ++i ) {
1660 y[j] += x[i] * A(i,j);
1681 template<
typename VT1
1684 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
1685 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1687 selectLargeAddAssignKernel( y, x, A );
1693 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1707 template<
typename VT1
1710 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
1711 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1713 typedef ElementType_<VT1> ET;
1715 if( IsTriangular<MT1>::value ) {
1716 ResultType_<VT1> tmp(
serial( x ) );
1717 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1718 addAssign( y, tmp );
1721 gemv( y, x, A, ET(1), ET(1) );
1745 template<
typename VT1 >
1746 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1752 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1756 LT x(
serial( rhs.vec_ ) );
1757 RT A(
serial( rhs.mat_ ) );
1764 TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1780 template<
typename VT1
1783 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1785 if( ( IsDiagonal<MT1>::value ) ||
1786 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1787 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1788 selectSmallSubAssignKernel( y, x, A );
1790 selectBlasSubAssignKernel( y, x, A );
1809 template<
typename VT1
1812 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1814 y.subAssign( x * A );
1833 template<
typename VT1
1836 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1837 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1839 selectDefaultSubAssignKernel( y, x, A );
1859 template<
typename VT1
1862 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1863 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1865 const size_t M( A.rows() );
1866 const size_t N( A.columns() );
1868 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1872 for( ; (j+8UL) <= N; j+=8UL )
1874 const size_t ibegin( ( IsLower<MT1>::value )
1875 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1877 const size_t iend( ( IsUpper<MT1>::value )
1878 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1882 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1883 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1885 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1888 for( ; i<ipos; i+=SIMDSIZE ) {
1889 const SIMDType x1( x.load(i) );
1890 xmm1 = xmm1 + x1 * A.load(i,j );
1891 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1892 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1893 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1894 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1895 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1896 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1897 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1900 y[j ] -=
sum( xmm1 );
1901 y[j+1UL] -=
sum( xmm2 );
1902 y[j+2UL] -=
sum( xmm3 );
1903 y[j+3UL] -=
sum( xmm4 );
1904 y[j+4UL] -=
sum( xmm5 );
1905 y[j+5UL] -=
sum( xmm6 );
1906 y[j+6UL] -=
sum( xmm7 );
1907 y[j+7UL] -=
sum( xmm8 );
1909 for( ; remainder && i<iend; ++i ) {
1910 y[j ] -= x[i] * A(i,j );
1911 y[j+1UL] -= x[i] * A(i,j+1UL);
1912 y[j+2UL] -= x[i] * A(i,j+2UL);
1913 y[j+3UL] -= x[i] * A(i,j+3UL);
1914 y[j+4UL] -= x[i] * A(i,j+4UL);
1915 y[j+5UL] -= x[i] * A(i,j+5UL);
1916 y[j+6UL] -= x[i] * A(i,j+6UL);
1917 y[j+7UL] -= x[i] * A(i,j+7UL);
1921 for( ; (j+4UL) <= N; j+=4UL )
1923 const size_t ibegin( ( IsLower<MT1>::value )
1924 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1926 const size_t iend( ( IsUpper<MT1>::value )
1927 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1931 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1932 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1934 SIMDType xmm1, xmm2, xmm3, xmm4;
1937 for( ; i<ipos; i+=SIMDSIZE ) {
1938 const SIMDType x1( x.load(i) );
1939 xmm1 = xmm1 + x1 * A.load(i,j );
1940 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1941 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1942 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1945 y[j ] -=
sum( xmm1 );
1946 y[j+1UL] -=
sum( xmm2 );
1947 y[j+2UL] -=
sum( xmm3 );
1948 y[j+3UL] -=
sum( xmm4 );
1950 for( ; remainder && i<iend; ++i ) {
1951 y[j ] -= x[i] * A(i,j );
1952 y[j+1UL] -= x[i] * A(i,j+1UL);
1953 y[j+2UL] -= x[i] * A(i,j+2UL);
1954 y[j+3UL] -= x[i] * A(i,j+3UL);
1958 for( ; (j+3UL) <= N; j+=3UL )
1960 const size_t ibegin( ( IsLower<MT1>::value )
1961 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1963 const size_t iend( ( IsUpper<MT1>::value )
1964 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1968 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1969 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1971 SIMDType xmm1, xmm2, xmm3;
1974 for( ; i<ipos; i+=SIMDSIZE ) {
1975 const SIMDType x1( x.load(i) );
1976 xmm1 = xmm1 + x1 * A.load(i,j );
1977 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1978 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1981 y[j ] -=
sum( xmm1 );
1982 y[j+1UL] -=
sum( xmm2 );
1983 y[j+2UL] -=
sum( xmm3 );
1985 for( ; remainder && i<iend; ++i ) {
1986 y[j ] -= x[i] * A(i,j );
1987 y[j+1UL] -= x[i] * A(i,j+1UL);
1988 y[j+2UL] -= x[i] * A(i,j+2UL);
1992 for( ; (j+2UL) <= N; j+=2UL )
1994 const size_t ibegin( ( IsLower<MT1>::value )
1995 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
1997 const size_t iend( ( IsUpper<MT1>::value )
1998 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2002 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2003 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2005 SIMDType xmm1, xmm2;
2008 for( ; i<ipos; i+=SIMDSIZE ) {
2009 const SIMDType x1( x.load(i) );
2010 xmm1 = xmm1 + x1 * A.load(i,j );
2011 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2014 y[j ] -=
sum( xmm1 );
2015 y[j+1UL] -=
sum( xmm2 );
2017 for( ; remainder && i<iend; ++i ) {
2018 y[j ] -= x[i] * A(i,j );
2019 y[j+1UL] -= x[i] * A(i,j+1UL);
2025 const size_t ibegin( ( IsLower<MT1>::value )
2026 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
2028 const size_t iend( ( IsUpper<MT1>::value )
2029 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2033 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2034 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2039 for( ; i<ipos; i+=SIMDSIZE ) {
2040 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2043 y[j] -=
sum( xmm1 );
2045 for( ; remainder && i<iend; ++i ) {
2046 y[j] -= x[i] * A(i,j);
2067 template<
typename VT1
2070 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
2071 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2073 selectDefaultSubAssignKernel( y, x, A );
2093 template<
typename VT1
2096 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
2097 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2099 const size_t M( A.rows() );
2100 const size_t N( A.columns() );
2102 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
2106 for( ; (j+8UL) <= N; j+=8UL )
2108 const size_t ibegin( ( IsLower<MT1>::value )
2109 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
2111 const size_t iend( ( IsUpper<MT1>::value )
2112 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
2116 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2117 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2121 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2122 const size_t i1( i+SIMDSIZE );
2123 const size_t i2( i+SIMDSIZE*2UL );
2124 const size_t i3( i+SIMDSIZE*3UL );
2125 const SIMDType x1( x.load(i ) );
2126 const SIMDType x2( x.load(i1) );
2127 const SIMDType x3( x.load(i2) );
2128 const SIMDType x4( x.load(i3) );
2129 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2130 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2131 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2132 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2133 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2134 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2135 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2136 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2139 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2140 const size_t i1( i+SIMDSIZE );
2141 const SIMDType x1( x.load(i ) );
2142 const SIMDType x2( x.load(i1) );
2143 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2144 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2145 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2146 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2147 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2148 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2149 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2150 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2153 for( ; i<ipos; i+=SIMDSIZE ) {
2154 const SIMDType x1( x.load(i) );
2155 y[j ] -=
sum( x1 * A.load(i,j ) );
2156 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2157 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2158 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2159 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) );
2160 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) );
2161 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) );
2162 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) );
2165 for( ; remainder && i<iend; ++i ) {
2166 y[j ] -= x[i] * A(i,j );
2167 y[j+1UL] -= x[i] * A(i,j+1UL);
2168 y[j+2UL] -= x[i] * A(i,j+2UL);
2169 y[j+3UL] -= x[i] * A(i,j+3UL);
2170 y[j+4UL] -= x[i] * A(i,j+4UL);
2171 y[j+5UL] -= x[i] * A(i,j+5UL);
2172 y[j+6UL] -= x[i] * A(i,j+6UL);
2173 y[j+7UL] -= x[i] * A(i,j+7UL);
2177 for( ; (j+4UL) <= N; j+=4UL )
2179 const size_t ibegin( ( IsLower<MT1>::value )
2180 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
2182 const size_t iend( ( IsUpper<MT1>::value )
2183 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
2187 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2188 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2192 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2193 const size_t i1( i+SIMDSIZE );
2194 const size_t i2( i+SIMDSIZE*2UL );
2195 const size_t i3( i+SIMDSIZE*3UL );
2196 const SIMDType x1( x.load(i ) );
2197 const SIMDType x2( x.load(i1) );
2198 const SIMDType x3( x.load(i2) );
2199 const SIMDType x4( x.load(i3) );
2200 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2201 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2202 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2203 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2206 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2207 const size_t i1( i+SIMDSIZE );
2208 const SIMDType x1( x.load(i ) );
2209 const SIMDType x2( x.load(i1) );
2210 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2211 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2212 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2213 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2216 for( ; i<ipos; i+=SIMDSIZE ) {
2217 const SIMDType x1( x.load(i) );
2218 y[j ] -=
sum( x1 * A.load(i,j ) );
2219 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2220 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2221 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2224 for( ; remainder && i<iend; ++i ) {
2225 y[j ] -= x[i] * A(i,j );
2226 y[j+1UL] -= x[i] * A(i,j+1UL);
2227 y[j+2UL] -= x[i] * A(i,j+2UL);
2228 y[j+3UL] -= x[i] * A(i,j+3UL);
2232 for( ; (j+2UL) <= N; j+=2UL )
2234 const size_t ibegin( ( IsLower<MT1>::value )
2235 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
2237 const size_t iend( ( IsUpper<MT1>::value )
2238 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2242 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2243 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2247 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2248 const size_t i1( i+SIMDSIZE );
2249 const size_t i2( i+SIMDSIZE*2UL );
2250 const size_t i3( i+SIMDSIZE*3UL );
2251 const SIMDType x1( x.load(i ) );
2252 const SIMDType x2( x.load(i1) );
2253 const SIMDType x3( x.load(i2) );
2254 const SIMDType x4( x.load(i3) );
2255 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2256 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2259 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2260 const size_t i1( i+SIMDSIZE );
2261 const SIMDType x1( x.load(i ) );
2262 const SIMDType x2( x.load(i1) );
2263 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2264 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2267 for( ; i<ipos; i+=SIMDSIZE ) {
2268 const SIMDType x1( x.load(i) );
2269 y[j ] -=
sum( x1 * A.load(i,j ) );
2270 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2273 for( ; remainder && i<iend; ++i ) {
2274 y[j ] -= x[i] * A(i,j );
2275 y[j+1UL] -= x[i] * A(i,j+1UL);
2281 const size_t ibegin( ( IsLower<MT1>::value )
2282 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
2284 const size_t iend( ( IsUpper<MT1>::value )
2285 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2289 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2290 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2294 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2295 const size_t i1( i+SIMDSIZE );
2296 const size_t i2( i+SIMDSIZE*2UL );
2297 const size_t i3( i+SIMDSIZE*3UL );
2298 const SIMDType x1( x.load(i ) );
2299 const SIMDType x2( x.load(i1) );
2300 const SIMDType x3( x.load(i2) );
2301 const SIMDType x4( x.load(i3) );
2302 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2305 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2306 const size_t i1( i+SIMDSIZE );
2307 const SIMDType x1( x.load(i ) );
2308 const SIMDType x2( x.load(i1) );
2309 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2312 for( ; i<ipos; i+=SIMDSIZE ) {
2313 const SIMDType x1( x.load(i) );
2314 y[j] -=
sum( x1 * A.load(i,j) );
2317 for( ; remainder && i<iend; ++i ) {
2318 y[j] -= x[i] * A(i,j);
2339 template<
typename VT1
2342 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
2343 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2345 selectLargeSubAssignKernel( y, x, A );
2351 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2365 template<
typename VT1
2368 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
2369 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2371 typedef ElementType_<VT1> ET;
2373 if( IsTriangular<MT1>::value ) {
2374 ResultType_<VT1> tmp(
serial( x ) );
2375 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2376 subAssign( y, tmp );
2379 gemv( y, x, A, ET(-1), ET(1) );
2403 template<
typename VT1 >
2404 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
2414 const ResultType tmp(
serial( rhs ) );
2415 multAssign( ~lhs, tmp );
2437 template<
typename VT1 >
2438 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
2448 const ResultType tmp(
serial( rhs ) );
2449 divAssign( ~lhs, tmp );
2473 template<
typename VT1 >
2474 friend inline EnableIf_< UseSMPAssign<VT1> >
2481 if( rhs.mat_.rows() == 0UL ) {
2485 else if( rhs.mat_.columns() == 0UL ) {
2517 template<
typename VT1 >
2518 friend inline EnableIf_< UseSMPAssign<VT1> >
2529 const ResultType tmp( rhs );
2550 template<
typename VT1 >
2551 friend inline EnableIf_< UseSMPAssign<VT1> >
2558 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2594 template<
typename VT1 >
2595 friend inline EnableIf_< UseSMPAssign<VT1> >
2602 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2638 template<
typename VT1 >
2639 friend inline EnableIf_< UseSMPAssign<VT1> >
2650 const ResultType tmp( rhs );
2675 template<
typename VT1 >
2676 friend inline EnableIf_< UseSMPAssign<VT1> >
2687 const ResultType tmp( rhs );
2726 template<
typename VT
2730 :
public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
2731 ,
private VecScalarMultExpr
2732 ,
private Computation
2736 typedef TDVecTDMatMultExpr<VT,MT> VMM;
2737 typedef ResultType_<VMM> RES;
2738 typedef ResultType_<VT>
VRT;
2739 typedef ResultType_<MT>
MRT;
2740 typedef ElementType_<VRT>
VET;
2741 typedef ElementType_<MRT>
MET;
2742 typedef CompositeType_<VT>
VCT;
2743 typedef CompositeType_<MT>
MCT;
2748 enum :
bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2753 enum :
bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2754 IsBLASCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2762 template<
typename T1 >
2763 struct UseSMPAssign {
2764 enum :
bool { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
2772 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2773 struct UseBlasKernel {
2775 HasMutableDataAccess<T1>::value &&
2776 HasConstDataAccess<T2>::value &&
2777 HasConstDataAccess<T3>::value &&
2778 !IsDiagonal<T3>::value &&
2779 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2780 IsBLASCompatible< ElementType_<T1> >::value &&
2781 IsBLASCompatible< ElementType_<T2> >::value &&
2782 IsBLASCompatible< ElementType_<T3> >::value &&
2783 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
2784 IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2785 !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
2794 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2795 struct UseVectorizedDefaultKernel {
2797 !IsDiagonal<T3>::value &&
2798 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2799 AreSIMDCombinable< ElementType_<T1>
2803 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2804 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2810 typedef DVecScalarMultExpr<VMM,ST,true>
This;
2814 typedef SIMDTrait_<ElementType>
SIMDType;
2819 typedef const TDVecTDMatMultExpr<VT,MT>
LeftOperand;
2825 typedef IfTrue_< evaluateVector, const VRT, VCT >
LT;
2828 typedef IfTrue_< evaluateMatrix, const MRT, MCT >
RT;
2833 enum :
bool { simdEnabled = !IsDiagonal<MT>::value &&
2834 VT::simdEnabled && MT::simdEnabled &&
2835 AreSIMDCombinable<VET,MET,ST>::value &&
2836 HasSIMDAdd<VET,MET>::value &&
2837 HasSIMDMult<VET,MET>::value };
2840 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2841 !evaluateMatrix && MT::smpAssignable };
2855 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
2867 inline ReturnType
operator[](
size_t index )
const {
2869 return vector_[index] * scalar_;
2880 inline ReturnType
at(
size_t index )
const {
2881 if( index >= vector_.size() ) {
2884 return (*
this)[index];
2893 inline size_t size()
const {
2894 return vector_.size();
2924 template<
typename T >
2925 inline bool canAlias(
const T* alias )
const {
2926 return vector_.canAlias( alias );
2936 template<
typename T >
2937 inline bool isAliased(
const T* alias )
const {
2938 return vector_.isAliased( alias );
2948 return vector_.isAligned();
2958 RightOperand_<VMM> A( vector_.rightOperand() );
2960 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2961 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2962 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
2968 LeftOperand vector_;
2969 RightOperand scalar_;
2984 template<
typename VT1
2986 friend inline void assign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2992 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
2993 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
2995 if( right.rows() == 0UL ) {
2999 else if( right.columns() == 0UL ) {
3011 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
3026 template<
typename VT1
3030 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3032 if( ( IsDiagonal<MT1>::value ) ||
3033 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3034 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3035 selectSmallAssignKernel( y, x, A, scalar );
3037 selectBlasAssignKernel( y, x, A, scalar );
3055 template<
typename VT1
3059 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3061 y.assign( x * A * scalar );
3079 template<
typename VT1
3083 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3084 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3086 selectDefaultAssignKernel( y, x, A, scalar );
3105 template<
typename VT1
3109 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3110 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3112 const size_t M( A.rows() );
3113 const size_t N( A.columns() );
3115 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3119 for( ; (j+8UL) <= N; j+=8UL )
3121 const size_t ibegin( ( IsLower<MT1>::value )
3122 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3124 const size_t iend( ( IsUpper<MT1>::value )
3125 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3129 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3130 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3132 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3135 for( ; i<ipos; i+=SIMDSIZE ) {
3136 const SIMDType x1( x.load(i) );
3137 xmm1 = xmm1 + x1 * A.load(i,j );
3138 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3139 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3140 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3141 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3142 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3143 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3144 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3147 y[j ] =
sum( xmm1 ) * scalar;
3148 y[j+1UL] =
sum( xmm2 ) * scalar;
3149 y[j+2UL] =
sum( xmm3 ) * scalar;
3150 y[j+3UL] =
sum( xmm4 ) * scalar;
3151 y[j+4UL] =
sum( xmm5 ) * scalar;
3152 y[j+5UL] =
sum( xmm6 ) * scalar;
3153 y[j+6UL] =
sum( xmm7 ) * scalar;
3154 y[j+7UL] =
sum( xmm8 ) * scalar;
3156 for( ; remainder && i<iend; ++i ) {
3157 y[j ] += x[i] * A(i,j ) * scalar;
3158 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3159 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3160 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3161 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3162 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3163 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3164 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3168 for( ; (j+4UL) <= N; j+=4UL )
3170 const size_t ibegin( ( IsLower<MT1>::value )
3171 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3173 const size_t iend( ( IsUpper<MT1>::value )
3174 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3178 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3179 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3181 SIMDType xmm1, xmm2, xmm3, xmm4;
3184 for( ; i<ipos; i+=SIMDSIZE ) {
3185 const SIMDType x1( x.load(i) );
3186 xmm1 = xmm1 + x1 * A.load(i,j );
3187 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3188 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3189 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3192 y[j ] =
sum( xmm1 ) * scalar;
3193 y[j+1UL] =
sum( xmm2 ) * scalar;
3194 y[j+2UL] =
sum( xmm3 ) * scalar;
3195 y[j+3UL] =
sum( xmm4 ) * scalar;
3197 for( ; remainder && i<iend; ++i ) {
3198 y[j ] += x[i] * A(i,j ) * scalar;
3199 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3200 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3201 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3205 for( ; (j+3UL) <= N; j+=3UL )
3207 const size_t ibegin( ( IsLower<MT1>::value )
3208 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3210 const size_t iend( ( IsUpper<MT1>::value )
3211 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3215 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3216 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3218 SIMDType xmm1, xmm2, xmm3;
3221 for( ; i<ipos; i+=SIMDSIZE ) {
3222 const SIMDType x1( x.load(i) );
3223 xmm1 = xmm1 + x1 * A.load(i,j );
3224 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3225 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3228 y[j ] =
sum( xmm1 ) * scalar;
3229 y[j+1UL] =
sum( xmm2 ) * scalar;
3230 y[j+2UL] =
sum( xmm3 ) * scalar;
3232 for( ; remainder && i<iend; ++i ) {
3233 y[j ] += x[i] * A(i,j ) * scalar;
3234 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3235 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3239 for( ; (j+2UL) <= N; j+=2UL )
3241 const size_t ibegin( ( IsLower<MT1>::value )
3242 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3244 const size_t iend( ( IsUpper<MT1>::value )
3245 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3249 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3250 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3252 SIMDType xmm1, xmm2;
3255 for( ; i<ipos; i+=SIMDSIZE ) {
3256 const SIMDType x1( x.load(i) );
3257 xmm1 = xmm1 + x1 * A.load(i,j );
3258 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3261 y[j ] =
sum( xmm1 ) * scalar;
3262 y[j+1UL] =
sum( xmm2 ) * scalar;
3264 for( ; remainder && i<iend; ++i ) {
3265 y[j ] += x[i] * A(i,j ) * scalar;
3266 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3272 const size_t ibegin( ( IsLower<MT1>::value )
3273 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3275 const size_t iend( ( IsUpper<MT1>::value )
3276 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3280 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3281 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3286 for( ; i<ipos; i+=SIMDSIZE ) {
3287 xmm1 = xmm1 + A.load(i,j) * x.load(i);
3290 y[j] =
sum( xmm1 ) * scalar;
3292 for( ; remainder && i<iend; ++i ) {
3293 y[j] += x[i] * A(i,j) * scalar;
3313 template<
typename VT1
3317 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3318 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3320 selectDefaultAssignKernel( y, x, A, scalar );
3339 template<
typename VT1
3343 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3344 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3346 const size_t M( A.rows() );
3347 const size_t N( A.columns() );
3349 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3355 for( ; (j+8UL) <= N; j+=8UL )
3357 const size_t ibegin( ( IsLower<MT1>::value )
3358 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3360 const size_t iend( ( IsUpper<MT1>::value )
3361 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3365 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3366 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3370 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3371 const size_t i1( i+SIMDSIZE );
3372 const size_t i2( i+SIMDSIZE*2UL );
3373 const size_t i3( i+SIMDSIZE*3UL );
3374 const SIMDType x1( x.load(i ) );
3375 const SIMDType x2( x.load(i1) );
3376 const SIMDType x3( x.load(i2) );
3377 const SIMDType x4( x.load(i3) );
3378 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3379 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3380 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3381 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3382 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3383 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3384 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3385 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3388 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3389 const size_t i1( i+SIMDSIZE );
3390 const SIMDType x1( x.load(i ) );
3391 const SIMDType x2( x.load(i1) );
3392 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3393 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3394 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3395 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3396 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3397 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3398 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3399 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3402 for( ; i<ipos; i+=SIMDSIZE ) {
3403 const SIMDType x1( x.load(i) );
3404 y[j ] +=
sum( x1 * A.load(i,j ) );
3405 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3406 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3407 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3408 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
3409 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
3410 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
3411 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
3414 for( ; remainder && i<iend; ++i ) {
3415 y[j ] += x[i] * A(i,j );
3416 y[j+1UL] += x[i] * A(i,j+1UL);
3417 y[j+2UL] += x[i] * A(i,j+2UL);
3418 y[j+3UL] += x[i] * A(i,j+3UL);
3419 y[j+4UL] += x[i] * A(i,j+4UL);
3420 y[j+5UL] += x[i] * A(i,j+5UL);
3421 y[j+6UL] += x[i] * A(i,j+6UL);
3422 y[j+7UL] += x[i] * A(i,j+7UL);
3435 for( ; (j+4UL) <= N; j+=4UL )
3437 const size_t ibegin( ( IsLower<MT1>::value )
3438 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3440 const size_t iend( ( IsUpper<MT1>::value )
3441 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3445 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3446 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3450 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3451 const size_t i1( i+SIMDSIZE );
3452 const size_t i2( i+SIMDSIZE*2UL );
3453 const size_t i3( i+SIMDSIZE*3UL );
3454 const SIMDType x1( x.load(i ) );
3455 const SIMDType x2( x.load(i1) );
3456 const SIMDType x3( x.load(i2) );
3457 const SIMDType x4( x.load(i3) );
3458 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3459 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3460 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3461 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3464 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3465 const size_t i1( i+SIMDSIZE );
3466 const SIMDType x1( x.load(i ) );
3467 const SIMDType x2( x.load(i1) );
3468 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3469 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3470 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3471 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3474 for( ; i<ipos; i+=SIMDSIZE ) {
3475 const SIMDType x1( x.load(i) );
3476 y[j ] +=
sum( x1 * A.load(i,j ) );
3477 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3478 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3479 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3482 for( ; remainder && i<iend; ++i ) {
3483 y[j ] += x[i] * A(i,j );
3484 y[j+1UL] += x[i] * A(i,j+1UL);
3485 y[j+2UL] += x[i] * A(i,j+2UL);
3486 y[j+3UL] += x[i] * A(i,j+3UL);
3495 for( ; (j+2UL) <= N; j+=2UL )
3497 const size_t ibegin( ( IsLower<MT1>::value )
3498 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3500 const size_t iend( ( IsUpper<MT1>::value )
3501 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3505 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3506 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3510 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3511 const size_t i1( i+SIMDSIZE );
3512 const size_t i2( i+SIMDSIZE*2UL );
3513 const size_t i3( i+SIMDSIZE*3UL );
3514 const SIMDType x1( x.load(i ) );
3515 const SIMDType x2( x.load(i1) );
3516 const SIMDType x3( x.load(i2) );
3517 const SIMDType x4( x.load(i3) );
3518 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3519 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3522 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3523 const size_t i1( i+SIMDSIZE );
3524 const SIMDType x1( x.load(i ) );
3525 const SIMDType x2( x.load(i1) );
3526 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3527 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3530 for( ; i<ipos; i+=SIMDSIZE ) {
3531 const SIMDType x1( x.load(i) );
3532 y[j ] +=
sum( x1 * A.load(i,j ) );
3533 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3536 for( ; remainder && i<iend; ++i ) {
3537 y[j ] += x[i] * A(i,j );
3538 y[j+1UL] += x[i] * A(i,j+1UL);
3547 const size_t ibegin( ( IsLower<MT1>::value )
3548 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3550 const size_t iend( ( IsUpper<MT1>::value )
3551 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3555 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3556 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3560 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3561 const size_t i1( i+SIMDSIZE );
3562 const size_t i2( i+SIMDSIZE*2UL );
3563 const size_t i3( i+SIMDSIZE*3UL );
3564 const SIMDType x1( x.load(i ) );
3565 const SIMDType x2( x.load(i1) );
3566 const SIMDType x3( x.load(i2) );
3567 const SIMDType x4( x.load(i3) );
3568 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3571 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3572 const size_t i1( i+SIMDSIZE );
3573 const SIMDType x1( x.load(i ) );
3574 const SIMDType x2( x.load(i1) );
3575 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3578 for( ; i<ipos; i+=SIMDSIZE ) {
3579 const SIMDType x1( x.load(i) );
3580 y[j] +=
sum( x1 * A.load(i,j) );
3583 for( ; remainder && i<iend; ++i ) {
3584 y[j] += x[i] * A(i,j);
3605 template<
typename VT1
3609 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3610 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3612 selectLargeAssignKernel( y, x, A, scalar );
3617 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3631 template<
typename VT1
3635 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3636 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3638 typedef ElementType_<VT1> ET;
3640 if( IsTriangular<MT1>::value ) {
3641 assign( y, scalar * x );
3642 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3645 gemv( y, x, A, ET(scalar), ET(0) );
3663 template<
typename VT1
3665 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3675 const ResultType tmp(
serial( rhs ) );
3676 assign( ~lhs, tmp );
3692 template<
typename VT1
3694 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3700 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3701 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3703 if( right.rows() == 0UL || right.columns() == 0UL ) {
3715 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3730 template<
typename VT1
3734 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3736 if( ( IsDiagonal<MT1>::value ) ||
3737 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3738 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3739 selectSmallAddAssignKernel( y, x, A, scalar );
3741 selectBlasAddAssignKernel( y, x, A, scalar );
3759 template<
typename VT1
3763 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3765 y.addAssign( x * A * scalar );
3783 template<
typename VT1
3787 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3788 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3790 selectDefaultAddAssignKernel( y, x, A, scalar );
3809 template<
typename VT1
3813 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3814 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3816 const size_t M( A.rows() );
3817 const size_t N( A.columns() );
3819 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3823 for( ; (j+8UL) <= N; j+=8UL )
3825 const size_t ibegin( ( IsLower<MT1>::value )
3826 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3828 const size_t iend( ( IsUpper<MT1>::value )
3829 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3833 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3834 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3836 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3839 for( ; i<ipos; i+=SIMDSIZE ) {
3840 const SIMDType x1( x.load(i) );
3841 xmm1 = xmm1 + x1 * A.load(i,j );
3842 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3843 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3844 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3845 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3846 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3847 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3848 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3851 y[j ] +=
sum( xmm1 ) * scalar;
3852 y[j+1UL] +=
sum( xmm2 ) * scalar;
3853 y[j+2UL] +=
sum( xmm3 ) * scalar;
3854 y[j+3UL] +=
sum( xmm4 ) * scalar;
3855 y[j+4UL] +=
sum( xmm5 ) * scalar;
3856 y[j+5UL] +=
sum( xmm6 ) * scalar;
3857 y[j+6UL] +=
sum( xmm7 ) * scalar;
3858 y[j+7UL] +=
sum( xmm8 ) * scalar;
3860 for( ; remainder && i<iend; ++i ) {
3861 y[j ] += x[i] * A(i,j ) * scalar;
3862 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3863 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3864 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3865 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3866 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3867 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3868 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3872 for( ; (j+4UL) <= N; j+=4UL )
3874 const size_t ibegin( ( IsLower<MT1>::value )
3875 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3877 const size_t iend( ( IsUpper<MT1>::value )
3878 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3882 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3883 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3885 SIMDType xmm1, xmm2, xmm3, xmm4;
3888 for( ; i<ipos; i+=SIMDSIZE ) {
3889 const SIMDType x1( x.load(i) );
3890 xmm1 = xmm1 + x1 * A.load(i,j );
3891 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3892 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3893 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3896 y[j ] +=
sum( xmm1 ) * scalar;
3897 y[j+1UL] +=
sum( xmm2 ) * scalar;
3898 y[j+2UL] +=
sum( xmm3 ) * scalar;
3899 y[j+3UL] +=
sum( xmm4 ) * scalar;
3901 for( ; remainder && i<iend; ++i ) {
3902 y[j ] += x[i] * A(i,j ) * scalar;
3903 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3904 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3905 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3909 for( ; (j+3UL) <= N; j+=3UL )
3911 const size_t ibegin( ( IsLower<MT1>::value )
3912 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3914 const size_t iend( ( IsUpper<MT1>::value )
3915 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3919 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3920 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3922 SIMDType xmm1, xmm2, xmm3;
3925 for( ; i<ipos; i+=SIMDSIZE ) {
3926 const SIMDType x1( x.load(i) );
3927 xmm1 = xmm1 + x1 * A.load(i,j );
3928 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3929 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3932 y[j ] +=
sum( xmm1 ) * scalar;
3933 y[j+1UL] +=
sum( xmm2 ) * scalar;
3934 y[j+2UL] +=
sum( xmm3 ) * scalar;
3936 for( ; remainder && i<iend; ++i ) {
3937 y[j ] += x[i] * A(i,j ) * scalar;
3938 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3939 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3943 for( ; (j+2UL) <= N; j+=2UL )
3945 const size_t ibegin( ( IsLower<MT1>::value )
3946 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3948 const size_t iend( ( IsUpper<MT1>::value )
3949 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3953 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3954 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3956 SIMDType xmm1, xmm2;
3959 for( ; i<ipos; i+=SIMDSIZE ) {
3960 const SIMDType x1( x.load(i) );
3961 xmm1 = xmm1 + x1 * A.load(i,j );
3962 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3965 y[j ] +=
sum( xmm1 ) * scalar;
3966 y[j+1UL] +=
sum( xmm2 ) * scalar;
3968 for( ; remainder && i<iend; ++i ) {
3969 y[j ] += x[i] * A(i,j ) * scalar;
3970 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3976 const size_t ibegin( ( IsLower<MT1>::value )
3977 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
3979 const size_t iend( ( IsUpper<MT1>::value )
3980 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3984 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3985 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3990 for( ; i<ipos; i+=SIMDSIZE ) {
3991 xmm1 = xmm1 + A.load(i,j) * x.load(i);
3994 y[j] +=
sum( xmm1 ) * scalar;
3996 for( ; remainder && i<iend; ++i ) {
3997 y[j] += x[i] * A(i,j) * scalar;
4017 template<
typename VT1
4021 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4022 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4024 selectDefaultAddAssignKernel( y, x, A, scalar );
4043 template<
typename VT1
4047 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4048 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4050 const size_t M( A.rows() );
4051 const size_t N( A.columns() );
4053 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4057 for( ; (j+8UL) <= N; j+=8UL )
4059 const size_t ibegin( ( IsLower<MT1>::value )
4060 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4062 const size_t iend( ( IsUpper<MT1>::value )
4063 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4067 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4068 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4072 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4073 const size_t i1( i+SIMDSIZE );
4074 const size_t i2( i+SIMDSIZE*2UL );
4075 const size_t i3( i+SIMDSIZE*3UL );
4076 const SIMDType x1( x.load(i ) );
4077 const SIMDType x2( x.load(i1) );
4078 const SIMDType x3( x.load(i2) );
4079 const SIMDType x4( x.load(i3) );
4080 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4081 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4082 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4083 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4084 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4085 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4086 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4087 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4090 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4091 const size_t i1( i+SIMDSIZE );
4092 const SIMDType x1( x.load(i ) );
4093 const SIMDType x2( x.load(i1) );
4094 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4095 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4096 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4097 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4098 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4099 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4100 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4101 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4104 for( ; i<ipos; i+=SIMDSIZE ) {
4105 const SIMDType x1( x.load(i) );
4106 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4107 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4108 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4109 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4110 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4111 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4112 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4113 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4116 for( ; remainder && i<iend; ++i ) {
4117 y[j ] += x[i] * A(i,j ) * scalar;
4118 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4119 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4120 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4121 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4122 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4123 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4124 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4128 for( ; (j+4UL) <= N; j+=4UL )
4130 const size_t ibegin( ( IsLower<MT1>::value )
4131 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4133 const size_t iend( ( IsUpper<MT1>::value )
4134 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4138 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4139 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4143 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4144 const size_t i1( i+SIMDSIZE );
4145 const size_t i2( i+SIMDSIZE*2UL );
4146 const size_t i3( i+SIMDSIZE*3UL );
4147 const SIMDType x1( x.load(i ) );
4148 const SIMDType x2( x.load(i1) );
4149 const SIMDType x3( x.load(i2) );
4150 const SIMDType x4( x.load(i3) );
4151 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4152 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4153 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4154 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4157 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4158 const size_t i1( i+SIMDSIZE );
4159 const SIMDType x1( x.load(i ) );
4160 const SIMDType x2( x.load(i1) );
4161 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4162 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4163 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4164 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4167 for( ; i<ipos; i+=SIMDSIZE ) {
4168 const SIMDType x1( x.load(i) );
4169 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4170 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4171 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4172 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4175 for( ; remainder && i<iend; ++i ) {
4176 y[j ] += x[i] * A(i,j ) * scalar;
4177 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4178 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4179 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4183 for( ; (j+2UL) <= N; j+=2UL )
4185 const size_t ibegin( ( IsLower<MT1>::value )
4186 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4188 const size_t iend( ( IsUpper<MT1>::value )
4189 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4193 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4194 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4198 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4199 const size_t i1( i+SIMDSIZE );
4200 const size_t i2( i+SIMDSIZE*2UL );
4201 const size_t i3( i+SIMDSIZE*3UL );
4202 const SIMDType x1( x.load(i ) );
4203 const SIMDType x2( x.load(i1) );
4204 const SIMDType x3( x.load(i2) );
4205 const SIMDType x4( x.load(i3) );
4206 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4207 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4210 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4211 const size_t i1( i+SIMDSIZE );
4212 const SIMDType x1( x.load(i ) );
4213 const SIMDType x2( x.load(i1) );
4214 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4215 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4218 for( ; i<ipos; i+=SIMDSIZE ) {
4219 const SIMDType x1( x.load(i) );
4220 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4221 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4224 for( ; remainder && i<iend; ++i ) {
4225 y[j ] += x[i] * A(i,j ) * scalar;
4226 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4232 const size_t ibegin( ( IsLower<MT1>::value )
4233 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4235 const size_t iend( ( IsUpper<MT1>::value )
4236 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4240 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4241 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4245 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4246 const size_t i1( i+SIMDSIZE );
4247 const size_t i2( i+SIMDSIZE*2UL );
4248 const size_t i3( i+SIMDSIZE*3UL );
4249 const SIMDType x1( x.load(i ) );
4250 const SIMDType x2( x.load(i1) );
4251 const SIMDType x3( x.load(i2) );
4252 const SIMDType x4( x.load(i3) );
4253 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4256 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4257 const size_t i1( i+SIMDSIZE );
4258 const SIMDType x1( x.load(i ) );
4259 const SIMDType x2( x.load(i1) );
4260 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4263 for( ; i<ipos; i+=SIMDSIZE ) {
4264 const SIMDType x1( x.load(i) );
4265 y[j] +=
sum( x1 * A.load(i,j) ) * scalar;
4268 for( ; remainder && i<iend; ++i ) {
4269 y[j] += x[i] * A(i,j) * scalar;
4290 template<
typename VT1
4294 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4295 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4297 selectLargeAddAssignKernel( y, x, A, scalar );
4302 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4316 template<
typename VT1
4320 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4321 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4323 typedef ElementType_<VT1> ET;
4325 if( IsTriangular<MT1>::value ) {
4326 ResultType_<VT1> tmp(
serial( scalar * x ) );
4327 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4328 addAssign( y, tmp );
4331 gemv( y, x, A, ET(scalar), ET(1) );
4353 template<
typename VT1
4355 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
4361 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4362 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4364 if( right.rows() == 0UL || right.columns() == 0UL ) {
4376 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
4391 template<
typename VT1
4395 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4397 if( ( IsDiagonal<MT1>::value ) ||
4398 ( IsComputation<MT>::value && !evaluateMatrix ) ||
4399 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4400 selectSmallSubAssignKernel( y, x, A, scalar );
4402 selectBlasSubAssignKernel( y, x, A, scalar );
4420 template<
typename VT1
4424 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4426 y.subAssign( x * A * scalar );
4444 template<
typename VT1
4448 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4449 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4451 selectDefaultSubAssignKernel( y, x, A, scalar );
4470 template<
typename VT1
4474 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4475 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4477 const size_t M( A.rows() );
4478 const size_t N( A.columns() );
4480 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4484 for( ; (j+8UL) <= N; j+=8UL )
4486 const size_t ibegin( ( IsLower<MT1>::value )
4487 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4489 const size_t iend( ( IsUpper<MT1>::value )
4490 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4494 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4495 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4497 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4500 for( ; i<ipos; i+=SIMDSIZE ) {
4501 const SIMDType x1( x.load(i) );
4502 xmm1 = xmm1 + x1 * A.load(i,j );
4503 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4504 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4505 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4506 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
4507 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
4508 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
4509 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
4512 y[j ] -=
sum( xmm1 ) * scalar;
4513 y[j+1UL] -=
sum( xmm2 ) * scalar;
4514 y[j+2UL] -=
sum( xmm3 ) * scalar;
4515 y[j+3UL] -=
sum( xmm4 ) * scalar;
4516 y[j+4UL] -=
sum( xmm5 ) * scalar;
4517 y[j+5UL] -=
sum( xmm6 ) * scalar;
4518 y[j+6UL] -=
sum( xmm7 ) * scalar;
4519 y[j+7UL] -=
sum( xmm8 ) * scalar;
4521 for( ; remainder && i<iend; ++i ) {
4522 y[j ] -= x[i] * A(i,j ) * scalar;
4523 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4524 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4525 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4526 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4527 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4528 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4529 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4533 for( ; (j+4UL) <= N; j+=4UL )
4535 const size_t ibegin( ( IsLower<MT1>::value )
4536 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4538 const size_t iend( ( IsUpper<MT1>::value )
4539 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4543 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4544 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4546 SIMDType xmm1, xmm2, xmm3, xmm4;
4549 for( ; i<ipos; i+=SIMDSIZE ) {
4550 const SIMDType x1( x.load(i) );
4551 xmm1 = xmm1 + x1 * A.load(i,j );
4552 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4553 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4554 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4557 y[j ] -=
sum( xmm1 ) * scalar;
4558 y[j+1UL] -=
sum( xmm2 ) * scalar;
4559 y[j+2UL] -=
sum( xmm3 ) * scalar;
4560 y[j+3UL] -=
sum( xmm4 ) * scalar;
4562 for( ; remainder && i<iend; ++i ) {
4563 y[j ] -= x[i] * A(i,j ) * scalar;
4564 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4565 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4566 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4570 for( ; (j+3UL) <= N; j+=3UL )
4572 const size_t ibegin( ( IsLower<MT1>::value )
4573 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4575 const size_t iend( ( IsUpper<MT1>::value )
4576 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
4580 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4581 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4583 SIMDType xmm1, xmm2, xmm3;
4586 for( ; i<ipos; i+=SIMDSIZE ) {
4587 const SIMDType x1( x.load(i) );
4588 xmm1 = xmm1 + x1 * A.load(i,j );
4589 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4590 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4593 y[j ] -=
sum( xmm1 ) * scalar;
4594 y[j+1UL] -=
sum( xmm2 ) * scalar;
4595 y[j+2UL] -=
sum( xmm3 ) * scalar;
4597 for( ; remainder && i<iend; ++i ) {
4598 y[j ] -= x[i] * A(i,j ) * scalar;
4599 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4600 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4604 for( ; (j+2UL) <= N; j+=2UL )
4606 const size_t ibegin( ( IsLower<MT1>::value )
4607 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4609 const size_t iend( ( IsUpper<MT1>::value )
4610 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4614 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4615 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4617 SIMDType xmm1, xmm2;
4620 for( ; i<ipos; i+=SIMDSIZE ) {
4621 const SIMDType x1( x.load(i) );
4622 xmm1 = xmm1 + x1 * A.load(i,j );
4623 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4626 y[j ] -=
sum( xmm1 ) * scalar;
4627 y[j+1UL] -=
sum( xmm2 ) * scalar;
4629 for( ; remainder && i<iend; ++i ) {
4630 y[j ] -= x[i] * A(i,j ) * scalar;
4631 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4637 const size_t ibegin( ( IsLower<MT1>::value )
4638 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4640 const size_t iend( ( IsUpper<MT1>::value )
4641 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4645 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4646 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4651 for( ; i<ipos; i+=SIMDSIZE ) {
4652 xmm1 = xmm1 + A.load(i,j) * x.load(i);
4655 y[j] -=
sum( xmm1 ) * scalar;
4657 for( ; remainder && i<iend; ++i ) {
4658 y[j] -= x[i] * A(i,j) * scalar;
4678 template<
typename VT1
4682 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4683 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4685 selectDefaultSubAssignKernel( y, x, A, scalar );
4704 template<
typename VT1
4708 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4709 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4711 const size_t M( A.rows() );
4712 const size_t N( A.columns() );
4714 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4718 for( ; (j+8UL) <= N; j+=8UL )
4720 const size_t ibegin( ( IsLower<MT1>::value )
4721 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4723 const size_t iend( ( IsUpper<MT1>::value )
4724 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4728 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4729 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4733 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4734 const size_t i1( i+SIMDSIZE );
4735 const size_t i2( i+SIMDSIZE*2UL );
4736 const size_t i3( i+SIMDSIZE*3UL );
4737 const SIMDType x1( x.load(i ) );
4738 const SIMDType x2( x.load(i1) );
4739 const SIMDType x3( x.load(i2) );
4740 const SIMDType x4( x.load(i3) );
4741 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4742 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4743 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4744 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4745 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4746 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4747 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4748 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4751 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4752 const size_t i1( i+SIMDSIZE );
4753 const SIMDType x1( x.load(i ) );
4754 const SIMDType x2( x.load(i1) );
4755 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4756 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4757 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4758 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4759 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4760 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4761 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4762 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4765 for( ; i<ipos; i+=SIMDSIZE ) {
4766 const SIMDType x1( x.load(i) );
4767 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4768 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4769 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4770 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4771 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4772 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4773 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4774 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4777 for( ; remainder && i<iend; ++i ) {
4778 y[j ] -= x[i] * A(i,j ) * scalar;
4779 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4780 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4781 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4782 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4783 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4784 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4785 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4789 for( ; (j+4UL) <= N; j+=4UL )
4791 const size_t ibegin( ( IsLower<MT1>::value )
4792 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4794 const size_t iend( ( IsUpper<MT1>::value )
4795 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4799 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4800 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4804 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4805 const size_t i1( i+SIMDSIZE );
4806 const size_t i2( i+SIMDSIZE*2UL );
4807 const size_t i3( i+SIMDSIZE*3UL );
4808 const SIMDType x1( x.load(i ) );
4809 const SIMDType x2( x.load(i1) );
4810 const SIMDType x3( x.load(i2) );
4811 const SIMDType x4( x.load(i3) );
4812 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4813 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4814 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4815 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4818 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4819 const size_t i1( i+SIMDSIZE );
4820 const SIMDType x1( x.load(i ) );
4821 const SIMDType x2( x.load(i1) );
4822 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4823 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4824 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4825 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4828 for( ; i<ipos; i+=SIMDSIZE ) {
4829 const SIMDType x1( x.load(i) );
4830 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4831 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4832 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4833 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4836 for( ; remainder && i<iend; ++i ) {
4837 y[j ] -= x[i] * A(i,j ) * scalar;
4838 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4839 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4840 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4844 for( ; (j+2UL) <= N; j+=2UL )
4846 const size_t ibegin( ( IsLower<MT1>::value )
4847 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4849 const size_t iend( ( IsUpper<MT1>::value )
4850 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4854 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4855 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4859 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4860 const size_t i1( i+SIMDSIZE );
4861 const size_t i2( i+SIMDSIZE*2UL );
4862 const size_t i3( i+SIMDSIZE*3UL );
4863 const SIMDType x1( x.load(i ) );
4864 const SIMDType x2( x.load(i1) );
4865 const SIMDType x3( x.load(i2) );
4866 const SIMDType x4( x.load(i3) );
4867 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4868 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4871 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4872 const size_t i1( i+SIMDSIZE );
4873 const SIMDType x1( x.load(i ) );
4874 const SIMDType x2( x.load(i1) );
4875 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4876 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4879 for( ; i<ipos; i+=SIMDSIZE ) {
4880 const SIMDType x1( x.load(i) );
4881 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4882 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4885 for( ; remainder && i<iend; ++i ) {
4886 y[j ] -= x[i] * A(i,j ) * scalar;
4887 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4893 const size_t ibegin( ( IsLower<MT1>::value )
4894 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-SIMDSIZE) )
4896 const size_t iend( ( IsUpper<MT1>::value )
4897 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4901 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4902 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4906 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4907 const size_t i1( i+SIMDSIZE );
4908 const size_t i2( i+SIMDSIZE*2UL );
4909 const size_t i3( i+SIMDSIZE*3UL );
4910 const SIMDType x1( x.load(i ) );
4911 const SIMDType x2( x.load(i1) );
4912 const SIMDType x3( x.load(i2) );
4913 const SIMDType x4( x.load(i3) );
4914 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4917 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4918 const size_t i1( i+SIMDSIZE );
4919 const SIMDType x1( x.load(i ) );
4920 const SIMDType x2( x.load(i1) );
4921 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4924 for( ; i<ipos; i+=SIMDSIZE ) {
4925 const SIMDType x1( x.load(i) );
4926 y[j] -=
sum( x1 * A.load(i,j) ) * scalar;
4929 for( ; remainder && i<iend; ++i ) {
4930 y[j] -= x[i] * A(i,j) * scalar;
4951 template<
typename VT1
4955 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4956 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4958 selectLargeSubAssignKernel( y, x, A, scalar );
4963 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4977 template<
typename VT1
4981 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4982 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4984 typedef ElementType_<VT1> ET;
4986 if( IsTriangular<MT1>::value ) {
4987 ResultType_<VT1> tmp(
serial( scalar * x ) );
4988 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4989 subAssign( y, tmp );
4992 gemv( y, x, A, ET(-scalar), ET(1) );
5014 template<
typename VT1
5016 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5026 const ResultType tmp(
serial( rhs ) );
5027 multAssign( ~lhs, tmp );
5047 template<
typename VT1
5049 friend inline void divAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5059 const ResultType tmp(
serial( rhs ) );
5060 divAssign( ~lhs, tmp );
5082 template<
typename VT1
5084 friend inline EnableIf_< UseSMPAssign<VT1> >
5085 smpAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5091 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5092 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5094 if( right.rows() == 0UL ) {
5098 else if( right.columns() == 0UL ) {
5128 template<
typename VT1
5130 friend inline EnableIf_< UseSMPAssign<VT1> >
5131 smpAssign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5141 const ResultType tmp( rhs );
5160 template<
typename VT1
5162 friend inline EnableIf_< UseSMPAssign<VT1> >
5163 smpAddAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5169 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5170 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5172 if( right.rows() == 0UL || right.columns() == 0UL ) {
5206 template<
typename VT1
5208 friend inline EnableIf_< UseSMPAssign<VT1> >
5209 smpSubAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5215 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5216 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5218 if( right.rows() == 0UL || right.columns() == 0UL ) {
5252 template<
typename VT1
5254 friend inline EnableIf_< UseSMPAssign<VT1> >
5255 smpMultAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5265 const ResultType tmp( rhs );
5288 template<
typename VT1
5290 friend inline EnableIf_< UseSMPAssign<VT1> >
5291 smpDivAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5301 const ResultType tmp( rhs );
5364 template<
typename T1
5366 inline const DisableIf_< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >
5371 if( (~vec).
size() != (~mat).
rows() ) {
5390 template<
typename VT,
typename MT >
5407 template<
typename VT,
typename MT >
5409 :
public BoolConstant< And< IsAligned<VT>, IsAligned<MT> >::value >
5425 template<
typename VT,
typename MT,
bool AF >
5430 using Type = MultExprTrait_< SubvectorExprTrait_<const VT,AF>
5431 , SubmatrixExprTrait_<const MT,AF> >;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:353
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:341
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:138
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Header file for basic type definitions.
BLAZE_ALWAYS_INLINE const complex< int8_t > sum(const SIMDcint8 &a) noexcept
Returns the sum of all elements in the 8-bit integral complex SIMD vector.
Definition: Reduction.h:63
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:130
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:250
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:212
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:383
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:211
If_< IsExpression< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:215
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:363
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:264
Header file for the If class template.
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:221
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:135
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the Columns type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
TDVecTDMatMultExpr< VT, MT > This
Type of this TDVecTDMatMultExpr instance.
Definition: TDVecTDMatMultExpr.h:206
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:207
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:133
If_< IsExpression< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:218
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:296
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:131
Header file for the SubmatrixExprTrait class template.
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:384
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:210
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:132
Header file for the TVecMatMultExpr base class.
Constraints on the storage order of matrix types.
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:319
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:314
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:110
Header file for the AreSIMDCombinable type trait.
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:134
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:208
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:309
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:209
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:373
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:329
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.