Blaze 3.9
TDVecTDMatMultExpr.h
Go to the documentation of this file.
1//=================================================================================================
33//=================================================================================================
34
35#ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
37
38
39//*************************************************************************************************
40// Includes
41//*************************************************************************************************
42
45#include <blaze/math/Aliases.h>
63#include <blaze/math/SIMD.h>
85#include <blaze/system/BLAS.h>
89#include <blaze/util/Assert.h>
90#include <blaze/util/Complex.h>
92#include <blaze/util/EnableIf.h>
95#include <blaze/util/mpl/If.h>
96#include <blaze/util/Types.h>
104
105
106namespace blaze {
107
108//=================================================================================================
109//
110// CLASS TDVECTDMATMULTEXPR
111//
112//=================================================================================================
113
114//*************************************************************************************************
121template< typename VT // Type of the left-hand side dense vector
122 , typename MT > // Type of the right-hand side dense matrix
124 : public TVecMatMultExpr< DenseVector< TDVecTDMatMultExpr<VT,MT>, true > >
125 , private Computation
126{
127 private:
128 //**Type definitions****************************************************************************
135 //**********************************************************************************************
136
137 //**********************************************************************************************
139 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
140 //**********************************************************************************************
141
142 //**********************************************************************************************
144 static constexpr bool evaluateMatrix =
145 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
146 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
147 //**********************************************************************************************
148
149 //**********************************************************************************************
151
155 template< typename T1 >
156 static constexpr bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
158 //**********************************************************************************************
159
160 //**********************************************************************************************
162
165 template< typename T1, typename T2, typename T3 >
166 static constexpr bool UseBlasKernel_v =
168 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
169 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
170 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
171 !IsDiagonal_v<T3> &&
172 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
173 IsBLASCompatible_v< ElementType_t<T1> > &&
174 IsBLASCompatible_v< ElementType_t<T2> > &&
175 IsBLASCompatible_v< ElementType_t<T3> > &&
176 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
177 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
179 //**********************************************************************************************
180
181 //**********************************************************************************************
183
187 template< typename T1, typename T2, typename T3 >
188 static constexpr bool UseVectorizedDefaultKernel_v =
189 ( useOptimizedKernels &&
190 !IsDiagonal_v<T3> &&
191 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
192 IsSIMDCombinable_v< ElementType_t<T1>
194 , ElementType_t<T3> > &&
195 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
196 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
198 //**********************************************************************************************
199
200 public:
201 //**Type definitions****************************************************************************
204
207
212 using ReturnType = const ElementType;
213 using CompositeType = const ResultType;
214
216 using LeftOperand = If_t< IsExpression_v<VT>, const VT, const VT& >;
217
219 using RightOperand = If_t< IsExpression_v<MT>, const MT, const MT& >;
220
223
226 //**********************************************************************************************
227
228 //**Compilation flags***************************************************************************
230 static constexpr bool simdEnabled =
231 ( !IsDiagonal_v<MT> &&
232 VT::simdEnabled && MT::simdEnabled &&
233 HasSIMDAdd_v<VET,MET> &&
234 HasSIMDMult_v<VET,MET> );
235
237 static constexpr bool smpAssignable =
238 ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
239 //**********************************************************************************************
240
241 //**SIMD properties*****************************************************************************
243 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
244 //**********************************************************************************************
245
246 //**Constructor*********************************************************************************
252 inline TDVecTDMatMultExpr( const VT& vec, const MT& mat ) noexcept
253 : vec_( vec ) // Left-hand side dense vector of the multiplication expression
254 , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
255 {
256 BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
257 }
258 //**********************************************************************************************
259
260 //**Subscript operator**************************************************************************
266 inline ReturnType operator[]( size_t index ) const {
267 BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
268
269 if( IsDiagonal_v<MT> )
270 {
271 return vec_[index] * mat_(index,index);
272 }
273 else if( IsLower_v<MT> && ( index > 8UL ) )
274 {
275 const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
276 const size_t n ( mat_.rows() - begin );
277 return subvector( vec_, begin, n, unchecked ) *
278 subvector( column( mat_, index, unchecked ), begin, n, unchecked );
279 }
280 else if( IsUpper_v<MT> && ( index + 8UL < mat_.rows() ) )
281 {
282 const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
283 return subvector( vec_, 0UL, n, unchecked ) *
284 subvector( column( mat_, index, unchecked ), 0UL, n, unchecked );
285 }
286 else
287 {
288 return vec_ * column( mat_, index, unchecked );
289 }
290 }
291 //**********************************************************************************************
292
293 //**At function*********************************************************************************
300 inline ReturnType at( size_t index ) const {
301 if( index >= mat_.columns() ) {
302 BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
303 }
304 return (*this)[index];
305 }
306 //**********************************************************************************************
307
308 //**Size function*******************************************************************************
313 inline size_t size() const noexcept {
314 return mat_.columns();
315 }
316 //**********************************************************************************************
317
318 //**Left operand access*************************************************************************
323 inline LeftOperand leftOperand() const noexcept {
324 return vec_;
325 }
326 //**********************************************************************************************
327
328 //**Right operand access************************************************************************
333 inline RightOperand rightOperand() const noexcept {
334 return mat_;
335 }
336 //**********************************************************************************************
337
338 //**********************************************************************************************
344 template< typename T >
345 inline bool canAlias( const T* alias ) const noexcept {
346 return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
347 }
348 //**********************************************************************************************
349
350 //**********************************************************************************************
356 template< typename T >
357 inline bool isAliased( const T* alias ) const noexcept {
358 return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
359 }
360 //**********************************************************************************************
361
362 //**********************************************************************************************
367 inline bool isAligned() const noexcept {
368 return vec_.isAligned() && mat_.isAligned();
369 }
370 //**********************************************************************************************
371
372 //**********************************************************************************************
377 inline bool canSMPAssign() const noexcept {
378 return ( !BLAZE_BLAS_MODE ||
381 ( IsComputation_v<MT> && !evaluateMatrix ) ||
382 ( mat_.rows() * mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
383 ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
384 }
385 //**********************************************************************************************
386
387 private:
388 //**Member variables****************************************************************************
391 //**********************************************************************************************
392
393 //**Assignment to dense vectors*****************************************************************
406 template< typename VT1 > // Type of the target dense vector
407 friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
408 {
410
411 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
412
413 if( rhs.mat_.rows() == 0UL ||
414 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
415 reset( *lhs );
416 return;
417 }
418 else if( rhs.mat_.columns() == 0UL ) {
419 return;
420 }
421
422 LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
423 RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
424
425 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
426 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
427 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
428 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
429
430 TDVecTDMatMultExpr::selectAssignKernel( *lhs, x, A );
431 }
433 //**********************************************************************************************
434
435 //**Assignment to dense vectors (kernel selection)**********************************************
446 template< typename VT1 // Type of the left-hand side target vector
447 , typename VT2 // Type of the left-hand side vector operand
448 , typename MT1 > // Type of the right-hand side matrix operand
449 static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
450 {
451 if( ( IsDiagonal_v<MT1> ) ||
452 ( IsComputation_v<MT> && !evaluateMatrix ) ||
453 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
454 selectSmallAssignKernel( y, x, A );
455 else
456 selectBlasAssignKernel( y, x, A );
457 }
459 //**********************************************************************************************
460
461 //**Default assignment to dense vectors*********************************************************
475 template< typename VT1 // Type of the left-hand side target vector
476 , typename VT2 // Type of the left-hand side vector operand
477 , typename MT1 > // Type of the right-hand side matrix operand
478 static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
479 {
480 y.assign( x * A );
481 }
483 //**********************************************************************************************
484
485 //**Default assignment to dense vectors (small matrices)****************************************
499 template< typename VT1 // Type of the left-hand side target vector
500 , typename VT2 // Type of the left-hand side vector operand
501 , typename MT1 > // Type of the right-hand side matrix operand
502 static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
503 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
504 {
505 selectDefaultAssignKernel( y, x, A );
506 }
508 //**********************************************************************************************
509
510 //**Vectorized default assignment to dense vectors (small matrices)*****************************
524 template< typename VT1 // Type of the left-hand side target vector
525 , typename VT2 // Type of the left-hand side vector operand
526 , typename MT1 > // Type of the right-hand side matrix operand
527 static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
528 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
529 {
530 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
531
532 const size_t M( A.rows() );
533 const size_t N( A.columns() );
534
535 size_t j( 0UL );
536
537 for( ; (j+8UL) <= N; j+=8UL )
538 {
539 const size_t ibegin( ( IsLower_v<MT1> )
540 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
541 :( 0UL ) );
542 const size_t iend( ( IsUpper_v<MT1> )
543 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
544 :( M ) );
545 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
546
547 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
548 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
549
550 size_t i( ibegin );
551
552 if( i < ipos )
553 {
554 SIMDType x1( x.load(i) );
555 SIMDType xmm1( x1 * A.load(i,j ) );
556 SIMDType xmm2( x1 * A.load(i,j+1UL) );
557 SIMDType xmm3( x1 * A.load(i,j+2UL) );
558 SIMDType xmm4( x1 * A.load(i,j+3UL) );
559 SIMDType xmm5( x1 * A.load(i,j+4UL) );
560 SIMDType xmm6( x1 * A.load(i,j+5UL) );
561 SIMDType xmm7( x1 * A.load(i,j+6UL) );
562 SIMDType xmm8( x1 * A.load(i,j+7UL) );
563
564 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
565 x1 = x.load(i);
566 xmm1 += x1 * A.load(i,j );
567 xmm2 += x1 * A.load(i,j+1UL);
568 xmm3 += x1 * A.load(i,j+2UL);
569 xmm4 += x1 * A.load(i,j+3UL);
570 xmm5 += x1 * A.load(i,j+4UL);
571 xmm6 += x1 * A.load(i,j+5UL);
572 xmm7 += x1 * A.load(i,j+6UL);
573 xmm8 += x1 * A.load(i,j+7UL);
574 }
575
576 y[j ] = sum( xmm1 );
577 y[j+1UL] = sum( xmm2 );
578 y[j+2UL] = sum( xmm3 );
579 y[j+3UL] = sum( xmm4 );
580 y[j+4UL] = sum( xmm5 );
581 y[j+5UL] = sum( xmm6 );
582 y[j+6UL] = sum( xmm7 );
583 y[j+7UL] = sum( xmm8 );
584
585 for( ; remainder && i<iend; ++i ) {
586 y[j ] += x[i] * A(i,j );
587 y[j+1UL] += x[i] * A(i,j+1UL);
588 y[j+2UL] += x[i] * A(i,j+2UL);
589 y[j+3UL] += x[i] * A(i,j+3UL);
590 y[j+4UL] += x[i] * A(i,j+4UL);
591 y[j+5UL] += x[i] * A(i,j+5UL);
592 y[j+6UL] += x[i] * A(i,j+6UL);
593 y[j+7UL] += x[i] * A(i,j+7UL);
594 }
595 }
596 else
597 {
598 ElementType value1( x[i] * A(i,j ) );
599 ElementType value2( x[i] * A(i,j+1UL) );
600 ElementType value3( x[i] * A(i,j+2UL) );
601 ElementType value4( x[i] * A(i,j+3UL) );
602 ElementType value5( x[i] * A(i,j+4UL) );
603 ElementType value6( x[i] * A(i,j+5UL) );
604 ElementType value7( x[i] * A(i,j+6UL) );
605 ElementType value8( x[i] * A(i,j+7UL) );
606
607 for( ++i; i<iend; ++i ) {
608 value1 += x[i] * A(i,j );
609 value2 += x[i] * A(i,j+1UL);
610 value3 += x[i] * A(i,j+2UL);
611 value4 += x[i] * A(i,j+3UL);
612 value5 += x[i] * A(i,j+4UL);
613 value6 += x[i] * A(i,j+5UL);
614 value7 += x[i] * A(i,j+6UL);
615 value8 += x[i] * A(i,j+7UL);
616 }
617
618 y[j ] = value1;
619 y[j+1UL] = value2;
620 y[j+2UL] = value3;
621 y[j+3UL] = value4;
622 y[j+4UL] = value5;
623 y[j+5UL] = value6;
624 y[j+6UL] = value7;
625 y[j+7UL] = value8;
626 }
627 }
628
629 for( ; (j+4UL) <= N; j+=4UL )
630 {
631 const size_t ibegin( ( IsLower_v<MT1> )
632 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
633 :( 0UL ) );
634 const size_t iend( ( IsUpper_v<MT1> )
635 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
636 :( M ) );
637 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
638
639 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
640 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
641
642 size_t i( ibegin );
643
644 if( i < ipos )
645 {
646 SIMDType x1( x.load(i) );
647 SIMDType xmm1( x1 * A.load(i,j ) );
648 SIMDType xmm2( x1 * A.load(i,j+1UL) );
649 SIMDType xmm3( x1 * A.load(i,j+2UL) );
650 SIMDType xmm4( x1 * A.load(i,j+3UL) );
651
652 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
653 x1 = x.load(i);
654 xmm1 += x1 * A.load(i,j );
655 xmm2 += x1 * A.load(i,j+1UL);
656 xmm3 += x1 * A.load(i,j+2UL);
657 xmm4 += x1 * A.load(i,j+3UL);
658 }
659
660 y[j ] = sum( xmm1 );
661 y[j+1UL] = sum( xmm2 );
662 y[j+2UL] = sum( xmm3 );
663 y[j+3UL] = sum( xmm4 );
664
665 for( ; remainder && i<iend; ++i ) {
666 y[j ] += x[i] * A(i,j );
667 y[j+1UL] += x[i] * A(i,j+1UL);
668 y[j+2UL] += x[i] * A(i,j+2UL);
669 y[j+3UL] += x[i] * A(i,j+3UL);
670 }
671 }
672 else
673 {
674 ElementType value1( x[i] * A(i,j ) );
675 ElementType value2( x[i] * A(i,j+1UL) );
676 ElementType value3( x[i] * A(i,j+2UL) );
677 ElementType value4( x[i] * A(i,j+3UL) );
678
679 for( ++i; i<iend; ++i ) {
680 value1 += x[i] * A(i,j );
681 value2 += x[i] * A(i,j+1UL);
682 value3 += x[i] * A(i,j+2UL);
683 value4 += x[i] * A(i,j+3UL);
684 }
685
686 y[j ] = value1;
687 y[j+1UL] = value2;
688 y[j+2UL] = value3;
689 y[j+3UL] = value4;
690 }
691 }
692
693 for( ; (j+3UL) <= N; j+=3UL )
694 {
695 const size_t ibegin( ( IsLower_v<MT1> )
696 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
697 :( 0UL ) );
698 const size_t iend( ( IsUpper_v<MT1> )
699 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
700 :( M ) );
701 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
702
703 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
704 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
705
706 size_t i( ibegin );
707
708 if( i < ipos )
709 {
710 SIMDType x1( x.load(i) );
711 SIMDType xmm1( x1 * A.load(i,j ) );
712 SIMDType xmm2( x1 * A.load(i,j+1UL) );
713 SIMDType xmm3( x1 * A.load(i,j+2UL) );
714
715 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
716 x1 = x.load(i);
717 xmm1 += x1 * A.load(i,j );
718 xmm2 += x1 * A.load(i,j+1UL);
719 xmm3 += x1 * A.load(i,j+2UL);
720 }
721
722 y[j ] = sum( xmm1 );
723 y[j+1UL] = sum( xmm2 );
724 y[j+2UL] = sum( xmm3 );
725
726 for( ; remainder && i<iend; ++i ) {
727 y[j ] += x[i] * A(i,j );
728 y[j+1UL] += x[i] * A(i,j+1UL);
729 y[j+2UL] += x[i] * A(i,j+2UL);
730 }
731 }
732 else
733 {
734 ElementType value1( x[i] * A(i,j ) );
735 ElementType value2( x[i] * A(i,j+1UL) );
736 ElementType value3( x[i] * A(i,j+2UL) );
737
738 for( ++i; i<iend; ++i ) {
739 value1 += x[i] * A(i,j );
740 value2 += x[i] * A(i,j+1UL);
741 value3 += x[i] * A(i,j+2UL);
742 }
743
744 y[j ] = value1;
745 y[j+1UL] = value2;
746 y[j+2UL] = value3;
747 }
748 }
749
750 for( ; (j+2UL) <= N; j+=2UL )
751 {
752 const size_t ibegin( ( IsLower_v<MT1> )
753 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
754 :( 0UL ) );
755 const size_t iend( ( IsUpper_v<MT1> )
756 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
757 :( M ) );
758 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
759
760 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
761 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
762
763 size_t i( ibegin );
764
765 if( i < ipos )
766 {
767 SIMDType x1( x.load(i) );
768 SIMDType xmm1( x1 * A.load(i,j ) );
769 SIMDType xmm2( x1 * A.load(i,j+1UL) );
770
771 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
772 x1 = x.load(i);
773 xmm1 += x1 * A.load(i,j );
774 xmm2 += x1 * A.load(i,j+1UL);
775 }
776
777 y[j ] = sum( xmm1 );
778 y[j+1UL] = sum( xmm2 );
779
780 for( ; remainder && i<iend; ++i ) {
781 y[j ] += x[i] * A(i,j );
782 y[j+1UL] += x[i] * A(i,j+1UL);
783 }
784 }
785 else
786 {
787 ElementType value1( x[i] * A(i,j ) );
788 ElementType value2( x[i] * A(i,j+1UL) );
789
790 for( ++i; i<iend; ++i ) {
791 value1 += x[i] * A(i,j );
792 value2 += x[i] * A(i,j+1UL);
793 }
794
795 y[j ] = value1;
796 y[j+1UL] = value2;
797 }
798 }
799
800 if( j < N )
801 {
802 const size_t ibegin( ( IsLower_v<MT1> )
803 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
804 :( 0UL ) );
805 const size_t iend( ( IsUpper_v<MT1> )
806 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
807 :( M ) );
808 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
809
810 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
811 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
812
813 size_t i( ibegin );
814
815 if( i < ipos )
816 {
817 SIMDType xmm1( x.load(i) * A.load(i,j) );
818
819 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
820 xmm1 += x.load(i) * A.load(i,j);
821 }
822
823 y[j] = sum( xmm1 );
824
825 for( ; remainder && i<iend; ++i ) {
826 y[j] += x[i] * A(i,j);
827 }
828 }
829 else
830 {
831 ElementType value( x[i] * A(i,j) );
832
833 for( ++i; i<iend; ++i ) {
834 value += x[i] * A(i,j);
835 }
836
837 y[j] = value;
838 }
839 }
840 }
842 //**********************************************************************************************
843
844 //**Default assignment to dense vectors (large matrices)****************************************
858 template< typename VT1 // Type of the left-hand side target vector
859 , typename VT2 // Type of the left-hand side vector operand
860 , typename MT1 > // Type of the right-hand side matrix operand
861 static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
862 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
863 {
864 selectDefaultAssignKernel( y, x, A );
865 }
867 //**********************************************************************************************
868
869 //**Vectorized default assignment to dense vectors (large matrices)*****************************
883 template< typename VT1 // Type of the left-hand side target vector
884 , typename VT2 // Type of the left-hand side vector operand
885 , typename MT1 > // Type of the right-hand side matrix operand
886 static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
887 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
888 {
889 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
890
891 const size_t M( A.rows() );
892 const size_t N( A.columns() );
893
894 reset( y );
895
896 size_t j( 0UL );
897
898 for( ; (j+8UL) <= N; j+=8UL )
899 {
900 const size_t ibegin( ( IsLower_v<MT1> )
901 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
902 :( 0UL ) );
903 const size_t iend( ( IsUpper_v<MT1> )
904 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
905 :( M ) );
906 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
907
908 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
909 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
910
911 size_t i( ibegin );
912
913 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
914 const size_t i1( i+SIMDSIZE );
915 const size_t i2( i+SIMDSIZE*2UL );
916 const size_t i3( i+SIMDSIZE*3UL );
917 const SIMDType x1( x.load(i ) );
918 const SIMDType x2( x.load(i1) );
919 const SIMDType x3( x.load(i2) );
920 const SIMDType x4( x.load(i3) );
921 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
922 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
923 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
924 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
925 y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
926 y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
927 y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
928 y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
929 }
930
931 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
932 const size_t i1( i+SIMDSIZE );
933 const SIMDType x1( x.load(i ) );
934 const SIMDType x2( x.load(i1) );
935 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
936 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
937 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
938 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
939 y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
940 y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
941 y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
942 y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
943 }
944
945 for( ; i<ipos; i+=SIMDSIZE ) {
946 const SIMDType x1( x.load(i) );
947 y[j ] += sum( x1 * A.load(i,j ) );
948 y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
949 y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
950 y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
951 y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
952 y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
953 y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
954 y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
955 }
956
957 for( ; remainder && i<iend; ++i ) {
958 y[j ] += x[i] * A(i,j );
959 y[j+1UL] += x[i] * A(i,j+1UL);
960 y[j+2UL] += x[i] * A(i,j+2UL);
961 y[j+3UL] += x[i] * A(i,j+3UL);
962 y[j+4UL] += x[i] * A(i,j+4UL);
963 y[j+5UL] += x[i] * A(i,j+5UL);
964 y[j+6UL] += x[i] * A(i,j+6UL);
965 y[j+7UL] += x[i] * A(i,j+7UL);
966 }
967 }
968
969 for( ; (j+4UL) <= N; j+=4UL )
970 {
971 const size_t ibegin( ( IsLower_v<MT1> )
972 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
973 :( 0UL ) );
974 const size_t iend( ( IsUpper_v<MT1> )
975 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
976 :( M ) );
977 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
978
979 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
980 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
981
982 size_t i( ibegin );
983
984 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
985 const size_t i1( i+SIMDSIZE );
986 const size_t i2( i+SIMDSIZE*2UL );
987 const size_t i3( i+SIMDSIZE*3UL );
988 const SIMDType x1( x.load(i ) );
989 const SIMDType x2( x.load(i1) );
990 const SIMDType x3( x.load(i2) );
991 const SIMDType x4( x.load(i3) );
992 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
993 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
994 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
995 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
996 }
997
998 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
999 const size_t i1( i+SIMDSIZE );
1000 const SIMDType x1( x.load(i ) );
1001 const SIMDType x2( x.load(i1) );
1002 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1003 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1004 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1005 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1006 }
1007
1008 for( ; i<ipos; i+=SIMDSIZE ) {
1009 const SIMDType x1( x.load(i) );
1010 y[j ] += sum( x1 * A.load(i,j ) );
1011 y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1012 y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1013 y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1014 }
1015
1016 for( ; remainder && i<iend; ++i ) {
1017 y[j ] += x[i] * A(i,j );
1018 y[j+1UL] += x[i] * A(i,j+1UL);
1019 y[j+2UL] += x[i] * A(i,j+2UL);
1020 y[j+3UL] += x[i] * A(i,j+3UL);
1021 }
1022 }
1023
1024 for( ; (j+2UL) <= N; j+=2UL )
1025 {
1026 const size_t ibegin( ( IsLower_v<MT1> )
1027 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1028 :( 0UL ) );
1029 const size_t iend( ( IsUpper_v<MT1> )
1030 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1031 :( M ) );
1032 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1033
1034 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1035 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1036
1037 size_t i( ibegin );
1038
1039 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1040 const size_t i1( i+SIMDSIZE );
1041 const size_t i2( i+SIMDSIZE*2UL );
1042 const size_t i3( i+SIMDSIZE*3UL );
1043 const SIMDType x1( x.load(i ) );
1044 const SIMDType x2( x.load(i1) );
1045 const SIMDType x3( x.load(i2) );
1046 const SIMDType x4( x.load(i3) );
1047 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1048 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1049 }
1050
1051 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1052 const size_t i1( i+SIMDSIZE );
1053 const SIMDType x1( x.load(i ) );
1054 const SIMDType x2( x.load(i1) );
1055 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1056 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1057 }
1058
1059 for( ; i<ipos; i+=SIMDSIZE ) {
1060 const SIMDType x1( x.load(i) );
1061 y[j ] += sum( x1 * A.load(i,j ) );
1062 y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1063 }
1064
1065 for( ; remainder && i<iend; ++i ) {
1066 y[j ] += x[i] * A(i,j );
1067 y[j+1UL] += x[i] * A(i,j+1UL);
1068 }
1069 }
1070
1071 if( j < N )
1072 {
1073 const size_t ibegin( ( IsLower_v<MT1> )
1074 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1075 :( 0UL ) );
1076 const size_t iend( ( IsUpper_v<MT1> )
1077 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1078 :( M ) );
1079 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1080
1081 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1082 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1083
1084 size_t i( ibegin );
1085
1086 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1087 const size_t i1( i+SIMDSIZE );
1088 const size_t i2( i+SIMDSIZE*2UL );
1089 const size_t i3( i+SIMDSIZE*3UL );
1090 const SIMDType x1( x.load(i ) );
1091 const SIMDType x2( x.load(i1) );
1092 const SIMDType x3( x.load(i2) );
1093 const SIMDType x4( x.load(i3) );
1094 y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1095 }
1096
1097 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1098 const size_t i1( i+SIMDSIZE );
1099 const SIMDType x1( x.load(i ) );
1100 const SIMDType x2( x.load(i1) );
1101 y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1102 }
1103
1104 for( ; i<ipos; i+=SIMDSIZE ) {
1105 const SIMDType x1( x.load(i) );
1106 y[j] += sum( x1 * A.load(i,j) );
1107 }
1108
1109 for( ; remainder && i<iend; ++i ) {
1110 y[j] += x[i] * A(i,j);
1111 }
1112 }
1113 }
1115 //**********************************************************************************************
1116
1117 //**BLAS-based assignment to dense vectors (default)********************************************
1131 template< typename VT1 // Type of the left-hand side target vector
1132 , typename VT2 // Type of the left-hand side vector operand
1133 , typename MT1 > // Type of the right-hand side matrix operand
1134 static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1135 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1136 {
1137 selectLargeAssignKernel( y, x, A );
1138 }
1140 //**********************************************************************************************
1141
1142 //**BLAS-based assignment to dense vectors******************************************************
1143#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1157 template< typename VT1 // Type of the left-hand side target vector
1158 , typename VT2 // Type of the left-hand side vector operand
1159 , typename MT1 > // Type of the right-hand side matrix operand
1160 static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1161 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1162 {
1163 using ET = ElementType_t<VT1>;
1164
1165 if( IsTriangular_v<MT1> ) {
1166 assign( y, x );
1167 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1168 }
1169 else {
1170 gemv( y, x, A, ET(1), ET(0) );
1171 }
1172 }
1174#endif
1175 //**********************************************************************************************
1176
1177 //**Assignment to sparse vectors****************************************************************
1190 template< typename VT1 > // Type of the target sparse vector
1191 friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1192 {
1194
1198
1199 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1200
1201 const ResultType tmp( serial( rhs ) );
1202 assign( *lhs, tmp );
1203 }
1205 //**********************************************************************************************
1206
1207 //**Addition assignment to dense vectors********************************************************
1220 template< typename VT1 > // Type of the target dense vector
1221 friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1222 {
1224
1225 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1226
1227 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1228 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1229 return;
1230 }
1231
1232 LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1233 RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1234
1235 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1236 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1237 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1238 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
1239
1240 TDVecTDMatMultExpr::selectAddAssignKernel( *lhs, x, A );
1241 }
1243 //**********************************************************************************************
1244
1245 //**Addition assignment to dense vectors (kernel selection)*************************************
1256 template< typename VT1 // Type of the left-hand side target vector
1257 , typename VT2 // Type of the left-hand side vector operand
1258 , typename MT1 > // Type of the right-hand side matrix operand
1259 static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1260 {
1261 if( ( IsDiagonal_v<MT1> ) ||
1262 ( IsComputation_v<MT> && !evaluateMatrix ) ||
1263 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1264 selectSmallAddAssignKernel( y, x, A );
1265 else
1266 selectBlasAddAssignKernel( y, x, A );
1267 }
1269 //**********************************************************************************************
1270
1271 //**Default addition assignment to dense vectors************************************************
1285 template< typename VT1 // Type of the left-hand side target vector
1286 , typename VT2 // Type of the left-hand side vector operand
1287 , typename MT1 > // Type of the right-hand side matrix operand
1288 static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1289 {
1290 y.addAssign( x * A );
1291 }
1293 //**********************************************************************************************
1294
1295 //**Default addition assignment to dense vectors (small matrices)*******************************
1309 template< typename VT1 // Type of the left-hand side target vector
1310 , typename VT2 // Type of the left-hand side vector operand
1311 , typename MT1 > // Type of the right-hand side matrix operand
1312 static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1313 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1314 {
1315 selectDefaultAddAssignKernel( y, x, A );
1316 }
1318 //**********************************************************************************************
1319
1320 //**Vectorized default addition assignment to dense vectors (small matrices)********************
1335 template< typename VT1 // Type of the left-hand side target vector
1336 , typename VT2 // Type of the left-hand side vector operand
1337 , typename MT1 > // Type of the right-hand side matrix operand
1338 static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1339 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1340 {
1341 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1342
1343 const size_t M( A.rows() );
1344 const size_t N( A.columns() );
1345
1346 size_t j( 0UL );
1347
1348 for( ; (j+8UL) <= N; j+=8UL )
1349 {
1350 const size_t ibegin( ( IsLower_v<MT1> )
1351 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1352 :( 0UL ) );
1353 const size_t iend( ( IsUpper_v<MT1> )
1354 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1355 :( M ) );
1356 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1357
1358 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1359 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1360
1361 size_t i( ibegin );
1362
1363 if( i < ipos )
1364 {
1365 SIMDType x1( x.load(i) );
1366 SIMDType xmm1( x1 * A.load(i,j ) );
1367 SIMDType xmm2( x1 * A.load(i,j+1UL) );
1368 SIMDType xmm3( x1 * A.load(i,j+2UL) );
1369 SIMDType xmm4( x1 * A.load(i,j+3UL) );
1370 SIMDType xmm5( x1 * A.load(i,j+4UL) );
1371 SIMDType xmm6( x1 * A.load(i,j+5UL) );
1372 SIMDType xmm7( x1 * A.load(i,j+6UL) );
1373 SIMDType xmm8( x1 * A.load(i,j+7UL) );
1374
1375 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
1376 x1 = x.load(i);
1377 xmm1 += x1 * A.load(i,j );
1378 xmm2 += x1 * A.load(i,j+1UL);
1379 xmm3 += x1 * A.load(i,j+2UL);
1380 xmm4 += x1 * A.load(i,j+3UL);
1381 xmm5 += x1 * A.load(i,j+4UL);
1382 xmm6 += x1 * A.load(i,j+5UL);
1383 xmm7 += x1 * A.load(i,j+6UL);
1384 xmm8 += x1 * A.load(i,j+7UL);
1385 }
1386
1387 y[j ] += sum( xmm1 );
1388 y[j+1UL] += sum( xmm2 );
1389 y[j+2UL] += sum( xmm3 );
1390 y[j+3UL] += sum( xmm4 );
1391 y[j+4UL] += sum( xmm5 );
1392 y[j+5UL] += sum( xmm6 );
1393 y[j+6UL] += sum( xmm7 );
1394 y[j+7UL] += sum( xmm8 );
1395
1396 for( ; remainder && i<iend; ++i ) {
1397 y[j ] += x[i] * A(i,j );
1398 y[j+1UL] += x[i] * A(i,j+1UL);
1399 y[j+2UL] += x[i] * A(i,j+2UL);
1400 y[j+3UL] += x[i] * A(i,j+3UL);
1401 y[j+4UL] += x[i] * A(i,j+4UL);
1402 y[j+5UL] += x[i] * A(i,j+5UL);
1403 y[j+6UL] += x[i] * A(i,j+6UL);
1404 y[j+7UL] += x[i] * A(i,j+7UL);
1405 }
1406 }
1407 else
1408 {
1409 ElementType value1( x[i] * A(i,j ) );
1410 ElementType value2( x[i] * A(i,j+1UL) );
1411 ElementType value3( x[i] * A(i,j+2UL) );
1412 ElementType value4( x[i] * A(i,j+3UL) );
1413 ElementType value5( x[i] * A(i,j+4UL) );
1414 ElementType value6( x[i] * A(i,j+5UL) );
1415 ElementType value7( x[i] * A(i,j+6UL) );
1416 ElementType value8( x[i] * A(i,j+7UL) );
1417
1418 for( ++i; i<iend; ++i ) {
1419 value1 += x[i] * A(i,j );
1420 value2 += x[i] * A(i,j+1UL);
1421 value3 += x[i] * A(i,j+2UL);
1422 value4 += x[i] * A(i,j+3UL);
1423 value5 += x[i] * A(i,j+4UL);
1424 value6 += x[i] * A(i,j+5UL);
1425 value7 += x[i] * A(i,j+6UL);
1426 value8 += x[i] * A(i,j+7UL);
1427 }
1428
1429 y[j ] += value1;
1430 y[j+1UL] += value2;
1431 y[j+2UL] += value3;
1432 y[j+3UL] += value4;
1433 y[j+4UL] += value5;
1434 y[j+5UL] += value6;
1435 y[j+6UL] += value7;
1436 y[j+7UL] += value8;
1437 }
1438 }
1439
1440 for( ; (j+4UL) <= N; j+=4UL )
1441 {
1442 const size_t ibegin( ( IsLower_v<MT1> )
1443 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1444 :( 0UL ) );
1445 const size_t iend( ( IsUpper_v<MT1> )
1446 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1447 :( M ) );
1448 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1449
1450 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1451 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1452
1453 size_t i( ibegin );
1454
1455 if( i < ipos )
1456 {
1457 SIMDType x1( x.load(i) );
1458 SIMDType xmm1( x1 * A.load(i,j ) );
1459 SIMDType xmm2( x1 * A.load(i,j+1UL) );
1460 SIMDType xmm3( x1 * A.load(i,j+2UL) );
1461 SIMDType xmm4( x1 * A.load(i,j+3UL) );
1462
1463 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
1464 x1 = x.load(i);
1465 xmm1 += x1 * A.load(i,j );
1466 xmm2 += x1 * A.load(i,j+1UL);
1467 xmm3 += x1 * A.load(i,j+2UL);
1468 xmm4 += x1 * A.load(i,j+3UL);
1469 }
1470
1471 y[j ] += sum( xmm1 );
1472 y[j+1UL] += sum( xmm2 );
1473 y[j+2UL] += sum( xmm3 );
1474 y[j+3UL] += sum( xmm4 );
1475
1476 for( ; remainder && i<iend; ++i ) {
1477 y[j ] += x[i] * A(i,j );
1478 y[j+1UL] += x[i] * A(i,j+1UL);
1479 y[j+2UL] += x[i] * A(i,j+2UL);
1480 y[j+3UL] += x[i] * A(i,j+3UL);
1481 }
1482 }
1483 else
1484 {
1485 ElementType value1( x[i] * A(i,j ) );
1486 ElementType value2( x[i] * A(i,j+1UL) );
1487 ElementType value3( x[i] * A(i,j+2UL) );
1488 ElementType value4( x[i] * A(i,j+3UL) );
1489
1490 for( ++i; i<iend; ++i ) {
1491 value1 += x[i] * A(i,j );
1492 value2 += x[i] * A(i,j+1UL);
1493 value3 += x[i] * A(i,j+2UL);
1494 value4 += x[i] * A(i,j+3UL);
1495 }
1496
1497 y[j ] += value1;
1498 y[j+1UL] += value2;
1499 y[j+2UL] += value3;
1500 y[j+3UL] += value4;
1501 }
1502 }
1503
1504 for( ; (j+3UL) <= N; j+=3UL )
1505 {
1506 const size_t ibegin( ( IsLower_v<MT1> )
1507 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1508 :( 0UL ) );
1509 const size_t iend( ( IsUpper_v<MT1> )
1510 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
1511 :( M ) );
1512 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1513
1514 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1515 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1516
1517 size_t i( ibegin );
1518
1519 if( i < ipos )
1520 {
1521 SIMDType x1( x.load(i) );
1522 SIMDType xmm1( x1 * A.load(i,j ) );
1523 SIMDType xmm2( x1 * A.load(i,j+1UL) );
1524 SIMDType xmm3( x1 * A.load(i,j+2UL) );
1525
1526 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
1527 x1 = x.load(i);
1528 xmm1 += x1 * A.load(i,j );
1529 xmm2 += x1 * A.load(i,j+1UL);
1530 xmm3 += x1 * A.load(i,j+2UL);
1531 }
1532
1533 y[j ] += sum( xmm1 );
1534 y[j+1UL] += sum( xmm2 );
1535 y[j+2UL] += sum( xmm3 );
1536
1537 for( ; remainder && i<iend; ++i ) {
1538 y[j ] += x[i] * A(i,j );
1539 y[j+1UL] += x[i] * A(i,j+1UL);
1540 y[j+2UL] += x[i] * A(i,j+2UL);
1541 }
1542 }
1543 else
1544 {
1545 ElementType value1( x[i] * A(i,j ) );
1546 ElementType value2( x[i] * A(i,j+1UL) );
1547 ElementType value3( x[i] * A(i,j+2UL) );
1548
1549 for( ++i; i<iend; ++i ) {
1550 value1 += x[i] * A(i,j );
1551 value2 += x[i] * A(i,j+1UL);
1552 value3 += x[i] * A(i,j+2UL);
1553 }
1554
1555 y[j ] += value1;
1556 y[j+1UL] += value2;
1557 y[j+2UL] += value3;
1558 }
1559 }
1560
1561 for( ; (j+2UL) <= N; j+=2UL )
1562 {
1563 const size_t ibegin( ( IsLower_v<MT1> )
1564 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1565 :( 0UL ) );
1566 const size_t iend( ( IsUpper_v<MT1> )
1567 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1568 :( M ) );
1569 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1570
1571 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1572 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1573
1574 size_t i( ibegin );
1575
1576 if( i < ipos )
1577 {
1578 SIMDType x1( x.load(i) );
1579 SIMDType xmm1( x1 * A.load(i,j ) );
1580 SIMDType xmm2( x1 * A.load(i,j+1UL) );
1581
1582 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
1583 x1 = x.load(i);
1584 xmm1 += x1 * A.load(i,j );
1585 xmm2 += x1 * A.load(i,j+1UL);
1586 }
1587
1588 y[j ] += sum( xmm1 );
1589 y[j+1UL] += sum( xmm2 );
1590
1591 for( ; remainder && i<iend; ++i ) {
1592 y[j ] += x[i] * A(i,j );
1593 y[j+1UL] += x[i] * A(i,j+1UL);
1594 }
1595 }
1596 else
1597 {
1598 ElementType value1( x[i] * A(i,j ) );
1599 ElementType value2( x[i] * A(i,j+1UL) );
1600
1601 for( ++i; i<iend; ++i ) {
1602 value1 += x[i] * A(i,j );
1603 value2 += x[i] * A(i,j+1UL);
1604 }
1605
1606 y[j ] += value1;
1607 y[j+1UL] += value2;
1608 }
1609 }
1610
1611 if( j < N )
1612 {
1613 const size_t ibegin( ( IsLower_v<MT1> )
1614 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1615 :( 0UL ) );
1616 const size_t iend( ( IsUpper_v<MT1> )
1617 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1618 :( M ) );
1619 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1620
1621 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1622 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1623
1624 size_t i( ibegin );
1625
1626 if( i < ipos )
1627 {
1628 SIMDType xmm1( x.load(i) * A.load(i,j) );
1629
1630 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
1631 xmm1 += A.load(i,j) * x.load(i);
1632 }
1633
1634 y[j] += sum( xmm1 );
1635
1636 for( ; remainder && i<iend; ++i ) {
1637 y[j] += x[i] * A(i,j);
1638 }
1639 }
1640 else
1641 {
1642 ElementType value( x[i] * A(i,j) );
1643
1644 for( ++i; i<iend; ++i ) {
1645 value += x[i] * A(i,j);
1646 }
1647
1648 y[j] += value;
1649 }
1650 }
1651 }
1653 //**********************************************************************************************
1654
1655 //**Default addition assignment to dense vectors (large matrices)*******************************
1669 template< typename VT1 // Type of the left-hand side target vector
1670 , typename VT2 // Type of the left-hand side vector operand
1671 , typename MT1 > // Type of the right-hand side matrix operand
1672 static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1673 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1674 {
1675 selectDefaultAddAssignKernel( y, x, A );
1676 }
1678 //**********************************************************************************************
1679
1680 //**Vectorized default addition assignment to dense vectors (large matrices)********************
1695 template< typename VT1 // Type of the left-hand side target vector
1696 , typename VT2 // Type of the left-hand side vector operand
1697 , typename MT1 > // Type of the right-hand side matrix operand
1698 static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1699 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1700 {
1701 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1702
1703 const size_t M( A.rows() );
1704 const size_t N( A.columns() );
1705
1706 size_t j( 0UL );
1707
1708 for( ; (j+8UL) <= N; j+=8UL )
1709 {
1710 const size_t ibegin( ( IsLower_v<MT1> )
1711 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1712 :( 0UL ) );
1713 const size_t iend( ( IsUpper_v<MT1> )
1714 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1715 :( M ) );
1716 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1717
1718 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1719 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1720
1721 size_t i( ibegin );
1722
1723 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1724 const size_t i1( i+SIMDSIZE );
1725 const size_t i2( i+SIMDSIZE*2UL );
1726 const size_t i3( i+SIMDSIZE*3UL );
1727 const SIMDType x1( x.load(i ) );
1728 const SIMDType x2( x.load(i1) );
1729 const SIMDType x3( x.load(i2) );
1730 const SIMDType x4( x.load(i3) );
1731 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1732 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1733 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1734 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1735 y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1736 y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1737 y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1738 y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1739 }
1740
1741 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1742 const size_t i1( i+SIMDSIZE );
1743 const SIMDType x1( x.load(i ) );
1744 const SIMDType x2( x.load(i1) );
1745 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1746 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1747 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1748 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1749 y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1750 y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1751 y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1752 y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1753 }
1754
1755 for( ; i<ipos; i+=SIMDSIZE ) {
1756 const SIMDType x1( x.load(i) );
1757 y[j ] += sum( x1 * A.load(i,j ) );
1758 y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1759 y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1760 y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1761 y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
1762 y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
1763 y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
1764 y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
1765 }
1766
1767 for( ; remainder && i<iend; ++i ) {
1768 y[j ] += x[i] * A(i,j );
1769 y[j+1UL] += x[i] * A(i,j+1UL);
1770 y[j+2UL] += x[i] * A(i,j+2UL);
1771 y[j+3UL] += x[i] * A(i,j+3UL);
1772 y[j+4UL] += x[i] * A(i,j+4UL);
1773 y[j+5UL] += x[i] * A(i,j+5UL);
1774 y[j+6UL] += x[i] * A(i,j+6UL);
1775 y[j+7UL] += x[i] * A(i,j+7UL);
1776 }
1777 }
1778
1779 for( ; (j+4UL) <= N; j+=4UL )
1780 {
1781 const size_t ibegin( ( IsLower_v<MT1> )
1782 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1783 :( 0UL ) );
1784 const size_t iend( ( IsUpper_v<MT1> )
1785 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1786 :( M ) );
1787 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1788
1789 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1790 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1791
1792 size_t i( ibegin );
1793
1794 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1795 const size_t i1( i+SIMDSIZE );
1796 const size_t i2( i+SIMDSIZE*2UL );
1797 const size_t i3( i+SIMDSIZE*3UL );
1798 const SIMDType x1( x.load(i ) );
1799 const SIMDType x2( x.load(i1) );
1800 const SIMDType x3( x.load(i2) );
1801 const SIMDType x4( x.load(i3) );
1802 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1803 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1804 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1805 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1806 }
1807
1808 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1809 const size_t i1( i+SIMDSIZE );
1810 const SIMDType x1( x.load(i ) );
1811 const SIMDType x2( x.load(i1) );
1812 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1813 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1814 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1815 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1816 }
1817
1818 for( ; i<ipos; i+=SIMDSIZE ) {
1819 const SIMDType x1( x.load(i) );
1820 y[j ] += sum( x1 * A.load(i,j ) );
1821 y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1822 y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1823 y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1824 }
1825
1826 for( ; remainder && i<iend; ++i ) {
1827 y[j ] += x[i] * A(i,j );
1828 y[j+1UL] += x[i] * A(i,j+1UL);
1829 y[j+2UL] += x[i] * A(i,j+2UL);
1830 y[j+3UL] += x[i] * A(i,j+3UL);
1831 }
1832 }
1833
1834 for( ; (j+2UL) <= N; j+=2UL )
1835 {
1836 const size_t ibegin( ( IsLower_v<MT1> )
1837 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1838 :( 0UL ) );
1839 const size_t iend( ( IsUpper_v<MT1> )
1840 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1841 :( M ) );
1842 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1843
1844 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1845 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1846
1847 size_t i( ibegin );
1848
1849 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1850 const size_t i1( i+SIMDSIZE );
1851 const size_t i2( i+SIMDSIZE*2UL );
1852 const size_t i3( i+SIMDSIZE*3UL );
1853 const SIMDType x1( x.load(i ) );
1854 const SIMDType x2( x.load(i1) );
1855 const SIMDType x3( x.load(i2) );
1856 const SIMDType x4( x.load(i3) );
1857 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1858 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1859 }
1860
1861 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1862 const size_t i1( i+SIMDSIZE );
1863 const SIMDType x1( x.load(i ) );
1864 const SIMDType x2( x.load(i1) );
1865 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1866 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1867 }
1868
1869 for( ; i<ipos; i+=SIMDSIZE ) {
1870 const SIMDType x1( x.load(i) );
1871 y[j ] += sum( x1 * A.load(i,j ) );
1872 y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1873 }
1874
1875 for( ; remainder && i<iend; ++i ) {
1876 y[j ] += x[i] * A(i,j );
1877 y[j+1UL] += x[i] * A(i,j+1UL);
1878 }
1879 }
1880
1881 if( j < N )
1882 {
1883 const size_t ibegin( ( IsLower_v<MT1> )
1884 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
1885 :( 0UL ) );
1886 const size_t iend( ( IsUpper_v<MT1> )
1887 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1888 :( M ) );
1889 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
1890
1891 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
1892 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
1893
1894 size_t i( ibegin );
1895
1896 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1897 const size_t i1( i+SIMDSIZE );
1898 const size_t i2( i+SIMDSIZE*2UL );
1899 const size_t i3( i+SIMDSIZE*3UL );
1900 const SIMDType x1( x.load(i ) );
1901 const SIMDType x2( x.load(i1) );
1902 const SIMDType x3( x.load(i2) );
1903 const SIMDType x4( x.load(i3) );
1904 y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1905 }
1906
1907 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1908 const size_t i1( i+SIMDSIZE );
1909 const SIMDType x1( x.load(i ) );
1910 const SIMDType x2( x.load(i1) );
1911 y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1912 }
1913
1914 for( ; i<ipos; i+=SIMDSIZE ) {
1915 const SIMDType x1( x.load(i) );
1916 y[j] += sum( x1 * A.load(i,j) );
1917 }
1918
1919 for( ; remainder && i<iend; ++i ) {
1920 y[j] += x[i] * A(i,j);
1921 }
1922 }
1923 }
1925 //**********************************************************************************************
1926
1927 //**BLAS-based addition assignment to dense vectors (default)***********************************
1941 template< typename VT1 // Type of the left-hand side target vector
1942 , typename VT2 // Type of the left-hand side vector operand
1943 , typename MT1 > // Type of the right-hand side matrix operand
1944 static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1945 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1946 {
1947 selectLargeAddAssignKernel( y, x, A );
1948 }
1950 //**********************************************************************************************
1951
1952 //**BLAS-based addition assignment to dense vectors*********************************************
1953#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1967 template< typename VT1 // Type of the left-hand side target vector
1968 , typename VT2 // Type of the left-hand side vector operand
1969 , typename MT1 > // Type of the right-hand side matrix operand
1970 static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1971 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1972 {
1973 using ET = ElementType_t<VT1>;
1974
1975 if( IsTriangular_v<MT1> ) {
1976 ResultType_t<VT1> tmp( serial( x ) );
1977 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1978 addAssign( y, tmp );
1979 }
1980 else {
1981 gemv( y, x, A, ET(1), ET(1) );
1982 }
1983 }
1985#endif
1986 //**********************************************************************************************
1987
1988 //**Addition assignment to sparse vectors*******************************************************
1989 // No special implementation for the addition assignment to sparse vectors.
1990 //**********************************************************************************************
1991
1992 //**Subtraction assignment to dense vectors*****************************************************
2005 template< typename VT1 > // Type of the target dense vector
2006 friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2007 {
2009
2010 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2011
2012 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2013 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2014 return;
2015 }
2016
2017 LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
2018 RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
2019
2020 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2021 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2022 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2023 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
2024
2025 TDVecTDMatMultExpr::selectSubAssignKernel( *lhs, x, A );
2026 }
2028 //**********************************************************************************************
2029
2030 //**Subtraction assignment to dense vectors (kernel selection)**********************************
2041 template< typename VT1 // Type of the left-hand side target vector
2042 , typename VT2 // Type of the left-hand side vector operand
2043 , typename MT1 > // Type of the right-hand side matrix operand
2044 static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2045 {
2046 if( ( IsDiagonal_v<MT1> ) ||
2047 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2048 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
2049 selectSmallSubAssignKernel( y, x, A );
2050 else
2051 selectBlasSubAssignKernel( y, x, A );
2052 }
2054 //**********************************************************************************************
2055
2056 //**Default subtraction assignment to dense vectors*********************************************
2070 template< typename VT1 // Type of the left-hand side target vector
2071 , typename VT2 // Type of the left-hand side vector operand
2072 , typename MT1 > // Type of the right-hand side matrix operand
2073 static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2074 {
2075 y.subAssign( x * A );
2076 }
2078 //**********************************************************************************************
2079
2080 //**Default subtraction assignment to dense vectors (small matrices)****************************
2094 template< typename VT1 // Type of the left-hand side target vector
2095 , typename VT2 // Type of the left-hand side vector operand
2096 , typename MT1 > // Type of the right-hand side matrix operand
2097 static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2098 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2099 {
2100 selectDefaultSubAssignKernel( y, x, A );
2101 }
2103 //**********************************************************************************************
2104
2105 //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
2120 template< typename VT1 // Type of the left-hand side target vector
2121 , typename VT2 // Type of the left-hand side vector operand
2122 , typename MT1 > // Type of the right-hand side matrix operand
2123 static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2124 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2125 {
2126 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
2127
2128 const size_t M( A.rows() );
2129 const size_t N( A.columns() );
2130
2131 size_t j( 0UL );
2132
2133 for( ; (j+8UL) <= N; j+=8UL )
2134 {
2135 const size_t ibegin( ( IsLower_v<MT1> )
2136 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
2137 :( 0UL ) );
2138 const size_t iend( ( IsUpper_v<MT1> )
2139 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
2140 :( M ) );
2141 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
2142
2143 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2144 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2145
2146 size_t i( ibegin );
2147
2148 if( i < ipos )
2149 {
2150 SIMDType x1( x.load(i) );
2151 SIMDType xmm1( x1 * A.load(i,j ) );
2152 SIMDType xmm2( x1 * A.load(i,j+1UL) );
2153 SIMDType xmm3( x1 * A.load(i,j+2UL) );
2154 SIMDType xmm4( x1 * A.load(i,j+3UL) );
2155 SIMDType xmm5( x1 * A.load(i,j+4UL) );
2156 SIMDType xmm6( x1 * A.load(i,j+5UL) );
2157 SIMDType xmm7( x1 * A.load(i,j+6UL) );
2158 SIMDType xmm8( x1 * A.load(i,j+7UL) );
2159
2160 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
2161 x1 = x.load(i);
2162 xmm1 += x1 * A.load(i,j );
2163 xmm2 += x1 * A.load(i,j+1UL);
2164 xmm3 += x1 * A.load(i,j+2UL);
2165 xmm4 += x1 * A.load(i,j+3UL);
2166 xmm5 += x1 * A.load(i,j+4UL);
2167 xmm6 += x1 * A.load(i,j+5UL);
2168 xmm7 += x1 * A.load(i,j+6UL);
2169 xmm8 += x1 * A.load(i,j+7UL);
2170 }
2171
2172 y[j ] -= sum( xmm1 );
2173 y[j+1UL] -= sum( xmm2 );
2174 y[j+2UL] -= sum( xmm3 );
2175 y[j+3UL] -= sum( xmm4 );
2176 y[j+4UL] -= sum( xmm5 );
2177 y[j+5UL] -= sum( xmm6 );
2178 y[j+6UL] -= sum( xmm7 );
2179 y[j+7UL] -= sum( xmm8 );
2180
2181 for( ; remainder && i<iend; ++i ) {
2182 y[j ] -= x[i] * A(i,j );
2183 y[j+1UL] -= x[i] * A(i,j+1UL);
2184 y[j+2UL] -= x[i] * A(i,j+2UL);
2185 y[j+3UL] -= x[i] * A(i,j+3UL);
2186 y[j+4UL] -= x[i] * A(i,j+4UL);
2187 y[j+5UL] -= x[i] * A(i,j+5UL);
2188 y[j+6UL] -= x[i] * A(i,j+6UL);
2189 y[j+7UL] -= x[i] * A(i,j+7UL);
2190 }
2191 }
2192 else
2193 {
2194 ElementType value1( x[i] * A(i,j ) );
2195 ElementType value2( x[i] * A(i,j+1UL) );
2196 ElementType value3( x[i] * A(i,j+2UL) );
2197 ElementType value4( x[i] * A(i,j+3UL) );
2198 ElementType value5( x[i] * A(i,j+4UL) );
2199 ElementType value6( x[i] * A(i,j+5UL) );
2200 ElementType value7( x[i] * A(i,j+6UL) );
2201 ElementType value8( x[i] * A(i,j+7UL) );
2202
2203 for( ++i; i<iend; ++i ) {
2204 value1 += x[i] * A(i,j );
2205 value2 += x[i] * A(i,j+1UL);
2206 value3 += x[i] * A(i,j+2UL);
2207 value4 += x[i] * A(i,j+3UL);
2208 value5 += x[i] * A(i,j+4UL);
2209 value6 += x[i] * A(i,j+5UL);
2210 value7 += x[i] * A(i,j+6UL);
2211 value8 += x[i] * A(i,j+7UL);
2212 }
2213
2214 y[j ] -= value1;
2215 y[j+1UL] -= value2;
2216 y[j+2UL] -= value3;
2217 y[j+3UL] -= value4;
2218 y[j+4UL] -= value5;
2219 y[j+5UL] -= value6;
2220 y[j+6UL] -= value7;
2221 y[j+7UL] -= value8;
2222 }
2223 }
2224
2225 for( ; (j+4UL) <= N; j+=4UL )
2226 {
2227 const size_t ibegin( ( IsLower_v<MT1> )
2228 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
2229 :( 0UL ) );
2230 const size_t iend( ( IsUpper_v<MT1> )
2231 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
2232 :( M ) );
2233 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
2234
2235 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2236 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2237
2238 size_t i( ibegin );
2239
2240 if( i < ipos )
2241 {
2242 SIMDType x1( x.load(i) );
2243 SIMDType xmm1( x1 * A.load(i,j ) );
2244 SIMDType xmm2( x1 * A.load(i,j+1UL) );
2245 SIMDType xmm3( x1 * A.load(i,j+2UL) );
2246 SIMDType xmm4( x1 * A.load(i,j+3UL) );
2247
2248 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
2249 x1 = x.load(i);
2250 xmm1 += x1 * A.load(i,j );
2251 xmm2 += x1 * A.load(i,j+1UL);
2252 xmm3 += x1 * A.load(i,j+2UL);
2253 xmm4 += x1 * A.load(i,j+3UL);
2254 }
2255
2256 y[j ] -= sum( xmm1 );
2257 y[j+1UL] -= sum( xmm2 );
2258 y[j+2UL] -= sum( xmm3 );
2259 y[j+3UL] -= sum( xmm4 );
2260
2261 for( ; remainder && i<iend; ++i ) {
2262 y[j ] -= x[i] * A(i,j );
2263 y[j+1UL] -= x[i] * A(i,j+1UL);
2264 y[j+2UL] -= x[i] * A(i,j+2UL);
2265 y[j+3UL] -= x[i] * A(i,j+3UL);
2266 }
2267 }
2268 else
2269 {
2270 ElementType value1( x[i] * A(i,j ) );
2271 ElementType value2( x[i] * A(i,j+1UL) );
2272 ElementType value3( x[i] * A(i,j+2UL) );
2273 ElementType value4( x[i] * A(i,j+3UL) );
2274
2275 for( ++i; i<iend; ++i ) {
2276 value1 += x[i] * A(i,j );
2277 value2 += x[i] * A(i,j+1UL);
2278 value3 += x[i] * A(i,j+2UL);
2279 value4 += x[i] * A(i,j+3UL);
2280 }
2281
2282 y[j ] -= value1;
2283 y[j+1UL] -= value2;
2284 y[j+2UL] -= value3;
2285 y[j+3UL] -= value4;
2286 }
2287 }
2288
2289 for( ; (j+3UL) <= N; j+=3UL )
2290 {
2291 const size_t ibegin( ( IsLower_v<MT1> )
2292 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
2293 :( 0UL ) );
2294 const size_t iend( ( IsUpper_v<MT1> )
2295 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
2296 :( M ) );
2297 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
2298
2299 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2300 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2301
2302 size_t i( ibegin );
2303
2304 if( i < ipos )
2305 {
2306 SIMDType x1( x.load(i) );
2307 SIMDType xmm1( x1 * A.load(i,j ) );
2308 SIMDType xmm2( x1 * A.load(i,j+1UL) );
2309 SIMDType xmm3( x1 * A.load(i,j+2UL) );
2310
2311 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
2312 x1 = x.load(i);
2313 xmm1 += x1 * A.load(i,j );
2314 xmm2 += x1 * A.load(i,j+1UL);
2315 xmm3 += x1 * A.load(i,j+2UL);
2316 }
2317
2318 y[j ] -= sum( xmm1 );
2319 y[j+1UL] -= sum( xmm2 );
2320 y[j+2UL] -= sum( xmm3 );
2321
2322 for( ; remainder && i<iend; ++i ) {
2323 y[j ] -= x[i] * A(i,j );
2324 y[j+1UL] -= x[i] * A(i,j+1UL);
2325 y[j+2UL] -= x[i] * A(i,j+2UL);
2326 }
2327 }
2328 else
2329 {
2330 ElementType value1( x[i] * A(i,j ) );
2331 ElementType value2( x[i] * A(i,j+1UL) );
2332 ElementType value3( x[i] * A(i,j+2UL) );
2333
2334 for( ++i; i<iend; ++i ) {
2335 value1 += x[i] * A(i,j );
2336 value2 += x[i] * A(i,j+1UL);
2337 value3 += x[i] * A(i,j+2UL);
2338 }
2339
2340 y[j ] -= value1;
2341 y[j+1UL] -= value2;
2342 y[j+2UL] -= value3;
2343 }
2344 }
2345
2346 for( ; (j+2UL) <= N; j+=2UL )
2347 {
2348 const size_t ibegin( ( IsLower_v<MT1> )
2349 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
2350 :( 0UL ) );
2351 const size_t iend( ( IsUpper_v<MT1> )
2352 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
2353 :( M ) );
2354 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
2355
2356 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2357 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2358
2359 size_t i( ibegin );
2360
2361 if( i < ipos )
2362 {
2363 SIMDType x1( x.load(i) );
2364 SIMDType xmm1( x1 * A.load(i,j ) );
2365 SIMDType xmm2( x1 * A.load(i,j+1UL) );
2366
2367 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
2368 x1 = x.load(i);
2369 xmm1 += x1 * A.load(i,j );
2370 xmm2 += x1 * A.load(i,j+1UL);
2371 }
2372
2373 y[j ] -= sum( xmm1 );
2374 y[j+1UL] -= sum( xmm2 );
2375
2376 for( ; remainder && i<iend; ++i ) {
2377 y[j ] -= x[i] * A(i,j );
2378 y[j+1UL] -= x[i] * A(i,j+1UL);
2379 }
2380 }
2381 else
2382 {
2383 ElementType value1( x[i] * A(i,j ) );
2384 ElementType value2( x[i] * A(i,j+1UL) );
2385
2386 for( ++i; i<iend; ++i ) {
2387 value1 += x[i] * A(i,j );
2388 value2 += x[i] * A(i,j+1UL);
2389 }
2390
2391 y[j ] -= value1;
2392 y[j+1UL] -= value2;
2393 }
2394 }
2395
2396 if( j < N )
2397 {
2398 const size_t ibegin( ( IsLower_v<MT1> )
2399 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
2400 :( 0UL ) );
2401 const size_t iend( ( IsUpper_v<MT1> )
2402 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
2403 :( M ) );
2404 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
2405
2406 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2407 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2408
2409 size_t i( ibegin );
2410
2411 if( i < ipos )
2412 {
2413 SIMDType xmm1( x.load(i) * A.load(i,j) );
2414
2415 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
2416 xmm1 += A.load(i,j) * x.load(i);
2417 }
2418
2419 y[j] -= sum( xmm1 );
2420
2421 for( ; remainder && i<iend; ++i ) {
2422 y[j] -= x[i] * A(i,j);
2423 }
2424 }
2425 else
2426 {
2427 ElementType value( x[i] * A(i,j) );
2428
2429 for( ++i; i<iend; ++i ) {
2430 value += x[i] * A(i,j);
2431 }
2432
2433 y[j] -= value;
2434 }
2435 }
2436 }
2438 //**********************************************************************************************
2439
2440 //**Default subtraction assignment to dense vectors (large matrices)****************************
2454 template< typename VT1 // Type of the left-hand side target vector
2455 , typename VT2 // Type of the left-hand side vector operand
2456 , typename MT1 > // Type of the right-hand side matrix operand
2457 static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2458 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2459 {
2460 selectDefaultSubAssignKernel( y, x, A );
2461 }
2463 //**********************************************************************************************
2464
2465 //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2480 template< typename VT1 // Type of the left-hand side target vector
2481 , typename VT2 // Type of the left-hand side vector operand
2482 , typename MT1 > // Type of the right-hand side matrix operand
2483 static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2484 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2485 {
2486 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
2487
2488 const size_t M( A.rows() );
2489 const size_t N( A.columns() );
2490
2491 size_t j( 0UL );
2492
2493 for( ; (j+8UL) <= N; j+=8UL )
2494 {
2495 const size_t ibegin( ( IsLower_v<MT1> )
2496 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
2497 :( 0UL ) );
2498 const size_t iend( ( IsUpper_v<MT1> )
2499 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
2500 :( M ) );
2501 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
2502
2503 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2504 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2505
2506 size_t i( ibegin );
2507
2508 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2509 const size_t i1( i+SIMDSIZE );
2510 const size_t i2( i+SIMDSIZE*2UL );
2511 const size_t i3( i+SIMDSIZE*3UL );
2512 const SIMDType x1( x.load(i ) );
2513 const SIMDType x2( x.load(i1) );
2514 const SIMDType x3( x.load(i2) );
2515 const SIMDType x4( x.load(i3) );
2516 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2517 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2518 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2519 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2520 y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2521 y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2522 y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2523 y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2524 }
2525
2526 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2527 const size_t i1( i+SIMDSIZE );
2528 const SIMDType x1( x.load(i ) );
2529 const SIMDType x2( x.load(i1) );
2530 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2531 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2532 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2533 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2534 y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2535 y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2536 y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2537 y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2538 }
2539
2540 for( ; i<ipos; i+=SIMDSIZE ) {
2541 const SIMDType x1( x.load(i) );
2542 y[j ] -= sum( x1 * A.load(i,j ) );
2543 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2544 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2545 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2546 y[j+4UL] -= sum( x1 * A.load(i,j+4UL) );
2547 y[j+5UL] -= sum( x1 * A.load(i,j+5UL) );
2548 y[j+6UL] -= sum( x1 * A.load(i,j+6UL) );
2549 y[j+7UL] -= sum( x1 * A.load(i,j+7UL) );
2550 }
2551
2552 for( ; remainder && i<iend; ++i ) {
2553 y[j ] -= x[i] * A(i,j );
2554 y[j+1UL] -= x[i] * A(i,j+1UL);
2555 y[j+2UL] -= x[i] * A(i,j+2UL);
2556 y[j+3UL] -= x[i] * A(i,j+3UL);
2557 y[j+4UL] -= x[i] * A(i,j+4UL);
2558 y[j+5UL] -= x[i] * A(i,j+5UL);
2559 y[j+6UL] -= x[i] * A(i,j+6UL);
2560 y[j+7UL] -= x[i] * A(i,j+7UL);
2561 }
2562 }
2563
2564 for( ; (j+4UL) <= N; j+=4UL )
2565 {
2566 const size_t ibegin( ( IsLower_v<MT1> )
2567 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
2568 :( 0UL ) );
2569 const size_t iend( ( IsUpper_v<MT1> )
2570 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
2571 :( M ) );
2572 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
2573
2574 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2575 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2576
2577 size_t i( ibegin );
2578
2579 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2580 const size_t i1( i+SIMDSIZE );
2581 const size_t i2( i+SIMDSIZE*2UL );
2582 const size_t i3( i+SIMDSIZE*3UL );
2583 const SIMDType x1( x.load(i ) );
2584 const SIMDType x2( x.load(i1) );
2585 const SIMDType x3( x.load(i2) );
2586 const SIMDType x4( x.load(i3) );
2587 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2588 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2589 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2590 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2591 }
2592
2593 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2594 const size_t i1( i+SIMDSIZE );
2595 const SIMDType x1( x.load(i ) );
2596 const SIMDType x2( x.load(i1) );
2597 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2598 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2599 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2600 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2601 }
2602
2603 for( ; i<ipos; i+=SIMDSIZE ) {
2604 const SIMDType x1( x.load(i) );
2605 y[j ] -= sum( x1 * A.load(i,j ) );
2606 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2607 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2608 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2609 }
2610
2611 for( ; remainder && i<iend; ++i ) {
2612 y[j ] -= x[i] * A(i,j );
2613 y[j+1UL] -= x[i] * A(i,j+1UL);
2614 y[j+2UL] -= x[i] * A(i,j+2UL);
2615 y[j+3UL] -= x[i] * A(i,j+3UL);
2616 }
2617 }
2618
2619 for( ; (j+2UL) <= N; j+=2UL )
2620 {
2621 const size_t ibegin( ( IsLower_v<MT1> )
2622 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
2623 :( 0UL ) );
2624 const size_t iend( ( IsUpper_v<MT1> )
2625 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
2626 :( M ) );
2627 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
2628
2629 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2630 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2631
2632 size_t i( ibegin );
2633
2634 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2635 const size_t i1( i+SIMDSIZE );
2636 const size_t i2( i+SIMDSIZE*2UL );
2637 const size_t i3( i+SIMDSIZE*3UL );
2638 const SIMDType x1( x.load(i ) );
2639 const SIMDType x2( x.load(i1) );
2640 const SIMDType x3( x.load(i2) );
2641 const SIMDType x4( x.load(i3) );
2642 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2643 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2644 }
2645
2646 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2647 const size_t i1( i+SIMDSIZE );
2648 const SIMDType x1( x.load(i ) );
2649 const SIMDType x2( x.load(i1) );
2650 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2651 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2652 }
2653
2654 for( ; i<ipos; i+=SIMDSIZE ) {
2655 const SIMDType x1( x.load(i) );
2656 y[j ] -= sum( x1 * A.load(i,j ) );
2657 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2658 }
2659
2660 for( ; remainder && i<iend; ++i ) {
2661 y[j ] -= x[i] * A(i,j );
2662 y[j+1UL] -= x[i] * A(i,j+1UL);
2663 }
2664 }
2665
2666 if( j < N )
2667 {
2668 const size_t ibegin( ( IsLower_v<MT1> )
2669 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
2670 :( 0UL ) );
2671 const size_t iend( ( IsUpper_v<MT1> )
2672 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
2673 :( M ) );
2674 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
2675
2676 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
2677 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
2678
2679 size_t i( ibegin );
2680
2681 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2682 const size_t i1( i+SIMDSIZE );
2683 const size_t i2( i+SIMDSIZE*2UL );
2684 const size_t i3( i+SIMDSIZE*3UL );
2685 const SIMDType x1( x.load(i ) );
2686 const SIMDType x2( x.load(i1) );
2687 const SIMDType x3( x.load(i2) );
2688 const SIMDType x4( x.load(i3) );
2689 y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2690 }
2691
2692 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2693 const size_t i1( i+SIMDSIZE );
2694 const SIMDType x1( x.load(i ) );
2695 const SIMDType x2( x.load(i1) );
2696 y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2697 }
2698
2699 for( ; i<ipos; i+=SIMDSIZE ) {
2700 const SIMDType x1( x.load(i) );
2701 y[j] -= sum( x1 * A.load(i,j) );
2702 }
2703
2704 for( ; remainder && i<iend; ++i ) {
2705 y[j] -= x[i] * A(i,j);
2706 }
2707 }
2708 }
2710 //**********************************************************************************************
2711
2712 //**BLAS-based subtraction assignment to dense vectors (default)********************************
2726 template< typename VT1 // Type of the left-hand side target vector
2727 , typename VT2 // Type of the left-hand side vector operand
2728 , typename MT1 > // Type of the right-hand side matrix operand
2729 static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2730 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2731 {
2732 selectLargeSubAssignKernel( y, x, A );
2733 }
2735 //**********************************************************************************************
2736
2737 //**BLAS-based subtraction assignment to dense vectors******************************************
2738#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2752 template< typename VT1 // Type of the left-hand side target vector
2753 , typename VT2 // Type of the left-hand side vector operand
2754 , typename MT1 > // Type of the right-hand side matrix operand
2755 static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2756 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2757 {
2758 using ET = ElementType_t<VT1>;
2759
2760 if( IsTriangular_v<MT1> ) {
2761 ResultType_t<VT1> tmp( serial( x ) );
2762 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2763 subAssign( y, tmp );
2764 }
2765 else {
2766 gemv( y, x, A, ET(-1), ET(1) );
2767 }
2768 }
2770#endif
2771 //**********************************************************************************************
2772
2773 //**Subtraction assignment to sparse vectors****************************************************
2774 // No special implementation for the subtraction assignment to sparse vectors.
2775 //**********************************************************************************************
2776
2777 //**Multiplication assignment to dense vectors**************************************************
2790 template< typename VT1 > // Type of the target dense vector
2791 friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2792 {
2794
2798
2799 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2800
2801 const ResultType tmp( serial( rhs ) );
2802 multAssign( *lhs, tmp );
2803 }
2805 //**********************************************************************************************
2806
2807 //**Multiplication assignment to sparse vectors*************************************************
2808 // No special implementation for the multiplication assignment to sparse vectors.
2809 //**********************************************************************************************
2810
2811 //**Division assignment to dense vectors********************************************************
2824 template< typename VT1 > // Type of the target dense vector
2825 friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2826 {
2828
2832
2833 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2834
2835 const ResultType tmp( serial( rhs ) );
2836 divAssign( *lhs, tmp );
2837 }
2839 //**********************************************************************************************
2840
2841 //**Division assignment to sparse vectors*******************************************************
2842 // No special implementation for the division assignment to sparse vectors.
2843 //**********************************************************************************************
2844
2845 //**SMP assignment to dense vectors*************************************************************
2860 template< typename VT1 > // Type of the target dense vector
2861 friend inline auto smpAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2862 -> EnableIf_t< UseSMPAssign_v<VT1> >
2863 {
2865
2866 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2867
2868 if( rhs.mat_.rows() == 0UL ||
2869 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2870 reset( *lhs );
2871 return;
2872 }
2873 else if( rhs.mat_.columns() == 0UL ) {
2874 return;
2875 }
2876
2877 LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2878 RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2879
2880 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2881 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2882 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2883 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
2884
2885 smpAssign( *lhs, x * A );
2886 }
2888 //**********************************************************************************************
2889
2890 //**SMP assignment to sparse vectors************************************************************
2905 template< typename VT1 > // Type of the target sparse vector
2906 friend inline auto smpAssign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2907 -> EnableIf_t< UseSMPAssign_v<VT1> >
2908 {
2910
2914
2915 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2916
2917 const ResultType tmp( rhs );
2918 smpAssign( *lhs, tmp );
2919 }
2921 //**********************************************************************************************
2922
2923 //**SMP addition assignment to dense vectors****************************************************
2938 template< typename VT1 > // Type of the target dense vector
2939 friend inline auto smpAddAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2940 -> EnableIf_t< UseSMPAssign_v<VT1> >
2941 {
2943
2944 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2945
2946 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2947 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2948 return;
2949 }
2950
2951 LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2952 RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2953
2954 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2955 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2956 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2957 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
2958
2959 smpAddAssign( *lhs, x * A );
2960 }
2962 //**********************************************************************************************
2963
2964 //**SMP addition assignment to sparse vectors***************************************************
2965 // No special implementation for the SMP addition assignment to sparse vectors.
2966 //**********************************************************************************************
2967
2968 //**SMP subtraction assignment to dense vectors*************************************************
2983 template< typename VT1 > // Type of the target dense vector
2984 friend inline auto smpSubAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2985 -> EnableIf_t< UseSMPAssign_v<VT1> >
2986 {
2988
2989 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2990
2991 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2992 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2993 return;
2994 }
2995
2996 LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2997 RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2998
2999 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
3000 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
3001 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
3002 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
3003
3004 smpSubAssign( *lhs, x * A );
3005 }
3007 //**********************************************************************************************
3008
3009 //**SMP subtraction assignment to sparse vectors************************************************
3010 // No special implementation for the SMP subtraction assignment to sparse vectors.
3011 //**********************************************************************************************
3012
3013 //**SMP multiplication assignment to dense vectors**********************************************
3028 template< typename VT1 > // Type of the target dense vector
3029 friend inline auto smpMultAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
3030 -> EnableIf_t< UseSMPAssign_v<VT1> >
3031 {
3033
3037
3038 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3039
3040 const ResultType tmp( rhs );
3041 smpMultAssign( *lhs, tmp );
3042 }
3044 //**********************************************************************************************
3045
3046 //**SMP multiplication assignment to sparse vectors*********************************************
3047 // No special implementation for the SMP multiplication assignment to sparse vectors.
3048 //**********************************************************************************************
3049
3050 //**SMP division assignment to dense vectors****************************************************
3065 template< typename VT1 > // Type of the target dense vector
3066 friend inline auto smpDivAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
3067 -> EnableIf_t< UseSMPAssign_v<VT1> >
3068 {
3070
3074
3075 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3076
3077 const ResultType tmp( rhs );
3078 smpDivAssign( *lhs, tmp );
3079 }
3081 //**********************************************************************************************
3082
3083 //**SMP division assignment to sparse vectors***************************************************
3084 // No special implementation for the SMP division assignment to sparse vectors.
3085 //**********************************************************************************************
3086
3087 //**Compile time checks*************************************************************************
3095 //**********************************************************************************************
3096};
3097//*************************************************************************************************
3098
3099
3100
3101
3102//=================================================================================================
3103//
3104// DVECSCALARMULTEXPR SPECIALIZATION
3105//
3106//=================================================================================================
3107
3108//*************************************************************************************************
3116template< typename VT // Type of the left-hand side dense vector
3117 , typename MT // Type of the right-hand side dense matrix
3118 , typename ST > // Type of the side scalar value
3119class DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >
3120 : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true > >
3121 , private Computation
3122{
3123 private:
3124 //**Type definitions****************************************************************************
3125 using VMM = TDVecTDMatMultExpr<VT,MT>;
3126 using RES = ResultType_t<VMM>;
3127 using VRT = ResultType_t<VT>;
3128 using MRT = ResultType_t<MT>;
3129 using VET = ElementType_t<VRT>;
3130 using MET = ElementType_t<MRT>;
3131 using VCT = CompositeType_t<VT>;
3132 using MCT = CompositeType_t<MT>;
3133 //**********************************************************************************************
3134
3135 //**********************************************************************************************
3137 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
3138 //**********************************************************************************************
3139
3140 //**********************************************************************************************
3142 static constexpr bool evaluateMatrix =
3143 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
3144 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
3145 //**********************************************************************************************
3146
3147 //**********************************************************************************************
3149
3151 template< typename T1 >
3152 static constexpr bool UseSMPAssign_v =
3153 ( T1::smpAssignable && ( evaluateVector || evaluateMatrix ) );
3154 //**********************************************************************************************
3155
3156 //**********************************************************************************************
3158
3160 template< typename T1, typename T2, typename T3, typename T4 >
3161 static constexpr bool UseBlasKernel_v =
3163 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
3164 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
3165 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
3166 !IsDiagonal_v<T3> &&
3167 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
3168 IsBLASCompatible_v< ElementType_t<T1> > &&
3169 IsBLASCompatible_v< ElementType_t<T2> > &&
3170 IsBLASCompatible_v< ElementType_t<T3> > &&
3171 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
3172 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
3173 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
3174 //**********************************************************************************************
3175
3176 //**********************************************************************************************
3178
3181 template< typename T1, typename T2, typename T3, typename T4 >
3182 static constexpr bool UseVectorizedDefaultKernel_v =
3183 ( useOptimizedKernels &&
3184 !IsDiagonal_v<T3> &&
3185 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
3186 IsSIMDCombinable_v< ElementType_t<T1>
3187 , ElementType_t<T2>
3188 , ElementType_t<T3>
3189 , T4 > &&
3190 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
3191 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
3192 //**********************************************************************************************
3193
3194 public:
3195 //**Type definitions****************************************************************************
3197 using This = DVecScalarMultExpr<VMM,ST,true>;
3198
3200 using BaseType = VecScalarMultExpr< DenseVector<This,true> >;
3201
3202 using ResultType = MultTrait_t<RES,ST>;
3203 using TransposeType = TransposeType_t<ResultType>;
3204 using ElementType = ElementType_t<ResultType>;
3205 using SIMDType = SIMDTrait_t<ElementType>;
3206 using ReturnType = const ElementType;
3207 using CompositeType = const ResultType;
3208
3210 using LeftOperand = const TDVecTDMatMultExpr<VT,MT>;
3211
3213 using RightOperand = ST;
3214
3216 using LT = If_t< evaluateVector, const VRT, VCT >;
3217
3219 using RT = If_t< evaluateMatrix, const MRT, MCT >;
3220 //**********************************************************************************************
3221
3222 //**Compilation flags***************************************************************************
3224 static constexpr bool simdEnabled =
3225 ( !IsDiagonal_v<MT> &&
3226 VT::simdEnabled && MT::simdEnabled &&
3227 IsSIMDCombinable_v<VET,MET,ST> &&
3228 HasSIMDAdd_v<VET,MET> &&
3229 HasSIMDMult_v<VET,MET> );
3230
3232 static constexpr bool smpAssignable =
3233 ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
3234 //**********************************************************************************************
3235
3236 //**SIMD properties*****************************************************************************
3238 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
3239 //**********************************************************************************************
3240
3241 //**Constructor*********************************************************************************
3247 inline DVecScalarMultExpr( const VMM& vector, ST scalar )
3248 : vector_( vector ) // Left-hand side dense vector of the multiplication expression
3249 , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
3250 {}
3251 //**********************************************************************************************
3252
3253 //**Subscript operator**************************************************************************
3259 inline ReturnType operator[]( size_t index ) const {
3260 BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
3261 return vector_[index] * scalar_;
3262 }
3263 //**********************************************************************************************
3264
3265 //**At function*********************************************************************************
3272 inline ReturnType at( size_t index ) const {
3273 if( index >= vector_.size() ) {
3274 BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
3275 }
3276 return (*this)[index];
3277 }
3278 //**********************************************************************************************
3279
3280 //**Size function*******************************************************************************
3285 inline size_t size() const {
3286 return vector_.size();
3287 }
3288 //**********************************************************************************************
3289
3290 //**Left operand access*************************************************************************
3295 inline LeftOperand leftOperand() const {
3296 return vector_;
3297 }
3298 //**********************************************************************************************
3299
3300 //**Right operand access************************************************************************
3305 inline RightOperand rightOperand() const {
3306 return scalar_;
3307 }
3308 //**********************************************************************************************
3309
3310 //**********************************************************************************************
3316 template< typename T >
3317 inline bool canAlias( const T* alias ) const {
3318 return vector_.canAlias( alias );
3319 }
3320 //**********************************************************************************************
3321
3322 //**********************************************************************************************
3328 template< typename T >
3329 inline bool isAliased( const T* alias ) const {
3330 return vector_.isAliased( alias );
3331 }
3332 //**********************************************************************************************
3333
3334 //**********************************************************************************************
3339 inline bool isAligned() const {
3340 return vector_.isAligned();
3341 }
3342 //**********************************************************************************************
3343
3344 //**********************************************************************************************
3349 inline bool canSMPAssign() const noexcept {
3350 RightOperand_t<VMM> A( vector_.rightOperand() );
3351 return ( !BLAZE_BLAS_MODE ||
3354 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3355 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
3356 ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
3357 }
3358 //**********************************************************************************************
3359
3360 private:
3361 //**Member variables****************************************************************************
3364 //**********************************************************************************************
3365
3366 //**Assignment to dense vectors*****************************************************************
3378 template< typename VT1 // Type of the target dense vector
3379 , bool TF > // Transpose flag of the target dense vector
3380 friend inline void assign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3381 {
3383
3384 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3385
3386 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3387 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3388
3389 if( right.rows() == 0UL ||
3390 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
3391 reset( *lhs );
3392 return;
3393 }
3394 else if( right.columns() == 0UL ) {
3395 return;
3396 }
3397
3398 LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3399 RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3400
3401 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3402 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3403 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3404 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
3405
3406 DVecScalarMultExpr::selectAssignKernel( *lhs, x, A, rhs.scalar_ );
3407 }
3408 //**********************************************************************************************
3409
3410 //**Assignment to dense vectors (kernel selection)**********************************************
3421 template< typename VT1 // Type of the left-hand side target vector
3422 , typename VT2 // Type of the left-hand side vector operand
3423 , typename MT1 // Type of the right-hand side matrix operand
3424 , typename ST2 > // Type of the scalar value
3425 static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3426 {
3427 if( ( IsDiagonal_v<MT1> ) ||
3428 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3429 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3430 selectSmallAssignKernel( y, x, A, scalar );
3431 else
3432 selectBlasAssignKernel( y, x, A, scalar );
3433 }
3434 //**********************************************************************************************
3435
3436 //**Default assignment to dense vectors*********************************************************
3450 template< typename VT1 // Type of the left-hand side target vector
3451 , typename VT2 // Type of the left-hand side vector operand
3452 , typename MT1 // Type of the right-hand side matrix operand
3453 , typename ST2 > // Type of the scalar value
3454 static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3455 {
3456 y.assign( x * A * scalar );
3457 }
3458 //**********************************************************************************************
3459
3460 //**Default assignment to dense vectors (small matrices)****************************************
3474 template< typename VT1 // Type of the left-hand side target vector
3475 , typename VT2 // Type of the left-hand side vector operand
3476 , typename MT1 // Type of the right-hand side matrix operand
3477 , typename ST2 > // Type of the scalar value
3478 static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3479 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3480 {
3481 selectDefaultAssignKernel( y, x, A, scalar );
3482 }
3483 //**********************************************************************************************
3484
3485 //**Vectorized default assignment to dense vectors (small matrices)*****************************
3500 template< typename VT1 // Type of the left-hand side target vector
3501 , typename VT2 // Type of the left-hand side vector operand
3502 , typename MT1 // Type of the right-hand side matrix operand
3503 , typename ST2 > // Type of the scalar value
3504 static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3505 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3506 {
3507 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3508
3509 const size_t M( A.rows() );
3510 const size_t N( A.columns() );
3511
3512 size_t j( 0UL );
3513
3514 for( ; (j+8UL) <= N; j+=8UL )
3515 {
3516 const size_t ibegin( ( IsLower_v<MT1> )
3517 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
3518 :( 0UL ) );
3519 const size_t iend( ( IsUpper_v<MT1> )
3520 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3521 :( M ) );
3522 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3523
3524 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
3525 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
3526
3527 size_t i( ibegin );
3528
3529 if( i < ipos )
3530 {
3531 SIMDType x1( x.load(i) );
3532 SIMDType xmm1( x1 * A.load(i,j ) );
3533 SIMDType xmm2( x1 * A.load(i,j+1UL) );
3534 SIMDType xmm3( x1 * A.load(i,j+2UL) );
3535 SIMDType xmm4( x1 * A.load(i,j+3UL) );
3536 SIMDType xmm5( x1 * A.load(i,j+4UL) );
3537 SIMDType xmm6( x1 * A.load(i,j+5UL) );
3538 SIMDType xmm7( x1 * A.load(i,j+6UL) );
3539 SIMDType xmm8( x1 * A.load(i,j+7UL) );
3540
3541 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
3542 x1 = x.load(i);
3543 xmm1 += x1 * A.load(i,j );
3544 xmm2 += x1 * A.load(i,j+1UL);
3545 xmm3 += x1 * A.load(i,j+2UL);
3546 xmm4 += x1 * A.load(i,j+3UL);
3547 xmm5 += x1 * A.load(i,j+4UL);
3548 xmm6 += x1 * A.load(i,j+5UL);
3549 xmm7 += x1 * A.load(i,j+6UL);
3550 xmm8 += x1 * A.load(i,j+7UL);
3551 }
3552
3553 y[j ] = sum( xmm1 ) * scalar;
3554 y[j+1UL] = sum( xmm2 ) * scalar;
3555 y[j+2UL] = sum( xmm3 ) * scalar;
3556 y[j+3UL] = sum( xmm4 ) * scalar;
3557 y[j+4UL] = sum( xmm5 ) * scalar;
3558 y[j+5UL] = sum( xmm6 ) * scalar;
3559 y[j+6UL] = sum( xmm7 ) * scalar;
3560 y[j+7UL] = sum( xmm8 ) * scalar;
3561
3562 for( ; remainder && i<iend; ++i ) {
3563 y[j ] += x[i] * A(i,j ) * scalar;
3564 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3565 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3566 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3567 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3568 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3569 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3570 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3571 }
3572 }
3573 else
3574 {
3575 ElementType value1( x[i] * A(i,j ) );
3576 ElementType value2( x[i] * A(i,j+1UL) );
3577 ElementType value3( x[i] * A(i,j+2UL) );
3578 ElementType value4( x[i] * A(i,j+3UL) );
3579 ElementType value5( x[i] * A(i,j+4UL) );
3580 ElementType value6( x[i] * A(i,j+5UL) );
3581 ElementType value7( x[i] * A(i,j+6UL) );
3582 ElementType value8( x[i] * A(i,j+7UL) );
3583
3584 for( ++i; i<iend; ++i ) {
3585 value1 += x[i] * A(i,j );
3586 value2 += x[i] * A(i,j+1UL);
3587 value3 += x[i] * A(i,j+2UL);
3588 value4 += x[i] * A(i,j+3UL);
3589 value5 += x[i] * A(i,j+4UL);
3590 value6 += x[i] * A(i,j+5UL);
3591 value7 += x[i] * A(i,j+6UL);
3592 value8 += x[i] * A(i,j+7UL);
3593 }
3594
3595 y[j ] = value1 * scalar;
3596 y[j+1UL] = value2 * scalar;
3597 y[j+2UL] = value3 * scalar;
3598 y[j+3UL] = value4 * scalar;
3599 y[j+4UL] = value5 * scalar;
3600 y[j+5UL] = value6 * scalar;
3601 y[j+6UL] = value7 * scalar;
3602 y[j+7UL] = value8 * scalar;
3603 }
3604 }
3605
3606 for( ; (j+4UL) <= N; j+=4UL )
3607 {
3608 const size_t ibegin( ( IsLower_v<MT1> )
3609 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
3610 :( 0UL ) );
3611 const size_t iend( ( IsUpper_v<MT1> )
3612 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3613 :( M ) );
3614 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3615
3616 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
3617 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
3618
3619 size_t i( ibegin );
3620
3621 if( i < ipos )
3622 {
3623 SIMDType x1( x.load(i) );
3624 SIMDType xmm1( x1 * A.load(i,j ) );
3625 SIMDType xmm2( x1 * A.load(i,j+1UL) );
3626 SIMDType xmm3( x1 * A.load(i,j+2UL) );
3627 SIMDType xmm4( x1 * A.load(i,j+3UL) );
3628
3629 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
3630 x1 = x.load(i);
3631 xmm1 += x1 * A.load(i,j );
3632 xmm2 += x1 * A.load(i,j+1UL);
3633 xmm3 += x1 * A.load(i,j+2UL);
3634 xmm4 += x1 * A.load(i,j+3UL);
3635 }
3636
3637 y[j ] = sum( xmm1 ) * scalar;
3638 y[j+1UL] = sum( xmm2 ) * scalar;
3639 y[j+2UL] = sum( xmm3 ) * scalar;
3640 y[j+3UL] = sum( xmm4 ) * scalar;
3641
3642 for( ; remainder && i<iend; ++i ) {
3643 y[j ] += x[i] * A(i,j ) * scalar;
3644 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3645 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3646 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3647 }
3648 }
3649 else
3650 {
3651 ElementType value1( x[i] * A(i,j ) );
3652 ElementType value2( x[i] * A(i,j+1UL) );
3653 ElementType value3( x[i] * A(i,j+2UL) );
3654 ElementType value4( x[i] * A(i,j+3UL) );
3655
3656 for( ++i; i<iend; ++i ) {
3657 value1 += x[i] * A(i,j );
3658 value2 += x[i] * A(i,j+1UL);
3659 value3 += x[i] * A(i,j+2UL);
3660 value4 += x[i] * A(i,j+3UL);
3661 }
3662
3663 y[j ] = value1 * scalar;
3664 y[j+1UL] = value2 * scalar;
3665 y[j+2UL] = value3 * scalar;
3666 y[j+3UL] = value4 * scalar;
3667 }
3668 }
3669
3670 for( ; (j+3UL) <= N; j+=3UL )
3671 {
3672 const size_t ibegin( ( IsLower_v<MT1> )
3673 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
3674 :( 0UL ) );
3675 const size_t iend( ( IsUpper_v<MT1> )
3676 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
3677 :( M ) );
3678 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3679
3680 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
3681 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
3682
3683 size_t i( ibegin );
3684
3685 if( i < ipos )
3686 {
3687 SIMDType x1( x.load(i) );
3688 SIMDType xmm1( x1 * A.load(i,j ) );
3689 SIMDType xmm2( x1 * A.load(i,j+1UL) );
3690 SIMDType xmm3( x1 * A.load(i,j+2UL) );
3691
3692 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
3693 x1 = x.load(i);
3694 xmm1 += x1 * A.load(i,j );
3695 xmm2 += x1 * A.load(i,j+1UL);
3696 xmm3 += x1 * A.load(i,j+2UL);
3697 }
3698
3699 y[j ] = sum( xmm1 ) * scalar;
3700 y[j+1UL] = sum( xmm2 ) * scalar;
3701 y[j+2UL] = sum( xmm3 ) * scalar;
3702
3703 for( ; remainder && i<iend; ++i ) {
3704 y[j ] += x[i] * A(i,j ) * scalar;
3705 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3706 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3707 }
3708 }
3709 else
3710 {
3711 ElementType value1( x[i] * A(i,j ) );
3712 ElementType value2( x[i] * A(i,j+1UL) );
3713 ElementType value3( x[i] * A(i,j+2UL) );
3714
3715 for( ++i; i<iend; ++i ) {
3716 value1 += x[i] * A(i,j );
3717 value2 += x[i] * A(i,j+1UL);
3718 value3 += x[i] * A(i,j+2UL);
3719 }
3720
3721 y[j ] = value1 * scalar;
3722 y[j+1UL] = value2 * scalar;
3723 y[j+2UL] = value3 * scalar;
3724 }
3725 }
3726
3727 for( ; (j+2UL) <= N; j+=2UL )
3728 {
3729 const size_t ibegin( ( IsLower_v<MT1> )
3730 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
3731 :( 0UL ) );
3732 const size_t iend( ( IsUpper_v<MT1> )
3733 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
3734 :( M ) );
3735 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3736
3737 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
3738 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
3739
3740 size_t i( ibegin );
3741
3742 if( i < ipos )
3743 {
3744 SIMDType x1( x.load(i) );
3745 SIMDType xmm1( x1 * A.load(i,j ) );
3746 SIMDType xmm2( x1 * A.load(i,j+1UL) );
3747
3748 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
3749 x1 = x.load(i);
3750 xmm1 += x1 * A.load(i,j );
3751 xmm2 += x1 * A.load(i,j+1UL);
3752 }
3753
3754 y[j ] = sum( xmm1 ) * scalar;
3755 y[j+1UL] = sum( xmm2 ) * scalar;
3756
3757 for( ; remainder && i<iend; ++i ) {
3758 y[j ] += x[i] * A(i,j ) * scalar;
3759 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3760 }
3761 }
3762 else
3763 {
3764 ElementType value1( x[i] * A(i,j ) );
3765 ElementType value2( x[i] * A(i,j+1UL) );
3766
3767 for( ++i; i<iend; ++i ) {
3768 value1 += x[i] * A(i,j );
3769 value2 += x[i] * A(i,j+1UL);
3770 }
3771
3772 y[j ] = value1 * scalar;
3773 y[j+1UL] = value2 * scalar;
3774 }
3775 }
3776
3777 if( j < N )
3778 {
3779 const size_t ibegin( ( IsLower_v<MT1> )
3780 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
3781 :( 0UL ) );
3782 const size_t iend( ( IsUpper_v<MT1> )
3783 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
3784 :( M ) );
3785 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3786
3787 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
3788 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
3789
3790 size_t i( ibegin );
3791
3792 if( i < ipos )
3793 {
3794 SIMDType xmm1( x.load(i) * A.load(i,j ) );
3795
3796 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
3797 xmm1 += A.load(i,j) * x.load(i);
3798 }
3799
3800 y[j] = sum( xmm1 ) * scalar;
3801
3802 for( ; remainder && i<iend; ++i ) {
3803 y[j] += x[i] * A(i,j) * scalar;
3804 }
3805 }
3806 else
3807 {
3808 ElementType value( x[i] * A(i,j) );
3809
3810 for( ++i; i<iend; ++i ) {
3811 value += x[i] * A(i,j);
3812 }
3813
3814 y[j] = value * scalar;
3815 }
3816 }
3817 }
3818 //**********************************************************************************************
3819
3820 //**Default assignment to dense vectors (large matrices)****************************************
3834 template< typename VT1 // Type of the left-hand side target vector
3835 , typename VT2 // Type of the left-hand side vector operand
3836 , typename MT1 // Type of the right-hand side matrix operand
3837 , typename ST2 > // Type of the scalar value
3838 static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3839 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3840 {
3841 selectDefaultAssignKernel( y, x, A, scalar );
3842 }
3843 //**********************************************************************************************
3844
3845 //**Vectorized default assignment to dense vectors (large matrices)*****************************
3860 template< typename VT1 // Type of the left-hand side target vector
3861 , typename VT2 // Type of the left-hand side vector operand
3862 , typename MT1 // Type of the right-hand side matrix operand
3863 , typename ST2 > // Type of the scalar value
3864 static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3865 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3866 {
3867 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3868
3869 const size_t M( A.rows() );
3870 const size_t N( A.columns() );
3871
3872 reset( y );
3873
3874 size_t j( 0UL );
3875
3876 for( ; (j+8UL) <= N; j+=8UL )
3877 {
3878 const size_t ibegin( ( IsLower_v<MT1> )
3879 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
3880 :( 0UL ) );
3881 const size_t iend( ( IsUpper_v<MT1> )
3882 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3883 :( M ) );
3884 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3885
3886 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
3887 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
3888
3889 size_t i( ibegin );
3890
3891 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3892 const size_t i1( i+SIMDSIZE );
3893 const size_t i2( i+SIMDSIZE*2UL );
3894 const size_t i3( i+SIMDSIZE*3UL );
3895 const SIMDType x1( x.load(i ) );
3896 const SIMDType x2( x.load(i1) );
3897 const SIMDType x3( x.load(i2) );
3898 const SIMDType x4( x.load(i3) );
3899 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3900 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3901 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3902 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3903 y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3904 y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3905 y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3906 y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3907 }
3908
3909 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3910 const size_t i1( i+SIMDSIZE );
3911 const SIMDType x1( x.load(i ) );
3912 const SIMDType x2( x.load(i1) );
3913 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3914 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3915 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3916 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3917 y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3918 y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3919 y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3920 y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3921 }
3922
3923 for( ; i<ipos; i+=SIMDSIZE ) {
3924 const SIMDType x1( x.load(i) );
3925 y[j ] += sum( x1 * A.load(i,j ) );
3926 y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3927 y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3928 y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3929 y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
3930 y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
3931 y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
3932 y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
3933 }
3934
3935 for( ; remainder && i<iend; ++i ) {
3936 y[j ] += x[i] * A(i,j );
3937 y[j+1UL] += x[i] * A(i,j+1UL);
3938 y[j+2UL] += x[i] * A(i,j+2UL);
3939 y[j+3UL] += x[i] * A(i,j+3UL);
3940 y[j+4UL] += x[i] * A(i,j+4UL);
3941 y[j+5UL] += x[i] * A(i,j+5UL);
3942 y[j+6UL] += x[i] * A(i,j+6UL);
3943 y[j+7UL] += x[i] * A(i,j+7UL);
3944 }
3945
3946 y[j ] *= scalar;
3947 y[j+1UL] *= scalar;
3948 y[j+2UL] *= scalar;
3949 y[j+3UL] *= scalar;
3950 y[j+4UL] *= scalar;
3951 y[j+5UL] *= scalar;
3952 y[j+6UL] *= scalar;
3953 y[j+7UL] *= scalar;
3954 }
3955
3956 for( ; (j+4UL) <= N; j+=4UL )
3957 {
3958 const size_t ibegin( ( IsLower_v<MT1> )
3959 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
3960 :( 0UL ) );
3961 const size_t iend( ( IsUpper_v<MT1> )
3962 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3963 :( M ) );
3964 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
3965
3966 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
3967 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
3968
3969 size_t i( ibegin );
3970
3971 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3972 const size_t i1( i+SIMDSIZE );
3973 const size_t i2( i+SIMDSIZE*2UL );
3974 const size_t i3( i+SIMDSIZE*3UL );
3975 const SIMDType x1( x.load(i ) );
3976 const SIMDType x2( x.load(i1) );
3977 const SIMDType x3( x.load(i2) );
3978 const SIMDType x4( x.load(i3) );
3979 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3980 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3981 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3982 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3983 }
3984
3985 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3986 const size_t i1( i+SIMDSIZE );
3987 const SIMDType x1( x.load(i ) );
3988 const SIMDType x2( x.load(i1) );
3989 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3990 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3991 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3992 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3993 }
3994
3995 for( ; i<ipos; i+=SIMDSIZE ) {
3996 const SIMDType x1( x.load(i) );
3997 y[j ] += sum( x1 * A.load(i,j ) );
3998 y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3999 y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
4000 y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
4001 }
4002
4003 for( ; remainder && i<iend; ++i ) {
4004 y[j ] += x[i] * A(i,j );
4005 y[j+1UL] += x[i] * A(i,j+1UL);
4006 y[j+2UL] += x[i] * A(i,j+2UL);
4007 y[j+3UL] += x[i] * A(i,j+3UL);
4008 }
4009
4010 y[j ] *= scalar;
4011 y[j+1UL] *= scalar;
4012 y[j+2UL] *= scalar;
4013 y[j+3UL] *= scalar;
4014 }
4015
4016 for( ; (j+2UL) <= N; j+=2UL )
4017 {
4018 const size_t ibegin( ( IsLower_v<MT1> )
4019 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4020 :( 0UL ) );
4021 const size_t iend( ( IsUpper_v<MT1> )
4022 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4023 :( M ) );
4024 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4025
4026 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4027 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4028
4029 size_t i( ibegin );
4030
4031 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4032 const size_t i1( i+SIMDSIZE );
4033 const size_t i2( i+SIMDSIZE*2UL );
4034 const size_t i3( i+SIMDSIZE*3UL );
4035 const SIMDType x1( x.load(i ) );
4036 const SIMDType x2( x.load(i1) );
4037 const SIMDType x3( x.load(i2) );
4038 const SIMDType x4( x.load(i3) );
4039 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
4040 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
4041 }
4042
4043 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4044 const size_t i1( i+SIMDSIZE );
4045 const SIMDType x1( x.load(i ) );
4046 const SIMDType x2( x.load(i1) );
4047 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
4048 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
4049 }
4050
4051 for( ; i<ipos; i+=SIMDSIZE ) {
4052 const SIMDType x1( x.load(i) );
4053 y[j ] += sum( x1 * A.load(i,j ) );
4054 y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
4055 }
4056
4057 for( ; remainder && i<iend; ++i ) {
4058 y[j ] += x[i] * A(i,j );
4059 y[j+1UL] += x[i] * A(i,j+1UL);
4060 }
4061
4062 y[j ] *= scalar;
4063 y[j+1UL] *= scalar;
4064 }
4065
4066 if( j < N )
4067 {
4068 const size_t ibegin( ( IsLower_v<MT1> )
4069 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4070 :( 0UL ) );
4071 const size_t iend( ( IsUpper_v<MT1> )
4072 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4073 :( M ) );
4074 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4075
4076 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4077 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4078
4079 size_t i( ibegin );
4080
4081 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4082 const size_t i1( i+SIMDSIZE );
4083 const size_t i2( i+SIMDSIZE*2UL );
4084 const size_t i3( i+SIMDSIZE*3UL );
4085 const SIMDType x1( x.load(i ) );
4086 const SIMDType x2( x.load(i1) );
4087 const SIMDType x3( x.load(i2) );
4088 const SIMDType x4( x.load(i3) );
4089 y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
4090 }
4091
4092 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4093 const size_t i1( i+SIMDSIZE );
4094 const SIMDType x1( x.load(i ) );
4095 const SIMDType x2( x.load(i1) );
4096 y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
4097 }
4098
4099 for( ; i<ipos; i+=SIMDSIZE ) {
4100 const SIMDType x1( x.load(i) );
4101 y[j] += sum( x1 * A.load(i,j) );
4102 }
4103
4104 for( ; remainder && i<iend; ++i ) {
4105 y[j] += x[i] * A(i,j);
4106 }
4107
4108 y[j] *= scalar;
4109 }
4110 }
4111 //**********************************************************************************************
4112
4113 //**BLAS-based assignment to dense vectors (default)********************************************
4126 template< typename VT1 // Type of the left-hand side target vector
4127 , typename VT2 // Type of the left-hand side vector operand
4128 , typename MT1 // Type of the right-hand side matrix operand
4129 , typename ST2 > // Type of the scalar value
4130 static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4131 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4132 {
4133 selectLargeAssignKernel( y, x, A, scalar );
4134 }
4135 //**********************************************************************************************
4136
4137 //**BLAS-based assignment to dense vectors******************************************************
4138#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4152 template< typename VT1 // Type of the left-hand side target vector
4153 , typename VT2 // Type of the left-hand side vector operand
4154 , typename MT1 // Type of the right-hand side matrix operand
4155 , typename ST2 > // Type of the scalar value
4156 static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4157 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4158 {
4159 using ET = ElementType_t<VT1>;
4160
4161 if( IsTriangular_v<MT1> ) {
4162 assign( y, scalar * x );
4163 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4164 }
4165 else {
4166 gemv( y, x, A, ET(scalar), ET(0) );
4167 }
4168 }
4169#endif
4170 //**********************************************************************************************
4171
4172 //**Assignment to sparse vectors****************************************************************
4184 template< typename VT1 // Type of the target sparse vector
4185 , bool TF > // Transpose flag of the target sparse vector
4186 friend inline void assign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
4187 {
4189
4193
4194 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4195
4196 const ResultType tmp( serial( rhs ) );
4197 assign( *lhs, tmp );
4198 }
4199 //**********************************************************************************************
4200
4201 //**Addition assignment to dense vectors********************************************************
4213 template< typename VT1 // Type of the target dense vector
4214 , bool TF > // Transpose flag of the target dense vector
4215 friend inline void addAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
4216 {
4218
4219 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4220
4221 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4222 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4223
4224 if( right.rows() == 0UL || right.columns() == 0UL ||
4225 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4226 return;
4227 }
4228
4229 LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
4230 RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4231
4232 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4233 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4234 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4235 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
4236
4237 DVecScalarMultExpr::selectAddAssignKernel( *lhs, x, A, rhs.scalar_ );
4238 }
4239 //**********************************************************************************************
4240
4241 //**Addition assignment to dense vectors (kernel selection)*************************************
4252 template< typename VT1 // Type of the left-hand side target vector
4253 , typename VT2 // Type of the left-hand side vector operand
4254 , typename MT1 // Type of the right-hand side matrix operand
4255 , typename ST2 > // Type of the scalar value
4256 static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4257 {
4258 if( ( IsDiagonal_v<MT1> ) ||
4259 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4260 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4261 selectSmallAddAssignKernel( y, x, A, scalar );
4262 else
4263 selectBlasAddAssignKernel( y, x, A, scalar );
4264 }
4265 //**********************************************************************************************
4266
4267 //**Default addition assignment to dense vectors************************************************
4281 template< typename VT1 // Type of the left-hand side target vector
4282 , typename VT2 // Type of the left-hand side vector operand
4283 , typename MT1 // Type of the right-hand side matrix operand
4284 , typename ST2 > // Type of the scalar value
4285 static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4286 {
4287 y.addAssign( x * A * scalar );
4288 }
4289 //**********************************************************************************************
4290
4291 //**Default addition assignment to dense vectors (small matrices)*******************************
4305 template< typename VT1 // Type of the left-hand side target vector
4306 , typename VT2 // Type of the left-hand side vector operand
4307 , typename MT1 // Type of the right-hand side matrix operand
4308 , typename ST2 > // Type of the scalar value
4309 static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4310 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4311 {
4312 selectDefaultAddAssignKernel( y, x, A, scalar );
4313 }
4314 //**********************************************************************************************
4315
4316 //**Vectorized default addition assignment to dense vectors (small matrices)********************
4331 template< typename VT1 // Type of the left-hand side target vector
4332 , typename VT2 // Type of the left-hand side vector operand
4333 , typename MT1 // Type of the right-hand side matrix operand
4334 , typename ST2 > // Type of the scalar value
4335 static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4336 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4337 {
4338 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4339
4340 const size_t M( A.rows() );
4341 const size_t N( A.columns() );
4342
4343 size_t j( 0UL );
4344
4345 for( ; (j+8UL) <= N; j+=8UL )
4346 {
4347 const size_t ibegin( ( IsLower_v<MT1> )
4348 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4349 :( 0UL ) );
4350 const size_t iend( ( IsUpper_v<MT1> )
4351 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4352 :( M ) );
4353 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4354
4355 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4356 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4357
4358 size_t i( ibegin );
4359
4360 if( i < ipos )
4361 {
4362 SIMDType x1( x.load(i) );
4363 SIMDType xmm1( x1 * A.load(i,j ) );
4364 SIMDType xmm2( x1 * A.load(i,j+1UL) );
4365 SIMDType xmm3( x1 * A.load(i,j+2UL) );
4366 SIMDType xmm4( x1 * A.load(i,j+3UL) );
4367 SIMDType xmm5( x1 * A.load(i,j+4UL) );
4368 SIMDType xmm6( x1 * A.load(i,j+5UL) );
4369 SIMDType xmm7( x1 * A.load(i,j+6UL) );
4370 SIMDType xmm8( x1 * A.load(i,j+7UL) );
4371
4372 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
4373 x1 = x.load(i);
4374 xmm1 += x1 * A.load(i,j );
4375 xmm2 += x1 * A.load(i,j+1UL);
4376 xmm3 += x1 * A.load(i,j+2UL);
4377 xmm4 += x1 * A.load(i,j+3UL);
4378 xmm5 += x1 * A.load(i,j+4UL);
4379 xmm6 += x1 * A.load(i,j+5UL);
4380 xmm7 += x1 * A.load(i,j+6UL);
4381 xmm8 += x1 * A.load(i,j+7UL);
4382 }
4383
4384 y[j ] += sum( xmm1 ) * scalar;
4385 y[j+1UL] += sum( xmm2 ) * scalar;
4386 y[j+2UL] += sum( xmm3 ) * scalar;
4387 y[j+3UL] += sum( xmm4 ) * scalar;
4388 y[j+4UL] += sum( xmm5 ) * scalar;
4389 y[j+5UL] += sum( xmm6 ) * scalar;
4390 y[j+6UL] += sum( xmm7 ) * scalar;
4391 y[j+7UL] += sum( xmm8 ) * scalar;
4392
4393 for( ; remainder && i<iend; ++i ) {
4394 y[j ] += x[i] * A(i,j ) * scalar;
4395 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4396 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4397 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4398 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4399 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4400 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4401 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4402 }
4403 }
4404 else
4405 {
4406 ElementType value1( x[i] * A(i,j ) );
4407 ElementType value2( x[i] * A(i,j+1UL) );
4408 ElementType value3( x[i] * A(i,j+2UL) );
4409 ElementType value4( x[i] * A(i,j+3UL) );
4410 ElementType value5( x[i] * A(i,j+4UL) );
4411 ElementType value6( x[i] * A(i,j+5UL) );
4412 ElementType value7( x[i] * A(i,j+6UL) );
4413 ElementType value8( x[i] * A(i,j+7UL) );
4414
4415 for( ++i; i<iend; ++i ) {
4416 value1 += x[i] * A(i,j );
4417 value2 += x[i] * A(i,j+1UL);
4418 value3 += x[i] * A(i,j+2UL);
4419 value4 += x[i] * A(i,j+3UL);
4420 value5 += x[i] * A(i,j+4UL);
4421 value6 += x[i] * A(i,j+5UL);
4422 value7 += x[i] * A(i,j+6UL);
4423 value8 += x[i] * A(i,j+7UL);
4424 }
4425
4426 y[j ] += value1 * scalar;
4427 y[j+1UL] += value2 * scalar;
4428 y[j+2UL] += value3 * scalar;
4429 y[j+3UL] += value4 * scalar;
4430 y[j+4UL] += value5 * scalar;
4431 y[j+5UL] += value6 * scalar;
4432 y[j+6UL] += value7 * scalar;
4433 y[j+7UL] += value8 * scalar;
4434 }
4435 }
4436
4437 for( ; (j+4UL) <= N; j+=4UL )
4438 {
4439 const size_t ibegin( ( IsLower_v<MT1> )
4440 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4441 :( 0UL ) );
4442 const size_t iend( ( IsUpper_v<MT1> )
4443 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4444 :( M ) );
4445 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4446
4447 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4448 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4449
4450 size_t i( ibegin );
4451
4452 if( i < ipos )
4453 {
4454 SIMDType x1( x.load(i) );
4455 SIMDType xmm1( x1 * A.load(i,j ) );
4456 SIMDType xmm2( x1 * A.load(i,j+1UL) );
4457 SIMDType xmm3( x1 * A.load(i,j+2UL) );
4458 SIMDType xmm4( x1 * A.load(i,j+3UL) );
4459
4460 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
4461 x1 = x.load(i);
4462 xmm1 += x1 * A.load(i,j );
4463 xmm2 += x1 * A.load(i,j+1UL);
4464 xmm3 += x1 * A.load(i,j+2UL);
4465 xmm4 += x1 * A.load(i,j+3UL);
4466 }
4467
4468 y[j ] += sum( xmm1 ) * scalar;
4469 y[j+1UL] += sum( xmm2 ) * scalar;
4470 y[j+2UL] += sum( xmm3 ) * scalar;
4471 y[j+3UL] += sum( xmm4 ) * scalar;
4472
4473 for( ; remainder && i<iend; ++i ) {
4474 y[j ] += x[i] * A(i,j ) * scalar;
4475 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4476 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4477 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4478 }
4479 }
4480 else
4481 {
4482 ElementType value1( x[i] * A(i,j ) );
4483 ElementType value2( x[i] * A(i,j+1UL) );
4484 ElementType value3( x[i] * A(i,j+2UL) );
4485 ElementType value4( x[i] * A(i,j+3UL) );
4486
4487 for( ++i; i<iend; ++i ) {
4488 value1 += x[i] * A(i,j );
4489 value2 += x[i] * A(i,j+1UL);
4490 value3 += x[i] * A(i,j+2UL);
4491 value4 += x[i] * A(i,j+3UL);
4492 }
4493
4494 y[j ] += value1 * scalar;
4495 y[j+1UL] += value2 * scalar;
4496 y[j+2UL] += value3 * scalar;
4497 y[j+3UL] += value4 * scalar;
4498 }
4499 }
4500
4501 for( ; (j+3UL) <= N; j+=3UL )
4502 {
4503 const size_t ibegin( ( IsLower_v<MT1> )
4504 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4505 :( 0UL ) );
4506 const size_t iend( ( IsUpper_v<MT1> )
4507 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
4508 :( M ) );
4509 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4510
4511 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4512 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4513
4514 size_t i( ibegin );
4515
4516 if( i < ipos )
4517 {
4518 SIMDType x1( x.load(i) );
4519 SIMDType xmm1( x1 * A.load(i,j ) );
4520 SIMDType xmm2( x1 * A.load(i,j+1UL) );
4521 SIMDType xmm3( x1 * A.load(i,j+2UL) );
4522
4523 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
4524 x1 = x.load(i);
4525 xmm1 += x1 * A.load(i,j );
4526 xmm2 += x1 * A.load(i,j+1UL);
4527 xmm3 += x1 * A.load(i,j+2UL);
4528 }
4529
4530 y[j ] += sum( xmm1 ) * scalar;
4531 y[j+1UL] += sum( xmm2 ) * scalar;
4532 y[j+2UL] += sum( xmm3 ) * scalar;
4533
4534 for( ; remainder && i<iend; ++i ) {
4535 y[j ] += x[i] * A(i,j ) * scalar;
4536 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4537 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4538 }
4539 }
4540 else
4541 {
4542 ElementType value1( x[i] * A(i,j ) );
4543 ElementType value2( x[i] * A(i,j+1UL) );
4544 ElementType value3( x[i] * A(i,j+2UL) );
4545
4546 for( ++i; i<iend; ++i ) {
4547 value1 += x[i] * A(i,j );
4548 value2 += x[i] * A(i,j+1UL);
4549 value3 += x[i] * A(i,j+2UL);
4550 }
4551
4552 y[j ] += value1 * scalar;
4553 y[j+1UL] += value2 * scalar;
4554 y[j+2UL] += value3 * scalar;
4555 }
4556 }
4557
4558 for( ; (j+2UL) <= N; j+=2UL )
4559 {
4560 const size_t ibegin( ( IsLower_v<MT1> )
4561 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4562 :( 0UL ) );
4563 const size_t iend( ( IsUpper_v<MT1> )
4564 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4565 :( M ) );
4566 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4567
4568 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4569 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4570
4571 size_t i( ibegin );
4572
4573 if( i < ipos )
4574 {
4575 SIMDType x1( x.load(i) );
4576 SIMDType xmm1( x1 * A.load(i,j ) );
4577 SIMDType xmm2( x1 * A.load(i,j+1UL) );
4578
4579 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
4580 x1 = x.load(i);
4581 xmm1 += x1 * A.load(i,j );
4582 xmm2 += x1 * A.load(i,j+1UL);
4583 }
4584
4585 y[j ] += sum( xmm1 ) * scalar;
4586 y[j+1UL] += sum( xmm2 ) * scalar;
4587
4588 for( ; remainder && i<iend; ++i ) {
4589 y[j ] += x[i] * A(i,j ) * scalar;
4590 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4591 }
4592 }
4593 else
4594 {
4595 ElementType value1( x[i] * A(i,j ) );
4596 ElementType value2( x[i] * A(i,j+1UL) );
4597
4598 for( ++i; i<iend; ++i ) {
4599 value1 += x[i] * A(i,j );
4600 value2 += x[i] * A(i,j+1UL);
4601 }
4602
4603 y[j ] += value1 * scalar;
4604 y[j+1UL] += value2 * scalar;
4605 }
4606 }
4607
4608 if( j < N )
4609 {
4610 const size_t ibegin( ( IsLower_v<MT1> )
4611 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4612 :( 0UL ) );
4613 const size_t iend( ( IsUpper_v<MT1> )
4614 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4615 :( M ) );
4616 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4617
4618 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4619 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4620
4621 size_t i( ibegin );
4622
4623 if( i < ipos )
4624 {
4625 SIMDType xmm1( x.load(i) * A.load(i,j) );
4626
4627 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
4628 xmm1 += A.load(i,j) * x.load(i);
4629 }
4630
4631 y[j] += sum( xmm1 ) * scalar;
4632
4633 for( ; remainder && i<iend; ++i ) {
4634 y[j] += x[i] * A(i,j) * scalar;
4635 }
4636 }
4637 else
4638 {
4639 ElementType value( x[i] * A(i,j) );
4640
4641 for( ++i; i<iend; ++i ) {
4642 value += x[i] * A(i,j);
4643 }
4644
4645 y[j] += value * scalar;
4646 }
4647 }
4648 }
4649 //**********************************************************************************************
4650
4651 //**Default addition assignment to dense vectors (large matrices)*******************************
4665 template< typename VT1 // Type of the left-hand side target vector
4666 , typename VT2 // Type of the left-hand side vector operand
4667 , typename MT1 // Type of the right-hand side matrix operand
4668 , typename ST2 > // Type of the scalar value
4669 static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4670 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4671 {
4672 selectDefaultAddAssignKernel( y, x, A, scalar );
4673 }
4674 //**********************************************************************************************
4675
4676 //**Vectorized default addition assignment to dense vectors (large matrices)********************
4691 template< typename VT1 // Type of the left-hand side target vector
4692 , typename VT2 // Type of the left-hand side vector operand
4693 , typename MT1 // Type of the right-hand side matrix operand
4694 , typename ST2 > // Type of the scalar value
4695 static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4696 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4697 {
4698 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4699
4700 const size_t M( A.rows() );
4701 const size_t N( A.columns() );
4702
4703 size_t j( 0UL );
4704
4705 for( ; (j+8UL) <= N; j+=8UL )
4706 {
4707 const size_t ibegin( ( IsLower_v<MT1> )
4708 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4709 :( 0UL ) );
4710 const size_t iend( ( IsUpper_v<MT1> )
4711 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4712 :( M ) );
4713 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4714
4715 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4716 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4717
4718 size_t i( ibegin );
4719
4720 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4721 const size_t i1( i+SIMDSIZE );
4722 const size_t i2( i+SIMDSIZE*2UL );
4723 const size_t i3( i+SIMDSIZE*3UL );
4724 const SIMDType x1( x.load(i ) );
4725 const SIMDType x2( x.load(i1) );
4726 const SIMDType x3( x.load(i2) );
4727 const SIMDType x4( x.load(i3) );
4728 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4729 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4730 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4731 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4732 y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4733 y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4734 y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4735 y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4736 }
4737
4738 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4739 const size_t i1( i+SIMDSIZE );
4740 const SIMDType x1( x.load(i ) );
4741 const SIMDType x2( x.load(i1) );
4742 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4743 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4744 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4745 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4746 y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4747 y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4748 y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4749 y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4750 }
4751
4752 for( ; i<ipos; i+=SIMDSIZE ) {
4753 const SIMDType x1( x.load(i) );
4754 y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4755 y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4756 y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4757 y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4758 y[j+4UL] += sum( x1 * A.load(i,j+4UL) ) * scalar;
4759 y[j+5UL] += sum( x1 * A.load(i,j+5UL) ) * scalar;
4760 y[j+6UL] += sum( x1 * A.load(i,j+6UL) ) * scalar;
4761 y[j+7UL] += sum( x1 * A.load(i,j+7UL) ) * scalar;
4762 }
4763
4764 for( ; remainder && i<iend; ++i ) {
4765 y[j ] += x[i] * A(i,j ) * scalar;
4766 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4767 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4768 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4769 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4770 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4771 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4772 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4773 }
4774 }
4775
4776 for( ; (j+4UL) <= N; j+=4UL )
4777 {
4778 const size_t ibegin( ( IsLower_v<MT1> )
4779 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4780 :( 0UL ) );
4781 const size_t iend( ( IsUpper_v<MT1> )
4782 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4783 :( M ) );
4784 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4785
4786 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4787 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4788
4789 size_t i( ibegin );
4790
4791 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4792 const size_t i1( i+SIMDSIZE );
4793 const size_t i2( i+SIMDSIZE*2UL );
4794 const size_t i3( i+SIMDSIZE*3UL );
4795 const SIMDType x1( x.load(i ) );
4796 const SIMDType x2( x.load(i1) );
4797 const SIMDType x3( x.load(i2) );
4798 const SIMDType x4( x.load(i3) );
4799 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4800 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4801 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4802 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4803 }
4804
4805 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4806 const size_t i1( i+SIMDSIZE );
4807 const SIMDType x1( x.load(i ) );
4808 const SIMDType x2( x.load(i1) );
4809 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4810 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4811 y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4812 y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4813 }
4814
4815 for( ; i<ipos; i+=SIMDSIZE ) {
4816 const SIMDType x1( x.load(i) );
4817 y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4818 y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4819 y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4820 y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4821 }
4822
4823 for( ; remainder && i<iend; ++i ) {
4824 y[j ] += x[i] * A(i,j ) * scalar;
4825 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4826 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4827 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4828 }
4829 }
4830
4831 for( ; (j+2UL) <= N; j+=2UL )
4832 {
4833 const size_t ibegin( ( IsLower_v<MT1> )
4834 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4835 :( 0UL ) );
4836 const size_t iend( ( IsUpper_v<MT1> )
4837 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4838 :( M ) );
4839 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4840
4841 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4842 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4843
4844 size_t i( ibegin );
4845
4846 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4847 const size_t i1( i+SIMDSIZE );
4848 const size_t i2( i+SIMDSIZE*2UL );
4849 const size_t i3( i+SIMDSIZE*3UL );
4850 const SIMDType x1( x.load(i ) );
4851 const SIMDType x2( x.load(i1) );
4852 const SIMDType x3( x.load(i2) );
4853 const SIMDType x4( x.load(i3) );
4854 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4855 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4856 }
4857
4858 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4859 const size_t i1( i+SIMDSIZE );
4860 const SIMDType x1( x.load(i ) );
4861 const SIMDType x2( x.load(i1) );
4862 y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4863 y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4864 }
4865
4866 for( ; i<ipos; i+=SIMDSIZE ) {
4867 const SIMDType x1( x.load(i) );
4868 y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4869 y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4870 }
4871
4872 for( ; remainder && i<iend; ++i ) {
4873 y[j ] += x[i] * A(i,j ) * scalar;
4874 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4875 }
4876 }
4877
4878 if( j < N )
4879 {
4880 const size_t ibegin( ( IsLower_v<MT1> )
4881 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
4882 :( 0UL ) );
4883 const size_t iend( ( IsUpper_v<MT1> )
4884 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4885 :( M ) );
4886 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
4887
4888 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
4889 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
4890
4891 size_t i( ibegin );
4892
4893 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4894 const size_t i1( i+SIMDSIZE );
4895 const size_t i2( i+SIMDSIZE*2UL );
4896 const size_t i3( i+SIMDSIZE*3UL );
4897 const SIMDType x1( x.load(i ) );
4898 const SIMDType x2( x.load(i1) );
4899 const SIMDType x3( x.load(i2) );
4900 const SIMDType x4( x.load(i3) );
4901 y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4902 }
4903
4904 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4905 const size_t i1( i+SIMDSIZE );
4906 const SIMDType x1( x.load(i ) );
4907 const SIMDType x2( x.load(i1) );
4908 y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4909 }
4910
4911 for( ; i<ipos; i+=SIMDSIZE ) {
4912 const SIMDType x1( x.load(i) );
4913 y[j] += sum( x1 * A.load(i,j) ) * scalar;
4914 }
4915
4916 for( ; remainder && i<iend; ++i ) {
4917 y[j] += x[i] * A(i,j) * scalar;
4918 }
4919 }
4920 }
4921 //**********************************************************************************************
4922
4923 //**BLAS-based addition assignment to dense vectors (default)***********************************
4938 template< typename VT1 // Type of the left-hand side target vector
4939 , typename VT2 // Type of the left-hand side vector operand
4940 , typename MT1 // Type of the right-hand side matrix operand
4941 , typename ST2 > // Type of the scalar value
4942 static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4943 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4944 {
4945 selectLargeAddAssignKernel( y, x, A, scalar );
4946 }
4947 //**********************************************************************************************
4948
4949 //**BLAS-based addition assignment to dense vectors*********************************************
4950#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4964 template< typename VT1 // Type of the left-hand side target vector
4965 , typename VT2 // Type of the left-hand side vector operand
4966 , typename MT1 // Type of the right-hand side matrix operand
4967 , typename ST2 > // Type of the scalar value
4968 static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4969 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4970 {
4971 using ET = ElementType_t<VT1>;
4972
4973 if( IsTriangular_v<MT1> ) {
4974 ResultType_t<VT1> tmp( serial( scalar * x ) );
4975 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4976 addAssign( y, tmp );
4977 }
4978 else {
4979 gemv( y, x, A, ET(scalar), ET(1) );
4980 }
4981 }
4982#endif
4983 //**********************************************************************************************
4984
4985 //**Addition assignment to sparse vectors*******************************************************
4986 // No special implementation for the addition assignment to sparse vectors.
4987 //**********************************************************************************************
4988
4989 //**Subtraction assignment to dense vectors*****************************************************
5001 template< typename VT1 // Type of the target dense vector
5002 , bool TF > // Transpose flag of the target dense vector
5003 friend inline void subAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5004 {
5006
5007 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5008
5009 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5010 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5011
5012 if( right.rows() == 0UL || right.columns() == 0UL ||
5013 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
5014 return;
5015 }
5016
5017 LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
5018 RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5019
5020 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5021 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5022 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5023 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
5024
5025 DVecScalarMultExpr::selectSubAssignKernel( *lhs, x, A, rhs.scalar_ );
5026 }
5027 //**********************************************************************************************
5028
5029 //**Subtraction assignment to dense vectors (kernel selection)**********************************
5040 template< typename VT1 // Type of the left-hand side target vector
5041 , typename VT2 // Type of the left-hand side vector operand
5042 , typename MT1 // Type of the right-hand side matrix operand
5043 , typename ST2 > // Type of the scalar value
5044 static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
5045 {
5046 if( ( IsDiagonal_v<MT1> ) ||
5047 ( IsComputation_v<MT> && !evaluateMatrix ) ||
5048 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
5049 selectSmallSubAssignKernel( y, x, A, scalar );
5050 else
5051 selectBlasSubAssignKernel( y, x, A, scalar );
5052 }
5053 //**********************************************************************************************
5054
5055 //**Default subtraction assignment to dense vectors*********************************************
5069 template< typename VT1 // Type of the left-hand side target vector
5070 , typename VT2 // Type of the left-hand side vector operand
5071 , typename MT1 // Type of the right-hand side matrix operand
5072 , typename ST2 > // Type of the scalar value
5073 static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
5074 {
5075 y.subAssign( x * A * scalar );
5076 }
5077 //**********************************************************************************************
5078
5079 //**Default subtraction assignment to dense vectors (small matrices)****************************
5093 template< typename VT1 // Type of the left-hand side target vector
5094 , typename VT2 // Type of the left-hand side vector operand
5095 , typename MT1 // Type of the right-hand side matrix operand
5096 , typename ST2 > // Type of the scalar value
5097 static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
5098 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
5099 {
5100 selectDefaultSubAssignKernel( y, x, A, scalar );
5101 }
5102 //**********************************************************************************************
5103
5104 //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
5119 template< typename VT1 // Type of the left-hand side target vector
5120 , typename VT2 // Type of the left-hand side vector operand
5121 , typename MT1 // Type of the right-hand side matrix operand
5122 , typename ST2 > // Type of the scalar value
5123 static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
5124 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
5125 {
5126 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
5127
5128 const size_t M( A.rows() );
5129 const size_t N( A.columns() );
5130
5131 size_t j( 0UL );
5132
5133 for( ; (j+8UL) <= N; j+=8UL )
5134 {
5135 const size_t ibegin( ( IsLower_v<MT1> )
5136 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
5137 :( 0UL ) );
5138 const size_t iend( ( IsUpper_v<MT1> )
5139 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
5140 :( M ) );
5141 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
5142
5143 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
5144 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
5145
5146 size_t i( ibegin );
5147
5148 if( i < ipos )
5149 {
5150 SIMDType x1( x.load(i) );
5151 SIMDType xmm1( x1 * A.load(i,j ) );
5152 SIMDType xmm2( x1 * A.load(i,j+1UL) );
5153 SIMDType xmm3( x1 * A.load(i,j+2UL) );
5154 SIMDType xmm4( x1 * A.load(i,j+3UL) );
5155 SIMDType xmm5( x1 * A.load(i,j+4UL) );
5156 SIMDType xmm6( x1 * A.load(i,j+5UL) );
5157 SIMDType xmm7( x1 * A.load(i,j+6UL) );
5158 SIMDType xmm8( x1 * A.load(i,j+7UL) );
5159
5160 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
5161 x1 = x.load(i);
5162 xmm1 += x1 * A.load(i,j );
5163 xmm2 += x1 * A.load(i,j+1UL);
5164 xmm3 += x1 * A.load(i,j+2UL);
5165 xmm4 += x1 * A.load(i,j+3UL);
5166 xmm5 += x1 * A.load(i,j+4UL);
5167 xmm6 += x1 * A.load(i,j+5UL);
5168 xmm7 += x1 * A.load(i,j+6UL);
5169 xmm8 += x1 * A.load(i,j+7UL);
5170 }
5171
5172 y[j ] -= sum( xmm1 ) * scalar;
5173 y[j+1UL] -= sum( xmm2 ) * scalar;
5174 y[j+2UL] -= sum( xmm3 ) * scalar;
5175 y[j+3UL] -= sum( xmm4 ) * scalar;
5176 y[j+4UL] -= sum( xmm5 ) * scalar;
5177 y[j+5UL] -= sum( xmm6 ) * scalar;
5178 y[j+6UL] -= sum( xmm7 ) * scalar;
5179 y[j+7UL] -= sum( xmm8 ) * scalar;
5180
5181 for( ; remainder && i<iend; ++i ) {
5182 y[j ] -= x[i] * A(i,j ) * scalar;
5183 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5184 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5185 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
5186 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
5187 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
5188 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
5189 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
5190 }
5191 }
5192 else
5193 {
5194 ElementType value1( x[i] * A(i,j ) );
5195 ElementType value2( x[i] * A(i,j+1UL) );
5196 ElementType value3( x[i] * A(i,j+2UL) );
5197 ElementType value4( x[i] * A(i,j+3UL) );
5198 ElementType value5( x[i] * A(i,j+4UL) );
5199 ElementType value6( x[i] * A(i,j+5UL) );
5200 ElementType value7( x[i] * A(i,j+6UL) );
5201 ElementType value8( x[i] * A(i,j+7UL) );
5202
5203 for( ++i; i<iend; ++i ) {
5204 value1 += x[i] * A(i,j );
5205 value2 += x[i] * A(i,j+1UL);
5206 value3 += x[i] * A(i,j+2UL);
5207 value4 += x[i] * A(i,j+3UL);
5208 value5 += x[i] * A(i,j+4UL);
5209 value6 += x[i] * A(i,j+5UL);
5210 value7 += x[i] * A(i,j+6UL);
5211 value8 += x[i] * A(i,j+7UL);
5212 }
5213
5214 y[j ] -= value1 * scalar;
5215 y[j+1UL] -= value2 * scalar;
5216 y[j+2UL] -= value3 * scalar;
5217 y[j+3UL] -= value4 * scalar;
5218 y[j+4UL] -= value5 * scalar;
5219 y[j+5UL] -= value6 * scalar;
5220 y[j+6UL] -= value7 * scalar;
5221 y[j+7UL] -= value8 * scalar;
5222 }
5223 }
5224
5225 for( ; (j+4UL) <= N; j+=4UL )
5226 {
5227 const size_t ibegin( ( IsLower_v<MT1> )
5228 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
5229 :( 0UL ) );
5230 const size_t iend( ( IsUpper_v<MT1> )
5231 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
5232 :( M ) );
5233 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
5234
5235 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
5236 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
5237
5238 size_t i( ibegin );
5239
5240 if( i < ipos )
5241 {
5242 SIMDType x1( x.load(i) );
5243 SIMDType xmm1( x1 * A.load(i,j ) );
5244 SIMDType xmm2( x1 * A.load(i,j+1UL) );
5245 SIMDType xmm3( x1 * A.load(i,j+2UL) );
5246 SIMDType xmm4( x1 * A.load(i,j+3UL) );
5247
5248 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
5249 x1 = x.load(i);
5250 xmm1 += x1 * A.load(i,j );
5251 xmm2 += x1 * A.load(i,j+1UL);
5252 xmm3 += x1 * A.load(i,j+2UL);
5253 xmm4 += x1 * A.load(i,j+3UL);
5254 }
5255
5256 y[j ] -= sum( xmm1 ) * scalar;
5257 y[j+1UL] -= sum( xmm2 ) * scalar;
5258 y[j+2UL] -= sum( xmm3 ) * scalar;
5259 y[j+3UL] -= sum( xmm4 ) * scalar;
5260
5261 for( ; remainder && i<iend; ++i ) {
5262 y[j ] -= x[i] * A(i,j ) * scalar;
5263 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5264 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5265 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
5266 }
5267 }
5268 else
5269 {
5270 ElementType value1( x[i] * A(i,j ) );
5271 ElementType value2( x[i] * A(i,j+1UL) );
5272 ElementType value3( x[i] * A(i,j+2UL) );
5273 ElementType value4( x[i] * A(i,j+3UL) );
5274
5275 for( ++i; i<iend; ++i ) {
5276 value1 += x[i] * A(i,j );
5277 value2 += x[i] * A(i,j+1UL);
5278 value3 += x[i] * A(i,j+2UL);
5279 value4 += x[i] * A(i,j+3UL);
5280 }
5281
5282 y[j ] -= value1 * scalar;
5283 y[j+1UL] -= value2 * scalar;
5284 y[j+2UL] -= value3 * scalar;
5285 y[j+3UL] -= value4 * scalar;
5286 }
5287 }
5288
5289 for( ; (j+3UL) <= N; j+=3UL )
5290 {
5291 const size_t ibegin( ( IsLower_v<MT1> )
5292 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
5293 :( 0UL ) );
5294 const size_t iend( ( IsUpper_v<MT1> )
5295 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
5296 :( M ) );
5297 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
5298
5299 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
5300 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
5301
5302 size_t i( ibegin );
5303
5304 if( i < ipos )
5305 {
5306 SIMDType x1( x.load(i) );
5307 SIMDType xmm1( x1 * A.load(i,j ) );
5308 SIMDType xmm2( x1 * A.load(i,j+1UL) );
5309 SIMDType xmm3( x1 * A.load(i,j+2UL) );
5310
5311 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
5312 x1 = x.load(i);
5313 xmm1 += x1 * A.load(i,j );
5314 xmm2 += x1 * A.load(i,j+1UL);
5315 xmm3 += x1 * A.load(i,j+2UL);
5316 }
5317
5318 y[j ] -= sum( xmm1 ) * scalar;
5319 y[j+1UL] -= sum( xmm2 ) * scalar;
5320 y[j+2UL] -= sum( xmm3 ) * scalar;
5321
5322 for( ; remainder && i<iend; ++i ) {
5323 y[j ] -= x[i] * A(i,j ) * scalar;
5324 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5325 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5326 }
5327 }
5328 else
5329 {
5330 ElementType value1( x[i] * A(i,j ) );
5331 ElementType value2( x[i] * A(i,j+1UL) );
5332 ElementType value3( x[i] * A(i,j+2UL) );
5333
5334 for( ++i; i<iend; ++i ) {
5335 value1 += x[i] * A(i,j );
5336 value2 += x[i] * A(i,j+1UL);
5337 value3 += x[i] * A(i,j+2UL);
5338 }
5339
5340 y[j ] -= value1 * scalar;
5341 y[j+1UL] -= value2 * scalar;
5342 y[j+2UL] -= value3 * scalar;
5343 }
5344 }
5345
5346 for( ; (j+2UL) <= N; j+=2UL )
5347 {
5348 const size_t ibegin( ( IsLower_v<MT1> )
5349 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
5350 :( 0UL ) );
5351 const size_t iend( ( IsUpper_v<MT1> )
5352 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
5353 :( M ) );
5354 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
5355
5356 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
5357 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
5358
5359 size_t i( ibegin );
5360
5361 if( i < ipos )
5362 {
5363 SIMDType x1( x.load(i) );
5364 SIMDType xmm1( x1 * A.load(i,j ) );
5365 SIMDType xmm2( x1 * A.load(i,j+1UL) );
5366
5367 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
5368 x1 = x.load(i);
5369 xmm1 += x1 * A.load(i,j );
5370 xmm2 += x1 * A.load(i,j+1UL);
5371 }
5372
5373 y[j ] -= sum( xmm1 ) * scalar;
5374 y[j+1UL] -= sum( xmm2 ) * scalar;
5375
5376 for( ; remainder && i<iend; ++i ) {
5377 y[j ] -= x[i] * A(i,j ) * scalar;
5378 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5379 }
5380 }
5381 else
5382 {
5383 ElementType value1( x[i] * A(i,j ) );
5384 ElementType value2( x[i] * A(i,j+1UL) );
5385
5386 for( ++i; i<iend; ++i ) {
5387 value1 += x[i] * A(i,j );
5388 value2 += x[i] * A(i,j+1UL);
5389 }
5390
5391 y[j ] -= value1 * scalar;
5392 y[j+1UL] -= value2 * scalar;
5393 }
5394 }
5395
5396 if( j < N )
5397 {
5398 const size_t ibegin( ( IsLower_v<MT1> )
5399 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
5400 :( 0UL ) );
5401 const size_t iend( ( IsUpper_v<MT1> )
5402 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
5403 :( M ) );
5404 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
5405
5406 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
5407 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
5408
5409 size_t i( ibegin );
5410
5411 if( i < ipos )
5412 {
5413 SIMDType xmm1( x.load(i) * A.load(i,j ) );
5414
5415 for( i+=SIMDSIZE; i<ipos; i+=SIMDSIZE ) {
5416 xmm1 += A.load(i,j) * x.load(i);
5417 }
5418
5419 y[j] -= sum( xmm1 ) * scalar;
5420
5421 for( ; remainder && i<iend; ++i ) {
5422 y[j] -= x[i] * A(i,j) * scalar;
5423 }
5424 }
5425 else
5426 {
5427 ElementType value( x[i] * A(i,j) );
5428
5429 for( ++i; i<iend; ++i ) {
5430 value += x[i] * A(i,j);
5431 }
5432
5433 y[j] -= value * scalar;
5434 }
5435 }
5436 }
5437 //**********************************************************************************************
5438
5439 //**Default subtraction assignment to dense vectors (large matrices)****************************
5453 template< typename VT1 // Type of the left-hand side target vector
5454 , typename VT2 // Type of the left-hand side vector operand
5455 , typename MT1 // Type of the right-hand side matrix operand
5456 , typename ST2 > // Type of the scalar value
5457 static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
5458 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
5459 {
5460 selectDefaultSubAssignKernel( y, x, A, scalar );
5461 }
5462 //**********************************************************************************************
5463
5464 //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
5479 template< typename VT1 // Type of the left-hand side target vector
5480 , typename VT2 // Type of the left-hand side vector operand
5481 , typename MT1 // Type of the right-hand side matrix operand
5482 , typename ST2 > // Type of the scalar value
5483 static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
5484 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
5485 {
5486 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
5487
5488 const size_t M( A.rows() );
5489 const size_t N( A.columns() );
5490
5491 size_t j( 0UL );
5492
5493 for( ; (j+8UL) <= N; j+=8UL )
5494 {
5495 const size_t ibegin( ( IsLower_v<MT1> )
5496 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
5497 :( 0UL ) );
5498 const size_t iend( ( IsUpper_v<MT1> )
5499 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
5500 :( M ) );
5501 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
5502
5503 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
5504 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
5505
5506 size_t i( ibegin );
5507
5508 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
5509 const size_t i1( i+SIMDSIZE );
5510 const size_t i2( i+SIMDSIZE*2UL );
5511 const size_t i3( i+SIMDSIZE*3UL );
5512 const SIMDType x1( x.load(i ) );
5513 const SIMDType x2( x.load(i1) );
5514 const SIMDType x3( x.load(i2) );
5515 const SIMDType x4( x.load(i3) );
5516 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
5517 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
5518 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
5519 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
5520 y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
5521 y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
5522 y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
5523 y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
5524 }
5525
5526 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
5527 const size_t i1( i+SIMDSIZE );
5528 const SIMDType x1( x.load(i ) );
5529 const SIMDType x2( x.load(i1) );
5530 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
5531 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
5532 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
5533 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
5534 y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
5535 y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
5536 y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
5537 y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
5538 }
5539
5540 for( ; i<ipos; i+=SIMDSIZE ) {
5541 const SIMDType x1( x.load(i) );
5542 y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
5543 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
5544 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
5545 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
5546 y[j+4UL] -= sum( x1 * A.load(i,j+4UL) ) * scalar;
5547 y[j+5UL] -= sum( x1 * A.load(i,j+5UL) ) * scalar;
5548 y[j+6UL] -= sum( x1 * A.load(i,j+6UL) ) * scalar;
5549 y[j+7UL] -= sum( x1 * A.load(i,j+7UL) ) * scalar;
5550 }
5551
5552 for( ; remainder && i<iend; ++i ) {
5553 y[j ] -= x[i] * A(i,j ) * scalar;
5554 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5555 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5556 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
5557 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
5558 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
5559 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
5560 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
5561 }
5562 }
5563
5564 for( ; (j+4UL) <= N; j+=4UL )
5565 {
5566 const size_t ibegin( ( IsLower_v<MT1> )
5567 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
5568 :( 0UL ) );
5569 const size_t iend( ( IsUpper_v<MT1> )
5570 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
5571 :( M ) );
5572 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
5573
5574 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
5575 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
5576
5577 size_t i( ibegin );
5578
5579 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
5580 const size_t i1( i+SIMDSIZE );
5581 const size_t i2( i+SIMDSIZE*2UL );
5582 const size_t i3( i+SIMDSIZE*3UL );
5583 const SIMDType x1( x.load(i ) );
5584 const SIMDType x2( x.load(i1) );
5585 const SIMDType x3( x.load(i2) );
5586 const SIMDType x4( x.load(i3) );
5587 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
5588 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
5589 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
5590 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
5591 }
5592
5593 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
5594 const size_t i1( i+SIMDSIZE );
5595 const SIMDType x1( x.load(i ) );
5596 const SIMDType x2( x.load(i1) );
5597 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
5598 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
5599 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
5600 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
5601 }
5602
5603 for( ; i<ipos; i+=SIMDSIZE ) {
5604 const SIMDType x1( x.load(i) );
5605 y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
5606 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
5607 y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
5608 y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
5609 }
5610
5611 for( ; remainder && i<iend; ++i ) {
5612 y[j ] -= x[i] * A(i,j ) * scalar;
5613 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5614 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5615 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
5616 }
5617 }
5618
5619 for( ; (j+2UL) <= N; j+=2UL )
5620 {
5621 const size_t ibegin( ( IsLower_v<MT1> )
5622 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
5623 :( 0UL ) );
5624 const size_t iend( ( IsUpper_v<MT1> )
5625 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
5626 :( M ) );
5627 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
5628
5629 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
5630 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
5631
5632 size_t i( ibegin );
5633
5634 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
5635 const size_t i1( i+SIMDSIZE );
5636 const size_t i2( i+SIMDSIZE*2UL );
5637 const size_t i3( i+SIMDSIZE*3UL );
5638 const SIMDType x1( x.load(i ) );
5639 const SIMDType x2( x.load(i1) );
5640 const SIMDType x3( x.load(i2) );
5641 const SIMDType x4( x.load(i3) );
5642 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
5643 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
5644 }
5645
5646 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
5647 const size_t i1( i+SIMDSIZE );
5648 const SIMDType x1( x.load(i ) );
5649 const SIMDType x2( x.load(i1) );
5650 y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
5651 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
5652 }
5653
5654 for( ; i<ipos; i+=SIMDSIZE ) {
5655 const SIMDType x1( x.load(i) );
5656 y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
5657 y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
5658 }
5659
5660 for( ; remainder && i<iend; ++i ) {
5661 y[j ] -= x[i] * A(i,j ) * scalar;
5662 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5663 }
5664 }
5665
5666 if( j < N )
5667 {
5668 const size_t ibegin( ( IsLower_v<MT1> )
5669 ?( prevMultiple( ( IsStrictlyLower_v<MT1> ? j+1UL : j ), SIMDSIZE ) )
5670 :( 0UL ) );
5671 const size_t iend( ( IsUpper_v<MT1> )
5672 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
5673 :( M ) );
5674 BLAZE_INTERNAL_ASSERT( ibegin < iend, "Invalid loop indices detected" );
5675
5676 const size_t ipos( remainder ? prevMultiple( iend, SIMDSIZE ) : iend );
5677 BLAZE_INTERNAL_ASSERT( ipos <= iend, "Invalid end calculation" );
5678
5679 size_t i( ibegin );
5680
5681 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
5682 const size_t i1( i+SIMDSIZE );
5683 const size_t i2( i+SIMDSIZE*2UL );
5684 const size_t i3( i+SIMDSIZE*3UL );
5685 const SIMDType x1( x.load(i ) );
5686 const SIMDType x2( x.load(i1) );
5687 const SIMDType x3( x.load(i2) );
5688 const SIMDType x4( x.load(i3) );
5689 y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
5690 }
5691
5692 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
5693 const size_t i1( i+SIMDSIZE );
5694 const SIMDType x1( x.load(i ) );
5695 const SIMDType x2( x.load(i1) );
5696 y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
5697 }
5698
5699 for( ; i<ipos; i+=SIMDSIZE ) {
5700 const SIMDType x1( x.load(i) );
5701 y[j] -= sum( x1 * A.load(i,j) ) * scalar;
5702 }
5703
5704 for( ; remainder && i<iend; ++i ) {
5705 y[j] -= x[i] * A(i,j) * scalar;
5706 }
5707 }
5708 }
5709 //**********************************************************************************************
5710
5711 //**BLAS-based subtraction assignment to dense vectors (default)********************************
5726 template< typename VT1 // Type of the left-hand side target vector
5727 , typename VT2 // Type of the left-hand side vector operand
5728 , typename MT1 // Type of the right-hand side matrix operand
5729 , typename ST2 > // Type of the scalar value
5730 static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
5731 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
5732 {
5733 selectLargeSubAssignKernel( y, x, A, scalar );
5734 }
5735 //**********************************************************************************************
5736
5737 //**BLAS-based subtraction assignment to dense vectors******************************************
5738#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
5752 template< typename VT1 // Type of the left-hand side target vector
5753 , typename VT2 // Type of the left-hand side vector operand
5754 , typename MT1 // Type of the right-hand side matrix operand
5755 , typename ST2 > // Type of the scalar value
5756 static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
5757 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
5758 {
5759 using ET = ElementType_t<VT1>;
5760
5761 if( IsTriangular_v<MT1> ) {
5762 ResultType_t<VT1> tmp( serial( scalar * x ) );
5763 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
5764 subAssign( y, tmp );
5765 }
5766 else {
5767 gemv( y, x, A, ET(-scalar), ET(1) );
5768 }
5769 }
5770#endif
5771 //**********************************************************************************************
5772
5773 //**Subtraction assignment to sparse vectors****************************************************
5774 // No special implementation for the subtraction assignment to sparse vectors.
5775 //**********************************************************************************************
5776
5777 //**Multiplication assignment to dense vectors**************************************************
5789 template< typename VT1 // Type of the target dense vector
5790 , bool TF > // Transpose flag of the target dense vector
5791 friend inline void multAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5792 {
5794
5798
5799 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5800
5801 const ResultType tmp( serial( rhs ) );
5802 multAssign( *lhs, tmp );
5803 }
5804 //**********************************************************************************************
5805
5806 //**Multiplication assignment to sparse vectors*************************************************
5807 // No special implementation for the multiplication assignment to sparse vectors.
5808 //**********************************************************************************************
5809
5810 //**Division assignment to dense vectors********************************************************
5822 template< typename VT1 // Type of the target dense vector
5823 , bool TF > // Transpose flag of the target dense vector
5824 friend inline void divAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5825 {
5827
5831
5832 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5833
5834 const ResultType tmp( serial( rhs ) );
5835 divAssign( *lhs, tmp );
5836 }
5837 //**********************************************************************************************
5838
5839 //**Division assignment to sparse vectors*******************************************************
5840 // No special implementation for the division assignment to sparse vectors.
5841 //**********************************************************************************************
5842
5843 //**SMP assignment to dense vectors*************************************************************
5857 template< typename VT1 // Type of the target dense vector
5858 , bool TF > // Transpose flag of the target dense vector
5859 friend inline auto smpAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5860 -> EnableIf_t< UseSMPAssign_v<VT1> >
5861 {
5863
5864 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5865
5866 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5867 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5868
5869 if( right.rows() == 0UL ||
5870 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
5871 reset( *lhs );
5872 return;
5873 }
5874 else if( right.columns() == 0UL ) {
5875 return;
5876 }
5877
5878 LT x( left ); // Evaluation of the left-hand side dense vector operand
5879 RT A( right ); // Evaluation of the right-hand side dense matrix operand
5880
5881 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5882 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5883 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5884 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
5885
5886 smpAssign( *lhs, x * A * rhs.scalar_ );
5887 }
5888 //**********************************************************************************************
5889
5890 //**SMP assignment to sparse vectors************************************************************
5904 template< typename VT1 // Type of the target sparse vector
5905 , bool TF > // Transpose flag of the target sparse vector
5906 friend inline auto smpAssign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5907 -> EnableIf_t< UseSMPAssign_v<VT1> >
5908 {
5910
5914
5915 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5916
5917 const ResultType tmp( rhs );
5918 smpAssign( *lhs, tmp );
5919 }
5920 //**********************************************************************************************
5921
5922 //**SMP addition assignment to dense vectors****************************************************
5936 template< typename VT1 // Type of the target dense vector
5937 , bool TF > // Transpose flag of the target dense vector
5938 friend inline auto smpAddAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5939 -> EnableIf_t< UseSMPAssign_v<VT1> >
5940 {
5942
5943 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5944
5945 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5946 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5947
5948 if( right.rows() == 0UL || right.columns() == 0UL ||
5949 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
5950 return;
5951 }
5952
5953 LT x( left ); // Evaluation of the left-hand side dense vector operand
5954 RT A( right ); // Evaluation of the right-hand side dense matrix operand
5955
5956 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5957 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5958 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5959 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
5960
5961 smpAddAssign( *lhs, x * A * rhs.scalar_ );
5962 }
5963 //**********************************************************************************************
5964
5965 //**SMP addition assignment to sparse vectors***************************************************
5966 // No special implementation for the SMP addition assignment to sparse vectors.
5967 //**********************************************************************************************
5968
5969 //**SMP subtraction assignment to dense vectors*************************************************
5983 template< typename VT1 // Type of the target dense vector
5984 , bool TF > // Transpose flag of the target dense vector
5985 friend inline auto smpSubAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5986 -> EnableIf_t< UseSMPAssign_v<VT1> >
5987 {
5989
5990 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5991
5992 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5993 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5994
5995 if( right.rows() == 0UL || right.columns() == 0UL ||
5996 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
5997 return;
5998 }
5999
6000 LT x( left ); // Evaluation of the left-hand side dense vector operand
6001 RT A( right ); // Evaluation of the right-hand side dense matrix operand
6002
6003 BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
6004 BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
6005 BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
6006 BLAZE_INTERNAL_ASSERT( A.columns() == (*lhs).size() , "Invalid vector size" );
6007
6008 smpSubAssign( *lhs, x * A * rhs.scalar_ );
6009 }
6010 //**********************************************************************************************
6011
6012 //**SMP subtraction assignment to sparse vectors************************************************
6013 // No special implementation for the SMP subtraction assignment to sparse vectors.
6014 //**********************************************************************************************
6015
6016 //**SMP multiplication assignment to dense vectors**********************************************
6030 template< typename VT1 // Type of the target dense vector
6031 , bool TF > // Transpose flag of the target dense vector
6032 friend inline auto smpMultAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
6033 -> EnableIf_t< UseSMPAssign_v<VT1> >
6034 {
6036
6040
6041 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
6042
6043 const ResultType tmp( rhs );
6044 smpMultAssign( *lhs, tmp );
6045 }
6046 //**********************************************************************************************
6047
6048 //**SMP multiplication assignment to sparse vectors*********************************************
6049 // No special implementation for the SMP multiplication assignment to sparse vectors.
6050 //**********************************************************************************************
6051
6052 //**SMP division assignment to dense vectors****************************************************
6066 template< typename VT1 // Type of the target dense vector
6067 , bool TF > // Transpose flag of the target dense vector
6068 friend inline auto smpDivAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
6069 -> EnableIf_t< UseSMPAssign_v<VT1> >
6070 {
6072
6076
6077 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
6078
6079 const ResultType tmp( rhs );
6080 smpDivAssign( *lhs, tmp );
6081 }
6082 //**********************************************************************************************
6083
6084 //**SMP division assignment to sparse vectors***************************************************
6085 // No special implementation for the SMP division assignment to sparse vectors.
6086 //**********************************************************************************************
6087
6088 //**Compile time checks*************************************************************************
6097 //**********************************************************************************************
6098};
6100//*************************************************************************************************
6101
6102
6103
6104
6105//=================================================================================================
6106//
6107// GLOBAL BINARY ARITHMETIC OPERATORS
6108//
6109//=================================================================================================
6110
6111//*************************************************************************************************
6142template< typename VT // Type of the left-hand side dense vector
6143 , typename MT > // Type of the right-hand side dense matrix
6144inline decltype(auto)
6145 operator*( const DenseVector<VT,true>& vec, const DenseMatrix<MT,true>& mat )
6146{
6148
6150
6151 if( (*vec).size() != (*mat).rows() ) {
6152 BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
6153 }
6154
6155 using ReturnType = const TDVecTDMatMultExpr<VT,MT>;
6156 return ReturnType( *vec, *mat );
6157}
6158//*************************************************************************************************
6159
6160
6161
6162
6163//=================================================================================================
6164//
6165// ISALIGNED SPECIALIZATIONS
6166//
6167//=================================================================================================
6168
6169//*************************************************************************************************
6171template< typename VT, typename MT >
6172struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
6173 : public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
6174{};
6176//*************************************************************************************************
6177
6178} // namespace blaze
6179
6180#endif
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Deactivation of problematic macros.
Header file for the multiplication trait.
Header file for the prevMultiple shim.
Constraint on the transpose flag of vector types.
Header file for all SIMD functionality.
Data type constraint.
Constraint on the data type.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:530
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:430
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:461
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:520
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:540
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:584
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:169
VecScalarMultExpr< DenseVector< This, TF > > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:166
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:440
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:552
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:176
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:179
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:112
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:170
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:435
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:449
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:182
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:574
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:168
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:564
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:173
Base class for dense matrices.
Definition: DenseMatrix.h:82
Base class for N-dimensional dense vectors.
Definition: DenseVector.h:77
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense vector-transpose dense matrix multiplications.
Definition: TDVecTDMatMultExpr.h:126
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:313
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:131
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:300
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:129
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:209
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecTDMatMultExpr.h:243
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:134
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecTDMatMultExpr.h:237
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:212
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:389
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:222
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:132
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:130
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecTDMatMultExpr.h:230
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:323
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:357
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:144
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:345
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:266
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:367
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:225
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:133
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:208
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:210
If_t< IsExpression_v< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:219
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:333
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:213
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:390
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:139
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:377
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:211
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:216
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:252
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseVector base class.
Header file for the TVecMatMultExpr base class.
Header file for the VecScalarMultExpr base class.
Header file for BLAS general matrix/vector multiplication functions (gemv)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).
Definition: BLAS.h:169
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2156
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: TVecMatMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.
Definition: MatMatMultExpr.h:83
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.
Definition: RowVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.
Definition: MultTrait.h:165
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:221
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:192
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Header file for all forward declarations for expression class templates.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base class for all vector/matrix multiplication expression templates.
Definition: TVecMatMultExpr.h:69
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.