Blaze 3.9
DMatDVecMultExpr.h
Go to the documentation of this file.
1//=================================================================================================
33//=================================================================================================
34
35#ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
37
38
39//*************************************************************************************************
40// Includes
41//*************************************************************************************************
42
45#include <blaze/math/Aliases.h>
64#include <blaze/math/SIMD.h>
86#include <blaze/system/BLAS.h>
90#include <blaze/util/Assert.h>
91#include <blaze/util/Complex.h>
93#include <blaze/util/EnableIf.h>
96#include <blaze/util/mpl/If.h>
97#include <blaze/util/Types.h>
105
106
107namespace blaze {
108
109//=================================================================================================
110//
111// CLASS DMATDVECMULTEXPR
112//
113//=================================================================================================
114
115//*************************************************************************************************
122template< typename MT // Type of the left-hand side dense matrix
123 , typename VT > // Type of the right-hand side dense vector
125 : public MatVecMultExpr< DenseVector< DMatDVecMultExpr<MT,VT>, false > >
126 , private Computation
127{
128 private:
129 //**Type definitions****************************************************************************
136 //**********************************************************************************************
137
138 //**********************************************************************************************
140 static constexpr bool evaluateMatrix =
141 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
142 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
143 //**********************************************************************************************
144
145 //**********************************************************************************************
147 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
148 //**********************************************************************************************
149
150 //**********************************************************************************************
152
156 template< typename T1 >
157 static constexpr bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
159 //**********************************************************************************************
160
161 //**********************************************************************************************
163
166 template< typename T1, typename T2, typename T3 >
167 static constexpr bool UseBlasKernel_v =
169 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
170 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
171 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
172 !IsDiagonal_v<T2> &&
173 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
174 IsBLASCompatible_v< ElementType_t<T1> > &&
175 IsBLASCompatible_v< ElementType_t<T2> > &&
176 IsBLASCompatible_v< ElementType_t<T3> > &&
177 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
178 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
180 //**********************************************************************************************
181
182 //**********************************************************************************************
184
188 template< typename T1, typename T2, typename T3 >
189 static constexpr bool UseVectorizedDefaultKernel_v =
190 ( useOptimizedKernels &&
191 !IsDiagonal_v<T2> &&
192 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
193 IsSIMDCombinable_v< ElementType_t<T1>
195 , ElementType_t<T3> > &&
196 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
197 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
199 //**********************************************************************************************
200
201 public:
202 //**Type definitions****************************************************************************
205
208
213 using ReturnType = const ElementType;
214 using CompositeType = const ResultType;
215
217 using LeftOperand = If_t< IsExpression_v<MT>, const MT, const MT& >;
218
220 using RightOperand = If_t< IsExpression_v<VT>, const VT, const VT& >;
221
224
227 //**********************************************************************************************
228
229 //**Compilation flags***************************************************************************
231 static constexpr bool simdEnabled =
232 ( !IsDiagonal_v<MT> &&
233 MT::simdEnabled && VT::simdEnabled &&
234 HasSIMDAdd_v<MET,VET> &&
235 HasSIMDMult_v<MET,VET> );
236
238 static constexpr bool smpAssignable =
239 ( !evaluateMatrix && MT::smpAssignable && !evaluateVector && VT::smpAssignable );
240 //**********************************************************************************************
241
242 //**SIMD properties*****************************************************************************
244 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
245 //**********************************************************************************************
246
247 //**Constructor*********************************************************************************
253 inline DMatDVecMultExpr( const MT& mat, const VT& vec ) noexcept
254 : mat_( mat ) // Left-hand side dense matrix of the multiplication expression
255 , vec_( vec ) // Right-hand side dense vector of the multiplication expression
256 {
257 BLAZE_INTERNAL_ASSERT( mat_.columns() == vec_.size(), "Invalid matrix and vector sizes" );
258 }
259 //**********************************************************************************************
260
261 //**Subscript operator**************************************************************************
267 inline ReturnType operator[]( size_t index ) const {
268 BLAZE_INTERNAL_ASSERT( index < mat_.rows(), "Invalid vector access index" );
269
270 if( IsDiagonal_v<MT> )
271 {
272 return mat_(index,index) * vec_[index];
273 }
274 else if( IsLower_v<MT> && ( index + 8UL < mat_.rows() ) )
275 {
276 const size_t n( IsStrictlyLower_v<MT> ? index : index+1UL );
277 return subvector( row( mat_, index, unchecked ), 0UL, n, unchecked ) *
278 subvector( vec_, 0UL, n, unchecked );
279 }
280 else if( IsUpper_v<MT> && ( index > 8UL ) )
281 {
282 const size_t begin( IsStrictlyUpper_v<MT> ? index+1UL : index );
283 const size_t n ( mat_.columns() - begin );
284 return subvector( row( mat_, index, unchecked ), begin, n, unchecked ) *
286 }
287 else
288 {
289 return row( mat_, index, unchecked ) * vec_;
290 }
291 }
292 //**********************************************************************************************
293
294 //**At function*********************************************************************************
301 inline ReturnType at( size_t index ) const {
302 if( index >= mat_.rows() ) {
303 BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
304 }
305 return (*this)[index];
306 }
307 //**********************************************************************************************
308
309 //**Size function*******************************************************************************
314 inline size_t size() const noexcept {
315 return mat_.rows();
316 }
317 //**********************************************************************************************
318
319 //**Left operand access*************************************************************************
324 inline LeftOperand leftOperand() const noexcept {
325 return mat_;
326 }
327 //**********************************************************************************************
328
329 //**Right operand access************************************************************************
334 inline RightOperand rightOperand() const noexcept {
335 return vec_;
336 }
337 //**********************************************************************************************
338
339 //**********************************************************************************************
345 template< typename T >
346 inline bool canAlias( const T* alias ) const noexcept {
347 return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
348 }
349 //**********************************************************************************************
350
351 //**********************************************************************************************
357 template< typename T >
358 inline bool isAliased( const T* alias ) const noexcept {
359 return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
360 }
361 //**********************************************************************************************
362
363 //**********************************************************************************************
368 inline bool isAligned() const noexcept {
369 return mat_.isAligned() && vec_.isAligned();
370 }
371 //**********************************************************************************************
372
373 //**********************************************************************************************
378 inline bool canSMPAssign() const noexcept {
379 return ( !BLAZE_BLAS_MODE ||
382 ( IsComputation_v<MT> && !evaluateMatrix ) ||
383 ( mat_.rows() * mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
384 ( size() > SMP_DMATDVECMULT_THRESHOLD );
385 }
386 //**********************************************************************************************
387
388 private:
389 //**Member variables****************************************************************************
392 //**********************************************************************************************
393
394 //**Assignment to dense vectors*****************************************************************
407 template< typename VT1 > // Type of the target dense vector
408 friend inline void assign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
409 {
411
412 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
413
414 if( rhs.mat_.rows() == 0UL ) {
415 return;
416 }
417 else if( rhs.mat_.columns() == 0UL ||
418 ( IsStrictlyTriangular_v<MT> && rhs.mat_.columns() == 1UL ) ) {
419 reset( *lhs );
420 return;
421 }
422
423 LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
424 RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
425
426 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
427 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
428 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
429 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
430
431 DMatDVecMultExpr::selectAssignKernel( *lhs, A, x );
432 }
434 //**********************************************************************************************
435
436 //**Assignment to dense vectors (kernel selection)**********************************************
447 template< typename VT1 // Type of the left-hand side target vector
448 , typename MT1 // Type of the left-hand side matrix operand
449 , typename VT2 > // Type of the right-hand side vector operand
450 static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x )
451 {
452 if( ( IsDiagonal_v<MT1> ) ||
453 ( IsComputation_v<MT> && !evaluateMatrix ) ||
454 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
455 selectSmallAssignKernel( y, A, x );
456 else
457 selectBlasAssignKernel( y, A, x );
458 }
460 //**********************************************************************************************
461
462 //**Default assignment to dense vectors*********************************************************
476 template< typename VT1 // Type of the left-hand side target vector
477 , typename MT1 // Type of the left-hand side matrix operand
478 , typename VT2 > // Type of the right-hand side vector operand
479 static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x )
480 {
481 y.assign( A * x );
482 }
484 //**********************************************************************************************
485
486 //**Default assignment to dense vectors (small matrices)****************************************
500 template< typename VT1 // Type of the left-hand side target vector
501 , typename MT1 // Type of the left-hand side matrix operand
502 , typename VT2 > // Type of the right-hand side vector operand
503 static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
504 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
505 {
506 selectDefaultAssignKernel( y, A, x );
507 }
509 //**********************************************************************************************
510
511 //**Vectorized default assignment to dense vectors (small matrices)*****************************
525 template< typename VT1 // Type of the left-hand side target vector
526 , typename MT1 // Type of the left-hand side matrix operand
527 , typename VT2 > // Type of the right-hand side vector operand
528 static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
529 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
530 {
531 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
532
533 const size_t M( A.rows() );
534 const size_t N( A.columns() );
535
536 size_t i( 0UL );
537
538 for( ; (i+8UL) <= M; i+=8UL )
539 {
540 const size_t jbegin( ( IsUpper_v<MT1> )
541 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
542 :( 0UL ) );
543 const size_t jend( ( IsLower_v<MT1> )
544 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
545 :( N ) );
546 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
547
548 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
549 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
550
551 size_t j( jbegin );
552
553 if( j < jpos )
554 {
555 SIMDType x1( x.load(j) );
556 SIMDType xmm1( A.load(i ,j) * x1 );
557 SIMDType xmm2( A.load(i+1UL,j) * x1 );
558 SIMDType xmm3( A.load(i+2UL,j) * x1 );
559 SIMDType xmm4( A.load(i+3UL,j) * x1 );
560 SIMDType xmm5( A.load(i+4UL,j) * x1 );
561 SIMDType xmm6( A.load(i+5UL,j) * x1 );
562 SIMDType xmm7( A.load(i+6UL,j) * x1 );
563 SIMDType xmm8( A.load(i+7UL,j) * x1 );
564
565 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
566 x1 = x.load(j);
567 xmm1 += A.load(i ,j) * x1;
568 xmm2 += A.load(i+1UL,j) * x1;
569 xmm3 += A.load(i+2UL,j) * x1;
570 xmm4 += A.load(i+3UL,j) * x1;
571 xmm5 += A.load(i+4UL,j) * x1;
572 xmm6 += A.load(i+5UL,j) * x1;
573 xmm7 += A.load(i+6UL,j) * x1;
574 xmm8 += A.load(i+7UL,j) * x1;
575 }
576
577 y[i ] = sum( xmm1 );
578 y[i+1UL] = sum( xmm2 );
579 y[i+2UL] = sum( xmm3 );
580 y[i+3UL] = sum( xmm4 );
581 y[i+4UL] = sum( xmm5 );
582 y[i+5UL] = sum( xmm6 );
583 y[i+6UL] = sum( xmm7 );
584 y[i+7UL] = sum( xmm8 );
585
586 for( ; remainder && j<jend; ++j ) {
587 y[i ] += A(i ,j) * x[j];
588 y[i+1UL] += A(i+1UL,j) * x[j];
589 y[i+2UL] += A(i+2UL,j) * x[j];
590 y[i+3UL] += A(i+3UL,j) * x[j];
591 y[i+4UL] += A(i+4UL,j) * x[j];
592 y[i+5UL] += A(i+5UL,j) * x[j];
593 y[i+6UL] += A(i+6UL,j) * x[j];
594 y[i+7UL] += A(i+7UL,j) * x[j];
595 }
596 }
597 else
598 {
599 ElementType value1( A(i ,j) * x[j] );
600 ElementType value2( A(i+1UL,j) * x[j] );
601 ElementType value3( A(i+2UL,j) * x[j] );
602 ElementType value4( A(i+3UL,j) * x[j] );
603 ElementType value5( A(i+4UL,j) * x[j] );
604 ElementType value6( A(i+5UL,j) * x[j] );
605 ElementType value7( A(i+6UL,j) * x[j] );
606 ElementType value8( A(i+7UL,j) * x[j] );
607
608 for( ++j; j<jend; ++j ) {
609 value1 += A(i ,j) * x[j];
610 value2 += A(i+1UL,j) * x[j];
611 value3 += A(i+2UL,j) * x[j];
612 value4 += A(i+3UL,j) * x[j];
613 value5 += A(i+4UL,j) * x[j];
614 value6 += A(i+5UL,j) * x[j];
615 value7 += A(i+6UL,j) * x[j];
616 value8 += A(i+7UL,j) * x[j];
617 }
618
619 y[i ] = value1;
620 y[i+1UL] = value2;
621 y[i+2UL] = value3;
622 y[i+3UL] = value4;
623 y[i+4UL] = value5;
624 y[i+5UL] = value6;
625 y[i+6UL] = value7;
626 y[i+7UL] = value8;
627 }
628 }
629
630 for( ; (i+4UL) <= M; i+=4UL )
631 {
632 const size_t jbegin( ( IsUpper_v<MT1> )
633 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
634 :( 0UL ) );
635 const size_t jend( ( IsLower_v<MT1> )
636 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
637 :( N ) );
638 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
639
640 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
641 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
642
643 size_t j( jbegin );
644
645 if( j < jpos )
646 {
647 SIMDType x1( x.load(j) );
648 SIMDType xmm1( A.load(i ,j) * x1 );
649 SIMDType xmm2( A.load(i+1UL,j) * x1 );
650 SIMDType xmm3( A.load(i+2UL,j) * x1 );
651 SIMDType xmm4( A.load(i+3UL,j) * x1 );
652
653 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
654 x1 = x.load(j);
655 xmm1 += A.load(i ,j) * x1;
656 xmm2 += A.load(i+1UL,j) * x1;
657 xmm3 += A.load(i+2UL,j) * x1;
658 xmm4 += A.load(i+3UL,j) * x1;
659 }
660
661 y[i ] = sum( xmm1 );
662 y[i+1UL] = sum( xmm2 );
663 y[i+2UL] = sum( xmm3 );
664 y[i+3UL] = sum( xmm4 );
665
666 for( ; remainder && j<jend; ++j ) {
667 y[i ] += A(i ,j) * x[j];
668 y[i+1UL] += A(i+1UL,j) * x[j];
669 y[i+2UL] += A(i+2UL,j) * x[j];
670 y[i+3UL] += A(i+3UL,j) * x[j];
671 }
672 }
673 else
674 {
675 ElementType value1( A(i ,j) * x[j] );
676 ElementType value2( A(i+1UL,j) * x[j] );
677 ElementType value3( A(i+2UL,j) * x[j] );
678 ElementType value4( A(i+3UL,j) * x[j] );
679
680 for( ++j; j<jend; ++j ) {
681 value1 += A(i ,j) * x[j];
682 value2 += A(i+1UL,j) * x[j];
683 value3 += A(i+2UL,j) * x[j];
684 value4 += A(i+3UL,j) * x[j];
685 }
686
687 y[i ] = value1;
688 y[i+1UL] = value2;
689 y[i+2UL] = value3;
690 y[i+3UL] = value4;
691 }
692 }
693
694 for( ; (i+3UL) <= M; i+=3UL )
695 {
696 const size_t jbegin( ( IsUpper_v<MT1> )
697 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
698 :( 0UL ) );
699 const size_t jend( ( IsLower_v<MT1> )
700 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
701 :( N ) );
702 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
703
704 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
705 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
706
707 size_t j( jbegin );
708
709 if( j < jpos )
710 {
711 SIMDType x1( x.load(j) );
712 SIMDType xmm1( A.load(i ,j) * x1 );
713 SIMDType xmm2( A.load(i+1UL,j) * x1 );
714 SIMDType xmm3( A.load(i+2UL,j) * x1 );
715
716 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
717 x1 = x.load(j);
718 xmm1 += A.load(i ,j) * x1;
719 xmm2 += A.load(i+1UL,j) * x1;
720 xmm3 += A.load(i+2UL,j) * x1;
721 }
722
723 y[i ] = sum( xmm1 );
724 y[i+1UL] = sum( xmm2 );
725 y[i+2UL] = sum( xmm3 );
726
727 for( ; remainder && j<jend; ++j ) {
728 y[i ] += A(i ,j) * x[j];
729 y[i+1UL] += A(i+1UL,j) * x[j];
730 y[i+2UL] += A(i+2UL,j) * x[j];
731 }
732 }
733 else
734 {
735 ElementType value1( A(i ,j) * x[j] );
736 ElementType value2( A(i+1UL,j) * x[j] );
737 ElementType value3( A(i+2UL,j) * x[j] );
738
739 for( ++j; j<jend; ++j ) {
740 value1 += A(i ,j) * x[j];
741 value2 += A(i+1UL,j) * x[j];
742 value3 += A(i+2UL,j) * x[j];
743 }
744
745 y[i ] = value1;
746 y[i+1UL] = value2;
747 y[i+2UL] = value3;
748 }
749 }
750
751 for( ; (i+2UL) <= M; i+=2UL )
752 {
753 const size_t jbegin( ( IsUpper_v<MT1> )
754 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
755 :( 0UL ) );
756 const size_t jend( ( IsLower_v<MT1> )
757 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
758 :( N ) );
759 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
760
761 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
762 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
763
764 size_t j( jbegin );
765
766 if( j < jpos )
767 {
768 SIMDType x1( x.load(j) );
769 SIMDType xmm1( A.load(i ,j) * x1 );
770 SIMDType xmm2( A.load(i+1UL,j) * x1 );
771
772 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
773 x1 = x.load(j);
774 xmm1 += A.load(i ,j) * x1;
775 xmm2 += A.load(i+1UL,j) * x1;
776 }
777
778 y[i ] = sum( xmm1 );
779 y[i+1UL] = sum( xmm2 );
780
781 for( ; remainder && j<jend; ++j ) {
782 y[i ] += A(i ,j) * x[j];
783 y[i+1UL] += A(i+1UL,j) * x[j];
784 }
785 }
786 else
787 {
788 ElementType value1( A(i ,j) * x[j] );
789 ElementType value2( A(i+1UL,j) * x[j] );
790
791 for( ++j; j<jend; ++j ) {
792 value1 += A(i ,j) * x[j];
793 value2 += A(i+1UL,j) * x[j];
794 }
795
796 y[i ] = value1;
797 y[i+1UL] = value2;
798 }
799 }
800
801 if( i < M )
802 {
803 const size_t jbegin( ( IsUpper_v<MT1> )
804 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
805 :( 0UL ) );
806 const size_t jend( ( IsLower_v<MT1> )
807 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
808 :( N ) );
809 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
810
811 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
812 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
813
814 size_t j( jbegin );
815
816 if( j < jpos )
817 {
818 SIMDType xmm1( A.load(i,j) * x.load(j) );
819
820 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
821 xmm1 += A.load(i,j) * x.load(j);
822 }
823
824 y[i] = sum( xmm1 );
825
826 for( ; remainder && j<jend; ++j ) {
827 y[i] += A(i,j) * x[j];
828 }
829 }
830 else
831 {
832 ElementType value( A(i,j) * x[j] );
833
834 for( ++j; j<jend; ++j ) {
835 value += A(i,j) * x[j];
836 }
837
838 y[i] = value;
839 }
840 }
841 }
843 //**********************************************************************************************
844
845 //**Default assignment to dense vectors (large matrices)****************************************
859 template< typename VT1 // Type of the left-hand side target vector
860 , typename MT1 // Type of the left-hand side matrix operand
861 , typename VT2 > // Type of the right-hand side vector operand
862 static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
863 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
864 {
865 selectDefaultAssignKernel( y, A, x );
866 }
868 //**********************************************************************************************
869
870 //**Vectorized default assignment to dense vectors (large matrices)*****************************
884 template< typename VT1 // Type of the left-hand side target vector
885 , typename MT1 // Type of the left-hand side matrix operand
886 , typename VT2 > // Type of the right-hand side vector operand
887 static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
888 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
889 {
890 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
891
892 const size_t M( A.rows() );
893 const size_t N( A.columns() );
894
895 reset( y );
896
897 size_t i( 0UL );
898
899 for( ; (i+8UL) <= M; i+=8UL )
900 {
901 const size_t jbegin( ( IsUpper_v<MT1> )
902 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
903 :( 0UL ) );
904 const size_t jend( ( IsLower_v<MT1> )
905 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
906 :( N ) );
907 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
908
909 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
910 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
911
912 size_t j( jbegin );
913
914 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
915 const size_t j1( j+SIMDSIZE );
916 const size_t j2( j+SIMDSIZE*2UL );
917 const size_t j3( j+SIMDSIZE*3UL );
918 const SIMDType x1( x.load(j ) );
919 const SIMDType x2( x.load(j1) );
920 const SIMDType x3( x.load(j2) );
921 const SIMDType x4( x.load(j3) );
922 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
923 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
924 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
925 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
926 y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
927 y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
928 y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
929 y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
930 }
931
932 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
933 const size_t j1( j+SIMDSIZE );
934 const SIMDType x1( x.load(j ) );
935 const SIMDType x2( x.load(j1) );
936 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
937 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
938 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
939 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
940 y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
941 y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
942 y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
943 y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
944 }
945
946 for( ; j<jpos; j+=SIMDSIZE ) {
947 const SIMDType x1( x.load(j) );
948 y[i ] += sum( A.load(i ,j) * x1 );
949 y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
950 y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
951 y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
952 y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
953 y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
954 y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
955 y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
956 }
957
958 for( ; remainder && j<jend; ++j ) {
959 y[i ] += A(i ,j) * x[j];
960 y[i+1UL] += A(i+1UL,j) * x[j];
961 y[i+2UL] += A(i+2UL,j) * x[j];
962 y[i+3UL] += A(i+3UL,j) * x[j];
963 y[i+4UL] += A(i+4UL,j) * x[j];
964 y[i+5UL] += A(i+5UL,j) * x[j];
965 y[i+6UL] += A(i+6UL,j) * x[j];
966 y[i+7UL] += A(i+7UL,j) * x[j];
967 }
968 }
969
970 for( ; (i+4UL) <= M; i+=4UL )
971 {
972 const size_t jbegin( ( IsUpper_v<MT1> )
973 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
974 :( 0UL ) );
975 const size_t jend( ( IsLower_v<MT1> )
976 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
977 :( N ) );
978 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
979
980 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
981 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
982
983 size_t j( jbegin );
984
985 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
986 const size_t j1( j+SIMDSIZE );
987 const size_t j2( j+SIMDSIZE*2UL );
988 const size_t j3( j+SIMDSIZE*3UL );
989 const SIMDType x1( x.load(j ) );
990 const SIMDType x2( x.load(j1) );
991 const SIMDType x3( x.load(j2) );
992 const SIMDType x4( x.load(j3) );
993 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
994 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
995 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
996 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
997 }
998
999 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1000 const size_t j1( j+SIMDSIZE );
1001 const SIMDType x1( x.load(j ) );
1002 const SIMDType x2( x.load(j1) );
1003 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1004 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1005 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1006 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1007 }
1008
1009 for( ; j<jpos; j+=SIMDSIZE ) {
1010 const SIMDType x1( x.load(j) );
1011 y[i ] += sum( A.load(i ,j) * x1 );
1012 y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1013 y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1014 y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1015 }
1016
1017 for( ; remainder && j<jend; ++j ) {
1018 y[i ] += A(i ,j) * x[j];
1019 y[i+1UL] += A(i+1UL,j) * x[j];
1020 y[i+2UL] += A(i+2UL,j) * x[j];
1021 y[i+3UL] += A(i+3UL,j) * x[j];
1022 }
1023 }
1024
1025 for( ; (i+2UL) <= M; i+=2UL )
1026 {
1027 const size_t jbegin( ( IsUpper_v<MT1> )
1028 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1029 :( 0UL ) );
1030 const size_t jend( ( IsLower_v<MT1> )
1031 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1032 :( N ) );
1033 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1034
1035 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1036 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1037
1038 size_t j( jbegin );
1039
1040 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1041 const size_t j1( j+SIMDSIZE );
1042 const size_t j2( j+SIMDSIZE*2UL );
1043 const size_t j3( j+SIMDSIZE*3UL );
1044 const SIMDType x1( x.load(j ) );
1045 const SIMDType x2( x.load(j1) );
1046 const SIMDType x3( x.load(j2) );
1047 const SIMDType x4( x.load(j3) );
1048 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1049 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1050 }
1051
1052 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1053 const size_t j1( j+SIMDSIZE );
1054 const SIMDType x1( x.load(j ) );
1055 const SIMDType x2( x.load(j1) );
1056 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1057 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1058 }
1059
1060 for( ; j<jpos; j+=SIMDSIZE ) {
1061 const SIMDType x1( x.load(j) );
1062 y[i ] += sum( A.load(i ,j) * x1 );
1063 y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1064 }
1065
1066 for( ; remainder && j<jend; ++j ) {
1067 y[i ] += A(i ,j) * x[j];
1068 y[i+1UL] += A(i+1UL,j) * x[j];
1069 }
1070 }
1071
1072 if( i < M )
1073 {
1074 const size_t jbegin( ( IsUpper_v<MT1> )
1075 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1076 :( 0UL ) );
1077 const size_t jend( ( IsLower_v<MT1> )
1078 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1079 :( N ) );
1080 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1081
1082 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1083 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1084
1085 size_t j( jbegin );
1086
1087 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1088 const size_t j1( j+SIMDSIZE );
1089 const size_t j2( j+SIMDSIZE*2UL );
1090 const size_t j3( j+SIMDSIZE*3UL );
1091 const SIMDType x1( x.load(j ) );
1092 const SIMDType x2( x.load(j1) );
1093 const SIMDType x3( x.load(j2) );
1094 const SIMDType x4( x.load(j3) );
1095 y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1096 }
1097
1098 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1099 const size_t j1( j+SIMDSIZE );
1100 const SIMDType x1( x.load(j ) );
1101 const SIMDType x2( x.load(j1) );
1102 y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1103 }
1104
1105 for( ; j<jpos; j+=SIMDSIZE ) {
1106 const SIMDType x1( x.load(j) );
1107 y[i] += sum( A.load(i,j) * x1 );
1108 }
1109
1110 for( ; remainder && j<jend; ++j ) {
1111 y[i] += A(i,j) * x[j];
1112 }
1113 }
1114 }
1116 //**********************************************************************************************
1117
1118 //**BLAS-based assignment to dense vectors (default)********************************************
1132 template< typename VT1 // Type of the left-hand side target vector
1133 , typename MT1 // Type of the left-hand side matrix operand
1134 , typename VT2 > // Type of the right-hand side vector operand
1135 static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1136 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1137 {
1138 selectLargeAssignKernel( y, A, x );
1139 }
1141 //**********************************************************************************************
1142
1143 //**BLAS-based assignment to dense vectors******************************************************
1144#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1158 template< typename VT1 // Type of the left-hand side target vector
1159 , typename MT1 // Type of the left-hand side matrix operand
1160 , typename VT2 > // Type of the right-hand side vector operand
1161 static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1162 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1163 {
1164 using ET = ElementType_t<VT1>;
1165
1166 if( IsTriangular_v<MT1> ) {
1167 assign( y, x );
1168 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1169 }
1170 else {
1171 gemv( y, A, x, ET(1), ET(0) );
1172 }
1173 }
1175#endif
1176 //**********************************************************************************************
1177
1178 //**Assignment to sparse vectors****************************************************************
1191 template< typename VT1 > // Type of the target sparse vector
1192 friend inline void assign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1193 {
1195
1199
1200 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1201
1202 const ResultType tmp( serial( rhs ) );
1203 assign( *lhs, tmp );
1204 }
1206 //**********************************************************************************************
1207
1208 //**Addition assignment to dense vectors********************************************************
1221 template< typename VT1 > // Type of the target dense vector
1222 friend inline void addAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1223 {
1225
1226 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
1227
1228 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1229 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1230 return;
1231 }
1232
1233 LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1234 RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1235
1236 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1237 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1238 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1239 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
1240
1241 DMatDVecMultExpr::selectAddAssignKernel( *lhs, A, x );
1242 }
1244 //**********************************************************************************************
1245
1246 //**Addition assignment to dense vectors (kernel selection)*************************************
1257 template< typename VT1 // Type of the left-hand side target vector
1258 , typename MT1 // Type of the left-hand side matrix operand
1259 , typename VT2 > // Type of the right-hand side vector operand
1260 static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1261 {
1262 if( ( IsDiagonal_v<MT1> ) ||
1263 ( IsComputation_v<MT> && !evaluateMatrix ) ||
1264 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1265 selectSmallAddAssignKernel( y, A, x );
1266 else
1267 selectBlasAddAssignKernel( y, A, x );
1268 }
1270 //**********************************************************************************************
1271
1272 //**Default addition assignment to dense vectors************************************************
1286 template< typename VT1 // Type of the left-hand side target vector
1287 , typename MT1 // Type of the left-hand side matrix operand
1288 , typename VT2 > // Type of the right-hand side vector operand
1289 static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1290 {
1291 y.addAssign( A * x );
1292 }
1294 //**********************************************************************************************
1295
1296 //**Default addition assignment to dense vectors (small matrices)*******************************
1310 template< typename VT1 // Type of the left-hand side target vector
1311 , typename MT1 // Type of the left-hand side matrix operand
1312 , typename VT2 > // Type of the right-hand side vector operand
1313 static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1314 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1315 {
1316 selectDefaultAddAssignKernel( y, A, x );
1317 }
1319 //**********************************************************************************************
1320
1321 //**Vectorized default addition assignment to dense vectors (small matrices)********************
1335 template< typename VT1 // Type of the left-hand side target vector
1336 , typename MT1 // Type of the left-hand side matrix operand
1337 , typename VT2 > // Type of the right-hand side vector operand
1338 static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1339 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1340 {
1341 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1342
1343 const size_t M( A.rows() );
1344 const size_t N( A.columns() );
1345
1346 size_t i( 0UL );
1347
1348 for( ; (i+8UL) <= M; i+=8UL )
1349 {
1350 const size_t jbegin( ( IsUpper_v<MT1> )
1351 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1352 :( 0UL ) );
1353 const size_t jend( ( IsLower_v<MT1> )
1354 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1355 :( N ) );
1356 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1357
1358 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1359 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1360
1361 size_t j( jbegin );
1362
1363 if( j < jpos )
1364 {
1365 SIMDType x1( x.load(j) );
1366 SIMDType xmm1( A.load(i ,j) * x1 );
1367 SIMDType xmm2( A.load(i+1UL,j) * x1 );
1368 SIMDType xmm3( A.load(i+2UL,j) * x1 );
1369 SIMDType xmm4( A.load(i+3UL,j) * x1 );
1370 SIMDType xmm5( A.load(i+4UL,j) * x1 );
1371 SIMDType xmm6( A.load(i+5UL,j) * x1 );
1372 SIMDType xmm7( A.load(i+6UL,j) * x1 );
1373 SIMDType xmm8( A.load(i+7UL,j) * x1 );
1374
1375 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
1376 x1 = x.load(j);
1377 xmm1 += A.load(i ,j) * x1;
1378 xmm2 += A.load(i+1UL,j) * x1;
1379 xmm3 += A.load(i+2UL,j) * x1;
1380 xmm4 += A.load(i+3UL,j) * x1;
1381 xmm5 += A.load(i+4UL,j) * x1;
1382 xmm6 += A.load(i+5UL,j) * x1;
1383 xmm7 += A.load(i+6UL,j) * x1;
1384 xmm8 += A.load(i+7UL,j) * x1;
1385 }
1386
1387 y[i ] += sum( xmm1 );
1388 y[i+1UL] += sum( xmm2 );
1389 y[i+2UL] += sum( xmm3 );
1390 y[i+3UL] += sum( xmm4 );
1391 y[i+4UL] += sum( xmm5 );
1392 y[i+5UL] += sum( xmm6 );
1393 y[i+6UL] += sum( xmm7 );
1394 y[i+7UL] += sum( xmm8 );
1395
1396 for( ; remainder && j<jend; ++j ) {
1397 y[i ] += A(i ,j) * x[j];
1398 y[i+1UL] += A(i+1UL,j) * x[j];
1399 y[i+2UL] += A(i+2UL,j) * x[j];
1400 y[i+3UL] += A(i+3UL,j) * x[j];
1401 y[i+4UL] += A(i+4UL,j) * x[j];
1402 y[i+5UL] += A(i+5UL,j) * x[j];
1403 y[i+6UL] += A(i+6UL,j) * x[j];
1404 y[i+7UL] += A(i+7UL,j) * x[j];
1405 }
1406 }
1407 else
1408 {
1409 ElementType value1( A(i ,j) * x[j] );
1410 ElementType value2( A(i+1UL,j) * x[j] );
1411 ElementType value3( A(i+2UL,j) * x[j] );
1412 ElementType value4( A(i+3UL,j) * x[j] );
1413 ElementType value5( A(i+4UL,j) * x[j] );
1414 ElementType value6( A(i+5UL,j) * x[j] );
1415 ElementType value7( A(i+6UL,j) * x[j] );
1416 ElementType value8( A(i+7UL,j) * x[j] );
1417
1418 for( ++j; j<jend; ++j ) {
1419 value1 += A(i ,j) * x[j];
1420 value2 += A(i+1UL,j) * x[j];
1421 value3 += A(i+2UL,j) * x[j];
1422 value4 += A(i+3UL,j) * x[j];
1423 value5 += A(i+4UL,j) * x[j];
1424 value6 += A(i+5UL,j) * x[j];
1425 value7 += A(i+6UL,j) * x[j];
1426 value8 += A(i+7UL,j) * x[j];
1427 }
1428
1429 y[i ] += value1;
1430 y[i+1UL] += value2;
1431 y[i+2UL] += value3;
1432 y[i+3UL] += value4;
1433 y[i+4UL] += value5;
1434 y[i+5UL] += value6;
1435 y[i+6UL] += value7;
1436 y[i+7UL] += value8;
1437 }
1438 }
1439
1440 for( ; (i+4UL) <= M; i+=4UL )
1441 {
1442 const size_t jbegin( ( IsUpper_v<MT1> )
1443 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1444 :( 0UL ) );
1445 const size_t jend( ( IsLower_v<MT1> )
1446 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1447 :( N ) );
1448 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1449
1450 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1451 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1452
1453 size_t j( jbegin );
1454
1455 if( j < jpos )
1456 {
1457 SIMDType x1( x.load(j) );
1458 SIMDType xmm1( A.load(i ,j) * x1 );
1459 SIMDType xmm2( A.load(i+1UL,j) * x1 );
1460 SIMDType xmm3( A.load(i+2UL,j) * x1 );
1461 SIMDType xmm4( A.load(i+3UL,j) * x1 );
1462
1463 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
1464 x1 = x.load(j);
1465 xmm1 += A.load(i ,j) * x1;
1466 xmm2 += A.load(i+1UL,j) * x1;
1467 xmm3 += A.load(i+2UL,j) * x1;
1468 xmm4 += A.load(i+3UL,j) * x1;
1469 }
1470
1471 y[i ] += sum( xmm1 );
1472 y[i+1UL] += sum( xmm2 );
1473 y[i+2UL] += sum( xmm3 );
1474 y[i+3UL] += sum( xmm4 );
1475
1476 for( ; remainder && j<jend; ++j ) {
1477 y[i ] += A(i ,j) * x[j];
1478 y[i+1UL] += A(i+1UL,j) * x[j];
1479 y[i+2UL] += A(i+2UL,j) * x[j];
1480 y[i+3UL] += A(i+3UL,j) * x[j];
1481 }
1482 }
1483 else
1484 {
1485 ElementType value1( A(i ,j) * x[j] );
1486 ElementType value2( A(i+1UL,j) * x[j] );
1487 ElementType value3( A(i+2UL,j) * x[j] );
1488 ElementType value4( A(i+3UL,j) * x[j] );
1489
1490 for( ++j; j<jend; ++j ) {
1491 value1 += A(i ,j) * x[j];
1492 value2 += A(i+1UL,j) * x[j];
1493 value3 += A(i+2UL,j) * x[j];
1494 value4 += A(i+3UL,j) * x[j];
1495 }
1496
1497 y[i ] += value1;
1498 y[i+1UL] += value2;
1499 y[i+2UL] += value3;
1500 y[i+3UL] += value4;
1501 }
1502 }
1503
1504 for( ; (i+3UL) <= M; i+=3UL )
1505 {
1506 const size_t jbegin( ( IsUpper_v<MT1> )
1507 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1508 :( 0UL ) );
1509 const size_t jend( ( IsLower_v<MT1> )
1510 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
1511 :( N ) );
1512 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1513
1514 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1515 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1516
1517 size_t j( jbegin );
1518
1519 if( j < jpos )
1520 {
1521 SIMDType x1( x.load(j) );
1522 SIMDType xmm1( A.load(i ,j) * x1 );
1523 SIMDType xmm2( A.load(i+1UL,j) * x1 );
1524 SIMDType xmm3( A.load(i+2UL,j) * x1 );
1525
1526 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
1527 x1 = x.load(j);
1528 xmm1 += A.load(i ,j) * x1;
1529 xmm2 += A.load(i+1UL,j) * x1;
1530 xmm3 += A.load(i+2UL,j) * x1;
1531 }
1532
1533 y[i ] += sum( xmm1 );
1534 y[i+1UL] += sum( xmm2 );
1535 y[i+2UL] += sum( xmm3 );
1536
1537 for( ; remainder && j<jend; ++j ) {
1538 y[i ] += A(i ,j) * x[j];
1539 y[i+1UL] += A(i+1UL,j) * x[j];
1540 y[i+2UL] += A(i+2UL,j) * x[j];
1541 }
1542 }
1543 else
1544 {
1545 ElementType value1( A(i ,j) * x[j] );
1546 ElementType value2( A(i+1UL,j) * x[j] );
1547 ElementType value3( A(i+2UL,j) * x[j] );
1548
1549 for( ++j; j<jend; ++j ) {
1550 value1 += A(i ,j) * x[j];
1551 value2 += A(i+1UL,j) * x[j];
1552 value3 += A(i+2UL,j) * x[j];
1553 }
1554
1555 y[i ] += value1;
1556 y[i+1UL] += value2;
1557 y[i+2UL] += value3;
1558 }
1559 }
1560
1561 for( ; (i+2UL) <= M; i+=2UL )
1562 {
1563 const size_t jbegin( ( IsUpper_v<MT1> )
1564 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1565 :( 0UL ) );
1566 const size_t jend( ( IsLower_v<MT1> )
1567 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1568 :( N ) );
1569 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1570
1571 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1572 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1573
1574 size_t j( jbegin );
1575
1576 if( j < jpos )
1577 {
1578 SIMDType x1( x.load(j) );
1579 SIMDType xmm1( A.load(i ,j) * x1 );
1580 SIMDType xmm2( A.load(i+1UL,j) * x1 );
1581
1582 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
1583 x1 = x.load(j);
1584 xmm1 += A.load(i ,j) * x1;
1585 xmm2 += A.load(i+1UL,j) * x1;
1586 }
1587
1588 y[i ] += sum( xmm1 );
1589 y[i+1UL] += sum( xmm2 );
1590
1591 for( ; remainder && j<jend; ++j ) {
1592 y[i ] += A(i ,j) * x[j];
1593 y[i+1UL] += A(i+1UL,j) * x[j];
1594 }
1595 }
1596 else
1597 {
1598 ElementType value1( A(i ,j) * x[j] );
1599 ElementType value2( A(i+1UL,j) * x[j] );
1600
1601 for( ++j; j<jend; ++j ) {
1602 value1 += A(i ,j) * x[j];
1603 value2 += A(i+1UL,j) * x[j];
1604 }
1605
1606 y[i ] += value1;
1607 y[i+1UL] += value2;
1608 }
1609 }
1610
1611 if( i < M )
1612 {
1613 const size_t jbegin( ( IsUpper_v<MT1> )
1614 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1615 :( 0UL ) );
1616 const size_t jend( ( IsLower_v<MT1> )
1617 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1618 :( N ) );
1619 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1620
1621 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1622 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1623
1624 size_t j( jbegin );
1625
1626 if( j < jpos )
1627 {
1628 SIMDType xmm1( A.load(i,j) * x.load(j) );
1629
1630 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
1631 xmm1 += A.load(i,j) * x.load(j);
1632 }
1633
1634 y[i] += sum( xmm1 );
1635
1636 for( ; remainder && j<jend; ++j ) {
1637 y[i] += A(i,j) * x[j];
1638 }
1639 }
1640 else
1641 {
1642 ElementType value( A(i,j) * x[j] );
1643
1644 for( ++j; j<jend; ++j ) {
1645 value += A(i,j) * x[j];
1646 }
1647
1648 y[i] += value;
1649 }
1650 }
1651 }
1653 //**********************************************************************************************
1654
1655 //**Default addition assignment to dense vectors (large matrices)*******************************
1669 template< typename VT1 // Type of the left-hand side target vector
1670 , typename MT1 // Type of the left-hand side matrix operand
1671 , typename VT2 > // Type of the right-hand side vector operand
1672 static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1673 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1674 {
1675 selectDefaultAddAssignKernel( y, A, x );
1676 }
1678 //**********************************************************************************************
1679
1680 //**Vectorized default addition assignment to dense vectors (large matrices)********************
1694 template< typename VT1 // Type of the left-hand side target vector
1695 , typename MT1 // Type of the left-hand side matrix operand
1696 , typename VT2 > // Type of the right-hand side vector operand
1697 static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1698 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1699 {
1700 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1701
1702 const size_t M( A.rows() );
1703 const size_t N( A.columns() );
1704
1705 size_t i( 0UL );
1706
1707 for( ; (i+8UL) <= M; i+=8UL )
1708 {
1709 const size_t jbegin( ( IsUpper_v<MT1> )
1710 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1711 :( 0UL ) );
1712 const size_t jend( ( IsLower_v<MT1> )
1713 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1714 :( N ) );
1715 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1716
1717 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1718 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1719
1720 size_t j( jbegin );
1721
1722 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1723 const size_t j1( j+SIMDSIZE );
1724 const size_t j2( j+SIMDSIZE*2UL );
1725 const size_t j3( j+SIMDSIZE*3UL );
1726 const SIMDType x1( x.load(j ) );
1727 const SIMDType x2( x.load(j1) );
1728 const SIMDType x3( x.load(j2) );
1729 const SIMDType x4( x.load(j3) );
1730 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1731 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1732 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1733 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1734 y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1735 y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1736 y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1737 y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1738 }
1739
1740 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1741 const size_t j1( j+SIMDSIZE );
1742 const SIMDType x1( x.load(j ) );
1743 const SIMDType x2( x.load(j1) );
1744 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1745 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1746 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1747 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1748 y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1749 y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1750 y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1751 y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1752 }
1753
1754 for( ; j<jpos; j+=SIMDSIZE ) {
1755 const SIMDType x1( x.load(j) );
1756 y[i ] += sum( A.load(i ,j) * x1 );
1757 y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1758 y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1759 y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1760 y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
1761 y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
1762 y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
1763 y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
1764 }
1765
1766 for( ; remainder && j<jend; ++j ) {
1767 y[i ] += A(i ,j) * x[j];
1768 y[i+1UL] += A(i+1UL,j) * x[j];
1769 y[i+2UL] += A(i+2UL,j) * x[j];
1770 y[i+3UL] += A(i+3UL,j) * x[j];
1771 y[i+4UL] += A(i+4UL,j) * x[j];
1772 y[i+5UL] += A(i+5UL,j) * x[j];
1773 y[i+6UL] += A(i+6UL,j) * x[j];
1774 y[i+7UL] += A(i+7UL,j) * x[j];
1775 }
1776 }
1777
1778 for( ; (i+4UL) <= M; i+=4UL )
1779 {
1780 const size_t jbegin( ( IsUpper_v<MT1> )
1781 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1782 :( 0UL ) );
1783 const size_t jend( ( IsLower_v<MT1> )
1784 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1785 :( N ) );
1786 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1787
1788 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1789 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1790
1791 size_t j( jbegin );
1792
1793 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1794 const size_t j1( j+SIMDSIZE );
1795 const size_t j2( j+SIMDSIZE*2UL );
1796 const size_t j3( j+SIMDSIZE*3UL );
1797 const SIMDType x1( x.load(j ) );
1798 const SIMDType x2( x.load(j1) );
1799 const SIMDType x3( x.load(j2) );
1800 const SIMDType x4( x.load(j3) );
1801 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1802 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1803 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1804 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1805 }
1806
1807 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1808 const size_t j1( j+SIMDSIZE );
1809 const SIMDType x1( x.load(j ) );
1810 const SIMDType x2( x.load(j1) );
1811 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1812 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1813 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1814 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1815 }
1816
1817 for( ; j<jpos; j+=SIMDSIZE ) {
1818 const SIMDType x1( x.load(j) );
1819 y[i ] += sum( A.load(i ,j) * x1 );
1820 y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1821 y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1822 y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1823 }
1824
1825 for( ; remainder && j<jend; ++j ) {
1826 y[i ] += A(i ,j) * x[j];
1827 y[i+1UL] += A(i+1UL,j) * x[j];
1828 y[i+2UL] += A(i+2UL,j) * x[j];
1829 y[i+3UL] += A(i+3UL,j) * x[j];
1830 }
1831 }
1832
1833 for( ; (i+2UL) <= M; i+=2UL )
1834 {
1835 const size_t jbegin( ( IsUpper_v<MT1> )
1836 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1837 :( 0UL ) );
1838 const size_t jend( ( IsLower_v<MT1> )
1839 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1840 :( N ) );
1841 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1842
1843 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1844 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1845
1846 size_t j( jbegin );
1847
1848 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1849 const size_t j1( j+SIMDSIZE );
1850 const size_t j2( j+SIMDSIZE*2UL );
1851 const size_t j3( j+SIMDSIZE*3UL );
1852 const SIMDType x1( x.load(j ) );
1853 const SIMDType x2( x.load(j1) );
1854 const SIMDType x3( x.load(j2) );
1855 const SIMDType x4( x.load(j3) );
1856 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1857 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1858 }
1859
1860 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1861 const size_t j1( j+SIMDSIZE );
1862 const SIMDType x1( x.load(j ) );
1863 const SIMDType x2( x.load(j1) );
1864 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1865 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1866 }
1867
1868 for( ; j<jpos; j+=SIMDSIZE ) {
1869 const SIMDType x1( x.load(j) );
1870 y[i ] += sum( A.load(i ,j) * x1 );
1871 y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1872 }
1873
1874 for( ; remainder && j<jend; ++j ) {
1875 y[i ] += A(i ,j) * x[j];
1876 y[i+1UL] += A(i+1UL,j) * x[j];
1877 }
1878 }
1879
1880 if( i < M )
1881 {
1882 const size_t jbegin( ( IsUpper_v<MT1> )
1883 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
1884 :( 0UL ) );
1885 const size_t jend( ( IsLower_v<MT1> )
1886 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1887 :( N ) );
1888 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
1889
1890 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
1891 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
1892
1893 size_t j( jbegin );
1894
1895 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1896 const size_t j1( j+SIMDSIZE );
1897 const size_t j2( j+SIMDSIZE*2UL );
1898 const size_t j3( j+SIMDSIZE*3UL );
1899 const SIMDType x1( x.load(j ) );
1900 const SIMDType x2( x.load(j1) );
1901 const SIMDType x3( x.load(j2) );
1902 const SIMDType x4( x.load(j3) );
1903 y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1904 }
1905
1906 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1907 const size_t j1( j+SIMDSIZE );
1908 const SIMDType x1( x.load(j ) );
1909 const SIMDType x2( x.load(j1) );
1910 y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1911 }
1912
1913 for( ; j<jpos; j+=SIMDSIZE ) {
1914 const SIMDType x1( x.load(j) );
1915 y[i] += sum( A.load(i,j) * x1 );
1916 }
1917
1918 for( ; remainder && j<jend; ++j ) {
1919 y[i] += A(i,j) * x[j];
1920 }
1921 }
1922 }
1924 //**********************************************************************************************
1925
1926 //**BLAS-based addition assignment to dense vectors (default)***********************************
1940 template< typename VT1 // Type of the left-hand side target vector
1941 , typename MT1 // Type of the left-hand side matrix operand
1942 , typename VT2 > // Type of the right-hand side vector operand
1943 static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1944 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1945 {
1946 selectLargeAddAssignKernel( y, A, x );
1947 }
1949 //**********************************************************************************************
1950
1951 //**BLAS-based addition assignment to dense vectors*********************************************
1952#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1966 template< typename VT1 // Type of the left-hand side target vector
1967 , typename MT1 // Type of the left-hand side matrix operand
1968 , typename VT2 > // Type of the right-hand side vector operand
1969 static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1970 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1971 {
1972 using ET = ElementType_t<VT1>;
1973
1974 if( IsTriangular_v<MT1> ) {
1975 ResultType_t<VT1> tmp( serial( x ) );
1976 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1977 addAssign( y, tmp );
1978 }
1979 else {
1980 gemv( y, A, x, ET(1), ET(1) );
1981 }
1982 }
1984#endif
1985 //**********************************************************************************************
1986
1987 //**Addition assignment to sparse vectors*******************************************************
1988 // No special implementation for the addition assignment to sparse vectors.
1989 //**********************************************************************************************
1990
1991 //**Subtraction assignment to dense vectors*****************************************************
2004 template< typename VT1 > // Type of the target dense vector
2005 friend inline void subAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2006 {
2008
2009 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2010
2011 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2012 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2013 return;
2014 }
2015
2016 LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
2017 RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
2018
2019 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2020 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2021 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2022 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
2023
2024 DMatDVecMultExpr::selectSubAssignKernel( *lhs, A, x );
2025 }
2027 //**********************************************************************************************
2028
2029 //**Subtraction assignment to dense vectors (kernel selection)**********************************
2040 template< typename VT1 // Type of the left-hand side target vector
2041 , typename MT1 // Type of the left-hand side matrix operand
2042 , typename VT2 > // Type of the right-hand side vector operand
2043 static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2044 {
2045 if( ( IsDiagonal_v<MT1> ) ||
2046 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2047 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
2048 selectSmallSubAssignKernel( y, A, x );
2049 else
2050 selectBlasSubAssignKernel( y, A, x );
2051 }
2053 //**********************************************************************************************
2054
2055 //**Default subtraction assignment to dense vectors*********************************************
2069 template< typename VT1 // Type of the left-hand side target vector
2070 , typename MT1 // Type of the left-hand side matrix operand
2071 , typename VT2 > // Type of the right-hand side vector operand
2072 static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2073 {
2074 y.subAssign( A * x );
2075 }
2077 //**********************************************************************************************
2078
2079 //**Default subtraction assignment to dense vectors (small matrices)****************************
2093 template< typename VT1 // Type of the left-hand side target vector
2094 , typename MT1 // Type of the left-hand side matrix operand
2095 , typename VT2 > // Type of the right-hand side vector operand
2096 static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2097 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2098 {
2099 selectDefaultSubAssignKernel( y, A, x );
2100 }
2102 //**********************************************************************************************
2103
2104 //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
2118 template< typename VT1 // Type of the left-hand side target vector
2119 , typename MT1 // Type of the left-hand side matrix operand
2120 , typename VT2 > // Type of the right-hand side vector operand
2121 static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2122 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2123 {
2124 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
2125
2126 const size_t M( A.rows() );
2127 const size_t N( A.columns() );
2128
2129 size_t i( 0UL );
2130
2131 for( ; (i+8UL) <= M; i+=8UL )
2132 {
2133 const size_t jbegin( ( IsUpper_v<MT1> )
2134 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
2135 :( 0UL ) );
2136 const size_t jend( ( IsLower_v<MT1> )
2137 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
2138 :( N ) );
2139 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
2140
2141 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2142 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2143
2144 size_t j( jbegin );
2145
2146 if( j < jpos )
2147 {
2148 SIMDType x1( x.load(j) );
2149 SIMDType xmm1( A.load(i ,j) * x1 );
2150 SIMDType xmm2( A.load(i+1UL,j) * x1 );
2151 SIMDType xmm3( A.load(i+2UL,j) * x1 );
2152 SIMDType xmm4( A.load(i+3UL,j) * x1 );
2153 SIMDType xmm5( A.load(i+4UL,j) * x1 );
2154 SIMDType xmm6( A.load(i+5UL,j) * x1 );
2155 SIMDType xmm7( A.load(i+6UL,j) * x1 );
2156 SIMDType xmm8( A.load(i+7UL,j) * x1 );
2157
2158 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
2159 x1 = x.load(j);
2160 xmm1 += A.load(i ,j) * x1;
2161 xmm2 += A.load(i+1UL,j) * x1;
2162 xmm3 += A.load(i+2UL,j) * x1;
2163 xmm4 += A.load(i+3UL,j) * x1;
2164 xmm5 += A.load(i+4UL,j) * x1;
2165 xmm6 += A.load(i+5UL,j) * x1;
2166 xmm7 += A.load(i+6UL,j) * x1;
2167 xmm8 += A.load(i+7UL,j) * x1;
2168 }
2169
2170 y[i ] -= sum( xmm1 );
2171 y[i+1UL] -= sum( xmm2 );
2172 y[i+2UL] -= sum( xmm3 );
2173 y[i+3UL] -= sum( xmm4 );
2174 y[i+4UL] -= sum( xmm5 );
2175 y[i+5UL] -= sum( xmm6 );
2176 y[i+6UL] -= sum( xmm7 );
2177 y[i+7UL] -= sum( xmm8 );
2178
2179 for( ; remainder && j<jend; ++j ) {
2180 y[i ] -= A(i ,j) * x[j];
2181 y[i+1UL] -= A(i+1UL,j) * x[j];
2182 y[i+2UL] -= A(i+2UL,j) * x[j];
2183 y[i+3UL] -= A(i+3UL,j) * x[j];
2184 y[i+4UL] -= A(i+4UL,j) * x[j];
2185 y[i+5UL] -= A(i+5UL,j) * x[j];
2186 y[i+6UL] -= A(i+6UL,j) * x[j];
2187 y[i+7UL] -= A(i+7UL,j) * x[j];
2188 }
2189 }
2190 else
2191 {
2192 ElementType value1( A(i ,j) * x[j] );
2193 ElementType value2( A(i+1UL,j) * x[j] );
2194 ElementType value3( A(i+2UL,j) * x[j] );
2195 ElementType value4( A(i+3UL,j) * x[j] );
2196 ElementType value5( A(i+4UL,j) * x[j] );
2197 ElementType value6( A(i+5UL,j) * x[j] );
2198 ElementType value7( A(i+6UL,j) * x[j] );
2199 ElementType value8( A(i+7UL,j) * x[j] );
2200
2201 for( ++j; j<jend; ++j ) {
2202 value1 += A(i ,j) * x[j];
2203 value2 += A(i+1UL,j) * x[j];
2204 value3 += A(i+2UL,j) * x[j];
2205 value4 += A(i+3UL,j) * x[j];
2206 value5 += A(i+4UL,j) * x[j];
2207 value6 += A(i+5UL,j) * x[j];
2208 value7 += A(i+6UL,j) * x[j];
2209 value8 += A(i+7UL,j) * x[j];
2210 }
2211
2212 y[i ] -= value1;
2213 y[i+1UL] -= value2;
2214 y[i+2UL] -= value3;
2215 y[i+3UL] -= value4;
2216 y[i+4UL] -= value5;
2217 y[i+5UL] -= value6;
2218 y[i+6UL] -= value7;
2219 y[i+7UL] -= value8;
2220 }
2221 }
2222
2223 for( ; (i+4UL) <= M; i+=4UL )
2224 {
2225 const size_t jbegin( ( IsUpper_v<MT1> )
2226 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
2227 :( 0UL ) );
2228 const size_t jend( ( IsLower_v<MT1> )
2229 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
2230 :( N ) );
2231 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
2232
2233 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2234 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2235
2236 size_t j( jbegin );
2237
2238 if( j < jpos )
2239 {
2240 SIMDType x1( x.load(j) );
2241 SIMDType xmm1( A.load(i ,j) * x1 );
2242 SIMDType xmm2( A.load(i+1UL,j) * x1 );
2243 SIMDType xmm3( A.load(i+2UL,j) * x1 );
2244 SIMDType xmm4( A.load(i+3UL,j) * x1 );
2245
2246 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
2247 x1 = x.load(j);
2248 xmm1 += A.load(i ,j) * x1;
2249 xmm2 += A.load(i+1UL,j) * x1;
2250 xmm3 += A.load(i+2UL,j) * x1;
2251 xmm4 += A.load(i+3UL,j) * x1;
2252 }
2253
2254 y[i ] -= sum( xmm1 );
2255 y[i+1UL] -= sum( xmm2 );
2256 y[i+2UL] -= sum( xmm3 );
2257 y[i+3UL] -= sum( xmm4 );
2258
2259 for( ; remainder && j<jend; ++j ) {
2260 y[i ] -= A(i ,j) * x[j];
2261 y[i+1UL] -= A(i+1UL,j) * x[j];
2262 y[i+2UL] -= A(i+2UL,j) * x[j];
2263 y[i+3UL] -= A(i+3UL,j) * x[j];
2264 }
2265 }
2266 else
2267 {
2268 ElementType value1( A(i ,j) * x[j] );
2269 ElementType value2( A(i+1UL,j) * x[j] );
2270 ElementType value3( A(i+2UL,j) * x[j] );
2271 ElementType value4( A(i+3UL,j) * x[j] );
2272
2273 for( ++j; j<jend; ++j ) {
2274 value1 += A(i ,j) * x[j];
2275 value2 += A(i+1UL,j) * x[j];
2276 value3 += A(i+2UL,j) * x[j];
2277 value4 += A(i+3UL,j) * x[j];
2278 }
2279
2280 y[i ] -= value1;
2281 y[i+1UL] -= value2;
2282 y[i+2UL] -= value3;
2283 y[i+3UL] -= value4;
2284 }
2285 }
2286
2287 for( ; (i+3UL) <= M; i+=3UL )
2288 {
2289 const size_t jbegin( ( IsUpper_v<MT1> )
2290 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
2291 :( 0UL ) );
2292 const size_t jend( ( IsLower_v<MT1> )
2293 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
2294 :( N ) );
2295 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
2296
2297 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2298 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2299
2300 size_t j( jbegin );
2301
2302 if( j < jpos )
2303 {
2304 SIMDType x1( x.load(j) );
2305 SIMDType xmm1( A.load(i ,j) * x1 );
2306 SIMDType xmm2( A.load(i+1UL,j) * x1 );
2307 SIMDType xmm3( A.load(i+2UL,j) * x1 );
2308
2309 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
2310 x1 = x.load(j);
2311 xmm1 += A.load(i ,j) * x1;
2312 xmm2 += A.load(i+1UL,j) * x1;
2313 xmm3 += A.load(i+2UL,j) * x1;
2314 }
2315
2316 y[i ] -= sum( xmm1 );
2317 y[i+1UL] -= sum( xmm2 );
2318 y[i+2UL] -= sum( xmm3 );
2319
2320 for( ; remainder && j<jend; ++j ) {
2321 y[i ] -= A(i ,j) * x[j];
2322 y[i+1UL] -= A(i+1UL,j) * x[j];
2323 y[i+2UL] -= A(i+2UL,j) * x[j];
2324 }
2325 }
2326 else
2327 {
2328 ElementType value1( A(i ,j) * x[j] );
2329 ElementType value2( A(i+1UL,j) * x[j] );
2330 ElementType value3( A(i+2UL,j) * x[j] );
2331
2332 for( ++j; j<jend; ++j ) {
2333 value1 += A(i ,j) * x[j];
2334 value2 += A(i+1UL,j) * x[j];
2335 value3 += A(i+2UL,j) * x[j];
2336 }
2337
2338 y[i ] -= value1;
2339 y[i+1UL] -= value2;
2340 y[i+2UL] -= value3;
2341 }
2342 }
2343
2344 for( ; (i+2UL) <= M; i+=2UL )
2345 {
2346 const size_t jbegin( ( IsUpper_v<MT1> )
2347 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
2348 :( 0UL ) );
2349 const size_t jend( ( IsLower_v<MT1> )
2350 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
2351 :( N ) );
2352 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
2353
2354 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2355 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2356
2357 size_t j( jbegin );
2358
2359 if( j < jpos )
2360 {
2361 SIMDType x1( x.load(j) );
2362 SIMDType xmm1( A.load(i ,j) * x1 );
2363 SIMDType xmm2( A.load(i+1UL,j) * x1 );
2364
2365 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
2366 x1 = x.load(j);
2367 xmm1 += A.load(i ,j) * x1;
2368 xmm2 += A.load(i+1UL,j) * x1;
2369 }
2370
2371 y[i ] -= sum( xmm1 );
2372 y[i+1UL] -= sum( xmm2 );
2373
2374 for( ; remainder && j<jend; ++j ) {
2375 y[i ] -= A(i ,j) * x[j];
2376 y[i+1UL] -= A(i+1UL,j) * x[j];
2377 }
2378 }
2379 else
2380 {
2381 ElementType value1( A(i ,j) * x[j] );
2382 ElementType value2( A(i+1UL,j) * x[j] );
2383
2384 for( ++j; j<jend; ++j ) {
2385 value1 += A(i ,j) * x[j];
2386 value2 += A(i+1UL,j) * x[j];
2387 }
2388
2389 y[i ] -= value1;
2390 y[i+1UL] -= value2;
2391 }
2392 }
2393
2394 if( i < M )
2395 {
2396 const size_t jbegin( ( IsUpper_v<MT1> )
2397 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
2398 :( 0UL ) );
2399 const size_t jend( ( IsLower_v<MT1> )
2400 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
2401 :( N ) );
2402 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
2403
2404 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2405 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2406
2407 size_t j( jbegin );
2408
2409 if( j < jpos )
2410 {
2411 SIMDType x1( x.load(j) );
2412 SIMDType xmm1( A.load(i,j) * x1 );
2413
2414 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
2415 xmm1 += A.load(i,j) * x.load(j);
2416 }
2417
2418 y[i] -= sum( xmm1 );
2419
2420 for( ; remainder && j<jend; ++j ) {
2421 y[i] -= A(i,j) * x[j];
2422 }
2423 }
2424 else
2425 {
2426 ElementType value( A(i,j) * x[j] );
2427
2428 for( ++j; j<jend; ++j ) {
2429 value += A(i,j) * x[j];
2430 }
2431
2432 y[i] -= value;
2433 }
2434 }
2435 }
2437 //**********************************************************************************************
2438
2439 //**Default subtraction assignment to dense vectors (large matrices)****************************
2453 template< typename VT1 // Type of the left-hand side target vector
2454 , typename MT1 // Type of the left-hand side matrix operand
2455 , typename VT2 > // Type of the right-hand side vector operand
2456 static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2457 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2458 {
2459 selectDefaultSubAssignKernel( y, A, x );
2460 }
2462 //**********************************************************************************************
2463
2464 //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2478 template< typename VT1 // Type of the left-hand side target vector
2479 , typename MT1 // Type of the left-hand side matrix operand
2480 , typename VT2 > // Type of the right-hand side vector operand
2481 static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2482 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2483 {
2484 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
2485
2486 const size_t M( A.rows() );
2487 const size_t N( A.columns() );
2488
2489 size_t i( 0UL );
2490
2491 for( ; (i+8UL) <= M; i+=8UL )
2492 {
2493 const size_t jbegin( ( IsUpper_v<MT1> )
2494 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
2495 :( 0UL ) );
2496 const size_t jend( ( IsLower_v<MT1> )
2497 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
2498 :( N ) );
2499 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
2500
2501 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2502 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2503
2504 size_t j( jbegin );
2505
2506 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2507 const size_t j1( j+SIMDSIZE );
2508 const size_t j2( j+SIMDSIZE*2UL );
2509 const size_t j3( j+SIMDSIZE*3UL );
2510 const SIMDType x1( x.load(j ) );
2511 const SIMDType x2( x.load(j1) );
2512 const SIMDType x3( x.load(j2) );
2513 const SIMDType x4( x.load(j3) );
2514 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2515 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2516 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2517 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2518 y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2519 y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2520 y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2521 y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2522 }
2523
2524 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2525 const size_t j1( j+SIMDSIZE );
2526 const SIMDType x1( x.load(j ) );
2527 const SIMDType x2( x.load(j1) );
2528 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2529 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2530 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2531 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2532 y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2533 y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2534 y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2535 y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2536 }
2537
2538 for( ; j<jpos; j+=SIMDSIZE ) {
2539 const SIMDType x1( x.load(j) );
2540 y[i ] -= sum( A.load(i ,j) * x1 );
2541 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2542 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2543 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2544 y[i+4UL] -= sum( A.load(i+4UL,j) * x1 );
2545 y[i+5UL] -= sum( A.load(i+5UL,j) * x1 );
2546 y[i+6UL] -= sum( A.load(i+6UL,j) * x1 );
2547 y[i+7UL] -= sum( A.load(i+7UL,j) * x1 );
2548 }
2549
2550 for( ; remainder && j<jend; ++j ) {
2551 y[i ] -= A(i ,j) * x[j];
2552 y[i+1UL] -= A(i+1UL,j) * x[j];
2553 y[i+2UL] -= A(i+2UL,j) * x[j];
2554 y[i+3UL] -= A(i+3UL,j) * x[j];
2555 y[i+4UL] -= A(i+4UL,j) * x[j];
2556 y[i+5UL] -= A(i+5UL,j) * x[j];
2557 y[i+6UL] -= A(i+6UL,j) * x[j];
2558 y[i+7UL] -= A(i+7UL,j) * x[j];
2559 }
2560 }
2561
2562 for( ; (i+4UL) <= M; i+=4UL )
2563 {
2564 const size_t jbegin( ( IsUpper_v<MT1> )
2565 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
2566 :( 0UL ) );
2567 const size_t jend( ( IsLower_v<MT1> )
2568 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
2569 :( N ) );
2570 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
2571
2572 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2573 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2574
2575 size_t j( jbegin );
2576
2577 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2578 const size_t j1( j+SIMDSIZE );
2579 const size_t j2( j+SIMDSIZE*2UL );
2580 const size_t j3( j+SIMDSIZE*3UL );
2581 const SIMDType x1( x.load(j ) );
2582 const SIMDType x2( x.load(j1) );
2583 const SIMDType x3( x.load(j2) );
2584 const SIMDType x4( x.load(j3) );
2585 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2586 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2587 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2588 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2589 }
2590
2591 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2592 const size_t j1( j+SIMDSIZE );
2593 const SIMDType x1( x.load(j ) );
2594 const SIMDType x2( x.load(j1) );
2595 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2596 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2597 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2598 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2599 }
2600
2601 for( ; j<jpos; j+=SIMDSIZE ) {
2602 const SIMDType x1( x.load(j) );
2603 y[i ] -= sum( A.load(i ,j) * x1 );
2604 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2605 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2606 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2607 }
2608
2609 for( ; remainder && j<jend; ++j ) {
2610 y[i ] -= A(i ,j) * x[j];
2611 y[i+1UL] -= A(i+1UL,j) * x[j];
2612 y[i+2UL] -= A(i+2UL,j) * x[j];
2613 y[i+3UL] -= A(i+3UL,j) * x[j];
2614 }
2615 }
2616
2617 for( ; (i+2UL) <= M; i+=2UL )
2618 {
2619 const size_t jbegin( ( IsUpper_v<MT1> )
2620 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
2621 :( 0UL ) );
2622 const size_t jend( ( IsLower_v<MT1> )
2623 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
2624 :( N ) );
2625 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
2626
2627 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2628 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2629
2630 size_t j( jbegin );
2631
2632 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2633 const size_t j1( j+SIMDSIZE );
2634 const size_t j2( j+SIMDSIZE*2UL );
2635 const size_t j3( j+SIMDSIZE*3UL );
2636 const SIMDType x1( x.load(j ) );
2637 const SIMDType x2( x.load(j1) );
2638 const SIMDType x3( x.load(j2) );
2639 const SIMDType x4( x.load(j3) );
2640 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2641 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2642 }
2643
2644 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2645 const size_t j1( j+SIMDSIZE );
2646 const SIMDType x1( x.load(j ) );
2647 const SIMDType x2( x.load(j1) );
2648 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2649 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2650 }
2651
2652 for( ; j<jpos; j+=SIMDSIZE ) {
2653 const SIMDType x1( x.load(j) );
2654 y[i ] -= sum( A.load(i ,j) * x1 );
2655 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2656 }
2657
2658 for( ; remainder && j<jend; ++j ) {
2659 y[i ] -= A(i ,j) * x[j];
2660 y[i+1UL] -= A(i+1UL,j) * x[j];
2661 }
2662 }
2663
2664 if( i < M )
2665 {
2666 const size_t jbegin( ( IsUpper_v<MT1> )
2667 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
2668 :( 0UL ) );
2669 const size_t jend( ( IsLower_v<MT1> )
2670 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
2671 :( N ) );
2672 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
2673
2674 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
2675 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
2676
2677 size_t j( jbegin );
2678
2679 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2680 const size_t j1( j+SIMDSIZE );
2681 const size_t j2( j+SIMDSIZE*2UL );
2682 const size_t j3( j+SIMDSIZE*3UL );
2683 const SIMDType x1( x.load(j ) );
2684 const SIMDType x2( x.load(j1) );
2685 const SIMDType x3( x.load(j2) );
2686 const SIMDType x4( x.load(j3) );
2687 y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2688 }
2689
2690 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2691 const size_t j1( j+SIMDSIZE );
2692 const SIMDType x1( x.load(j ) );
2693 const SIMDType x2( x.load(j1) );
2694 y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2695 }
2696
2697 for( ; j<jpos; j+=SIMDSIZE ) {
2698 const SIMDType x1( x.load(j) );
2699 y[i] -= sum( A.load(i,j) * x1 );
2700 }
2701
2702 for( ; remainder && j<jend; ++j ) {
2703 y[i] -= A(i,j) * x[j];
2704 }
2705 }
2706 }
2708 //**********************************************************************************************
2709
2710 //**BLAS-based subtraction assignment to dense vectors (default)********************************
2724 template< typename VT1 // Type of the left-hand side target vector
2725 , typename MT1 // Type of the left-hand side matrix operand
2726 , typename VT2 > // Type of the right-hand side vector operand
2727 static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2728 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2729 {
2730 selectLargeSubAssignKernel( y, A, x );
2731 }
2733 //**********************************************************************************************
2734
2735 //**BLAS-based subtraction assignment to dense vectors******************************************
2736#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2750 template< typename VT1 // Type of the left-hand side target vector
2751 , typename MT1 // Type of the left-hand side matrix operand
2752 , typename VT2 > // Type of the right-hand side vector operand
2753 static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2754 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2755 {
2756 using ET = ElementType_t<VT1>;
2757
2758 if( IsTriangular_v<MT1> ) {
2759 ResultType_t<VT1> tmp( serial( x ) );
2760 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2761 subAssign( y, tmp );
2762 }
2763 else {
2764 gemv( y, A, x, ET(-1), ET(1) );
2765 }
2766 }
2768#endif
2769 //**********************************************************************************************
2770
2771 //**Subtraction assignment to sparse vectors****************************************************
2772 // No special implementation for the subtraction assignment to sparse vectors.
2773 //**********************************************************************************************
2774
2775 //**Multiplication assignment to dense vectors**************************************************
2788 template< typename VT1 > // Type of the target dense vector
2789 friend inline void multAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2790 {
2792
2796
2797 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2798
2799 const ResultType tmp( serial( rhs ) );
2800 multAssign( *lhs, tmp );
2801 }
2803 //**********************************************************************************************
2804
2805 //**Multiplication assignment to sparse vectors*************************************************
2806 // No special implementation for the multiplication assignment to sparse vectors.
2807 //**********************************************************************************************
2808
2809 //**Division assignment to dense vectors********************************************************
2822 template< typename VT1 > // Type of the target dense vector
2823 friend inline void divAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2824 {
2826
2830
2831 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2832
2833 const ResultType tmp( serial( rhs ) );
2834 divAssign( *lhs, tmp );
2835 }
2837 //**********************************************************************************************
2838
2839 //**Division assignment to sparse vectors*******************************************************
2840 // No special implementation for the division assignment to sparse vectors.
2841 //**********************************************************************************************
2842
2843 //**SMP assignment to dense vectors*************************************************************
2858 template< typename VT1 > // Type of the target dense vector
2859 friend inline auto smpAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2860 -> EnableIf_t< UseSMPAssign_v<VT1> >
2861 {
2863
2864 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2865
2866 if( rhs.mat_.rows() == 0UL ) {
2867 return;
2868 }
2869 else if( rhs.mat_.columns() == 0UL ||
2870 ( IsStrictlyTriangular_v<MT> && rhs.mat_.columns() == 1UL ) ) {
2871 reset( *lhs );
2872 return;
2873 }
2874
2875 LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2876 RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2877
2878 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2879 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2880 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2881 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
2882
2883 smpAssign( *lhs, A * x );
2884 }
2886 //**********************************************************************************************
2887
2888 //**SMP assignment to sparse vectors************************************************************
2903 template< typename VT1 > // Type of the target sparse vector
2904 friend inline auto smpAssign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2905 -> EnableIf_t< UseSMPAssign_v<VT1> >
2906 {
2908
2912
2913 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2914
2915 const ResultType tmp( rhs );
2916 smpAssign( *lhs, tmp );
2917 }
2919 //**********************************************************************************************
2920
2921 //**SMP addition assignment to dense vectors****************************************************
2936 template< typename VT1 > // Type of the target dense vector
2937 friend inline auto smpAddAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2938 -> EnableIf_t< UseSMPAssign_v<VT1> >
2939 {
2941
2942 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2943
2944 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2945 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2946 return;
2947 }
2948
2949 LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2950 RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2951
2952 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2953 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2954 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2955 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
2956
2957 smpAddAssign( *lhs, A * x );
2958 }
2960 //**********************************************************************************************
2961
2962 //**SMP addition assignment to sparse vectors***************************************************
2963 // No special implementation for the SMP addition assignment to sparse vectors.
2964 //**********************************************************************************************
2965
2966 //**SMP subtraction assignment to dense vectors*************************************************
2981 template< typename VT1 > // Type of the target dense vector
2982 friend inline auto smpSubAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2983 -> EnableIf_t< UseSMPAssign_v<VT1> >
2984 {
2986
2987 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
2988
2989 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2990 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2991 return;
2992 }
2993
2994 LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2995 RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2996
2997 BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2998 BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2999 BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
3000 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
3001
3002 smpSubAssign( *lhs, A * x );
3003 }
3005 //**********************************************************************************************
3006
3007 //**SMP subtraction assignment to sparse vectors************************************************
3008 // No special implementation for the SMP subtraction assignment to sparse vectors.
3009 //**********************************************************************************************
3010
3011 //**SMP multiplication assignment to dense vectors**********************************************
3026 template< typename VT1 > // Type of the target dense vector
3027 friend inline auto smpMultAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
3028 -> EnableIf_t< UseSMPAssign_v<VT1> >
3029 {
3031
3035
3036 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3037
3038 const ResultType tmp( rhs );
3039 smpMultAssign( *lhs, tmp );
3040 }
3042 //**********************************************************************************************
3043
3044 //**SMP multiplication assignment to sparse vectors*********************************************
3045 // No special implementation for the SMP multiplication assignment to sparse vectors.
3046 //**********************************************************************************************
3047
3048 //**SMP division assignment to dense vectors****************************************************
3063 template< typename VT1 > // Type of the target dense vector
3064 friend inline auto smpDivAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
3065 -> EnableIf_t< UseSMPAssign_v<VT1> >
3066 {
3068
3072
3073 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3074
3075 const ResultType tmp( rhs );
3076 smpDivAssign( *lhs, tmp );
3077 }
3079 //**********************************************************************************************
3080
3081 //**SMP division assignment to sparse vectors***************************************************
3082 // No special implementation for the SMP division assignment to sparse vectors.
3083 //**********************************************************************************************
3084
3085 //**Compile time checks*************************************************************************
3093 //**********************************************************************************************
3094};
3095//*************************************************************************************************
3096
3097
3098
3099
3100//=================================================================================================
3101//
3102// DVECSCALARMULTEXPR SPECIALIZATION
3103//
3104//=================================================================================================
3105
3106//*************************************************************************************************
3114template< typename MT // Type of the left-hand side dense matrix
3115 , typename VT // Type of the right-hand side dense vector
3116 , typename ST > // Type of the scalar value
3117class DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >
3118 : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false > >
3119 , private Computation
3120{
3121 private:
3122 //**Type definitions****************************************************************************
3123 using MVM = DMatDVecMultExpr<MT,VT>;
3124 using RES = ResultType_t<MVM>;
3125 using MRT = ResultType_t<MT>;
3126 using VRT = ResultType_t<VT>;
3127 using MET = ElementType_t<MRT>;
3128 using VET = ElementType_t<VRT>;
3129 using MCT = CompositeType_t<MT>;
3130 using VCT = CompositeType_t<VT>;
3131 //**********************************************************************************************
3132
3133 //**********************************************************************************************
3135 static constexpr bool evaluateMatrix =
3136 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
3137 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
3138 //**********************************************************************************************
3139
3140 //**********************************************************************************************
3142 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<MT> );
3143 //**********************************************************************************************
3144
3145 //**********************************************************************************************
3147
3150 template< typename T1 >
3151 static constexpr bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
3152 //**********************************************************************************************
3153
3154 //**********************************************************************************************
3156
3158 template< typename T1, typename T2, typename T3, typename T4 >
3159 static constexpr bool UseBlasKernel_v =
3161 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
3162 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
3163 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
3164 !IsDiagonal_v<T2> &&
3165 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
3166 IsBLASCompatible_v< ElementType_t<T1> > &&
3167 IsBLASCompatible_v< ElementType_t<T2> > &&
3168 IsBLASCompatible_v< ElementType_t<T3> > &&
3169 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
3170 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
3171 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
3172 //**********************************************************************************************
3173
3174 //**********************************************************************************************
3176
3179 template< typename T1, typename T2, typename T3, typename T4 >
3180 static constexpr bool UseVectorizedDefaultKernel_v =
3181 ( useOptimizedKernels &&
3182 !IsDiagonal_v<T2> &&
3183 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
3184 IsSIMDCombinable_v< ElementType_t<T1>
3185 , ElementType_t<T2>
3186 , ElementType_t<T3>
3187 , T4 > &&
3188 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
3189 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
3190 //**********************************************************************************************
3191
3192 public:
3193 //**Type definitions****************************************************************************
3195 using This = DVecScalarMultExpr<MVM,ST,false>;
3196
3198 using BaseType = VecScalarMultExpr< DenseVector<This,false> >;
3199
3200 using ResultType = MultTrait_t<RES,ST>;
3201 using TransposeType = TransposeType_t<ResultType>;
3202 using ElementType = ElementType_t<ResultType>;
3203 using SIMDType = SIMDTrait_t<ElementType>;
3204 using ReturnType = const ElementType;
3205 using CompositeType = const ResultType;
3206
3208 using LeftOperand = const DMatDVecMultExpr<MT,VT>;
3209
3211 using RightOperand = ST;
3212
3214 using LT = If_t< evaluateMatrix, const MRT, MCT >;
3215
3217 using RT = If_t< evaluateVector, const VRT, VCT >;
3218 //**********************************************************************************************
3219
3220 //**Compilation flags***************************************************************************
3222 static constexpr bool simdEnabled =
3223 ( !IsDiagonal_v<MT> &&
3224 MT::simdEnabled && VT::simdEnabled &&
3225 IsSIMDCombinable_v<MET,VET,ST> &&
3226 HasSIMDAdd_v<MET,VET> &&
3227 HasSIMDMult_v<MET,VET> );
3228
3230 static constexpr bool smpAssignable =
3231 ( !evaluateMatrix && MT::smpAssignable && !evaluateVector && VT::smpAssignable );
3232 //**********************************************************************************************
3233
3234 //**SIMD properties*****************************************************************************
3236 static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
3237 //**********************************************************************************************
3238
3239 //**Constructor*********************************************************************************
3245 inline DVecScalarMultExpr( const MVM& vector, ST scalar )
3246 : vector_( vector ) // Left-hand side dense vector of the multiplication expression
3247 , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
3248 {}
3249 //**********************************************************************************************
3250
3251 //**Subscript operator**************************************************************************
3257 inline ReturnType operator[]( size_t index ) const {
3258 BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
3259 return vector_[index] * scalar_;
3260 }
3261 //**********************************************************************************************
3262
3263 //**At function*********************************************************************************
3270 inline ReturnType at( size_t index ) const {
3271 if( index >= vector_.size() ) {
3272 BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
3273 }
3274 return (*this)[index];
3275 }
3276 //**********************************************************************************************
3277
3278 //**Size function*******************************************************************************
3283 inline size_t size() const {
3284 return vector_.size();
3285 }
3286 //**********************************************************************************************
3287
3288 //**Left operand access*************************************************************************
3293 inline LeftOperand leftOperand() const {
3294 return vector_;
3295 }
3296 //**********************************************************************************************
3297
3298 //**Right operand access************************************************************************
3303 inline RightOperand rightOperand() const {
3304 return scalar_;
3305 }
3306 //**********************************************************************************************
3307
3308 //**********************************************************************************************
3314 template< typename T >
3315 inline bool canAlias( const T* alias ) const {
3316 return vector_.canAlias( alias );
3317 }
3318 //**********************************************************************************************
3319
3320 //**********************************************************************************************
3326 template< typename T >
3327 inline bool isAliased( const T* alias ) const {
3328 return vector_.isAliased( alias );
3329 }
3330 //**********************************************************************************************
3331
3332 //**********************************************************************************************
3337 inline bool isAligned() const {
3338 return vector_.isAligned();
3339 }
3340 //**********************************************************************************************
3341
3342 //**********************************************************************************************
3347 inline bool canSMPAssign() const noexcept {
3348 LeftOperand_t<MVM> A( vector_.leftOperand() );
3349 return ( !BLAZE_BLAS_MODE ||
3352 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3353 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
3354 ( size() > SMP_DMATDVECMULT_THRESHOLD );
3355 }
3356 //**********************************************************************************************
3357
3358 private:
3359 //**Member variables****************************************************************************
3362 //**********************************************************************************************
3363
3364 //**Assignment to dense vectors*****************************************************************
3376 template< typename VT1 > // Type of the target dense vector
3377 friend inline void assign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3378 {
3380
3381 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
3382
3383 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
3384 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
3385
3386 if( left.rows() == 0UL ) {
3387 return;
3388 }
3389 else if( left.columns() == 0UL ||
3390 ( IsStrictlyTriangular_v<MT> && left.columns() == 1UL ) ) {
3391 reset( *lhs );
3392 return;
3393 }
3394
3395 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3396 RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3397
3398 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3399 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3400 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3401 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
3402
3403 DVecScalarMultExpr::selectAssignKernel( *lhs, A, x, rhs.scalar_ );
3404 }
3405 //**********************************************************************************************
3406
3407 //**Assignment to dense vectors (kernel selection)**********************************************
3418 template< typename VT1 // Type of the left-hand side target vector
3419 , typename MT1 // Type of the left-hand side matrix operand
3420 , typename VT2 // Type of the right-hand side vector operand
3421 , typename ST2 > // Type of the scalar value
3422 static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3423 {
3424 if( ( IsDiagonal_v<MT1> ) ||
3425 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3426 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3427 selectSmallAssignKernel( y, A, x, scalar );
3428 else
3429 selectBlasAssignKernel( y, A, x, scalar );
3430 }
3431 //**********************************************************************************************
3432
3433 //**Default assignment to dense vectors*********************************************************
3447 template< typename VT1 // Type of the left-hand side target vector
3448 , typename MT1 // Type of the left-hand side matrix operand
3449 , typename VT2 // Type of the right-hand side vector operand
3450 , typename ST2 > // Type of the scalar value
3451 static inline auto selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3452 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3453 {
3454 y.assign( A * x * scalar );
3455 }
3456 //**********************************************************************************************
3457
3458 //**Default assignment to dense vectors (small matrices)****************************************
3472 template< typename VT1 // Type of the left-hand side target vector
3473 , typename MT1 // Type of the left-hand side matrix operand
3474 , typename VT2 // Type of the right-hand side vector operand
3475 , typename ST2 > // Type of the scalar value
3476 static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3477 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3478 {
3479 selectDefaultAssignKernel( y, A, x, scalar );
3480 }
3481 //**********************************************************************************************
3482
3483 //**Vectorized default assignment to dense vectors (small matrices)*****************************
3497 template< typename VT1 // Type of the left-hand side target vector
3498 , typename MT1 // Type of the left-hand side matrix operand
3499 , typename VT2 // Type of the right-hand side vector operand
3500 , typename ST2 > // Type of the scalar value
3501 static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3502 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3503 {
3504 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3505
3506 const size_t M( A.rows() );
3507 const size_t N( A.columns() );
3508
3509 size_t i( 0UL );
3510
3511 for( ; (i+8UL) <= M; i+=8UL )
3512 {
3513 const size_t jbegin( ( IsUpper_v<MT1> )
3514 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
3515 :( 0UL ) );
3516 const size_t jend( ( IsLower_v<MT1> )
3517 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3518 :( N ) );
3519 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3520
3521 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
3522 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
3523
3524 size_t j( jbegin );
3525
3526 if( j < jpos )
3527 {
3528 SIMDType x1( x.load(j) );
3529 SIMDType xmm1( A.load(i ,j) * x1 );
3530 SIMDType xmm2( A.load(i+1UL,j) * x1 );
3531 SIMDType xmm3( A.load(i+2UL,j) * x1 );
3532 SIMDType xmm4( A.load(i+3UL,j) * x1 );
3533 SIMDType xmm5( A.load(i+4UL,j) * x1 );
3534 SIMDType xmm6( A.load(i+5UL,j) * x1 );
3535 SIMDType xmm7( A.load(i+6UL,j) * x1 );
3536 SIMDType xmm8( A.load(i+7UL,j) * x1 );
3537
3538 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
3539 x1 = x.load(j);
3540 xmm1 += A.load(i ,j) * x1;
3541 xmm2 += A.load(i+1UL,j) * x1;
3542 xmm3 += A.load(i+2UL,j) * x1;
3543 xmm4 += A.load(i+3UL,j) * x1;
3544 xmm5 += A.load(i+4UL,j) * x1;
3545 xmm6 += A.load(i+5UL,j) * x1;
3546 xmm7 += A.load(i+6UL,j) * x1;
3547 xmm8 += A.load(i+7UL,j) * x1;
3548 }
3549
3550 y[i ] = sum( xmm1 ) * scalar;
3551 y[i+1UL] = sum( xmm2 ) * scalar;
3552 y[i+2UL] = sum( xmm3 ) * scalar;
3553 y[i+3UL] = sum( xmm4 ) * scalar;
3554 y[i+4UL] = sum( xmm5 ) * scalar;
3555 y[i+5UL] = sum( xmm6 ) * scalar;
3556 y[i+6UL] = sum( xmm7 ) * scalar;
3557 y[i+7UL] = sum( xmm8 ) * scalar;
3558
3559 for( ; remainder && j<jend; ++j ) {
3560 y[i ] += A(i ,j) * x[j] * scalar;
3561 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3562 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3563 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3564 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3565 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3566 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3567 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3568 }
3569 }
3570 else
3571 {
3572 ElementType value1( A(i ,j) * x[j] );
3573 ElementType value2( A(i+1UL,j) * x[j] );
3574 ElementType value3( A(i+2UL,j) * x[j] );
3575 ElementType value4( A(i+3UL,j) * x[j] );
3576 ElementType value5( A(i+4UL,j) * x[j] );
3577 ElementType value6( A(i+5UL,j) * x[j] );
3578 ElementType value7( A(i+6UL,j) * x[j] );
3579 ElementType value8( A(i+7UL,j) * x[j] );
3580
3581 for( ++j; j<jend; ++j ) {
3582 value1 += A(i ,j) * x[j];
3583 value2 += A(i+1UL,j) * x[j];
3584 value3 += A(i+2UL,j) * x[j];
3585 value4 += A(i+3UL,j) * x[j];
3586 value5 += A(i+4UL,j) * x[j];
3587 value6 += A(i+5UL,j) * x[j];
3588 value7 += A(i+6UL,j) * x[j];
3589 value8 += A(i+7UL,j) * x[j];
3590 }
3591
3592 y[i ] = value1 * scalar;
3593 y[i+1UL] = value2 * scalar;
3594 y[i+2UL] = value3 * scalar;
3595 y[i+3UL] = value4 * scalar;
3596 y[i+4UL] = value5 * scalar;
3597 y[i+5UL] = value6 * scalar;
3598 y[i+6UL] = value7 * scalar;
3599 y[i+7UL] = value8 * scalar;
3600 }
3601 }
3602
3603 for( ; (i+4UL) <= M; i+=4UL )
3604 {
3605 const size_t jbegin( ( IsUpper_v<MT1> )
3606 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
3607 :( 0UL ) );
3608 const size_t jend( ( IsLower_v<MT1> )
3609 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3610 :( N ) );
3611 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3612
3613 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
3614 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
3615
3616 size_t j( jbegin );
3617
3618 if( j < jpos )
3619 {
3620 SIMDType x1( x.load(j) );
3621 SIMDType xmm1( A.load(i ,j) * x1 );
3622 SIMDType xmm2( A.load(i+1UL,j) * x1 );
3623 SIMDType xmm3( A.load(i+2UL,j) * x1 );
3624 SIMDType xmm4( A.load(i+3UL,j) * x1 );
3625
3626 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
3627 x1 = x.load(j);
3628 xmm1 += A.load(i ,j) * x1;
3629 xmm2 += A.load(i+1UL,j) * x1;
3630 xmm3 += A.load(i+2UL,j) * x1;
3631 xmm4 += A.load(i+3UL,j) * x1;
3632 }
3633
3634 y[i ] = sum( xmm1 ) * scalar;
3635 y[i+1UL] = sum( xmm2 ) * scalar;
3636 y[i+2UL] = sum( xmm3 ) * scalar;
3637 y[i+3UL] = sum( xmm4 ) * scalar;
3638
3639 for( ; remainder && j<jend; ++j ) {
3640 y[i ] += A(i ,j) * x[j] * scalar;
3641 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3642 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3643 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3644 }
3645 }
3646 else
3647 {
3648 ElementType value1( A(i ,j) * x[j] );
3649 ElementType value2( A(i+1UL,j) * x[j] );
3650 ElementType value3( A(i+2UL,j) * x[j] );
3651 ElementType value4( A(i+3UL,j) * x[j] );
3652
3653 for( ++j; j<jend; ++j ) {
3654 value1 += A(i ,j) * x[j];
3655 value2 += A(i+1UL,j) * x[j];
3656 value3 += A(i+2UL,j) * x[j];
3657 value4 += A(i+3UL,j) * x[j];
3658 }
3659
3660 y[i ] = value1 * scalar;
3661 y[i+1UL] = value2 * scalar;
3662 y[i+2UL] = value3 * scalar;
3663 y[i+3UL] = value4 * scalar;
3664 }
3665 }
3666
3667 for( ; (i+3UL) <= M; i+=3UL )
3668 {
3669 const size_t jbegin( ( IsUpper_v<MT1> )
3670 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
3671 :( 0UL ) );
3672 const size_t jend( ( IsLower_v<MT1> )
3673 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
3674 :( N ) );
3675 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3676
3677 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
3678 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
3679
3680 size_t j( jbegin );
3681
3682 if( j < jpos )
3683 {
3684 SIMDType x1( x.load(j) );
3685 SIMDType xmm1( A.load(i ,j) * x1 );
3686 SIMDType xmm2( A.load(i+1UL,j) * x1 );
3687 SIMDType xmm3( A.load(i+2UL,j) * x1 );
3688
3689 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
3690 x1 = x.load(j);
3691 xmm1 += A.load(i ,j) * x1;
3692 xmm2 += A.load(i+1UL,j) * x1;
3693 xmm3 += A.load(i+2UL,j) * x1;
3694 }
3695
3696 y[i ] = sum( xmm1 ) * scalar;
3697 y[i+1UL] = sum( xmm2 ) * scalar;
3698 y[i+2UL] = sum( xmm3 ) * scalar;
3699
3700 for( ; remainder && j<jend; ++j ) {
3701 y[i ] += A(i ,j) * x[j] * scalar;
3702 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3703 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3704 }
3705 }
3706 else
3707 {
3708 ElementType value1( A(i ,j) * x[j] );
3709 ElementType value2( A(i+1UL,j) * x[j] );
3710 ElementType value3( A(i+2UL,j) * x[j] );
3711
3712 for( ++j; j<jend; ++j ) {
3713 value1 += A(i ,j) * x[j];
3714 value2 += A(i+1UL,j) * x[j];
3715 value3 += A(i+2UL,j) * x[j];
3716 }
3717
3718 y[i ] = value1 * scalar;
3719 y[i+1UL] = value2 * scalar;
3720 y[i+2UL] = value3 * scalar;
3721 }
3722 }
3723
3724 for( ; (i+2UL) <= M; i+=2UL )
3725 {
3726 const size_t jbegin( ( IsUpper_v<MT1> )
3727 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
3728 :( 0UL ) );
3729 const size_t jend( ( IsLower_v<MT1> )
3730 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
3731 :( N ) );
3732 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3733
3734 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
3735 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
3736
3737 size_t j( jbegin );
3738
3739 if( j < jpos )
3740 {
3741 SIMDType x1( x.load(j) );
3742 SIMDType xmm1( A.load(i ,j) * x1 );
3743 SIMDType xmm2( A.load(i+1UL,j) * x1 );
3744
3745 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
3746 x1 = x.load(j);
3747 xmm1 += A.load(i ,j) * x1;
3748 xmm2 += A.load(i+1UL,j) * x1;
3749 }
3750
3751 y[i ] = sum( xmm1 ) * scalar;
3752 y[i+1UL] = sum( xmm2 ) * scalar;
3753
3754 for( ; remainder && j<jend; ++j ) {
3755 y[i ] += A(i ,j) * x[j] * scalar;
3756 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3757 }
3758 }
3759 else
3760 {
3761 ElementType value1( A(i ,j) * x[j] );
3762 ElementType value2( A(i+1UL,j) * x[j] );
3763
3764 for( ++j; j<jend; ++j ) {
3765 value1 += A(i ,j) * x[j];
3766 value2 += A(i+1UL,j) * x[j];
3767 }
3768
3769 y[i ] = value1 * scalar;
3770 y[i+1UL] = value2 * scalar;
3771 }
3772 }
3773
3774 if( i < M )
3775 {
3776 const size_t jbegin( ( IsUpper_v<MT1> )
3777 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
3778 :( 0UL ) );
3779 const size_t jend( ( IsLower_v<MT1> )
3780 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
3781 :( N ) );
3782 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3783
3784 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
3785 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
3786
3787 size_t j( jbegin );
3788
3789 if( j < jpos )
3790 {
3791 SIMDType xmm1( A.load(i,j) * x.load(j) );
3792
3793 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
3794 xmm1 += A.load(i,j) * x.load(j);
3795 }
3796
3797 y[i] = sum( xmm1 ) * scalar;
3798
3799 for( ; remainder && j<jend; ++j ) {
3800 y[i] += A(i,j) * x[j] * scalar;
3801 }
3802 }
3803 else
3804 {
3805 ElementType value( A(i,j) * x[j] );
3806
3807 for( ++j; j<jend; ++j ) {
3808 value += A(i,j) * x[j];
3809 }
3810
3811 y[i] = value * scalar;
3812 }
3813 }
3814 }
3815 //**********************************************************************************************
3816
3817 //**Default assignment to dense vectors (large matrices)****************************************
3831 template< typename VT1 // Type of the left-hand side target vector
3832 , typename MT1 // Type of the left-hand side matrix operand
3833 , typename VT2 // Type of the right-hand side vector operand
3834 , typename ST2 > // Type of the scalar value
3835 static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3836 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3837 {
3838 selectDefaultAssignKernel( y, A, x, scalar );
3839 }
3840 //**********************************************************************************************
3841
3842 //**Vectorized default assignment to dense vectors (large matrices)*****************************
3856 template< typename VT1 // Type of the left-hand side target vector
3857 , typename MT1 // Type of the left-hand side matrix operand
3858 , typename VT2 // Type of the right-hand side vector operand
3859 , typename ST2 > // Type of the scalar value
3860 static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3861 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3862 {
3863 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3864
3865 const size_t M( A.rows() );
3866 const size_t N( A.columns() );
3867
3868 reset( y );
3869
3870 size_t i( 0UL );
3871
3872 for( ; (i+8UL) <= M; i+=8UL )
3873 {
3874 const size_t jbegin( ( IsUpper_v<MT1> )
3875 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
3876 :( 0UL ) );
3877 const size_t jend( ( IsLower_v<MT1> )
3878 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3879 :( N ) );
3880 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3881
3882 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
3883 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
3884
3885 size_t j( jbegin );
3886
3887 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3888 const size_t j1( j+SIMDSIZE );
3889 const size_t j2( j+SIMDSIZE*2UL );
3890 const size_t j3( j+SIMDSIZE*3UL );
3891 const SIMDType x1( x.load(j ) );
3892 const SIMDType x2( x.load(j1) );
3893 const SIMDType x3( x.load(j2) );
3894 const SIMDType x4( x.load(j3) );
3895 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3896 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3897 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3898 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3899 y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3900 y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3901 y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3902 y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3903 }
3904
3905 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3906 const size_t j1( j+SIMDSIZE );
3907 const SIMDType x1( x.load(j ) );
3908 const SIMDType x2( x.load(j1) );
3909 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3910 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3911 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3912 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3913 y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3914 y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3915 y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3916 y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3917 }
3918
3919 for( ; j<jpos; j+=SIMDSIZE ) {
3920 const SIMDType x1( x.load(j) );
3921 y[i ] += sum( A.load(i ,j) * x1 );
3922 y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3923 y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3924 y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3925 y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
3926 y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
3927 y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
3928 y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
3929 }
3930
3931 for( ; remainder && j<jend; ++j ) {
3932 y[i ] += A(i ,j) * x[j];
3933 y[i+1UL] += A(i+1UL,j) * x[j];
3934 y[i+2UL] += A(i+2UL,j) * x[j];
3935 y[i+3UL] += A(i+3UL,j) * x[j];
3936 y[i+4UL] += A(i+4UL,j) * x[j];
3937 y[i+5UL] += A(i+5UL,j) * x[j];
3938 y[i+6UL] += A(i+6UL,j) * x[j];
3939 y[i+7UL] += A(i+7UL,j) * x[j];
3940 }
3941
3942 y[i ] *= scalar;
3943 y[i+1UL] *= scalar;
3944 y[i+2UL] *= scalar;
3945 y[i+3UL] *= scalar;
3946 y[i+4UL] *= scalar;
3947 y[i+5UL] *= scalar;
3948 y[i+6UL] *= scalar;
3949 y[i+7UL] *= scalar;
3950 }
3951
3952 for( ; (i+4UL) <= M; i+=4UL )
3953 {
3954 const size_t jbegin( ( IsUpper_v<MT1> )
3955 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
3956 :( 0UL ) );
3957 const size_t jend( ( IsLower_v<MT1> )
3958 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3959 :( N ) );
3960 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
3961
3962 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
3963 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
3964
3965 size_t j( jbegin );
3966
3967 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3968 const size_t j1( j+SIMDSIZE );
3969 const size_t j2( j+SIMDSIZE*2UL );
3970 const size_t j3( j+SIMDSIZE*3UL );
3971 const SIMDType x1( x.load(j ) );
3972 const SIMDType x2( x.load(j1) );
3973 const SIMDType x3( x.load(j2) );
3974 const SIMDType x4( x.load(j3) );
3975 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3976 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3977 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3978 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3979 }
3980
3981 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3982 const size_t j1( j+SIMDSIZE );
3983 const SIMDType x1( x.load(j ) );
3984 const SIMDType x2( x.load(j1) );
3985 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3986 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3987 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3988 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3989 }
3990
3991 for( ; j<jpos; j+=SIMDSIZE ) {
3992 const SIMDType x1( x.load(j) );
3993 y[i ] += sum( A.load(i ,j) * x1 );
3994 y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3995 y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3996 y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3997 }
3998
3999 for( ; remainder && j<jend; ++j ) {
4000 y[i ] += A(i ,j) * x[j];
4001 y[i+1UL] += A(i+1UL,j) * x[j];
4002 y[i+2UL] += A(i+2UL,j) * x[j];
4003 y[i+3UL] += A(i+3UL,j) * x[j];
4004 }
4005
4006 y[i ] *= scalar;
4007 y[i+1UL] *= scalar;
4008 y[i+2UL] *= scalar;
4009 y[i+3UL] *= scalar;
4010 }
4011
4012 for( ; (i+2UL) <= M; i+=2UL )
4013 {
4014 const size_t jbegin( ( IsUpper_v<MT1> )
4015 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4016 :( 0UL ) );
4017 const size_t jend( ( IsLower_v<MT1> )
4018 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4019 :( N ) );
4020 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4021
4022 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4023 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4024
4025 size_t j( jbegin );
4026
4027 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4028 const size_t j1( j+SIMDSIZE );
4029 const size_t j2( j+SIMDSIZE*2UL );
4030 const size_t j3( j+SIMDSIZE*3UL );
4031 const SIMDType x1( x.load(j ) );
4032 const SIMDType x2( x.load(j1) );
4033 const SIMDType x3( x.load(j2) );
4034 const SIMDType x4( x.load(j3) );
4035 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
4036 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
4037 }
4038
4039 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4040 const size_t j1( j+SIMDSIZE );
4041 const SIMDType x1( x.load(j ) );
4042 const SIMDType x2( x.load(j1) );
4043 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
4044 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
4045 }
4046
4047 for( ; j<jpos; j+=SIMDSIZE ) {
4048 const SIMDType x1( x.load(j) );
4049 y[i ] += sum( A.load(i ,j) * x1 );
4050 y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
4051 }
4052
4053 for( ; remainder && j<jend; ++j ) {
4054 y[i ] += A(i ,j) * x[j];
4055 y[i+1UL] += A(i+1UL,j) * x[j];
4056 }
4057
4058 y[i ] *= scalar;
4059 y[i+1UL] *= scalar;
4060 }
4061
4062 if( i < M )
4063 {
4064 const size_t jbegin( ( IsUpper_v<MT1> )
4065 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4066 :( 0UL ) );
4067 const size_t jend( ( IsLower_v<MT1> )
4068 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4069 :( N ) );
4070 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4071
4072 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4073 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4074
4075 size_t j( jbegin );
4076
4077 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4078 const size_t j1( j+SIMDSIZE );
4079 const size_t j2( j+SIMDSIZE*2UL );
4080 const size_t j3( j+SIMDSIZE*3UL );
4081 const SIMDType x1( x.load(j ) );
4082 const SIMDType x2( x.load(j1) );
4083 const SIMDType x3( x.load(j2) );
4084 const SIMDType x4( x.load(j3) );
4085 y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
4086 }
4087
4088 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4089 const size_t j1( j+SIMDSIZE );
4090 const SIMDType x1( x.load(j ) );
4091 const SIMDType x2( x.load(j1) );
4092 y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
4093 }
4094
4095 for( ; j<jpos; j+=SIMDSIZE ) {
4096 const SIMDType x1( x.load(j) );
4097 y[i] += sum( A.load(i,j) * x1 );
4098 }
4099
4100 for( ; remainder && j<jend; ++j ) {
4101 y[i] += A(i,j) * x[j];
4102 }
4103
4104 y[i] *= scalar;
4105 }
4106 }
4107 //**********************************************************************************************
4108
4109 //**BLAS-based assignment to dense vectors (default)********************************************
4123 template< typename VT1 // Type of the left-hand side target vector
4124 , typename MT1 // Type of the left-hand side matrix operand
4125 , typename VT2 // Type of the right-hand side vector operand
4126 , typename ST2 > // Type of the scalar value
4127 static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4128 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4129 {
4130 selectLargeAssignKernel( y, A, x, scalar );
4131 }
4132 //**********************************************************************************************
4133
4134 //**BLAS-based assignment to dense vectors******************************************************
4135#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4149 template< typename VT1 // Type of the left-hand side target vector
4150 , typename MT1 // Type of the left-hand side matrix operand
4151 , typename VT2 // Type of the right-hand side vector operand
4152 , typename ST2 > // Type of the scalar value
4153 static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4154 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4155 {
4156 using ET = ElementType_t<VT1>;
4157
4158 if( IsTriangular_v<MT1> ) {
4159 assign( y, scalar * x );
4160 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4161 }
4162 else {
4163 gemv( y, A, x, ET(scalar), ET(0) );
4164 }
4165 }
4166#endif
4167 //**********************************************************************************************
4168
4169 //**Assignment to sparse vectors****************************************************************
4181 template< typename VT1 > // Type of the target sparse vector
4182 friend inline void assign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4183 {
4185
4189
4190 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4191
4192 const ResultType tmp( serial( rhs ) );
4193 assign( *lhs, tmp );
4194 }
4195 //**********************************************************************************************
4196
4197 //**Addition assignment to dense vectors********************************************************
4209 template< typename VT1 > // Type of the target dense vector
4210 friend inline void addAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4211 {
4213
4214 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4215
4216 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4217 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4218
4219 if( left.rows() == 0UL || left.columns() == 0UL ||
4220 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
4221 return;
4222 }
4223
4224 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4225 RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
4226
4227 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4228 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4229 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4230 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
4231
4232 DVecScalarMultExpr::selectAddAssignKernel( *lhs, A, x, rhs.scalar_ );
4233 }
4234 //**********************************************************************************************
4235
4236 //**Addition assignment to dense vectors (kernel selection)*************************************
4247 template< typename VT1 // Type of the left-hand side target vector
4248 , typename MT1 // Type of the left-hand side matrix operand
4249 , typename VT2 // Type of the right-hand side vector operand
4250 , typename ST2 > // Type of the scalar value
4251 static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4252 {
4253 if( ( IsDiagonal_v<MT1> ) ||
4254 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4255 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4256 selectSmallAddAssignKernel( y, A, x, scalar );
4257 else
4258 selectBlasAddAssignKernel( y, A, x, scalar );
4259 }
4260 //**********************************************************************************************
4261
4262 //**Default addition assignment to dense vectors************************************************
4276 template< typename VT1 // Type of the left-hand side target vector
4277 , typename MT1 // Type of the left-hand side matrix operand
4278 , typename VT2 // Type of the right-hand side vector operand
4279 , typename ST2 > // Type of the scalar value
4280 static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4281 {
4282 y.addAssign( A * x * scalar );
4283 }
4284 //**********************************************************************************************
4285
4286 //**Default addition assignment to dense vectors (small matrices)*******************************
4300 template< typename VT1 // Type of the left-hand side target vector
4301 , typename MT1 // Type of the left-hand side matrix operand
4302 , typename VT2 // Type of the right-hand side vector operand
4303 , typename ST2 > // Type of the scalar value
4304 static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4305 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4306 {
4307 selectDefaultAddAssignKernel( y, A, x, scalar );
4308 }
4309 //**********************************************************************************************
4310
4311 //**Vectorized default addition assignment to dense vectors (small matrices)********************
4325 template< typename VT1 // Type of the left-hand side target vector
4326 , typename MT1 // Type of the left-hand side matrix operand
4327 , typename VT2 // Type of the right-hand side vector operand
4328 , typename ST2 > // Type of the scalar value
4329 static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4330 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4331 {
4332 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4333
4334 const size_t M( A.rows() );
4335 const size_t N( A.columns() );
4336
4337 size_t i( 0UL );
4338
4339 for( ; (i+8UL) <= M; i+=8UL )
4340 {
4341 const size_t jbegin( ( IsUpper_v<MT1> )
4342 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4343 :( 0UL ) );
4344 const size_t jend( ( IsLower_v<MT1> )
4345 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4346 :( N ) );
4347 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4348
4349 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4350 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4351
4352 size_t j( jbegin );
4353
4354 if( j < jpos )
4355 {
4356 SIMDType x1( x.load(j) );
4357 SIMDType xmm1( A.load(i ,j) * x1 );
4358 SIMDType xmm2( A.load(i+1UL,j) * x1 );
4359 SIMDType xmm3( A.load(i+2UL,j) * x1 );
4360 SIMDType xmm4( A.load(i+3UL,j) * x1 );
4361 SIMDType xmm5( A.load(i+4UL,j) * x1 );
4362 SIMDType xmm6( A.load(i+5UL,j) * x1 );
4363 SIMDType xmm7( A.load(i+6UL,j) * x1 );
4364 SIMDType xmm8( A.load(i+7UL,j) * x1 );
4365
4366 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
4367 x1 = x.load(j);
4368 xmm1 += A.load(i ,j) * x1;
4369 xmm2 += A.load(i+1UL,j) * x1;
4370 xmm3 += A.load(i+2UL,j) * x1;
4371 xmm4 += A.load(i+3UL,j) * x1;
4372 xmm5 += A.load(i+4UL,j) * x1;
4373 xmm6 += A.load(i+5UL,j) * x1;
4374 xmm7 += A.load(i+6UL,j) * x1;
4375 xmm8 += A.load(i+7UL,j) * x1;
4376 }
4377
4378 y[i ] += sum( xmm1 ) * scalar;
4379 y[i+1UL] += sum( xmm2 ) * scalar;
4380 y[i+2UL] += sum( xmm3 ) * scalar;
4381 y[i+3UL] += sum( xmm4 ) * scalar;
4382 y[i+4UL] += sum( xmm5 ) * scalar;
4383 y[i+5UL] += sum( xmm6 ) * scalar;
4384 y[i+6UL] += sum( xmm7 ) * scalar;
4385 y[i+7UL] += sum( xmm8 ) * scalar;
4386
4387 for( ; remainder && j<jend; ++j ) {
4388 y[i ] += A(i ,j) * x[j] * scalar;
4389 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4390 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4391 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4392 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4393 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4394 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4395 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4396 }
4397 }
4398 else
4399 {
4400 ElementType value1( A(i ,j) * x[j] );
4401 ElementType value2( A(i+1UL,j) * x[j] );
4402 ElementType value3( A(i+2UL,j) * x[j] );
4403 ElementType value4( A(i+3UL,j) * x[j] );
4404 ElementType value5( A(i+4UL,j) * x[j] );
4405 ElementType value6( A(i+5UL,j) * x[j] );
4406 ElementType value7( A(i+6UL,j) * x[j] );
4407 ElementType value8( A(i+7UL,j) * x[j] );
4408
4409 for( ++j; j<jend; ++j ) {
4410 value1 += A(i ,j) * x[j];
4411 value2 += A(i+1UL,j) * x[j];
4412 value3 += A(i+2UL,j) * x[j];
4413 value4 += A(i+3UL,j) * x[j];
4414 value5 += A(i+4UL,j) * x[j];
4415 value6 += A(i+5UL,j) * x[j];
4416 value7 += A(i+6UL,j) * x[j];
4417 value8 += A(i+7UL,j) * x[j];
4418 }
4419
4420 y[i ] += value1 * scalar;
4421 y[i+1UL] += value2 * scalar;
4422 y[i+2UL] += value3 * scalar;
4423 y[i+3UL] += value4 * scalar;
4424 y[i+4UL] += value5 * scalar;
4425 y[i+5UL] += value6 * scalar;
4426 y[i+6UL] += value7 * scalar;
4427 y[i+7UL] += value8 * scalar;
4428 }
4429 }
4430
4431 for( ; (i+4UL) <= M; i+=4UL )
4432 {
4433 const size_t jbegin( ( IsUpper_v<MT1> )
4434 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4435 :( 0UL ) );
4436 const size_t jend( ( IsLower_v<MT1> )
4437 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4438 :( N ) );
4439 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4440
4441 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4442 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4443
4444 size_t j( jbegin );
4445
4446 if( j < jpos )
4447 {
4448 SIMDType x1( x.load(j) );
4449 SIMDType xmm1( A.load(i ,j) * x1 );
4450 SIMDType xmm2( A.load(i+1UL,j) * x1 );
4451 SIMDType xmm3( A.load(i+2UL,j) * x1 );
4452 SIMDType xmm4( A.load(i+3UL,j) * x1 );
4453
4454 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
4455 x1 = x.load(j);
4456 xmm1 += A.load(i ,j) * x1;
4457 xmm2 += A.load(i+1UL,j) * x1;
4458 xmm3 += A.load(i+2UL,j) * x1;
4459 xmm4 += A.load(i+3UL,j) * x1;
4460 }
4461
4462 y[i ] += sum( xmm1 ) * scalar;
4463 y[i+1UL] += sum( xmm2 ) * scalar;
4464 y[i+2UL] += sum( xmm3 ) * scalar;
4465 y[i+3UL] += sum( xmm4 ) * scalar;
4466
4467 for( ; remainder && j<jend; ++j ) {
4468 y[i ] += A(i ,j) * x[j] * scalar;
4469 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4470 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4471 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4472 }
4473 }
4474 else
4475 {
4476 ElementType value1( A(i ,j) * x[j] );
4477 ElementType value2( A(i+1UL,j) * x[j] );
4478 ElementType value3( A(i+2UL,j) * x[j] );
4479 ElementType value4( A(i+3UL,j) * x[j] );
4480
4481 for( ++j; j<jend; ++j ) {
4482 value1 += A(i ,j) * x[j];
4483 value2 += A(i+1UL,j) * x[j];
4484 value3 += A(i+2UL,j) * x[j];
4485 value4 += A(i+3UL,j) * x[j];
4486 }
4487
4488 y[i ] += value1 * scalar;
4489 y[i+1UL] += value2 * scalar;
4490 y[i+2UL] += value3 * scalar;
4491 y[i+3UL] += value4 * scalar;
4492 }
4493 }
4494
4495 for( ; (i+3UL) <= M; i+=3UL )
4496 {
4497 const size_t jbegin( ( IsUpper_v<MT1> )
4498 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4499 :( 0UL ) );
4500 const size_t jend( ( IsLower_v<MT1> )
4501 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
4502 :( N ) );
4503 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4504
4505 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4506 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4507
4508 size_t j( jbegin );
4509
4510 if( j < jpos )
4511 {
4512 SIMDType x1( x.load(j) );
4513 SIMDType xmm1( A.load(i ,j) * x1 );
4514 SIMDType xmm2( A.load(i+1UL,j) * x1 );
4515 SIMDType xmm3( A.load(i+2UL,j) * x1 );
4516
4517 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
4518 x1 = x.load(j);
4519 xmm1 += A.load(i ,j) * x1;
4520 xmm2 += A.load(i+1UL,j) * x1;
4521 xmm3 += A.load(i+2UL,j) * x1;
4522 }
4523
4524 y[i ] += sum( xmm1 ) * scalar;
4525 y[i+1UL] += sum( xmm2 ) * scalar;
4526 y[i+2UL] += sum( xmm3 ) * scalar;
4527
4528 for( ; remainder && j<jend; ++j ) {
4529 y[i ] += A(i ,j) * x[j] * scalar;
4530 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4531 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4532 }
4533 }
4534 else
4535 {
4536 ElementType value1( A(i ,j) * x[j] );
4537 ElementType value2( A(i+1UL,j) * x[j] );
4538 ElementType value3( A(i+2UL,j) * x[j] );
4539
4540 for( ++j; j<jend; ++j ) {
4541 value1 += A(i ,j) * x[j];
4542 value2 += A(i+1UL,j) * x[j];
4543 value3 += A(i+2UL,j) * x[j];
4544 }
4545
4546 y[i ] += value1 * scalar;
4547 y[i+1UL] += value2 * scalar;
4548 y[i+2UL] += value3 * scalar;
4549 }
4550 }
4551
4552 for( ; (i+2UL) <= M; i+=2UL )
4553 {
4554 const size_t jbegin( ( IsUpper_v<MT1> )
4555 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4556 :( 0UL ) );
4557 const size_t jend( ( IsLower_v<MT1> )
4558 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4559 :( N ) );
4560 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4561
4562 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4563 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4564
4565 size_t j( jbegin );
4566
4567 if( j < jpos )
4568 {
4569 SIMDType x1( x.load(j) );
4570 SIMDType xmm1( A.load(i ,j) * x1 );
4571 SIMDType xmm2( A.load(i+1UL,j) * x1 );
4572
4573 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
4574 x1 = x.load(j);
4575 xmm1 += A.load(i ,j) * x1;
4576 xmm2 += A.load(i+1UL,j) * x1;
4577 }
4578
4579 y[i ] += sum( xmm1 ) * scalar;
4580 y[i+1UL] += sum( xmm2 ) * scalar;
4581
4582 for( ; remainder && j<jend; ++j ) {
4583 y[i ] += A(i ,j) * x[j] * scalar;
4584 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4585 }
4586 }
4587 else
4588 {
4589 ElementType value1( A(i ,j) * x[j] );
4590 ElementType value2( A(i+1UL,j) * x[j] );
4591
4592 for( ++j; j<jend; ++j ) {
4593 value1 += A(i ,j) * x[j];
4594 value2 += A(i+1UL,j) * x[j];
4595 }
4596
4597 y[i ] += value1 * scalar;
4598 y[i+1UL] += value2 * scalar;
4599 }
4600 }
4601
4602 if( i < M )
4603 {
4604 const size_t jbegin( ( IsUpper_v<MT1> )
4605 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4606 :( 0UL ) );
4607 const size_t jend( ( IsLower_v<MT1> )
4608 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4609 :( N ) );
4610 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4611
4612 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4613 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4614
4615 size_t j( jbegin );
4616
4617 if( j < jpos )
4618 {
4619 SIMDType xmm1( A.load(i,j) * x.load(j) );
4620
4621 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
4622 xmm1 += A.load(i,j) * x.load(j);
4623 }
4624
4625 y[i] += sum( xmm1 ) * scalar;
4626
4627 for( ; remainder && j<jend; ++j ) {
4628 y[i] += A(i,j) * x[j] * scalar;
4629 }
4630 }
4631 else
4632 {
4633 ElementType value( A(i,j) * x[j] );
4634
4635 for( ++j; j<jend; ++j ) {
4636 value += A(i,j) * x[j];
4637 }
4638
4639 y[i] += value * scalar;
4640 }
4641 }
4642 }
4643 //**********************************************************************************************
4644
4645 //**Default addition assignment to dense vectors (large matrices)*******************************
4659 template< typename VT1 // Type of the left-hand side target vector
4660 , typename MT1 // Type of the left-hand side matrix operand
4661 , typename VT2 // Type of the right-hand side vector operand
4662 , typename ST2 > // Type of the scalar value
4663 static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4664 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4665 {
4666 selectDefaultAddAssignKernel( y, A, x, scalar );
4667 }
4668 //**********************************************************************************************
4669
4670 //**Vectorized default addition assignment to dense vectors (large matrices)********************
4684 template< typename VT1 // Type of the left-hand side target vector
4685 , typename MT1 // Type of the left-hand side matrix operand
4686 , typename VT2 // Type of the right-hand side vector operand
4687 , typename ST2 > // Type of the scalar value
4688 static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4689 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4690 {
4691 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4692
4693 const size_t M( A.rows() );
4694 const size_t N( A.columns() );
4695
4696 size_t i( 0UL );
4697
4698 for( ; (i+8UL) <= M; i+=8UL )
4699 {
4700 const size_t jbegin( ( IsUpper_v<MT1> )
4701 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4702 :( 0UL ) );
4703 const size_t jend( ( IsLower_v<MT1> )
4704 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4705 :( N ) );
4706 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4707
4708 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4709 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4710
4711 size_t j( jbegin );
4712
4713 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4714 const size_t j1( j+SIMDSIZE );
4715 const size_t j2( j+SIMDSIZE*2UL );
4716 const size_t j3( j+SIMDSIZE*3UL );
4717 const SIMDType x1( x.load(j ) );
4718 const SIMDType x2( x.load(j1) );
4719 const SIMDType x3( x.load(j2) );
4720 const SIMDType x4( x.load(j3) );
4721 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4722 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4723 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4724 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4725 y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4726 y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4727 y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4728 y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4729 }
4730
4731 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4732 const size_t j1( j+SIMDSIZE );
4733 const SIMDType x1( x.load(j ) );
4734 const SIMDType x2( x.load(j1) );
4735 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4736 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4737 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4738 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4739 y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4740 y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4741 y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4742 y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4743 }
4744
4745 for( ; j<jpos; j+=SIMDSIZE ) {
4746 const SIMDType x1( x.load(j) );
4747 y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4748 y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4749 y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4750 y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4751 y[i+4UL] += sum( A.load(i+4UL,j) * x1 ) * scalar;
4752 y[i+5UL] += sum( A.load(i+5UL,j) * x1 ) * scalar;
4753 y[i+6UL] += sum( A.load(i+6UL,j) * x1 ) * scalar;
4754 y[i+7UL] += sum( A.load(i+7UL,j) * x1 ) * scalar;
4755 }
4756
4757 for( ; remainder && j<jend; ++j ) {
4758 y[i ] += A(i ,j) * x[j] * scalar;
4759 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4760 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4761 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4762 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4763 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4764 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4765 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4766 }
4767 }
4768
4769 for( ; (i+4UL) <= M; i+=4UL )
4770 {
4771 const size_t jbegin( ( IsUpper_v<MT1> )
4772 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4773 :( 0UL ) );
4774 const size_t jend( ( IsLower_v<MT1> )
4775 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4776 :( N ) );
4777 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4778
4779 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4780 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4781
4782 size_t j( jbegin );
4783
4784 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4785 const size_t j1( j+SIMDSIZE );
4786 const size_t j2( j+SIMDSIZE*2UL );
4787 const size_t j3( j+SIMDSIZE*3UL );
4788 const SIMDType x1( x.load(j ) );
4789 const SIMDType x2( x.load(j1) );
4790 const SIMDType x3( x.load(j2) );
4791 const SIMDType x4( x.load(j3) );
4792 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4793 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4794 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4795 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4796 }
4797
4798 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4799 const size_t j1( j+SIMDSIZE );
4800 const SIMDType x1( x.load(j ) );
4801 const SIMDType x2( x.load(j1) );
4802 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4803 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4804 y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4805 y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4806 }
4807
4808 for( ; j<jpos; j+=SIMDSIZE ) {
4809 const SIMDType x1( x.load(j) );
4810 y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4811 y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4812 y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4813 y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4814 }
4815
4816 for( ; remainder && j<jend; ++j ) {
4817 y[i ] += A(i ,j) * x[j] * scalar;
4818 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4819 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4820 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4821 }
4822 }
4823
4824 for( ; (i+2UL) <= M; i+=2UL )
4825 {
4826 const size_t jbegin( ( IsUpper_v<MT1> )
4827 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4828 :( 0UL ) );
4829 const size_t jend( ( IsLower_v<MT1> )
4830 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4831 :( N ) );
4832 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4833
4834 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4835 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4836
4837 size_t j( jbegin );
4838
4839 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4840 const size_t j1( j+SIMDSIZE );
4841 const size_t j2( j+SIMDSIZE*2UL );
4842 const size_t j3( j+SIMDSIZE*3UL );
4843 const SIMDType x1( x.load(j ) );
4844 const SIMDType x2( x.load(j1) );
4845 const SIMDType x3( x.load(j2) );
4846 const SIMDType x4( x.load(j3) );
4847 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4848 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4849 }
4850
4851 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4852 const size_t j1( j+SIMDSIZE );
4853 const SIMDType x1( x.load(j ) );
4854 const SIMDType x2( x.load(j1) );
4855 y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4856 y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4857 }
4858
4859 for( ; j<jpos; j+=SIMDSIZE ) {
4860 const SIMDType x1( x.load(j) );
4861 y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4862 y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4863 }
4864
4865 for( ; remainder && j<jend; ++j ) {
4866 y[i ] += A(i ,j) * x[j] * scalar;
4867 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4868 }
4869 }
4870
4871 if( i < M )
4872 {
4873 const size_t jbegin( ( IsUpper_v<MT1> )
4874 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
4875 :( 0UL ) );
4876 const size_t jend( ( IsLower_v<MT1> )
4877 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4878 :( N ) );
4879 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
4880
4881 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
4882 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
4883
4884 size_t j( jbegin );
4885
4886 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4887 const size_t j1( j+SIMDSIZE );
4888 const size_t j2( j+SIMDSIZE*2UL );
4889 const size_t j3( j+SIMDSIZE*3UL );
4890 const SIMDType x1( x.load(j ) );
4891 const SIMDType x2( x.load(j1) );
4892 const SIMDType x3( x.load(j2) );
4893 const SIMDType x4( x.load(j3) );
4894 y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4895 }
4896
4897 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4898 const size_t j1( j+SIMDSIZE );
4899 const SIMDType x1( x.load(j ) );
4900 const SIMDType x2( x.load(j1) );
4901 y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4902 }
4903
4904 for( ; j<jpos; j+=SIMDSIZE ) {
4905 const SIMDType x1( x.load(j) );
4906 y[i] += sum( A.load(i,j) * x1 ) * scalar;
4907 }
4908
4909 for( ; remainder && j<jend; ++j ) {
4910 y[i] += A(i,j) * x[j] * scalar;
4911 }
4912 }
4913 }
4914 //**********************************************************************************************
4915
4916 //**BLAS-based addition assignment to dense vectors (default)***********************************
4930 template< typename VT1 // Type of the left-hand side target vector
4931 , typename MT1 // Type of the left-hand side matrix operand
4932 , typename VT2 // Type of the right-hand side vector operand
4933 , typename ST2 > // Type of the scalar value
4934 static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4935 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4936 {
4937 selectLargeAddAssignKernel( y, A, x, scalar );
4938 }
4939 //**********************************************************************************************
4940
4941 //**BLAS-based addition assignment to dense vectors*********************************************
4942#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4956 template< typename VT1 // Type of the left-hand side target vector
4957 , typename MT1 // Type of the left-hand side matrix operand
4958 , typename VT2 // Type of the right-hand side vector operand
4959 , typename ST2 > // Type of the scalar value
4960 static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4961 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4962 {
4963 using ET = ElementType_t<VT1>;
4964
4965 if( IsTriangular_v<MT1> ) {
4966 ResultType_t<VT1> tmp( serial( scalar * x ) );
4967 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4968 addAssign( y, tmp );
4969 }
4970 else {
4971 gemv( y, A, x, ET(scalar), ET(1) );
4972 }
4973 }
4974#endif
4975 //**********************************************************************************************
4976
4977 //**Addition assignment to sparse vectors*******************************************************
4978 // No special implementation for the addition assignment to sparse vectors.
4979 //**********************************************************************************************
4980
4981 //**Subtraction assignment to dense vectors*****************************************************
4993 template< typename VT1 > // Type of the target dense vector
4994 friend inline void subAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4995 {
4997
4998 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
4999
5000 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5001 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5002
5003 if( left.rows() == 0UL || left.columns() == 0UL ||
5004 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
5005 return;
5006 }
5007
5008 LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5009 RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
5010
5011 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5012 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5013 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5014 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
5015
5016 DVecScalarMultExpr::selectSubAssignKernel( *lhs, A, x, rhs.scalar_ );
5017 }
5018 //**********************************************************************************************
5019
5020 //**Subtraction assignment to dense vectors (kernel selection)**********************************
5031 template< typename VT1 // Type of the left-hand side target vector
5032 , typename MT1 // Type of the left-hand side matrix operand
5033 , typename VT2 // Type of the right-hand side vector operand
5034 , typename ST2 > // Type of the scalar value
5035 static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
5036 {
5037 if( ( IsDiagonal_v<MT1> ) ||
5038 ( IsComputation_v<MT> && !evaluateMatrix ) ||
5039 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
5040 selectSmallSubAssignKernel( y, A, x, scalar );
5041 else
5042 selectBlasSubAssignKernel( y, A, x, scalar );
5043 }
5044 //**********************************************************************************************
5045
5046 //**Default subtraction assignment to dense vectors*********************************************
5060 template< typename VT1 // Type of the left-hand side target vector
5061 , typename MT1 // Type of the left-hand side matrix operand
5062 , typename VT2 // Type of the right-hand side vector operand
5063 , typename ST2 > // Type of the scalar value
5064 static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
5065 {
5066 y.subAssign( A * x * scalar );
5067 }
5068 //**********************************************************************************************
5069
5070 //**Default subtraction assignment to dense vectors (small matrices)****************************
5084 template< typename VT1 // Type of the left-hand side target vector
5085 , typename MT1 // Type of the left-hand side matrix operand
5086 , typename VT2 // Type of the right-hand side vector operand
5087 , typename ST2 > // Type of the scalar value
5088 static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
5089 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
5090 {
5091 selectDefaultSubAssignKernel( y, A, x, scalar );
5092 }
5093 //**********************************************************************************************
5094
5095 //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
5109 template< typename VT1 // Type of the left-hand side target vector
5110 , typename MT1 // Type of the left-hand side matrix operand
5111 , typename VT2 // Type of the right-hand side vector operand
5112 , typename ST2 > // Type of the scalar value
5113 static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
5114 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
5115 {
5116 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
5117
5118 const size_t M( A.rows() );
5119 const size_t N( A.columns() );
5120
5121 size_t i( 0UL );
5122
5123 for( ; (i+8UL) <= M; i+=8UL )
5124 {
5125 const size_t jbegin( ( IsUpper_v<MT1> )
5126 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
5127 :( 0UL ) );
5128 const size_t jend( ( IsLower_v<MT1> )
5129 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
5130 :( N ) );
5131 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
5132
5133 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
5134 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
5135
5136 size_t j( jbegin );
5137
5138 if( j < jpos )
5139 {
5140 SIMDType x1( x.load(j) );
5141 SIMDType xmm1( A.load(i ,j) * x1 );
5142 SIMDType xmm2( A.load(i+1UL,j) * x1 );
5143 SIMDType xmm3( A.load(i+2UL,j) * x1 );
5144 SIMDType xmm4( A.load(i+3UL,j) * x1 );
5145 SIMDType xmm5( A.load(i+4UL,j) * x1 );
5146 SIMDType xmm6( A.load(i+5UL,j) * x1 );
5147 SIMDType xmm7( A.load(i+6UL,j) * x1 );
5148 SIMDType xmm8( A.load(i+7UL,j) * x1 );
5149
5150 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
5151 x1 = x.load(j);
5152 xmm1 += A.load(i ,j) * x1;
5153 xmm2 += A.load(i+1UL,j) * x1;
5154 xmm3 += A.load(i+2UL,j) * x1;
5155 xmm4 += A.load(i+3UL,j) * x1;
5156 xmm5 += A.load(i+4UL,j) * x1;
5157 xmm6 += A.load(i+5UL,j) * x1;
5158 xmm7 += A.load(i+6UL,j) * x1;
5159 xmm8 += A.load(i+7UL,j) * x1;
5160 }
5161
5162 y[i ] -= sum( xmm1 ) * scalar;
5163 y[i+1UL] -= sum( xmm2 ) * scalar;
5164 y[i+2UL] -= sum( xmm3 ) * scalar;
5165 y[i+3UL] -= sum( xmm4 ) * scalar;
5166 y[i+4UL] -= sum( xmm5 ) * scalar;
5167 y[i+5UL] -= sum( xmm6 ) * scalar;
5168 y[i+6UL] -= sum( xmm7 ) * scalar;
5169 y[i+7UL] -= sum( xmm8 ) * scalar;
5170
5171 for( ; remainder && j<jend; ++j ) {
5172 y[i ] -= A(i ,j) * x[j] * scalar;
5173 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5174 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5175 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
5176 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
5177 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
5178 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
5179 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
5180 }
5181 }
5182 else
5183 {
5184 ElementType value1( A(i ,j) * x[j] );
5185 ElementType value2( A(i+1UL,j) * x[j] );
5186 ElementType value3( A(i+2UL,j) * x[j] );
5187 ElementType value4( A(i+3UL,j) * x[j] );
5188 ElementType value5( A(i+4UL,j) * x[j] );
5189 ElementType value6( A(i+5UL,j) * x[j] );
5190 ElementType value7( A(i+6UL,j) * x[j] );
5191 ElementType value8( A(i+7UL,j) * x[j] );
5192
5193 for( ++j; j<jend; ++j ) {
5194 value1 += A(i ,j) * x[j];
5195 value2 += A(i+1UL,j) * x[j];
5196 value3 += A(i+2UL,j) * x[j];
5197 value4 += A(i+3UL,j) * x[j];
5198 value5 += A(i+4UL,j) * x[j];
5199 value6 += A(i+5UL,j) * x[j];
5200 value7 += A(i+6UL,j) * x[j];
5201 value8 += A(i+7UL,j) * x[j];
5202 }
5203
5204 y[i ] -= value1 * scalar;
5205 y[i+1UL] -= value2 * scalar;
5206 y[i+2UL] -= value3 * scalar;
5207 y[i+3UL] -= value4 * scalar;
5208 y[i+4UL] -= value5 * scalar;
5209 y[i+5UL] -= value6 * scalar;
5210 y[i+6UL] -= value7 * scalar;
5211 y[i+7UL] -= value8 * scalar;
5212 }
5213 }
5214
5215 for( ; (i+4UL) <= M; i+=4UL )
5216 {
5217 const size_t jbegin( ( IsUpper_v<MT1> )
5218 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
5219 :( 0UL ) );
5220 const size_t jend( ( IsLower_v<MT1> )
5221 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
5222 :( N ) );
5223 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
5224
5225 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
5226 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
5227
5228 size_t j( jbegin );
5229
5230 if( j < jpos )
5231 {
5232 SIMDType x1( x.load(j) );
5233 SIMDType xmm1( A.load(i ,j) * x1 );
5234 SIMDType xmm2( A.load(i+1UL,j) * x1 );
5235 SIMDType xmm3( A.load(i+2UL,j) * x1 );
5236 SIMDType xmm4( A.load(i+3UL,j) * x1 );
5237
5238 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
5239 x1 = x.load(j);
5240 xmm1 += A.load(i ,j) * x1;
5241 xmm2 += A.load(i+1UL,j) * x1;
5242 xmm3 += A.load(i+2UL,j) * x1;
5243 xmm4 += A.load(i+3UL,j) * x1;
5244 }
5245
5246 y[i ] -= sum( xmm1 ) * scalar;
5247 y[i+1UL] -= sum( xmm2 ) * scalar;
5248 y[i+2UL] -= sum( xmm3 ) * scalar;
5249 y[i+3UL] -= sum( xmm4 ) * scalar;
5250
5251 for( ; remainder && j<jend; ++j ) {
5252 y[i ] -= A(i ,j) * x[j] * scalar;
5253 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5254 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5255 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
5256 }
5257 }
5258 else
5259 {
5260 ElementType value1( A(i ,j) * x[j] );
5261 ElementType value2( A(i+1UL,j) * x[j] );
5262 ElementType value3( A(i+2UL,j) * x[j] );
5263 ElementType value4( A(i+3UL,j) * x[j] );
5264
5265 for( ++j; j<jend; ++j ) {
5266 value1 += A(i ,j) * x[j];
5267 value2 += A(i+1UL,j) * x[j];
5268 value3 += A(i+2UL,j) * x[j];
5269 value4 += A(i+3UL,j) * x[j];
5270 }
5271
5272 y[i ] -= value1 * scalar;
5273 y[i+1UL] -= value2 * scalar;
5274 y[i+2UL] -= value3 * scalar;
5275 y[i+3UL] -= value4 * scalar;
5276 }
5277 }
5278
5279 for( ; (i+3UL) <= M; i+=3UL )
5280 {
5281 const size_t jbegin( ( IsUpper_v<MT1> )
5282 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
5283 :( 0UL ) );
5284 const size_t jend( ( IsLower_v<MT1> )
5285 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
5286 :( N ) );
5287 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
5288
5289 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
5290 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
5291
5292 size_t j( jbegin );
5293
5294 if( j < jpos )
5295 {
5296 SIMDType x1( x.load(j) );
5297 SIMDType xmm1( A.load(i ,j) * x1 );
5298 SIMDType xmm2( A.load(i+1UL,j) * x1 );
5299 SIMDType xmm3( A.load(i+2UL,j) * x1 );
5300
5301 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
5302 x1 = x.load(j);
5303 xmm1 += A.load(i ,j) * x1;
5304 xmm2 += A.load(i+1UL,j) * x1;
5305 xmm3 += A.load(i+2UL,j) * x1;
5306 }
5307
5308 y[i ] -= sum( xmm1 ) * scalar;
5309 y[i+1UL] -= sum( xmm2 ) * scalar;
5310 y[i+2UL] -= sum( xmm3 ) * scalar;
5311
5312 for( ; remainder && j<jend; ++j ) {
5313 y[i ] -= A(i ,j) * x[j] * scalar;
5314 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5315 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5316 }
5317 }
5318 else
5319 {
5320 ElementType value1( A(i ,j) * x[j] );
5321 ElementType value2( A(i+1UL,j) * x[j] );
5322 ElementType value3( A(i+2UL,j) * x[j] );
5323
5324 for( ++j; j<jend; ++j ) {
5325 value1 += A(i ,j) * x[j];
5326 value2 += A(i+1UL,j) * x[j];
5327 value3 += A(i+2UL,j) * x[j];
5328 }
5329
5330 y[i ] -= value1 * scalar;
5331 y[i+1UL] -= value2 * scalar;
5332 y[i+2UL] -= value3 * scalar;
5333 }
5334 }
5335
5336 for( ; (i+2UL) <= M; i+=2UL )
5337 {
5338 const size_t jbegin( ( IsUpper_v<MT1> )
5339 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
5340 :( 0UL ) );
5341 const size_t jend( ( IsLower_v<MT1> )
5342 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
5343 :( N ) );
5344 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
5345
5346 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
5347 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
5348
5349 size_t j( jbegin );
5350
5351 if( j < jpos )
5352 {
5353 SIMDType x1( x.load(j) );
5354 SIMDType xmm1( A.load(i ,j) * x1 );
5355 SIMDType xmm2( A.load(i+1UL,j) * x1 );
5356
5357 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
5358 x1 = x.load(j);
5359 xmm1 += A.load(i ,j) * x1;
5360 xmm2 += A.load(i+1UL,j) * x1;
5361 }
5362
5363 y[i ] -= sum( xmm1 ) * scalar;
5364 y[i+1UL] -= sum( xmm2 ) * scalar;
5365
5366 for( ; remainder && j<jend; ++j ) {
5367 y[i ] -= A(i ,j) * x[j] * scalar;
5368 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5369 }
5370 }
5371 else
5372 {
5373 ElementType value1( A(i ,j) * x[j] );
5374 ElementType value2( A(i+1UL,j) * x[j] );
5375
5376 for( ++j; j<jend; ++j ) {
5377 value1 += A(i ,j) * x[j];
5378 value2 += A(i+1UL,j) * x[j];
5379 }
5380
5381 y[i ] -= value1 * scalar;
5382 y[i+1UL] -= value2 * scalar;
5383 }
5384 }
5385
5386 if( i < M )
5387 {
5388 const size_t jbegin( ( IsUpper_v<MT1> )
5389 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
5390 :( 0UL ) );
5391 const size_t jend( ( IsLower_v<MT1> )
5392 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
5393 :( N ) );
5394 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
5395
5396 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
5397 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
5398
5399 size_t j( jbegin );
5400
5401 if( j < jpos )
5402 {
5403 SIMDType xmm1( A.load(i,j) * x.load(j) );
5404
5405 for( j+=SIMDSIZE; j<jpos; j+=SIMDSIZE ) {
5406 xmm1 += A.load(i,j) * x.load(j);
5407 }
5408
5409 y[i] -= sum( xmm1 ) * scalar;
5410
5411 for( ; remainder && j<jend; ++j ) {
5412 y[i] -= A(i,j) * x[j] * scalar;
5413 }
5414 }
5415 else
5416 {
5417 ElementType value( A(i,j) * x[j] );
5418
5419 for( ++j; j<jend; ++j ) {
5420 value += A(i,j) * x[j];
5421 }
5422
5423 y[i] -= value * scalar;
5424 }
5425 }
5426 }
5427 //**********************************************************************************************
5428
5429 //**Default subtraction assignment to dense vectors (large matrices)****************************
5443 template< typename VT1 // Type of the left-hand side target vector
5444 , typename MT1 // Type of the left-hand side matrix operand
5445 , typename VT2 // Type of the right-hand side vector operand
5446 , typename ST2 > // Type of the scalar value
5447 static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
5448 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
5449 {
5450 selectDefaultSubAssignKernel( y, A, x, scalar );
5451 }
5452 //**********************************************************************************************
5453
5454 //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
5468 template< typename VT1 // Type of the left-hand side target vector
5469 , typename MT1 // Type of the left-hand side matrix operand
5470 , typename VT2 // Type of the right-hand side vector operand
5471 , typename ST2 > // Type of the scalar value
5472 static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
5473 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
5474 {
5475 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
5476
5477 const size_t M( A.rows() );
5478 const size_t N( A.columns() );
5479
5480 size_t i( 0UL );
5481
5482 for( ; (i+8UL) <= M; i+=8UL )
5483 {
5484 const size_t jbegin( ( IsUpper_v<MT1> )
5485 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
5486 :( 0UL ) );
5487 const size_t jend( ( IsLower_v<MT1> )
5488 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
5489 :( N ) );
5490 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
5491
5492 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
5493 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
5494
5495 size_t j( jbegin );
5496
5497 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
5498 const size_t j1( j+SIMDSIZE );
5499 const size_t j2( j+SIMDSIZE*2UL );
5500 const size_t j3( j+SIMDSIZE*3UL );
5501 const SIMDType x1( x.load(j ) );
5502 const SIMDType x2( x.load(j1) );
5503 const SIMDType x3( x.load(j2) );
5504 const SIMDType x4( x.load(j3) );
5505 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
5506 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
5507 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
5508 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
5509 y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
5510 y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
5511 y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
5512 y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
5513 }
5514
5515 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
5516 const size_t j1( j+SIMDSIZE );
5517 const SIMDType x1( x.load(j ) );
5518 const SIMDType x2( x.load(j1) );
5519 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
5520 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
5521 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
5522 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
5523 y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
5524 y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
5525 y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
5526 y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
5527 }
5528
5529 for( ; j<jpos; j+=SIMDSIZE ) {
5530 const SIMDType x1( x.load(j) );
5531 y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
5532 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
5533 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
5534 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
5535 y[i+4UL] -= sum( A.load(i+4UL,j) * x1 ) * scalar;
5536 y[i+5UL] -= sum( A.load(i+5UL,j) * x1 ) * scalar;
5537 y[i+6UL] -= sum( A.load(i+6UL,j) * x1 ) * scalar;
5538 y[i+7UL] -= sum( A.load(i+7UL,j) * x1 ) * scalar;
5539 }
5540
5541 for( ; remainder && j<jend; ++j ) {
5542 y[i ] -= A(i ,j) * x[j] * scalar;
5543 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5544 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5545 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
5546 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
5547 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
5548 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
5549 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
5550 }
5551 }
5552
5553 for( ; (i+4UL) <= M; i+=4UL )
5554 {
5555 const size_t jbegin( ( IsUpper_v<MT1> )
5556 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
5557 :( 0UL ) );
5558 const size_t jend( ( IsLower_v<MT1> )
5559 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
5560 :( N ) );
5561 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
5562
5563 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
5564 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
5565
5566 size_t j( jbegin );
5567
5568 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
5569 const size_t j1( j+SIMDSIZE );
5570 const size_t j2( j+SIMDSIZE*2UL );
5571 const size_t j3( j+SIMDSIZE*3UL );
5572 const SIMDType x1( x.load(j ) );
5573 const SIMDType x2( x.load(j1) );
5574 const SIMDType x3( x.load(j2) );
5575 const SIMDType x4( x.load(j3) );
5576 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
5577 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
5578 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
5579 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
5580 }
5581
5582 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
5583 const size_t j1( j+SIMDSIZE );
5584 const SIMDType x1( x.load(j ) );
5585 const SIMDType x2( x.load(j1) );
5586 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
5587 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
5588 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
5589 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
5590 }
5591
5592 for( ; j<jpos; j+=SIMDSIZE ) {
5593 const SIMDType x1( x.load(j) );
5594 y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
5595 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
5596 y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
5597 y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
5598 }
5599
5600 for( ; remainder && j<jend; ++j ) {
5601 y[i ] -= A(i ,j) * x[j] * scalar;
5602 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5603 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5604 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
5605 }
5606 }
5607
5608 for( ; (i+2UL) <= M; i+=2UL )
5609 {
5610 const size_t jbegin( ( IsUpper_v<MT1> )
5611 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
5612 :( 0UL ) );
5613 const size_t jend( ( IsLower_v<MT1> )
5614 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
5615 :( N ) );
5616 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
5617
5618 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
5619 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
5620
5621 size_t j( jbegin );
5622
5623 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
5624 const size_t j1( j+SIMDSIZE );
5625 const size_t j2( j+SIMDSIZE*2UL );
5626 const size_t j3( j+SIMDSIZE*3UL );
5627 const SIMDType x1( x.load(j ) );
5628 const SIMDType x2( x.load(j1) );
5629 const SIMDType x3( x.load(j2) );
5630 const SIMDType x4( x.load(j3) );
5631 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
5632 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
5633 }
5634
5635 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
5636 const size_t j1( j+SIMDSIZE );
5637 const SIMDType x1( x.load(j ) );
5638 const SIMDType x2( x.load(j1) );
5639 y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
5640 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
5641 }
5642
5643 for( ; j<jpos; j+=SIMDSIZE ) {
5644 const SIMDType x1( x.load(j) );
5645 y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
5646 y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
5647 }
5648
5649 for( ; remainder && j<jend; ++j ) {
5650 y[i ] -= A(i ,j) * x[j] * scalar;
5651 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5652 }
5653 }
5654
5655 if( i < M )
5656 {
5657 const size_t jbegin( ( IsUpper_v<MT1> )
5658 ?( prevMultiple( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ), SIMDSIZE ) )
5659 :( 0UL ) );
5660 const size_t jend( ( IsLower_v<MT1> )
5661 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
5662 :( N ) );
5663 BLAZE_INTERNAL_ASSERT( jbegin < jend, "Invalid loop indices detected" );
5664
5665 const size_t jpos( remainder ? prevMultiple( jend, SIMDSIZE ) : jend );
5666 BLAZE_INTERNAL_ASSERT( jpos <= jend, "Invalid end calculation" );
5667
5668 size_t j( jbegin );
5669
5670 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
5671 const size_t j1( j+SIMDSIZE );
5672 const size_t j2( j+SIMDSIZE*2UL );
5673 const size_t j3( j+SIMDSIZE*3UL );
5674 const SIMDType x1( x.load(j ) );
5675 const SIMDType x2( x.load(j1) );
5676 const SIMDType x3( x.load(j2) );
5677 const SIMDType x4( x.load(j3) );
5678 y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
5679 }
5680
5681 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
5682 const size_t j1( j+SIMDSIZE );
5683 const SIMDType x1( x.load(j ) );
5684 const SIMDType x2( x.load(j1) );
5685 y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
5686 }
5687
5688 for( ; j<jpos; j+=SIMDSIZE ) {
5689 const SIMDType x1( x.load(j) );
5690 y[i] -= sum( A.load(i,j) * x1 ) * scalar;
5691 }
5692
5693 for( ; remainder && j<jend; ++j ) {
5694 y[i] -= A(i,j) * x[j] * scalar;
5695 }
5696 }
5697 }
5698 //**********************************************************************************************
5699
5700 //**BLAS-based subtraction assignment to dense vectors (default)********************************
5714 template< typename VT1 // Type of the left-hand side target vector
5715 , typename MT1 // Type of the left-hand side matrix operand
5716 , typename VT2 // Type of the right-hand side vector operand
5717 , typename ST2 > // Type of the scalar value
5718 static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
5719 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
5720 {
5721 selectLargeSubAssignKernel( y, A, x, scalar );
5722 }
5723 //**********************************************************************************************
5724
5725 //**BLAS-based subtraction assignment to dense vectors******************************************
5726#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
5740 template< typename VT1 // Type of the left-hand side target vector
5741 , typename MT1 // Type of the left-hand side matrix operand
5742 , typename VT2 // Type of the right-hand side vector operand
5743 , typename ST2 > // Type of the scalar value
5744 static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
5745 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
5746 {
5747 using ET = ElementType_t<VT1>;
5748
5749 if( IsTriangular_v<MT1> ) {
5750 ResultType_t<VT1> tmp( serial( scalar * x ) );
5751 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
5752 subAssign( y, tmp );
5753 }
5754 else {
5755 gemv( y, A, x, ET(-scalar), ET(1) );
5756 }
5757 }
5758#endif
5759 //**********************************************************************************************
5760
5761 //**Subtraction assignment to sparse vectors****************************************************
5762 // No special implementation for the subtraction assignment to sparse vectors.
5763 //**********************************************************************************************
5764
5765 //**Multiplication assignment to dense vectors**************************************************
5777 template< typename VT1 > // Type of the target dense vector
5778 friend inline void multAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5779 {
5781
5785
5786 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5787
5788 const ResultType tmp( serial( rhs ) );
5789 multAssign( *lhs, tmp );
5790 }
5791 //**********************************************************************************************
5792
5793 //**Multiplication assignment to sparse vectors*************************************************
5794 // No special implementation for the multiplication assignment to sparse vectors.
5795 //**********************************************************************************************
5796
5797 //**Division assignment to dense vectors********************************************************
5809 template< typename VT1 > // Type of the target dense vector
5810 friend inline void divAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5811 {
5813
5817
5818 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5819
5820 const ResultType tmp( serial( rhs ) );
5821 divAssign( *lhs, tmp );
5822 }
5823 //**********************************************************************************************
5824
5825 //**Division assignment to sparse vectors*******************************************************
5826 // No special implementation for the division assignment to sparse vectors.
5827 //**********************************************************************************************
5828
5829 //**SMP assignment to dense vectors*************************************************************
5843 template< typename VT1 > // Type of the target dense vector
5844 friend inline auto smpAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5845 -> EnableIf_t< UseSMPAssign_v<VT1> >
5846 {
5848
5849 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5850
5851 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5852 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5853
5854 if( left.rows() == 0UL ) {
5855 return;
5856 }
5857 else if( left.columns() == 0UL ||
5858 ( IsStrictlyTriangular_v<MT> && left.columns() == 1UL ) ) {
5859 reset( *lhs );
5860 return;
5861 }
5862
5863 LT A( left ); // Evaluation of the left-hand side dense matrix operand
5864 RT x( right ); // Evaluation of the right-hand side dense vector operand
5865
5866 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5867 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5868 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5869 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
5870
5871 smpAssign( *lhs, A * x * rhs.scalar_ );
5872 }
5873 //**********************************************************************************************
5874
5875 //**SMP assignment to sparse vectors************************************************************
5889 template< typename VT1 > // Type of the target sparse vector
5890 friend inline auto smpAssign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5891 -> EnableIf_t< UseSMPAssign_v<VT1> >
5892 {
5894
5898
5899 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5900
5901 const ResultType tmp( rhs );
5902 smpAssign( *lhs, tmp );
5903 }
5904 //**********************************************************************************************
5905
5906 //**SMP addition assignment to dense vectors****************************************************
5920 template< typename VT1 > // Type of the target dense vector
5921 friend inline auto smpAddAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5922 -> EnableIf_t< UseSMPAssign_v<VT1> >
5923 {
5925
5926 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5927
5928 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5929 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5930
5931 if( left.rows() == 0UL || left.columns() == 0UL ||
5932 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
5933 return;
5934 }
5935
5936 LT A( left ); // Evaluation of the left-hand side dense matrix operand
5937 RT x( right ); // Evaluation of the right-hand side dense vector operand
5938
5939 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5940 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5941 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5942 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
5943
5944 smpAddAssign( *lhs, A * x * rhs.scalar_ );
5945 }
5946 //**********************************************************************************************
5947
5948 //**SMP addition assignment to sparse vectors***************************************************
5949 // No special implementation for the SMP addition assignment to sparse vectors.
5950 //**********************************************************************************************
5951
5952 //**SMP subtraction assignment to dense vectors*************************************************
5966 template< typename VT1 > // Type of the target dense vector
5967 friend inline auto smpSubAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5968 -> EnableIf_t< UseSMPAssign_v<VT1> >
5969 {
5971
5972 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
5973
5974 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5975 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5976
5977 if( left.rows() == 0UL || left.columns() == 0UL ||
5978 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
5979 return;
5980 }
5981
5982 LT A( left ); // Evaluation of the left-hand side dense matrix operand
5983 RT x( right ); // Evaluation of the right-hand side dense vector operand
5984
5985 BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5986 BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5987 BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5988 BLAZE_INTERNAL_ASSERT( A.rows() == (*lhs).size() , "Invalid vector size" );
5989
5990 smpSubAssign( *lhs, A * x * rhs.scalar_ );
5991 }
5992 //**********************************************************************************************
5993
5994 //**SMP subtraction assignment to sparse vectors************************************************
5995 // No special implementation for the SMP subtraction assignment to sparse vectors.
5996 //**********************************************************************************************
5997
5998 //**SMP multiplication assignment to dense vectors**********************************************
6012 template< typename VT1 > // Type of the target dense vector
6013 friend inline auto smpMultAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
6014 -> EnableIf_t< UseSMPAssign_v<VT1> >
6015 {
6017
6021
6022 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
6023
6024 const ResultType tmp( rhs );
6025 smpMultAssign( *lhs, tmp );
6026 }
6027 //**********************************************************************************************
6028
6029 //**SMP multiplication assignment to sparse vectors*********************************************
6030 // No special implementation for the SMP multiplication assignment to sparse vectors.
6031 //**********************************************************************************************
6032
6033 //**SMP division assignment to dense vectors****************************************************
6047 template< typename VT1 > // Type of the target dense vector
6048 friend inline auto smpDivAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
6049 -> EnableIf_t< UseSMPAssign_v<VT1> >
6050 {
6052
6056
6057 BLAZE_INTERNAL_ASSERT( (*lhs).size() == rhs.size(), "Invalid vector sizes" );
6058
6059 const ResultType tmp( rhs );
6060 smpDivAssign( *lhs, tmp );
6061 }
6062 //**********************************************************************************************
6063
6064 //**SMP division assignment to sparse vectors***************************************************
6065 // No special implementation for the SMP division assignment to sparse vectors.
6066 //**********************************************************************************************
6067
6068 //**Compile time checks*************************************************************************
6077 //**********************************************************************************************
6078};
6080//*************************************************************************************************
6081
6082
6083
6084
6085//=================================================================================================
6086//
6087// GLOBAL BINARY ARITHMETIC OPERATORS
6088//
6089//=================================================================================================
6090
6091//*************************************************************************************************
6121template< typename MT // Type of the left-hand side dense matrix
6122 , typename VT > // Type of the right-hand side dense vector
6123inline decltype(auto)
6124 operator*( const DenseMatrix<MT,false>& mat, const DenseVector<VT,false>& vec )
6125{
6127
6129
6130 if( (*mat).columns() != (*vec).size() ) {
6131 BLAZE_THROW_INVALID_ARGUMENT( "Matrix and vector sizes do not match" );
6132 }
6133
6134 using ReturnType = const DMatDVecMultExpr<MT,VT>;
6135 return ReturnType( *mat, *vec );
6136}
6137//*************************************************************************************************
6138
6139
6140
6141
6142//=================================================================================================
6143//
6144// ISALIGNED SPECIALIZATIONS
6145//
6146//=================================================================================================
6147
6148//*************************************************************************************************
6150template< typename MT, typename VT >
6151struct IsAligned< DMatDVecMultExpr<MT,VT> >
6152 : public BoolConstant< IsAligned_v<MT> && IsAligned_v<VT> >
6153{};
6155//*************************************************************************************************
6156
6157} // namespace blaze
6158
6159#endif
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for the blaze::checked and blaze::unchecked instances.
Constraint on the transpose flag of vector types.
Header file for the complex data type.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Deactivation of problematic macros.
Header file for the multiplication trait.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Data type constraint.
Constraint on the data type.
Expression object for dense matrix-dense vector multiplications.
Definition: DMatDVecMultExpr.h:127
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDVecMultExpr.h:231
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:217
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDVecMultExpr.h:244
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:267
If_t< IsExpression_v< VT >, const VT, const VT & > RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:220
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:391
CompositeType_t< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:135
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:140
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:253
If_t< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:223
CompositeType_t< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:134
ElementType_t< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:133
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:378
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:390
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:334
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:358
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:214
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:210
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:211
static constexpr bool evaluateVector
Compilation switch for the composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:147
ElementType_t< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:132
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:301
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDVecMultExpr.h:238
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:213
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:346
ResultType_t< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:130
ResultType_t< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:131
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:368
If_t< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:226
MultTrait_t< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:209
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:212
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:324
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:314
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:530
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:430
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:461
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:520
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:540
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:584
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:169
VecScalarMultExpr< DenseVector< This, TF > > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:166
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:440
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:552
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:176
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:179
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:112
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:170
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:435
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:449
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:182
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:574
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:168
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:564
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:173
Base class for dense matrices.
Definition: DenseMatrix.h:82
Base class for N-dimensional dense vectors.
Definition: DenseVector.h:77
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseVector base class.
Header file for the MatMatMultExpr base class.
Header file for the MatVecMultExpr base class.
Header file for the VecScalarMultExpr base class.
Header file for BLAS general matrix/vector multiplication functions (gemv)
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).
Definition: BLAS.h:169
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2156
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.
Definition: MatMatMultExpr.h:83
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatVecMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.
Definition: ColumnVector.h:61
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.
Definition: MultTrait.h:165
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:221
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:192
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Header file for all forward declarations for expression class templates.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base class for all matrix/vector multiplication expression templates.
Definition: MatVecMultExpr.h:69
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.