DMatDVecMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/Exception.h>
60 #include <blaze/math/shims/Reset.h>
62 #include <blaze/math/SIMD.h>
82 #include <blaze/math/views/Check.h>
83 #include <blaze/system/BLAS.h>
86 #include <blaze/util/Assert.h>
87 #include <blaze/util/Complex.h>
89 #include <blaze/util/DisableIf.h>
90 #include <blaze/util/EnableIf.h>
93 #include <blaze/util/mpl/If.h>
94 #include <blaze/util/Types.h>
102 
103 
104 namespace blaze {
105 
106 //=================================================================================================
107 //
108 // CLASS DMATDVECMULTEXPR
109 //
110 //=================================================================================================
111 
112 //*************************************************************************************************
119 template< typename MT // Type of the left-hand side dense matrix
120  , typename VT > // Type of the right-hand side dense vector
122  : public MatVecMultExpr< DenseVector< DMatDVecMultExpr<MT,VT>, false > >
123  , private Computation
124 {
125  private:
126  //**Type definitions****************************************************************************
133  //**********************************************************************************************
134 
135  //**********************************************************************************************
137  static constexpr bool evaluateMatrix =
138  ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
139  IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
140  //**********************************************************************************************
141 
142  //**********************************************************************************************
144  static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
145  //**********************************************************************************************
146 
147  //**********************************************************************************************
149 
153  template< typename T1 >
154  static constexpr bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
156  //**********************************************************************************************
157 
158  //**********************************************************************************************
160 
163  template< typename T1, typename T2, typename T3 >
164  static constexpr bool UseBlasKernel_v =
166  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
167  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
168  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
169  !IsDiagonal_v<T2> &&
170  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
171  IsBLASCompatible_v< ElementType_t<T1> > &&
172  IsBLASCompatible_v< ElementType_t<T2> > &&
173  IsBLASCompatible_v< ElementType_t<T3> > &&
174  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
175  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
177  //**********************************************************************************************
178 
179  //**********************************************************************************************
181 
185  template< typename T1, typename T2, typename T3 >
186  static constexpr bool UseVectorizedDefaultKernel_v =
187  ( useOptimizedKernels &&
188  !IsDiagonal_v<T2> &&
189  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
190  IsSIMDCombinable_v< ElementType_t<T1>
192  , ElementType_t<T3> > &&
193  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
194  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
196  //**********************************************************************************************
197 
198  public:
199  //**Type definitions****************************************************************************
206  using ReturnType = const ElementType;
207  using CompositeType = const ResultType;
208 
210  using LeftOperand = If_t< IsExpression_v<MT>, const MT, const MT& >;
211 
213  using RightOperand = If_t< IsExpression_v<VT>, const VT, const VT& >;
214 
217 
220  //**********************************************************************************************
221 
222  //**Compilation flags***************************************************************************
224  static constexpr bool simdEnabled =
225  ( !IsDiagonal_v<MT> &&
226  MT::simdEnabled && VT::simdEnabled &&
227  HasSIMDAdd_v<MET,VET> &&
228  HasSIMDMult_v<MET,VET> );
229 
231  static constexpr bool smpAssignable =
233  //**********************************************************************************************
234 
235  //**SIMD properties*****************************************************************************
237  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
238  //**********************************************************************************************
239 
240  //**Constructor*********************************************************************************
246  explicit inline DMatDVecMultExpr( const MT& mat, const VT& vec ) noexcept
247  : mat_( mat ) // Left-hand side dense matrix of the multiplication expression
248  , vec_( vec ) // Right-hand side dense vector of the multiplication expression
249  {
250  BLAZE_INTERNAL_ASSERT( mat_.columns() == vec_.size(), "Invalid matrix and vector sizes" );
251  }
252  //**********************************************************************************************
253 
254  //**Subscript operator**************************************************************************
260  inline ReturnType operator[]( size_t index ) const {
261  BLAZE_INTERNAL_ASSERT( index < mat_.rows(), "Invalid vector access index" );
262 
263  if( IsDiagonal_v<MT> )
264  {
265  return mat_(index,index) * vec_[index];
266  }
267  else if( IsLower_v<MT> && ( index + 8UL < mat_.rows() ) )
268  {
269  const size_t n( IsStrictlyLower_v<MT> ? index : index+1UL );
270  return subvector( row( mat_, index, unchecked ), 0UL, n, unchecked ) *
271  subvector( vec_, 0UL, n, unchecked );
272  }
273  else if( IsUpper_v<MT> && ( index > 8UL ) )
274  {
275  const size_t begin( IsStrictlyUpper_v<MT> ? index+1UL : index );
276  const size_t n ( mat_.columns() - begin );
277  return subvector( row( mat_, index, unchecked ), begin, n, unchecked ) *
278  subvector( vec_, begin, n, unchecked );
279  }
280  else
281  {
282  return row( mat_, index, unchecked ) * vec_;
283  }
284  }
285  //**********************************************************************************************
286 
287  //**At function*********************************************************************************
294  inline ReturnType at( size_t index ) const {
295  if( index >= mat_.rows() ) {
296  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
297  }
298  return (*this)[index];
299  }
300  //**********************************************************************************************
301 
302  //**Size function*******************************************************************************
307  inline size_t size() const noexcept {
308  return mat_.rows();
309  }
310  //**********************************************************************************************
311 
312  //**Left operand access*************************************************************************
317  inline LeftOperand leftOperand() const noexcept{
318  return mat_;
319  }
320  //**********************************************************************************************
321 
322  //**Right operand access************************************************************************
327  inline RightOperand rightOperand() const noexcept {
328  return vec_;
329  }
330  //**********************************************************************************************
331 
332  //**********************************************************************************************
338  template< typename T >
339  inline bool canAlias( const T* alias ) const noexcept {
340  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
341  }
342  //**********************************************************************************************
343 
344  //**********************************************************************************************
350  template< typename T >
351  inline bool isAliased( const T* alias ) const noexcept {
352  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
353  }
354  //**********************************************************************************************
355 
356  //**********************************************************************************************
361  inline bool isAligned() const noexcept {
362  return mat_.isAligned() && vec_.isAligned();
363  }
364  //**********************************************************************************************
365 
366  //**********************************************************************************************
371  inline bool canSMPAssign() const noexcept {
372  return ( !BLAZE_BLAS_MODE ||
375  ( IsComputation_v<MT> && !evaluateMatrix ) ||
376  ( mat_.rows() * mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
377  ( size() > SMP_DMATDVECMULT_THRESHOLD );
378  }
379  //**********************************************************************************************
380 
381  private:
382  //**Member variables****************************************************************************
385  //**********************************************************************************************
386 
387  //**Assignment to dense vectors*****************************************************************
400  template< typename VT1 > // Type of the target dense vector
401  friend inline void assign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
402  {
404 
405  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
406 
407  if( rhs.mat_.rows() == 0UL ) {
408  return;
409  }
410  else if( rhs.mat_.columns() == 0UL ) {
411  reset( ~lhs );
412  return;
413  }
414 
415  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
416  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
417 
418  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
419  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
420  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
421  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
422 
423  DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
424  }
426  //**********************************************************************************************
427 
428  //**Assignment to dense vectors (kernel selection)**********************************************
439  template< typename VT1 // Type of the left-hand side target vector
440  , typename MT1 // Type of the left-hand side matrix operand
441  , typename VT2 > // Type of the right-hand side vector operand
442  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x )
443  {
444  if( ( IsDiagonal_v<MT1> ) ||
445  ( IsComputation_v<MT> && !evaluateMatrix ) ||
446  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
447  selectSmallAssignKernel( y, A, x );
448  else
449  selectBlasAssignKernel( y, A, x );
450  }
452  //**********************************************************************************************
453 
454  //**Default assignment to dense vectors*********************************************************
468  template< typename VT1 // Type of the left-hand side target vector
469  , typename MT1 // Type of the left-hand side matrix operand
470  , typename VT2 > // Type of the right-hand side vector operand
471  static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x )
472  {
473  y.assign( A * x );
474  }
476  //**********************************************************************************************
477 
478  //**Default assignment to dense vectors (small matrices)****************************************
492  template< typename VT1 // Type of the left-hand side target vector
493  , typename MT1 // Type of the left-hand side matrix operand
494  , typename VT2 > // Type of the right-hand side vector operand
495  static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
496  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
497  {
498  selectDefaultAssignKernel( y, A, x );
499  }
501  //**********************************************************************************************
502 
503  //**Vectorized default assignment to dense vectors (small matrices)*****************************
517  template< typename VT1 // Type of the left-hand side target vector
518  , typename MT1 // Type of the left-hand side matrix operand
519  , typename VT2 > // Type of the right-hand side vector operand
520  static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
521  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
522  {
523  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
524 
525  const size_t M( A.rows() );
526  const size_t N( A.columns() );
527 
528  size_t i( 0UL );
529 
530  for( ; (i+8UL) <= M; i+=8UL )
531  {
532  const size_t jbegin( ( IsUpper_v<MT1> )
533  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
534  :( 0UL ) );
535  const size_t jend( ( IsLower_v<MT1> )
536  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
537  :( N ) );
538  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
539 
540  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
541  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
542 
543  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
544  size_t j( jbegin );
545 
546  for( ; j<jpos; j+=SIMDSIZE ) {
547  const SIMDType x1( x.load(j) );
548  xmm1 += A.load(i ,j) * x1;
549  xmm2 += A.load(i+1UL,j) * x1;
550  xmm3 += A.load(i+2UL,j) * x1;
551  xmm4 += A.load(i+3UL,j) * x1;
552  xmm5 += A.load(i+4UL,j) * x1;
553  xmm6 += A.load(i+5UL,j) * x1;
554  xmm7 += A.load(i+6UL,j) * x1;
555  xmm8 += A.load(i+7UL,j) * x1;
556  }
557 
558  y[i ] = sum( xmm1 );
559  y[i+1UL] = sum( xmm2 );
560  y[i+2UL] = sum( xmm3 );
561  y[i+3UL] = sum( xmm4 );
562  y[i+4UL] = sum( xmm5 );
563  y[i+5UL] = sum( xmm6 );
564  y[i+6UL] = sum( xmm7 );
565  y[i+7UL] = sum( xmm8 );
566 
567  for( ; remainder && j<jend; ++j ) {
568  y[i ] += A(i ,j) * x[j];
569  y[i+1UL] += A(i+1UL,j) * x[j];
570  y[i+2UL] += A(i+2UL,j) * x[j];
571  y[i+3UL] += A(i+3UL,j) * x[j];
572  y[i+4UL] += A(i+4UL,j) * x[j];
573  y[i+5UL] += A(i+5UL,j) * x[j];
574  y[i+6UL] += A(i+6UL,j) * x[j];
575  y[i+7UL] += A(i+7UL,j) * x[j];
576  }
577  }
578 
579  for( ; (i+4UL) <= M; i+=4UL )
580  {
581  const size_t jbegin( ( IsUpper_v<MT1> )
582  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
583  :( 0UL ) );
584  const size_t jend( ( IsLower_v<MT1> )
585  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
586  :( N ) );
587  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
588 
589  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
590  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
591 
592  SIMDType xmm1, xmm2, xmm3, xmm4;
593  size_t j( jbegin );
594 
595  for( ; j<jpos; j+=SIMDSIZE ) {
596  const SIMDType x1( x.load(j) );
597  xmm1 += A.load(i ,j) * x1;
598  xmm2 += A.load(i+1UL,j) * x1;
599  xmm3 += A.load(i+2UL,j) * x1;
600  xmm4 += A.load(i+3UL,j) * x1;
601  }
602 
603  y[i ] = sum( xmm1 );
604  y[i+1UL] = sum( xmm2 );
605  y[i+2UL] = sum( xmm3 );
606  y[i+3UL] = sum( xmm4 );
607 
608  for( ; remainder && j<jend; ++j ) {
609  y[i ] += A(i ,j) * x[j];
610  y[i+1UL] += A(i+1UL,j) * x[j];
611  y[i+2UL] += A(i+2UL,j) * x[j];
612  y[i+3UL] += A(i+3UL,j) * x[j];
613  }
614  }
615 
616  for( ; (i+3UL) <= M; i+=3UL )
617  {
618  const size_t jbegin( ( IsUpper_v<MT1> )
619  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
620  :( 0UL ) );
621  const size_t jend( ( IsLower_v<MT1> )
622  ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
623  :( N ) );
624  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
625 
626  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
627  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
628 
629  SIMDType xmm1, xmm2, xmm3;
630  size_t j( jbegin );
631 
632  for( ; j<jpos; j+=SIMDSIZE ) {
633  const SIMDType x1( x.load(j) );
634  xmm1 += A.load(i ,j) * x1;
635  xmm2 += A.load(i+1UL,j) * x1;
636  xmm3 += A.load(i+2UL,j) * x1;
637  }
638 
639  y[i ] = sum( xmm1 );
640  y[i+1UL] = sum( xmm2 );
641  y[i+2UL] = sum( xmm3 );
642 
643  for( ; remainder && j<jend; ++j ) {
644  y[i ] += A(i ,j) * x[j];
645  y[i+1UL] += A(i+1UL,j) * x[j];
646  y[i+2UL] += A(i+2UL,j) * x[j];
647  }
648  }
649 
650  for( ; (i+2UL) <= M; i+=2UL )
651  {
652  const size_t jbegin( ( IsUpper_v<MT1> )
653  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
654  :( 0UL ) );
655  const size_t jend( ( IsLower_v<MT1> )
656  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
657  :( N ) );
658  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
659 
660  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
661  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
662 
663  SIMDType xmm1, xmm2;
664  size_t j( jbegin );
665 
666  for( ; j<jpos; j+=SIMDSIZE ) {
667  const SIMDType x1( x.load(j) );
668  xmm1 += A.load(i ,j) * x1;
669  xmm2 += A.load(i+1UL,j) * x1;
670  }
671 
672  y[i ] = sum( xmm1 );
673  y[i+1UL] = sum( xmm2 );
674 
675  for( ; remainder && j<jend; ++j ) {
676  y[i ] += A(i ,j) * x[j];
677  y[i+1UL] += A(i+1UL,j) * x[j];
678  }
679  }
680 
681  if( i < M )
682  {
683  const size_t jbegin( ( IsUpper_v<MT1> )
684  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
685  :( 0UL ) );
686  const size_t jend( ( IsLower_v<MT1> )
687  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
688  :( N ) );
689  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
690 
691  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
692  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
693 
694  SIMDType xmm1;
695  size_t j( jbegin );
696 
697  for( ; j<jpos; j+=SIMDSIZE ) {
698  xmm1 += A.load(i,j) * x.load(j);
699  }
700 
701  y[i] = sum( xmm1 );
702 
703  for( ; remainder && j<jend; ++j ) {
704  y[i] += A(i,j) * x[j];
705  }
706  }
707  }
709  //**********************************************************************************************
710 
711  //**Default assignment to dense vectors (large matrices)****************************************
725  template< typename VT1 // Type of the left-hand side target vector
726  , typename MT1 // Type of the left-hand side matrix operand
727  , typename VT2 > // Type of the right-hand side vector operand
728  static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
729  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
730  {
731  selectDefaultAssignKernel( y, A, x );
732  }
734  //**********************************************************************************************
735 
736  //**Vectorized default assignment to dense vectors (large matrices)*****************************
750  template< typename VT1 // Type of the left-hand side target vector
751  , typename MT1 // Type of the left-hand side matrix operand
752  , typename VT2 > // Type of the right-hand side vector operand
753  static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
754  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
755  {
756  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
757 
758  const size_t M( A.rows() );
759  const size_t N( A.columns() );
760 
761  reset( y );
762 
763  size_t i( 0UL );
764 
765  for( ; (i+8UL) <= M; i+=8UL )
766  {
767  const size_t jbegin( ( IsUpper_v<MT1> )
768  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
769  :( 0UL ) );
770  const size_t jend( ( IsLower_v<MT1> )
771  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
772  :( N ) );
773  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
774 
775  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
776  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
777 
778  size_t j( jbegin );
779 
780  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
781  const size_t j1( j+SIMDSIZE );
782  const size_t j2( j+SIMDSIZE*2UL );
783  const size_t j3( j+SIMDSIZE*3UL );
784  const SIMDType x1( x.load(j ) );
785  const SIMDType x2( x.load(j1) );
786  const SIMDType x3( x.load(j2) );
787  const SIMDType x4( x.load(j3) );
788  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
789  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
790  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
791  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
792  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
793  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
794  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
795  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
796  }
797 
798  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
799  const size_t j1( j+SIMDSIZE );
800  const SIMDType x1( x.load(j ) );
801  const SIMDType x2( x.load(j1) );
802  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
803  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
804  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
805  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
806  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
807  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
808  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
809  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
810  }
811 
812  for( ; j<jpos; j+=SIMDSIZE ) {
813  const SIMDType x1( x.load(j) );
814  y[i ] += sum( A.load(i ,j) * x1 );
815  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
816  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
817  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
818  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
819  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
820  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
821  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
822  }
823 
824  for( ; remainder && j<jend; ++j ) {
825  y[i ] += A(i ,j) * x[j];
826  y[i+1UL] += A(i+1UL,j) * x[j];
827  y[i+2UL] += A(i+2UL,j) * x[j];
828  y[i+3UL] += A(i+3UL,j) * x[j];
829  y[i+4UL] += A(i+4UL,j) * x[j];
830  y[i+5UL] += A(i+5UL,j) * x[j];
831  y[i+6UL] += A(i+6UL,j) * x[j];
832  y[i+7UL] += A(i+7UL,j) * x[j];
833  }
834  }
835 
836  for( ; (i+4UL) <= M; i+=4UL )
837  {
838  const size_t jbegin( ( IsUpper_v<MT1> )
839  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
840  :( 0UL ) );
841  const size_t jend( ( IsLower_v<MT1> )
842  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
843  :( N ) );
844  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
845 
846  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
847  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
848 
849  size_t j( jbegin );
850 
851  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
852  const size_t j1( j+SIMDSIZE );
853  const size_t j2( j+SIMDSIZE*2UL );
854  const size_t j3( j+SIMDSIZE*3UL );
855  const SIMDType x1( x.load(j ) );
856  const SIMDType x2( x.load(j1) );
857  const SIMDType x3( x.load(j2) );
858  const SIMDType x4( x.load(j3) );
859  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
860  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
861  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
862  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
863  }
864 
865  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
866  const size_t j1( j+SIMDSIZE );
867  const SIMDType x1( x.load(j ) );
868  const SIMDType x2( x.load(j1) );
869  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
870  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
871  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
872  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
873  }
874 
875  for( ; j<jpos; j+=SIMDSIZE ) {
876  const SIMDType x1( x.load(j) );
877  y[i ] += sum( A.load(i ,j) * x1 );
878  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
879  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
880  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
881  }
882 
883  for( ; remainder && j<jend; ++j ) {
884  y[i ] += A(i ,j) * x[j];
885  y[i+1UL] += A(i+1UL,j) * x[j];
886  y[i+2UL] += A(i+2UL,j) * x[j];
887  y[i+3UL] += A(i+3UL,j) * x[j];
888  }
889  }
890 
891  for( ; (i+2UL) <= M; i+=2UL )
892  {
893  const size_t jbegin( ( IsUpper_v<MT1> )
894  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
895  :( 0UL ) );
896  const size_t jend( ( IsLower_v<MT1> )
897  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
898  :( N ) );
899  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
900 
901  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
902  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
903 
904  size_t j( jbegin );
905 
906  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
907  const size_t j1( j+SIMDSIZE );
908  const size_t j2( j+SIMDSIZE*2UL );
909  const size_t j3( j+SIMDSIZE*3UL );
910  const SIMDType x1( x.load(j ) );
911  const SIMDType x2( x.load(j1) );
912  const SIMDType x3( x.load(j2) );
913  const SIMDType x4( x.load(j3) );
914  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
915  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
916  }
917 
918  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
919  const size_t j1( j+SIMDSIZE );
920  const SIMDType x1( x.load(j ) );
921  const SIMDType x2( x.load(j1) );
922  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
923  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
924  }
925 
926  for( ; j<jpos; j+=SIMDSIZE ) {
927  const SIMDType x1( x.load(j) );
928  y[i ] += sum( A.load(i ,j) * x1 );
929  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
930  }
931 
932  for( ; remainder && j<jend; ++j ) {
933  y[i ] += A(i ,j) * x[j];
934  y[i+1UL] += A(i+1UL,j) * x[j];
935  }
936  }
937 
938  if( i < M )
939  {
940  const size_t jbegin( ( IsUpper_v<MT1> )
941  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
942  :( 0UL ) );
943  const size_t jend( ( IsLower_v<MT1> )
944  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
945  :( N ) );
946  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
947 
948  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
949  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
950 
951  size_t j( jbegin );
952 
953  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
954  const size_t j1( j+SIMDSIZE );
955  const size_t j2( j+SIMDSIZE*2UL );
956  const size_t j3( j+SIMDSIZE*3UL );
957  const SIMDType x1( x.load(j ) );
958  const SIMDType x2( x.load(j1) );
959  const SIMDType x3( x.load(j2) );
960  const SIMDType x4( x.load(j3) );
961  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
962  }
963 
964  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
965  const size_t j1( j+SIMDSIZE );
966  const SIMDType x1( x.load(j ) );
967  const SIMDType x2( x.load(j1) );
968  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
969  }
970 
971  for( ; j<jpos; j+=SIMDSIZE ) {
972  const SIMDType x1( x.load(j) );
973  y[i] += sum( A.load(i,j) * x1 );
974  }
975 
976  for( ; remainder && j<jend; ++j ) {
977  y[i] += A(i,j) * x[j];
978  }
979  }
980  }
982  //**********************************************************************************************
983 
984  //**BLAS-based assignment to dense vectors (default)********************************************
998  template< typename VT1 // Type of the left-hand side target vector
999  , typename MT1 // Type of the left-hand side matrix operand
1000  , typename VT2 > // Type of the right-hand side vector operand
1001  static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1002  -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1003  {
1004  selectLargeAssignKernel( y, A, x );
1005  }
1007  //**********************************************************************************************
1008 
1009  //**BLAS-based assignment to dense vectors******************************************************
1010 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1011 
1024  template< typename VT1 // Type of the left-hand side target vector
1025  , typename MT1 // Type of the left-hand side matrix operand
1026  , typename VT2 > // Type of the right-hand side vector operand
1027  static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1028  -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1029  {
1030  using ET = ElementType_t<VT1>;
1031 
1032  if( IsTriangular_v<MT1> ) {
1033  assign( y, x );
1034  trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1035  }
1036  else {
1037  gemv( y, A, x, ET(1), ET(0) );
1038  }
1039  }
1041 #endif
1042  //**********************************************************************************************
1043 
1044  //**Assignment to sparse vectors****************************************************************
1057  template< typename VT1 > // Type of the target sparse vector
1058  friend inline void assign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1059  {
1061 
1065 
1066  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1067 
1068  const ResultType tmp( serial( rhs ) );
1069  assign( ~lhs, tmp );
1070  }
1072  //**********************************************************************************************
1073 
1074  //**Addition assignment to dense vectors********************************************************
1087  template< typename VT1 > // Type of the target dense vector
1088  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1089  {
1091 
1092  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1093 
1094  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1095  return;
1096  }
1097 
1098  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1099  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1100 
1101  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1102  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1103  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1104  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1105 
1106  DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1107  }
1109  //**********************************************************************************************
1110 
1111  //**Addition assignment to dense vectors (kernel selection)*************************************
1122  template< typename VT1 // Type of the left-hand side target vector
1123  , typename MT1 // Type of the left-hand side matrix operand
1124  , typename VT2 > // Type of the right-hand side vector operand
1125  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1126  {
1127  if( ( IsDiagonal_v<MT1> ) ||
1128  ( IsComputation_v<MT> && !evaluateMatrix ) ||
1129  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1130  selectSmallAddAssignKernel( y, A, x );
1131  else
1132  selectBlasAddAssignKernel( y, A, x );
1133  }
1135  //**********************************************************************************************
1136 
1137  //**Default addition assignment to dense vectors************************************************
1151  template< typename VT1 // Type of the left-hand side target vector
1152  , typename MT1 // Type of the left-hand side matrix operand
1153  , typename VT2 > // Type of the right-hand side vector operand
1154  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1155  {
1156  y.addAssign( A * x );
1157  }
1159  //**********************************************************************************************
1160 
1161  //**Default addition assignment to dense vectors (small matrices)*******************************
1175  template< typename VT1 // Type of the left-hand side target vector
1176  , typename MT1 // Type of the left-hand side matrix operand
1177  , typename VT2 > // Type of the right-hand side vector operand
1178  static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1179  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1180  {
1181  selectDefaultAddAssignKernel( y, A, x );
1182  }
1184  //**********************************************************************************************
1185 
1186  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1200  template< typename VT1 // Type of the left-hand side target vector
1201  , typename MT1 // Type of the left-hand side matrix operand
1202  , typename VT2 > // Type of the right-hand side vector operand
1203  static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1204  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1205  {
1206  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1207 
1208  const size_t M( A.rows() );
1209  const size_t N( A.columns() );
1210 
1211  size_t i( 0UL );
1212 
1213  for( ; (i+8UL) <= M; i+=8UL )
1214  {
1215  const size_t jbegin( ( IsUpper_v<MT1> )
1216  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1217  :( 0UL ) );
1218  const size_t jend( ( IsLower_v<MT1> )
1219  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1220  :( N ) );
1221  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1222 
1223  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1224  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1225 
1226  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1227  size_t j( jbegin );
1228 
1229  for( ; j<jpos; j+=SIMDSIZE ) {
1230  const SIMDType x1( x.load(j) );
1231  xmm1 += A.load(i ,j) * x1;
1232  xmm2 += A.load(i+1UL,j) * x1;
1233  xmm3 += A.load(i+2UL,j) * x1;
1234  xmm4 += A.load(i+3UL,j) * x1;
1235  xmm5 += A.load(i+4UL,j) * x1;
1236  xmm6 += A.load(i+5UL,j) * x1;
1237  xmm7 += A.load(i+6UL,j) * x1;
1238  xmm8 += A.load(i+7UL,j) * x1;
1239  }
1240 
1241  y[i ] += sum( xmm1 );
1242  y[i+1UL] += sum( xmm2 );
1243  y[i+2UL] += sum( xmm3 );
1244  y[i+3UL] += sum( xmm4 );
1245  y[i+4UL] += sum( xmm5 );
1246  y[i+5UL] += sum( xmm6 );
1247  y[i+6UL] += sum( xmm7 );
1248  y[i+7UL] += sum( xmm8 );
1249 
1250  for( ; remainder && j<jend; ++j ) {
1251  y[i ] += A(i ,j) * x[j];
1252  y[i+1UL] += A(i+1UL,j) * x[j];
1253  y[i+2UL] += A(i+2UL,j) * x[j];
1254  y[i+3UL] += A(i+3UL,j) * x[j];
1255  y[i+4UL] += A(i+4UL,j) * x[j];
1256  y[i+5UL] += A(i+5UL,j) * x[j];
1257  y[i+6UL] += A(i+6UL,j) * x[j];
1258  y[i+7UL] += A(i+7UL,j) * x[j];
1259  }
1260  }
1261 
1262  for( ; (i+4UL) <= M; i+=4UL )
1263  {
1264  const size_t jbegin( ( IsUpper_v<MT1> )
1265  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1266  :( 0UL ) );
1267  const size_t jend( ( IsLower_v<MT1> )
1268  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1269  :( N ) );
1270  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1271 
1272  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1273  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1274 
1275  SIMDType xmm1, xmm2, xmm3, xmm4;
1276  size_t j( jbegin );
1277 
1278  for( ; j<jpos; j+=SIMDSIZE ) {
1279  const SIMDType x1( x.load(j) );
1280  xmm1 += A.load(i ,j) * x1;
1281  xmm2 += A.load(i+1UL,j) * x1;
1282  xmm3 += A.load(i+2UL,j) * x1;
1283  xmm4 += A.load(i+3UL,j) * x1;
1284  }
1285 
1286  y[i ] += sum( xmm1 );
1287  y[i+1UL] += sum( xmm2 );
1288  y[i+2UL] += sum( xmm3 );
1289  y[i+3UL] += sum( xmm4 );
1290 
1291  for( ; remainder && j<jend; ++j ) {
1292  y[i ] += A(i ,j) * x[j];
1293  y[i+1UL] += A(i+1UL,j) * x[j];
1294  y[i+2UL] += A(i+2UL,j) * x[j];
1295  y[i+3UL] += A(i+3UL,j) * x[j];
1296  }
1297  }
1298 
1299  for( ; (i+3UL) <= M; i+=3UL )
1300  {
1301  const size_t jbegin( ( IsUpper_v<MT1> )
1302  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1303  :( 0UL ) );
1304  const size_t jend( ( IsLower_v<MT1> )
1305  ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
1306  :( N ) );
1307  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1308 
1309  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1310  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1311 
1312  SIMDType xmm1, xmm2, xmm3;
1313  size_t j( jbegin );
1314 
1315  for( ; j<jpos; j+=SIMDSIZE ) {
1316  const SIMDType x1( x.load(j) );
1317  xmm1 += A.load(i ,j) * x1;
1318  xmm2 += A.load(i+1UL,j) * x1;
1319  xmm3 += A.load(i+2UL,j) * x1;
1320  }
1321 
1322  y[i ] += sum( xmm1 );
1323  y[i+1UL] += sum( xmm2 );
1324  y[i+2UL] += sum( xmm3 );
1325 
1326  for( ; remainder && j<jend; ++j ) {
1327  y[i ] += A(i ,j) * x[j];
1328  y[i+1UL] += A(i+1UL,j) * x[j];
1329  y[i+2UL] += A(i+2UL,j) * x[j];
1330  }
1331  }
1332 
1333  for( ; (i+2UL) <= M; i+=2UL )
1334  {
1335  const size_t jbegin( ( IsUpper_v<MT1> )
1336  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1337  :( 0UL ) );
1338  const size_t jend( ( IsLower_v<MT1> )
1339  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1340  :( N ) );
1341  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1342 
1343  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1344  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1345 
1346  SIMDType xmm1, xmm2;
1347  size_t j( jbegin );
1348 
1349  for( ; j<jpos; j+=SIMDSIZE ) {
1350  const SIMDType x1( x.load(j) );
1351  xmm1 += A.load(i ,j) * x1;
1352  xmm2 += A.load(i+1UL,j) * x1;
1353  }
1354 
1355  y[i ] += sum( xmm1 );
1356  y[i+1UL] += sum( xmm2 );
1357 
1358  for( ; remainder && j<jend; ++j ) {
1359  y[i ] += A(i ,j) * x[j];
1360  y[i+1UL] += A(i+1UL,j) * x[j];
1361  }
1362  }
1363 
1364  if( i < M )
1365  {
1366  const size_t jbegin( ( IsUpper_v<MT1> )
1367  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1368  :( 0UL ) );
1369  const size_t jend( ( IsLower_v<MT1> )
1370  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1371  :( N ) );
1372  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1373 
1374  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1375  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1376 
1377  SIMDType xmm1;
1378  size_t j( jbegin );
1379 
1380  for( ; j<jpos; j+=SIMDSIZE ) {
1381  xmm1 += A.load(i,j) * x.load(j);
1382  }
1383 
1384  y[i] += sum( xmm1 );
1385 
1386  for( ; remainder && j<jend; ++j ) {
1387  y[i] += A(i,j) * x[j];
1388  }
1389  }
1390  }
1392  //**********************************************************************************************
1393 
1394  //**Default addition assignment to dense vectors (large matrices)*******************************
1408  template< typename VT1 // Type of the left-hand side target vector
1409  , typename MT1 // Type of the left-hand side matrix operand
1410  , typename VT2 > // Type of the right-hand side vector operand
1411  static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1412  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1413  {
1414  selectDefaultAddAssignKernel( y, A, x );
1415  }
1417  //**********************************************************************************************
1418 
1419  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1433  template< typename VT1 // Type of the left-hand side target vector
1434  , typename MT1 // Type of the left-hand side matrix operand
1435  , typename VT2 > // Type of the right-hand side vector operand
1436  static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1437  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1438  {
1439  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1440 
1441  const size_t M( A.rows() );
1442  const size_t N( A.columns() );
1443 
1444  size_t i( 0UL );
1445 
1446  for( ; (i+8UL) <= M; i+=8UL )
1447  {
1448  const size_t jbegin( ( IsUpper_v<MT1> )
1449  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1450  :( 0UL ) );
1451  const size_t jend( ( IsLower_v<MT1> )
1452  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1453  :( N ) );
1454  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1455 
1456  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1457  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1458 
1459  size_t j( jbegin );
1460 
1461  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1462  const size_t j1( j+SIMDSIZE );
1463  const size_t j2( j+SIMDSIZE*2UL );
1464  const size_t j3( j+SIMDSIZE*3UL );
1465  const SIMDType x1( x.load(j ) );
1466  const SIMDType x2( x.load(j1) );
1467  const SIMDType x3( x.load(j2) );
1468  const SIMDType x4( x.load(j3) );
1469  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1470  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1471  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1472  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1473  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1474  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1475  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1476  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1477  }
1478 
1479  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1480  const size_t j1( j+SIMDSIZE );
1481  const SIMDType x1( x.load(j ) );
1482  const SIMDType x2( x.load(j1) );
1483  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1484  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1485  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1486  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1487  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1488  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1489  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1490  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1491  }
1492 
1493  for( ; j<jpos; j+=SIMDSIZE ) {
1494  const SIMDType x1( x.load(j) );
1495  y[i ] += sum( A.load(i ,j) * x1 );
1496  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1497  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1498  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1499  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
1500  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
1501  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
1502  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
1503  }
1504 
1505  for( ; remainder && j<jend; ++j ) {
1506  y[i ] += A(i ,j) * x[j];
1507  y[i+1UL] += A(i+1UL,j) * x[j];
1508  y[i+2UL] += A(i+2UL,j) * x[j];
1509  y[i+3UL] += A(i+3UL,j) * x[j];
1510  y[i+4UL] += A(i+4UL,j) * x[j];
1511  y[i+5UL] += A(i+5UL,j) * x[j];
1512  y[i+6UL] += A(i+6UL,j) * x[j];
1513  y[i+7UL] += A(i+7UL,j) * x[j];
1514  }
1515  }
1516 
1517  for( ; (i+4UL) <= M; i+=4UL )
1518  {
1519  const size_t jbegin( ( IsUpper_v<MT1> )
1520  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1521  :( 0UL ) );
1522  const size_t jend( ( IsLower_v<MT1> )
1523  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1524  :( N ) );
1525  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1526 
1527  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1528  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1529 
1530  size_t j( jbegin );
1531 
1532  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1533  const size_t j1( j+SIMDSIZE );
1534  const size_t j2( j+SIMDSIZE*2UL );
1535  const size_t j3( j+SIMDSIZE*3UL );
1536  const SIMDType x1( x.load(j ) );
1537  const SIMDType x2( x.load(j1) );
1538  const SIMDType x3( x.load(j2) );
1539  const SIMDType x4( x.load(j3) );
1540  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1541  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1542  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1543  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1544  }
1545 
1546  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1547  const size_t j1( j+SIMDSIZE );
1548  const SIMDType x1( x.load(j ) );
1549  const SIMDType x2( x.load(j1) );
1550  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1551  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1552  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1553  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1554  }
1555 
1556  for( ; j<jpos; j+=SIMDSIZE ) {
1557  const SIMDType x1( x.load(j) );
1558  y[i ] += sum( A.load(i ,j) * x1 );
1559  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1560  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1561  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1562  }
1563 
1564  for( ; remainder && j<jend; ++j ) {
1565  y[i ] += A(i ,j) * x[j];
1566  y[i+1UL] += A(i+1UL,j) * x[j];
1567  y[i+2UL] += A(i+2UL,j) * x[j];
1568  y[i+3UL] += A(i+3UL,j) * x[j];
1569  }
1570  }
1571 
1572  for( ; (i+2UL) <= M; i+=2UL )
1573  {
1574  const size_t jbegin( ( IsUpper_v<MT1> )
1575  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1576  :( 0UL ) );
1577  const size_t jend( ( IsLower_v<MT1> )
1578  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1579  :( N ) );
1580  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1581 
1582  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1583  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1584 
1585  size_t j( jbegin );
1586 
1587  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1588  const size_t j1( j+SIMDSIZE );
1589  const size_t j2( j+SIMDSIZE*2UL );
1590  const size_t j3( j+SIMDSIZE*3UL );
1591  const SIMDType x1( x.load(j ) );
1592  const SIMDType x2( x.load(j1) );
1593  const SIMDType x3( x.load(j2) );
1594  const SIMDType x4( x.load(j3) );
1595  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1596  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1597  }
1598 
1599  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1600  const size_t j1( j+SIMDSIZE );
1601  const SIMDType x1( x.load(j ) );
1602  const SIMDType x2( x.load(j1) );
1603  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1604  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1605  }
1606 
1607  for( ; j<jpos; j+=SIMDSIZE ) {
1608  const SIMDType x1( x.load(j) );
1609  y[i ] += sum( A.load(i ,j) * x1 );
1610  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1611  }
1612 
1613  for( ; remainder && j<jend; ++j ) {
1614  y[i ] += A(i ,j) * x[j];
1615  y[i+1UL] += A(i+1UL,j) * x[j];
1616  }
1617  }
1618 
1619  if( i < M )
1620  {
1621  const size_t jbegin( ( IsUpper_v<MT1> )
1622  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1623  :( 0UL ) );
1624  const size_t jend( ( IsLower_v<MT1> )
1625  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1626  :( N ) );
1627  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1628 
1629  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1630  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1631 
1632  size_t j( jbegin );
1633 
1634  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1635  const size_t j1( j+SIMDSIZE );
1636  const size_t j2( j+SIMDSIZE*2UL );
1637  const size_t j3( j+SIMDSIZE*3UL );
1638  const SIMDType x1( x.load(j ) );
1639  const SIMDType x2( x.load(j1) );
1640  const SIMDType x3( x.load(j2) );
1641  const SIMDType x4( x.load(j3) );
1642  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1643  }
1644 
1645  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1646  const size_t j1( j+SIMDSIZE );
1647  const SIMDType x1( x.load(j ) );
1648  const SIMDType x2( x.load(j1) );
1649  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1650  }
1651 
1652  for( ; j<jpos; j+=SIMDSIZE ) {
1653  const SIMDType x1( x.load(j) );
1654  y[i] += sum( A.load(i,j) * x1 );
1655  }
1656 
1657  for( ; remainder && j<jend; ++j ) {
1658  y[i] += A(i,j) * x[j];
1659  }
1660  }
1661  }
1663  //**********************************************************************************************
1664 
1665  //**BLAS-based addition assignment to dense vectors (default)***********************************
1679  template< typename VT1 // Type of the left-hand side target vector
1680  , typename MT1 // Type of the left-hand side matrix operand
1681  , typename VT2 > // Type of the right-hand side vector operand
1682  static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1683  -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1684  {
1685  selectLargeAddAssignKernel( y, A, x );
1686  }
1688  //**********************************************************************************************
1689 
1690  //**BLAS-based addition assignment to dense vectors*********************************************
1691 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1692 
1705  template< typename VT1 // Type of the left-hand side target vector
1706  , typename MT1 // Type of the left-hand side matrix operand
1707  , typename VT2 > // Type of the right-hand side vector operand
1708  static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1709  -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1710  {
1711  using ET = ElementType_t<VT1>;
1712 
1713  if( IsTriangular_v<MT1> ) {
1714  ResultType_t<VT1> tmp( serial( x ) );
1715  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1716  addAssign( y, tmp );
1717  }
1718  else {
1719  gemv( y, A, x, ET(1), ET(1) );
1720  }
1721  }
1723 #endif
1724  //**********************************************************************************************
1725 
1726  //**Addition assignment to sparse vectors*******************************************************
1727  // No special implementation for the addition assignment to sparse vectors.
1728  //**********************************************************************************************
1729 
1730  //**Subtraction assignment to dense vectors*****************************************************
1743  template< typename VT1 > // Type of the target dense vector
1744  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1745  {
1747 
1748  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1749 
1750  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1751  return;
1752  }
1753 
1754  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1755  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1756 
1757  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1758  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1759  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1760  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1761 
1762  DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1763  }
1765  //**********************************************************************************************
1766 
1767  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1778  template< typename VT1 // Type of the left-hand side target vector
1779  , typename MT1 // Type of the left-hand side matrix operand
1780  , typename VT2 > // Type of the right-hand side vector operand
1781  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1782  {
1783  if( ( IsDiagonal_v<MT1> ) ||
1784  ( IsComputation_v<MT> && !evaluateMatrix ) ||
1785  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1786  selectSmallSubAssignKernel( y, A, x );
1787  else
1788  selectBlasSubAssignKernel( y, A, x );
1789  }
1791  //**********************************************************************************************
1792 
1793  //**Default subtraction assignment to dense vectors*********************************************
1807  template< typename VT1 // Type of the left-hand side target vector
1808  , typename MT1 // Type of the left-hand side matrix operand
1809  , typename VT2 > // Type of the right-hand side vector operand
1810  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1811  {
1812  y.subAssign( A * x );
1813  }
1815  //**********************************************************************************************
1816 
1817  //**Default subtraction assignment to dense vectors (small matrices)****************************
1831  template< typename VT1 // Type of the left-hand side target vector
1832  , typename MT1 // Type of the left-hand side matrix operand
1833  , typename VT2 > // Type of the right-hand side vector operand
1834  static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1835  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1836  {
1837  selectDefaultSubAssignKernel( y, A, x );
1838  }
1840  //**********************************************************************************************
1841 
1842  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1856  template< typename VT1 // Type of the left-hand side target vector
1857  , typename MT1 // Type of the left-hand side matrix operand
1858  , typename VT2 > // Type of the right-hand side vector operand
1859  static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1860  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1861  {
1862  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1863 
1864  const size_t M( A.rows() );
1865  const size_t N( A.columns() );
1866 
1867  size_t i( 0UL );
1868 
1869  for( ; (i+8UL) <= M; i+=8UL )
1870  {
1871  const size_t jbegin( ( IsUpper_v<MT1> )
1872  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1873  :( 0UL ) );
1874  const size_t jend( ( IsLower_v<MT1> )
1875  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1876  :( N ) );
1877  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1878 
1879  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1880  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1881 
1882  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1883  size_t j( jbegin );
1884 
1885  for( ; j<jpos; j+=SIMDSIZE ) {
1886  const SIMDType x1( x.load(j) );
1887  xmm1 += A.load(i ,j) * x1;
1888  xmm2 += A.load(i+1UL,j) * x1;
1889  xmm3 += A.load(i+2UL,j) * x1;
1890  xmm4 += A.load(i+3UL,j) * x1;
1891  xmm5 += A.load(i+4UL,j) * x1;
1892  xmm6 += A.load(i+5UL,j) * x1;
1893  xmm7 += A.load(i+6UL,j) * x1;
1894  xmm8 += A.load(i+7UL,j) * x1;
1895  }
1896 
1897  y[i ] -= sum( xmm1 );
1898  y[i+1UL] -= sum( xmm2 );
1899  y[i+2UL] -= sum( xmm3 );
1900  y[i+3UL] -= sum( xmm4 );
1901  y[i+4UL] -= sum( xmm5 );
1902  y[i+5UL] -= sum( xmm6 );
1903  y[i+6UL] -= sum( xmm7 );
1904  y[i+7UL] -= sum( xmm8 );
1905 
1906  for( ; remainder && j<jend; ++j ) {
1907  y[i ] -= A(i ,j) * x[j];
1908  y[i+1UL] -= A(i+1UL,j) * x[j];
1909  y[i+2UL] -= A(i+2UL,j) * x[j];
1910  y[i+3UL] -= A(i+3UL,j) * x[j];
1911  y[i+4UL] -= A(i+4UL,j) * x[j];
1912  y[i+5UL] -= A(i+5UL,j) * x[j];
1913  y[i+6UL] -= A(i+6UL,j) * x[j];
1914  y[i+7UL] -= A(i+7UL,j) * x[j];
1915  }
1916  }
1917 
1918  for( ; (i+4UL) <= M; i+=4UL )
1919  {
1920  const size_t jbegin( ( IsUpper_v<MT1> )
1921  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1922  :( 0UL ) );
1923  const size_t jend( ( IsLower_v<MT1> )
1924  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1925  :( N ) );
1926  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1927 
1928  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1929  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1930 
1931  SIMDType xmm1, xmm2, xmm3, xmm4;
1932  size_t j( jbegin );
1933 
1934  for( ; j<jpos; j+=SIMDSIZE ) {
1935  const SIMDType x1( x.load(j) );
1936  xmm1 += A.load(i ,j) * x1;
1937  xmm2 += A.load(i+1UL,j) * x1;
1938  xmm3 += A.load(i+2UL,j) * x1;
1939  xmm4 += A.load(i+3UL,j) * x1;
1940  }
1941 
1942  y[i ] -= sum( xmm1 );
1943  y[i+1UL] -= sum( xmm2 );
1944  y[i+2UL] -= sum( xmm3 );
1945  y[i+3UL] -= sum( xmm4 );
1946 
1947  for( ; remainder && j<jend; ++j ) {
1948  y[i ] -= A(i ,j) * x[j];
1949  y[i+1UL] -= A(i+1UL,j) * x[j];
1950  y[i+2UL] -= A(i+2UL,j) * x[j];
1951  y[i+3UL] -= A(i+3UL,j) * x[j];
1952  }
1953  }
1954 
1955  for( ; (i+3UL) <= M; i+=3UL )
1956  {
1957  const size_t jbegin( ( IsUpper_v<MT1> )
1958  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1959  :( 0UL ) );
1960  const size_t jend( ( IsLower_v<MT1> )
1961  ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
1962  :( N ) );
1963  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1964 
1965  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1966  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1967 
1968  SIMDType xmm1, xmm2, xmm3;
1969  size_t j( jbegin );
1970 
1971  for( ; j<jpos; j+=SIMDSIZE ) {
1972  const SIMDType x1( x.load(j) );
1973  xmm1 += A.load(i ,j) * x1;
1974  xmm2 += A.load(i+1UL,j) * x1;
1975  xmm3 += A.load(i+2UL,j) * x1;
1976  }
1977 
1978  y[i ] -= sum( xmm1 );
1979  y[i+1UL] -= sum( xmm2 );
1980  y[i+2UL] -= sum( xmm3 );
1981 
1982  for( ; remainder && j<jend; ++j ) {
1983  y[i ] -= A(i ,j) * x[j];
1984  y[i+1UL] -= A(i+1UL,j) * x[j];
1985  y[i+2UL] -= A(i+2UL,j) * x[j];
1986  }
1987  }
1988 
1989  for( ; (i+2UL) <= M; i+=2UL )
1990  {
1991  const size_t jbegin( ( IsUpper_v<MT1> )
1992  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
1993  :( 0UL ) );
1994  const size_t jend( ( IsLower_v<MT1> )
1995  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1996  :( N ) );
1997  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1998 
1999  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2000  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2001 
2002  SIMDType xmm1, xmm2;
2003  size_t j( jbegin );
2004 
2005  for( ; j<jpos; j+=SIMDSIZE ) {
2006  const SIMDType x1( x.load(j) );
2007  xmm1 += A.load(i ,j) * x1;
2008  xmm2 += A.load(i+1UL,j) * x1;
2009  }
2010 
2011  y[i ] -= sum( xmm1 );
2012  y[i+1UL] -= sum( xmm2 );
2013 
2014  for( ; remainder && j<jend; ++j ) {
2015  y[i ] -= A(i ,j) * x[j];
2016  y[i+1UL] -= A(i+1UL,j) * x[j];
2017  }
2018  }
2019 
2020  if( i < M )
2021  {
2022  const size_t jbegin( ( IsUpper_v<MT1> )
2023  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
2024  :( 0UL ) );
2025  const size_t jend( ( IsLower_v<MT1> )
2026  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
2027  :( N ) );
2028  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2029 
2030  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2031  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2032 
2033  SIMDType xmm1;
2034  size_t j( jbegin );
2035 
2036  for( ; j<jpos; j+=SIMDSIZE ) {
2037  xmm1 += A.load(i,j) * x.load(j);
2038  }
2039 
2040  y[i] -= sum( xmm1 );
2041 
2042  for( ; remainder && j<jend; ++j ) {
2043  y[i] -= A(i,j) * x[j];
2044  }
2045  }
2046  }
2048  //**********************************************************************************************
2049 
2050  //**Default subtraction assignment to dense vectors (large matrices)****************************
2064  template< typename VT1 // Type of the left-hand side target vector
2065  , typename MT1 // Type of the left-hand side matrix operand
2066  , typename VT2 > // Type of the right-hand side vector operand
2067  static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2068  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2069  {
2070  selectDefaultSubAssignKernel( y, A, x );
2071  }
2073  //**********************************************************************************************
2074 
2075  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2089  template< typename VT1 // Type of the left-hand side target vector
2090  , typename MT1 // Type of the left-hand side matrix operand
2091  , typename VT2 > // Type of the right-hand side vector operand
2092  static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2093  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2094  {
2095  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
2096 
2097  const size_t M( A.rows() );
2098  const size_t N( A.columns() );
2099 
2100  size_t i( 0UL );
2101 
2102  for( ; (i+8UL) <= M; i+=8UL )
2103  {
2104  const size_t jbegin( ( IsUpper_v<MT1> )
2105  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
2106  :( 0UL ) );
2107  const size_t jend( ( IsLower_v<MT1> )
2108  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
2109  :( N ) );
2110  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2111 
2112  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2113  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2114 
2115  size_t j( jbegin );
2116 
2117  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2118  const size_t j1( j+SIMDSIZE );
2119  const size_t j2( j+SIMDSIZE*2UL );
2120  const size_t j3( j+SIMDSIZE*3UL );
2121  const SIMDType x1( x.load(j ) );
2122  const SIMDType x2( x.load(j1) );
2123  const SIMDType x3( x.load(j2) );
2124  const SIMDType x4( x.load(j3) );
2125  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2126  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2127  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2128  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2129  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2130  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2131  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2132  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2133  }
2134 
2135  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2136  const size_t j1( j+SIMDSIZE );
2137  const SIMDType x1( x.load(j ) );
2138  const SIMDType x2( x.load(j1) );
2139  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2140  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2141  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2142  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2143  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2144  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2145  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2146  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2147  }
2148 
2149  for( ; j<jpos; j+=SIMDSIZE ) {
2150  const SIMDType x1( x.load(j) );
2151  y[i ] -= sum( A.load(i ,j) * x1 );
2152  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2153  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2154  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2155  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 );
2156  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 );
2157  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 );
2158  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 );
2159  }
2160 
2161  for( ; remainder && j<jend; ++j ) {
2162  y[i ] -= A(i ,j) * x[j];
2163  y[i+1UL] -= A(i+1UL,j) * x[j];
2164  y[i+2UL] -= A(i+2UL,j) * x[j];
2165  y[i+3UL] -= A(i+3UL,j) * x[j];
2166  y[i+4UL] -= A(i+4UL,j) * x[j];
2167  y[i+5UL] -= A(i+5UL,j) * x[j];
2168  y[i+6UL] -= A(i+6UL,j) * x[j];
2169  y[i+7UL] -= A(i+7UL,j) * x[j];
2170  }
2171  }
2172 
2173  for( ; (i+4UL) <= M; i+=4UL )
2174  {
2175  const size_t jbegin( ( IsUpper_v<MT1> )
2176  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
2177  :( 0UL ) );
2178  const size_t jend( ( IsLower_v<MT1> )
2179  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
2180  :( N ) );
2181  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2182 
2183  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2184  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2185 
2186  size_t j( jbegin );
2187 
2188  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2189  const size_t j1( j+SIMDSIZE );
2190  const size_t j2( j+SIMDSIZE*2UL );
2191  const size_t j3( j+SIMDSIZE*3UL );
2192  const SIMDType x1( x.load(j ) );
2193  const SIMDType x2( x.load(j1) );
2194  const SIMDType x3( x.load(j2) );
2195  const SIMDType x4( x.load(j3) );
2196  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2197  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2198  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2199  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2200  }
2201 
2202  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2203  const size_t j1( j+SIMDSIZE );
2204  const SIMDType x1( x.load(j ) );
2205  const SIMDType x2( x.load(j1) );
2206  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2207  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2208  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2209  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2210  }
2211 
2212  for( ; j<jpos; j+=SIMDSIZE ) {
2213  const SIMDType x1( x.load(j) );
2214  y[i ] -= sum( A.load(i ,j) * x1 );
2215  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2216  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2217  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2218  }
2219 
2220  for( ; remainder && j<jend; ++j ) {
2221  y[i ] -= A(i ,j) * x[j];
2222  y[i+1UL] -= A(i+1UL,j) * x[j];
2223  y[i+2UL] -= A(i+2UL,j) * x[j];
2224  y[i+3UL] -= A(i+3UL,j) * x[j];
2225  }
2226  }
2227 
2228  for( ; (i+2UL) <= M; i+=2UL )
2229  {
2230  const size_t jbegin( ( IsUpper_v<MT1> )
2231  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
2232  :( 0UL ) );
2233  const size_t jend( ( IsLower_v<MT1> )
2234  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
2235  :( N ) );
2236  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2237 
2238  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2239  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2240 
2241  size_t j( jbegin );
2242 
2243  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2244  const size_t j1( j+SIMDSIZE );
2245  const size_t j2( j+SIMDSIZE*2UL );
2246  const size_t j3( j+SIMDSIZE*3UL );
2247  const SIMDType x1( x.load(j ) );
2248  const SIMDType x2( x.load(j1) );
2249  const SIMDType x3( x.load(j2) );
2250  const SIMDType x4( x.load(j3) );
2251  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2252  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2253  }
2254 
2255  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2256  const size_t j1( j+SIMDSIZE );
2257  const SIMDType x1( x.load(j ) );
2258  const SIMDType x2( x.load(j1) );
2259  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2260  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2261  }
2262 
2263  for( ; j<jpos; j+=SIMDSIZE ) {
2264  const SIMDType x1( x.load(j) );
2265  y[i ] -= sum( A.load(i ,j) * x1 );
2266  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2267  }
2268 
2269  for( ; remainder && j<jend; ++j ) {
2270  y[i ] -= A(i ,j) * x[j];
2271  y[i+1UL] -= A(i+1UL,j) * x[j];
2272  }
2273  }
2274 
2275  if( i < M )
2276  {
2277  const size_t jbegin( ( IsUpper_v<MT1> )
2278  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
2279  :( 0UL ) );
2280  const size_t jend( ( IsLower_v<MT1> )
2281  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
2282  :( N ) );
2283  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2284 
2285  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2286  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2287 
2288  size_t j( jbegin );
2289 
2290  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2291  const size_t j1( j+SIMDSIZE );
2292  const size_t j2( j+SIMDSIZE*2UL );
2293  const size_t j3( j+SIMDSIZE*3UL );
2294  const SIMDType x1( x.load(j ) );
2295  const SIMDType x2( x.load(j1) );
2296  const SIMDType x3( x.load(j2) );
2297  const SIMDType x4( x.load(j3) );
2298  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2299  }
2300 
2301  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2302  const size_t j1( j+SIMDSIZE );
2303  const SIMDType x1( x.load(j ) );
2304  const SIMDType x2( x.load(j1) );
2305  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2306  }
2307 
2308  for( ; j<jpos; j+=SIMDSIZE ) {
2309  const SIMDType x1( x.load(j) );
2310  y[i] -= sum( A.load(i,j) * x1 );
2311  }
2312 
2313  for( ; remainder && j<jend; ++j ) {
2314  y[i] -= A(i,j) * x[j];
2315  }
2316  }
2317  }
2319  //**********************************************************************************************
2320 
2321  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2335  template< typename VT1 // Type of the left-hand side target vector
2336  , typename MT1 // Type of the left-hand side matrix operand
2337  , typename VT2 > // Type of the right-hand side vector operand
2338  static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2339  -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2340  {
2341  selectLargeSubAssignKernel( y, A, x );
2342  }
2344  //**********************************************************************************************
2345 
2346  //**BLAS-based subtraction assignment to dense vectors******************************************
2347 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2348 
2361  template< typename VT1 // Type of the left-hand side target vector
2362  , typename MT1 // Type of the left-hand side matrix operand
2363  , typename VT2 > // Type of the right-hand side vector operand
2364  static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2365  -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2366  {
2367  using ET = ElementType_t<VT1>;
2368 
2369  if( IsTriangular_v<MT1> ) {
2370  ResultType_t<VT1> tmp( serial( x ) );
2371  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2372  subAssign( y, tmp );
2373  }
2374  else {
2375  gemv( y, A, x, ET(-1), ET(1) );
2376  }
2377  }
2379 #endif
2380  //**********************************************************************************************
2381 
2382  //**Subtraction assignment to sparse vectors****************************************************
2383  // No special implementation for the subtraction assignment to sparse vectors.
2384  //**********************************************************************************************
2385 
2386  //**Multiplication assignment to dense vectors**************************************************
2399  template< typename VT1 > // Type of the target dense vector
2400  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2401  {
2403 
2407 
2408  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2409 
2410  const ResultType tmp( serial( rhs ) );
2411  multAssign( ~lhs, tmp );
2412  }
2414  //**********************************************************************************************
2415 
2416  //**Multiplication assignment to sparse vectors*************************************************
2417  // No special implementation for the multiplication assignment to sparse vectors.
2418  //**********************************************************************************************
2419 
2420  //**Division assignment to dense vectors********************************************************
2433  template< typename VT1 > // Type of the target dense vector
2434  friend inline void divAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2435  {
2437 
2441 
2442  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2443 
2444  const ResultType tmp( serial( rhs ) );
2445  divAssign( ~lhs, tmp );
2446  }
2448  //**********************************************************************************************
2449 
2450  //**Division assignment to sparse vectors*******************************************************
2451  // No special implementation for the division assignment to sparse vectors.
2452  //**********************************************************************************************
2453 
2454  //**SMP assignment to dense vectors*************************************************************
2469  template< typename VT1 > // Type of the target dense vector
2470  friend inline auto smpAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2471  -> EnableIf_t< UseSMPAssign_v<VT1> >
2472  {
2474 
2475  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2476 
2477  if( rhs.mat_.rows() == 0UL ) {
2478  return;
2479  }
2480  else if( rhs.mat_.columns() == 0UL ) {
2481  reset( ~lhs );
2482  return;
2483  }
2484 
2485  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2486  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2487 
2488  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2489  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2490  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2491  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2492 
2493  smpAssign( ~lhs, A * x );
2494  }
2496  //**********************************************************************************************
2497 
2498  //**SMP assignment to sparse vectors************************************************************
2513  template< typename VT1 > // Type of the target sparse vector
2514  friend inline auto smpAssign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2515  -> EnableIf_t< UseSMPAssign_v<VT1> >
2516  {
2518 
2522 
2523  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2524 
2525  const ResultType tmp( rhs );
2526  smpAssign( ~lhs, tmp );
2527  }
2529  //**********************************************************************************************
2530 
2531  //**SMP addition assignment to dense vectors****************************************************
2546  template< typename VT1 > // Type of the target dense vector
2547  friend inline auto smpAddAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2548  -> EnableIf_t< UseSMPAssign_v<VT1> >
2549  {
2551 
2552  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2553 
2554  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2555  return;
2556  }
2557 
2558  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2559  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2560 
2561  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2562  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2563  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2564  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2565 
2566  smpAddAssign( ~lhs, A * x );
2567  }
2569  //**********************************************************************************************
2570 
2571  //**SMP addition assignment to sparse vectors***************************************************
2572  // No special implementation for the SMP addition assignment to sparse vectors.
2573  //**********************************************************************************************
2574 
2575  //**SMP subtraction assignment to dense vectors*************************************************
2590  template< typename VT1 > // Type of the target dense vector
2591  friend inline auto smpSubAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2592  -> EnableIf_t< UseSMPAssign_v<VT1> >
2593  {
2595 
2596  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2597 
2598  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2599  return;
2600  }
2601 
2602  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2603  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2604 
2605  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2606  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2607  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2608  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2609 
2610  smpSubAssign( ~lhs, A * x );
2611  }
2613  //**********************************************************************************************
2614 
2615  //**SMP subtraction assignment to sparse vectors************************************************
2616  // No special implementation for the SMP subtraction assignment to sparse vectors.
2617  //**********************************************************************************************
2618 
2619  //**SMP multiplication assignment to dense vectors**********************************************
2634  template< typename VT1 > // Type of the target dense vector
2635  friend inline auto smpMultAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2636  -> EnableIf_t< UseSMPAssign_v<VT1> >
2637  {
2639 
2643 
2644  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2645 
2646  const ResultType tmp( rhs );
2647  smpMultAssign( ~lhs, tmp );
2648  }
2650  //**********************************************************************************************
2651 
2652  //**SMP multiplication assignment to sparse vectors*********************************************
2653  // No special implementation for the SMP multiplication assignment to sparse vectors.
2654  //**********************************************************************************************
2655 
2656  //**SMP division assignment to dense vectors****************************************************
2671  template< typename VT1 > // Type of the target dense vector
2672  friend inline auto smpDivAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2673  -> EnableIf_t< UseSMPAssign_v<VT1> >
2674  {
2676 
2680 
2681  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2682 
2683  const ResultType tmp( rhs );
2684  smpDivAssign( ~lhs, tmp );
2685  }
2687  //**********************************************************************************************
2688 
2689  //**SMP division assignment to sparse vectors***************************************************
2690  // No special implementation for the SMP division assignment to sparse vectors.
2691  //**********************************************************************************************
2692 
2693  //**Compile time checks*************************************************************************
2701  //**********************************************************************************************
2702 };
2703 //*************************************************************************************************
2704 
2705 
2706 
2707 
2708 //=================================================================================================
2709 //
2710 // DVECSCALARMULTEXPR SPECIALIZATION
2711 //
2712 //=================================================================================================
2713 
2714 //*************************************************************************************************
2722 template< typename MT // Type of the left-hand side dense matrix
2723  , typename VT // Type of the right-hand side dense vector
2724  , typename ST > // Type of the scalar value
2725 class DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >
2726  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false > >
2727  , private Computation
2728 {
2729  private:
2730  //**Type definitions****************************************************************************
2731  using MVM = DMatDVecMultExpr<MT,VT>;
2732  using RES = ResultType_t<MVM>;
2733  using MRT = ResultType_t<MT>;
2734  using VRT = ResultType_t<VT>;
2735  using MET = ElementType_t<MRT>;
2736  using VET = ElementType_t<VRT>;
2737  using MCT = CompositeType_t<MT>;
2738  using VCT = CompositeType_t<VT>;
2739  //**********************************************************************************************
2740 
2741  //**********************************************************************************************
2743  static constexpr bool evaluateMatrix =
2744  ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2745  IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2746  //**********************************************************************************************
2747 
2748  //**********************************************************************************************
2750  static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<MT> );
2751  //**********************************************************************************************
2752 
2753  //**********************************************************************************************
2755 
2758  template< typename T1 >
2759  static constexpr bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
2760  //**********************************************************************************************
2761 
2762  //**********************************************************************************************
2764 
2766  template< typename T1, typename T2, typename T3, typename T4 >
2767  static constexpr bool UseBlasKernel_v =
2769  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2770  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2771  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2772  !IsDiagonal_v<T2> &&
2773  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2774  IsBLASCompatible_v< ElementType_t<T1> > &&
2775  IsBLASCompatible_v< ElementType_t<T2> > &&
2776  IsBLASCompatible_v< ElementType_t<T3> > &&
2777  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2778  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2779  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2780  //**********************************************************************************************
2781 
2782  //**********************************************************************************************
2784 
2787  template< typename T1, typename T2, typename T3, typename T4 >
2788  static constexpr bool UseVectorizedDefaultKernel_v =
2789  ( useOptimizedKernels &&
2790  !IsDiagonal_v<T2> &&
2791  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2792  IsSIMDCombinable_v< ElementType_t<T1>
2793  , ElementType_t<T2>
2794  , ElementType_t<T3>
2795  , T4 > &&
2796  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2797  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2798  //**********************************************************************************************
2799 
2800  public:
2801  //**Type definitions****************************************************************************
2802  using This = DVecScalarMultExpr<MVM,ST,false>;
2803  using BaseType = DenseVector<This,false>;
2804  using ResultType = MultTrait_t<RES,ST>;
2805  using TransposeType = TransposeType_t<ResultType>;
2806  using ElementType = ElementType_t<ResultType>;
2807  using SIMDType = SIMDTrait_t<ElementType>;
2808  using ReturnType = const ElementType;
2809  using CompositeType = const ResultType;
2810 
2812  using LeftOperand = const DMatDVecMultExpr<MT,VT>;
2813 
2815  using RightOperand = ST;
2816 
2818  using LT = If_t< evaluateMatrix, const MRT, MCT >;
2819 
2821  using RT = If_t< evaluateVector, const VRT, VCT >;
2822  //**********************************************************************************************
2823 
2824  //**Compilation flags***************************************************************************
2826  static constexpr bool simdEnabled =
2827  ( !IsDiagonal_v<MT> &&
2828  MT::simdEnabled && VT::simdEnabled &&
2829  IsSIMDCombinable_v<MET,VET,ST> &&
2830  HasSIMDAdd_v<MET,VET> &&
2831  HasSIMDMult_v<MET,VET> );
2832 
2834  static constexpr bool smpAssignable =
2835  ( !evaluateMatrix && MT::smpAssignable && !evaluateVector && VT::smpAssignable );
2836  //**********************************************************************************************
2837 
2838  //**SIMD properties*****************************************************************************
2840  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
2841  //**********************************************************************************************
2842 
2843  //**Constructor*********************************************************************************
2849  explicit inline DVecScalarMultExpr( const MVM& vector, ST scalar )
2850  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2851  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2852  {}
2853  //**********************************************************************************************
2854 
2855  //**Subscript operator**************************************************************************
2861  inline ReturnType operator[]( size_t index ) const {
2862  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2863  return vector_[index] * scalar_;
2864  }
2865  //**********************************************************************************************
2866 
2867  //**At function*********************************************************************************
2874  inline ReturnType at( size_t index ) const {
2875  if( index >= vector_.size() ) {
2876  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2877  }
2878  return (*this)[index];
2879  }
2880  //**********************************************************************************************
2881 
2882  //**Size function*******************************************************************************
2887  inline size_t size() const {
2888  return vector_.size();
2889  }
2890  //**********************************************************************************************
2891 
2892  //**Left operand access*************************************************************************
2897  inline LeftOperand leftOperand() const {
2898  return vector_;
2899  }
2900  //**********************************************************************************************
2901 
2902  //**Right operand access************************************************************************
2907  inline RightOperand rightOperand() const {
2908  return scalar_;
2909  }
2910  //**********************************************************************************************
2911 
2912  //**********************************************************************************************
2918  template< typename T >
2919  inline bool canAlias( const T* alias ) const {
2920  return vector_.canAlias( alias );
2921  }
2922  //**********************************************************************************************
2923 
2924  //**********************************************************************************************
2930  template< typename T >
2931  inline bool isAliased( const T* alias ) const {
2932  return vector_.isAliased( alias );
2933  }
2934  //**********************************************************************************************
2935 
2936  //**********************************************************************************************
2941  inline bool isAligned() const {
2942  return vector_.isAligned();
2943  }
2944  //**********************************************************************************************
2945 
2946  //**********************************************************************************************
2951  inline bool canSMPAssign() const noexcept {
2952  LeftOperand_t<MVM> A( vector_.leftOperand() );
2953  return ( !BLAZE_BLAS_MODE ||
2956  ( IsComputation_v<MT> && !evaluateMatrix ) ||
2957  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2958  ( size() > SMP_DMATDVECMULT_THRESHOLD );
2959  }
2960  //**********************************************************************************************
2961 
2962  private:
2963  //**Member variables****************************************************************************
2966  //**********************************************************************************************
2967 
2968  //**Assignment to dense vectors*****************************************************************
2980  template< typename VT1 > // Type of the target dense vector
2981  friend inline void assign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
2982  {
2984 
2985  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2986 
2987  LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
2988  RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
2989 
2990  if( left.rows() == 0UL ) {
2991  return;
2992  }
2993  else if( left.columns() == 0UL ) {
2994  reset( ~lhs );
2995  return;
2996  }
2997 
2998  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
2999  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3000 
3001  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3002  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3003  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3004  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3005 
3006  DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
3007  }
3008  //**********************************************************************************************
3009 
3010  //**Assignment to dense vectors (kernel selection)**********************************************
3021  template< typename VT1 // Type of the left-hand side target vector
3022  , typename MT1 // Type of the left-hand side matrix operand
3023  , typename VT2 // Type of the right-hand side vector operand
3024  , typename ST2 > // Type of the scalar value
3025  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3026  {
3027  if( ( IsDiagonal_v<MT1> ) ||
3028  ( IsComputation_v<MT> && !evaluateMatrix ) ||
3029  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3030  selectSmallAssignKernel( y, A, x, scalar );
3031  else
3032  selectBlasAssignKernel( y, A, x, scalar );
3033  }
3034  //**********************************************************************************************
3035 
3036  //**Default assignment to dense vectors*********************************************************
3050  template< typename VT1 // Type of the left-hand side target vector
3051  , typename MT1 // Type of the left-hand side matrix operand
3052  , typename VT2 // Type of the right-hand side vector operand
3053  , typename ST2 > // Type of the scalar value
3054  static inline auto selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3055  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3056  {
3057  y.assign( A * x * scalar );
3058  }
3059  //**********************************************************************************************
3060 
3061  //**Default assignment to dense vectors (small matrices)****************************************
3075  template< typename VT1 // Type of the left-hand side target vector
3076  , typename MT1 // Type of the left-hand side matrix operand
3077  , typename VT2 // Type of the right-hand side vector operand
3078  , typename ST2 > // Type of the scalar value
3079  static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3080  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3081  {
3082  selectDefaultAssignKernel( y, A, x, scalar );
3083  }
3084  //**********************************************************************************************
3085 
3086  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3100  template< typename VT1 // Type of the left-hand side target vector
3101  , typename MT1 // Type of the left-hand side matrix operand
3102  , typename VT2 // Type of the right-hand side vector operand
3103  , typename ST2 > // Type of the scalar value
3104  static inline auto selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3105  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3106  {
3107  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3108 
3109  const size_t M( A.rows() );
3110  const size_t N( A.columns() );
3111 
3112  size_t i( 0UL );
3113 
3114  for( ; (i+8UL) <= M; i+=8UL )
3115  {
3116  const size_t jbegin( ( IsUpper_v<MT1> )
3117  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3118  :( 0UL ) );
3119  const size_t jend( ( IsLower_v<MT1> )
3120  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3121  :( N ) );
3122  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3123 
3124  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3125  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3126 
3127  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3128  size_t j( jbegin );
3129 
3130  for( ; j<jpos; j+=SIMDSIZE ) {
3131  const SIMDType x1( x.load(j) );
3132  xmm1 += A.load(i ,j) * x1;
3133  xmm2 += A.load(i+1UL,j) * x1;
3134  xmm3 += A.load(i+2UL,j) * x1;
3135  xmm4 += A.load(i+3UL,j) * x1;
3136  xmm5 += A.load(i+4UL,j) * x1;
3137  xmm6 += A.load(i+5UL,j) * x1;
3138  xmm7 += A.load(i+6UL,j) * x1;
3139  xmm8 += A.load(i+7UL,j) * x1;
3140  }
3141 
3142  y[i ] = sum( xmm1 ) * scalar;
3143  y[i+1UL] = sum( xmm2 ) * scalar;
3144  y[i+2UL] = sum( xmm3 ) * scalar;
3145  y[i+3UL] = sum( xmm4 ) * scalar;
3146  y[i+4UL] = sum( xmm5 ) * scalar;
3147  y[i+5UL] = sum( xmm6 ) * scalar;
3148  y[i+6UL] = sum( xmm7 ) * scalar;
3149  y[i+7UL] = sum( xmm8 ) * scalar;
3150 
3151  for( ; remainder && j<jend; ++j ) {
3152  y[i ] += A(i ,j) * x[j] * scalar;
3153  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3154  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3155  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3156  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3157  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3158  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3159  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3160  }
3161  }
3162 
3163  for( ; (i+4UL) <= M; i+=4UL )
3164  {
3165  const size_t jbegin( ( IsUpper_v<MT1> )
3166  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3167  :( 0UL ) );
3168  const size_t jend( ( IsLower_v<MT1> )
3169  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3170  :( N ) );
3171  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3172 
3173  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3174  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3175 
3176  SIMDType xmm1, xmm2, xmm3, xmm4;
3177  size_t j( jbegin );
3178 
3179  for( ; j<jpos; j+=SIMDSIZE ) {
3180  const SIMDType x1( x.load(j) );
3181  xmm1 += A.load(i ,j) * x1;
3182  xmm2 += A.load(i+1UL,j) * x1;
3183  xmm3 += A.load(i+2UL,j) * x1;
3184  xmm4 += A.load(i+3UL,j) * x1;
3185  }
3186 
3187  y[i ] = sum( xmm1 ) * scalar;
3188  y[i+1UL] = sum( xmm2 ) * scalar;
3189  y[i+2UL] = sum( xmm3 ) * scalar;
3190  y[i+3UL] = sum( xmm4 ) * scalar;
3191 
3192  for( ; remainder && j<jend; ++j ) {
3193  y[i ] += A(i ,j) * x[j] * scalar;
3194  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3195  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3196  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3197  }
3198  }
3199 
3200  for( ; (i+3UL) <= M; i+=3UL )
3201  {
3202  const size_t jbegin( ( IsUpper_v<MT1> )
3203  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3204  :( 0UL ) );
3205  const size_t jend( ( IsLower_v<MT1> )
3206  ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
3207  :( N ) );
3208  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3209 
3210  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3211  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3212 
3213  SIMDType xmm1, xmm2, xmm3;
3214  size_t j( jbegin );
3215 
3216  for( ; j<jpos; j+=SIMDSIZE ) {
3217  const SIMDType x1( x.load(j) );
3218  xmm1 += A.load(i ,j) * x1;
3219  xmm2 += A.load(i+1UL,j) * x1;
3220  xmm3 += A.load(i+2UL,j) * x1;
3221  }
3222 
3223  y[i ] = sum( xmm1 ) * scalar;
3224  y[i+1UL] = sum( xmm2 ) * scalar;
3225  y[i+2UL] = sum( xmm3 ) * scalar;
3226 
3227  for( ; remainder && j<jend; ++j ) {
3228  y[i ] += A(i ,j) * x[j] * scalar;
3229  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3230  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3231  }
3232  }
3233 
3234  for( ; (i+2UL) <= M; i+=2UL )
3235  {
3236  const size_t jbegin( ( IsUpper_v<MT1> )
3237  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3238  :( 0UL ) );
3239  const size_t jend( ( IsLower_v<MT1> )
3240  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
3241  :( N ) );
3242  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3243 
3244  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3245  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3246 
3247  SIMDType xmm1, xmm2;
3248  size_t j( jbegin );
3249 
3250  for( ; j<jpos; j+=SIMDSIZE ) {
3251  const SIMDType x1( x.load(j) );
3252  xmm1 += A.load(i ,j) * x1;
3253  xmm2 += A.load(i+1UL,j) * x1;
3254  }
3255 
3256  y[i ] = sum( xmm1 ) * scalar;
3257  y[i+1UL] = sum( xmm2 ) * scalar;
3258 
3259  for( ; remainder && j<jend; ++j ) {
3260  y[i ] += A(i ,j) * x[j] * scalar;
3261  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3262  }
3263  }
3264 
3265  if( i < M )
3266  {
3267  const size_t jbegin( ( IsUpper_v<MT1> )
3268  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3269  :( 0UL ) );
3270  const size_t jend( ( IsLower_v<MT1> )
3271  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
3272  :( N ) );
3273  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3274 
3275  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3276  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3277 
3278  SIMDType xmm1;
3279  size_t j( jbegin );
3280 
3281  for( ; j<jpos; j+=SIMDSIZE ) {
3282  xmm1 += A.load(i,j) * x.load(j);
3283  }
3284 
3285  y[i] = sum( xmm1 ) * scalar;
3286 
3287  for( ; remainder && j<jend; ++j ) {
3288  y[i] += A(i,j) * x[j] * scalar;
3289  }
3290  }
3291  }
3292  //**********************************************************************************************
3293 
3294  //**Default assignment to dense vectors (large matrices)****************************************
3308  template< typename VT1 // Type of the left-hand side target vector
3309  , typename MT1 // Type of the left-hand side matrix operand
3310  , typename VT2 // Type of the right-hand side vector operand
3311  , typename ST2 > // Type of the scalar value
3312  static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3313  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3314  {
3315  selectDefaultAssignKernel( y, A, x, scalar );
3316  }
3317  //**********************************************************************************************
3318 
3319  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3333  template< typename VT1 // Type of the left-hand side target vector
3334  , typename MT1 // Type of the left-hand side matrix operand
3335  , typename VT2 // Type of the right-hand side vector operand
3336  , typename ST2 > // Type of the scalar value
3337  static inline auto selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3338  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3339  {
3340  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3341 
3342  const size_t M( A.rows() );
3343  const size_t N( A.columns() );
3344 
3345  reset( y );
3346 
3347  size_t i( 0UL );
3348 
3349  for( ; (i+8UL) <= M; i+=8UL )
3350  {
3351  const size_t jbegin( ( IsUpper_v<MT1> )
3352  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3353  :( 0UL ) );
3354  const size_t jend( ( IsLower_v<MT1> )
3355  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3356  :( N ) );
3357  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3358 
3359  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3360  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3361 
3362  size_t j( jbegin );
3363 
3364  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3365  const size_t j1( j+SIMDSIZE );
3366  const size_t j2( j+SIMDSIZE*2UL );
3367  const size_t j3( j+SIMDSIZE*3UL );
3368  const SIMDType x1( x.load(j ) );
3369  const SIMDType x2( x.load(j1) );
3370  const SIMDType x3( x.load(j2) );
3371  const SIMDType x4( x.load(j3) );
3372  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3373  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3374  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3375  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3376  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3377  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3378  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3379  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3380  }
3381 
3382  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3383  const size_t j1( j+SIMDSIZE );
3384  const SIMDType x1( x.load(j ) );
3385  const SIMDType x2( x.load(j1) );
3386  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3387  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3388  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3389  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3390  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3391  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3392  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3393  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3394  }
3395 
3396  for( ; j<jpos; j+=SIMDSIZE ) {
3397  const SIMDType x1( x.load(j) );
3398  y[i ] += sum( A.load(i ,j) * x1 );
3399  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3400  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3401  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3402  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
3403  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
3404  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
3405  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
3406  }
3407 
3408  for( ; remainder && j<jend; ++j ) {
3409  y[i ] += A(i ,j) * x[j];
3410  y[i+1UL] += A(i+1UL,j) * x[j];
3411  y[i+2UL] += A(i+2UL,j) * x[j];
3412  y[i+3UL] += A(i+3UL,j) * x[j];
3413  y[i+4UL] += A(i+4UL,j) * x[j];
3414  y[i+5UL] += A(i+5UL,j) * x[j];
3415  y[i+6UL] += A(i+6UL,j) * x[j];
3416  y[i+7UL] += A(i+7UL,j) * x[j];
3417  }
3418 
3419  y[i ] *= scalar;
3420  y[i+1UL] *= scalar;
3421  y[i+2UL] *= scalar;
3422  y[i+3UL] *= scalar;
3423  y[i+4UL] *= scalar;
3424  y[i+5UL] *= scalar;
3425  y[i+6UL] *= scalar;
3426  y[i+7UL] *= scalar;
3427  }
3428 
3429  for( ; (i+4UL) <= M; i+=4UL )
3430  {
3431  const size_t jbegin( ( IsUpper_v<MT1> )
3432  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3433  :( 0UL ) );
3434  const size_t jend( ( IsLower_v<MT1> )
3435  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3436  :( N ) );
3437  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3438 
3439  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3440  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3441 
3442  size_t j( jbegin );
3443 
3444  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3445  const size_t j1( j+SIMDSIZE );
3446  const size_t j2( j+SIMDSIZE*2UL );
3447  const size_t j3( j+SIMDSIZE*3UL );
3448  const SIMDType x1( x.load(j ) );
3449  const SIMDType x2( x.load(j1) );
3450  const SIMDType x3( x.load(j2) );
3451  const SIMDType x4( x.load(j3) );
3452  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3453  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3454  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3455  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3456  }
3457 
3458  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3459  const size_t j1( j+SIMDSIZE );
3460  const SIMDType x1( x.load(j ) );
3461  const SIMDType x2( x.load(j1) );
3462  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3463  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3464  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3465  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3466  }
3467 
3468  for( ; j<jpos; j+=SIMDSIZE ) {
3469  const SIMDType x1( x.load(j) );
3470  y[i ] += sum( A.load(i ,j) * x1 );
3471  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3472  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3473  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3474  }
3475 
3476  for( ; remainder && j<jend; ++j ) {
3477  y[i ] += A(i ,j) * x[j];
3478  y[i+1UL] += A(i+1UL,j) * x[j];
3479  y[i+2UL] += A(i+2UL,j) * x[j];
3480  y[i+3UL] += A(i+3UL,j) * x[j];
3481  }
3482 
3483  y[i ] *= scalar;
3484  y[i+1UL] *= scalar;
3485  y[i+2UL] *= scalar;
3486  y[i+3UL] *= scalar;
3487  }
3488 
3489  for( ; (i+2UL) <= M; i+=2UL )
3490  {
3491  const size_t jbegin( ( IsUpper_v<MT1> )
3492  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3493  :( 0UL ) );
3494  const size_t jend( ( IsLower_v<MT1> )
3495  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
3496  :( N ) );
3497  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3498 
3499  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3500  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3501 
3502  size_t j( jbegin );
3503 
3504  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3505  const size_t j1( j+SIMDSIZE );
3506  const size_t j2( j+SIMDSIZE*2UL );
3507  const size_t j3( j+SIMDSIZE*3UL );
3508  const SIMDType x1( x.load(j ) );
3509  const SIMDType x2( x.load(j1) );
3510  const SIMDType x3( x.load(j2) );
3511  const SIMDType x4( x.load(j3) );
3512  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3513  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3514  }
3515 
3516  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3517  const size_t j1( j+SIMDSIZE );
3518  const SIMDType x1( x.load(j ) );
3519  const SIMDType x2( x.load(j1) );
3520  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3521  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3522  }
3523 
3524  for( ; j<jpos; j+=SIMDSIZE ) {
3525  const SIMDType x1( x.load(j) );
3526  y[i ] += sum( A.load(i ,j) * x1 );
3527  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3528  }
3529 
3530  for( ; remainder && j<jend; ++j ) {
3531  y[i ] += A(i ,j) * x[j];
3532  y[i+1UL] += A(i+1UL,j) * x[j];
3533  }
3534 
3535  y[i ] *= scalar;
3536  y[i+1UL] *= scalar;
3537  }
3538 
3539  if( i < M )
3540  {
3541  const size_t jbegin( ( IsUpper_v<MT1> )
3542  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3543  :( 0UL ) );
3544  const size_t jend( ( IsLower_v<MT1> )
3545  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
3546  :( N ) );
3547  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3548 
3549  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3550  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3551 
3552  size_t j( jbegin );
3553 
3554  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3555  const size_t j1( j+SIMDSIZE );
3556  const size_t j2( j+SIMDSIZE*2UL );
3557  const size_t j3( j+SIMDSIZE*3UL );
3558  const SIMDType x1( x.load(j ) );
3559  const SIMDType x2( x.load(j1) );
3560  const SIMDType x3( x.load(j2) );
3561  const SIMDType x4( x.load(j3) );
3562  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3563  }
3564 
3565  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3566  const size_t j1( j+SIMDSIZE );
3567  const SIMDType x1( x.load(j ) );
3568  const SIMDType x2( x.load(j1) );
3569  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3570  }
3571 
3572  for( ; j<jpos; j+=SIMDSIZE ) {
3573  const SIMDType x1( x.load(j) );
3574  y[i] += sum( A.load(i,j) * x1 );
3575  }
3576 
3577  for( ; remainder && j<jend; ++j ) {
3578  y[i] += A(i,j) * x[j];
3579  }
3580 
3581  y[i] *= scalar;
3582  }
3583  }
3584  //**********************************************************************************************
3585 
3586  //**BLAS-based assignment to dense vectors (default)********************************************
3600  template< typename VT1 // Type of the left-hand side target vector
3601  , typename MT1 // Type of the left-hand side matrix operand
3602  , typename VT2 // Type of the right-hand side vector operand
3603  , typename ST2 > // Type of the scalar value
3604  static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3605  -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3606  {
3607  selectLargeAssignKernel( y, A, x, scalar );
3608  }
3609  //**********************************************************************************************
3610 
3611  //**BLAS-based assignment to dense vectors******************************************************
3612 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3613 
3626  template< typename VT1 // Type of the left-hand side target vector
3627  , typename MT1 // Type of the left-hand side matrix operand
3628  , typename VT2 // Type of the right-hand side vector operand
3629  , typename ST2 > // Type of the scalar value
3630  static inline auto selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3631  -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3632  {
3633  using ET = ElementType_t<VT1>;
3634 
3635  if( IsTriangular_v<MT1> ) {
3636  assign( y, scalar * x );
3637  trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3638  }
3639  else {
3640  gemv( y, A, x, ET(scalar), ET(0) );
3641  }
3642  }
3643 #endif
3644  //**********************************************************************************************
3645 
3646  //**Assignment to sparse vectors****************************************************************
3658  template< typename VT1 > // Type of the target sparse vector
3659  friend inline void assign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3660  {
3662 
3666 
3667  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3668 
3669  const ResultType tmp( serial( rhs ) );
3670  assign( ~lhs, tmp );
3671  }
3672  //**********************************************************************************************
3673 
3674  //**Addition assignment to dense vectors********************************************************
3686  template< typename VT1 > // Type of the target dense vector
3687  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3688  {
3690 
3691  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3692 
3693  LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
3694  RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
3695 
3696  if( left.rows() == 0UL || left.columns() == 0UL ) {
3697  return;
3698  }
3699 
3700  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3701  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3702 
3703  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3704  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3705  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3706  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3707 
3708  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3709  }
3710  //**********************************************************************************************
3711 
3712  //**Addition assignment to dense vectors (kernel selection)*************************************
3723  template< typename VT1 // Type of the left-hand side target vector
3724  , typename MT1 // Type of the left-hand side matrix operand
3725  , typename VT2 // Type of the right-hand side vector operand
3726  , typename ST2 > // Type of the scalar value
3727  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3728  {
3729  if( ( IsDiagonal_v<MT1> ) ||
3730  ( IsComputation_v<MT> && !evaluateMatrix ) ||
3731  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3732  selectSmallAddAssignKernel( y, A, x, scalar );
3733  else
3734  selectBlasAddAssignKernel( y, A, x, scalar );
3735  }
3736  //**********************************************************************************************
3737 
3738  //**Default addition assignment to dense vectors************************************************
3752  template< typename VT1 // Type of the left-hand side target vector
3753  , typename MT1 // Type of the left-hand side matrix operand
3754  , typename VT2 // Type of the right-hand side vector operand
3755  , typename ST2 > // Type of the scalar value
3756  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3757  {
3758  y.addAssign( A * x * scalar );
3759  }
3760  //**********************************************************************************************
3761 
3762  //**Default addition assignment to dense vectors (small matrices)*******************************
3776  template< typename VT1 // Type of the left-hand side target vector
3777  , typename MT1 // Type of the left-hand side matrix operand
3778  , typename VT2 // Type of the right-hand side vector operand
3779  , typename ST2 > // Type of the scalar value
3780  static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3781  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3782  {
3783  selectDefaultAddAssignKernel( y, A, x, scalar );
3784  }
3785  //**********************************************************************************************
3786 
3787  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3801  template< typename VT1 // Type of the left-hand side target vector
3802  , typename MT1 // Type of the left-hand side matrix operand
3803  , typename VT2 // Type of the right-hand side vector operand
3804  , typename ST2 > // Type of the scalar value
3805  static inline auto selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3806  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3807  {
3808  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3809 
3810  const size_t M( A.rows() );
3811  const size_t N( A.columns() );
3812 
3813  size_t i( 0UL );
3814 
3815  for( ; (i+8UL) <= M; i+=8UL )
3816  {
3817  const size_t jbegin( ( IsUpper_v<MT1> )
3818  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3819  :( 0UL ) );
3820  const size_t jend( ( IsLower_v<MT1> )
3821  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3822  :( N ) );
3823  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3824 
3825  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3826  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3827 
3828  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3829  size_t j( jbegin );
3830 
3831  for( ; j<jpos; j+=SIMDSIZE ) {
3832  const SIMDType x1( x.load(j) );
3833  xmm1 += A.load(i ,j) * x1;
3834  xmm2 += A.load(i+1UL,j) * x1;
3835  xmm3 += A.load(i+2UL,j) * x1;
3836  xmm4 += A.load(i+3UL,j) * x1;
3837  xmm5 += A.load(i+4UL,j) * x1;
3838  xmm6 += A.load(i+5UL,j) * x1;
3839  xmm7 += A.load(i+6UL,j) * x1;
3840  xmm8 += A.load(i+7UL,j) * x1;
3841  }
3842 
3843  y[i ] += sum( xmm1 ) * scalar;
3844  y[i+1UL] += sum( xmm2 ) * scalar;
3845  y[i+2UL] += sum( xmm3 ) * scalar;
3846  y[i+3UL] += sum( xmm4 ) * scalar;
3847  y[i+4UL] += sum( xmm5 ) * scalar;
3848  y[i+5UL] += sum( xmm6 ) * scalar;
3849  y[i+6UL] += sum( xmm7 ) * scalar;
3850  y[i+7UL] += sum( xmm8 ) * scalar;
3851 
3852  for( ; remainder && j<jend; ++j ) {
3853  y[i ] += A(i ,j) * x[j] * scalar;
3854  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3855  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3856  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3857  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3858  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3859  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3860  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3861  }
3862  }
3863 
3864  for( ; (i+4UL) <= M; i+=4UL )
3865  {
3866  const size_t jbegin( ( IsUpper_v<MT1> )
3867  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3868  :( 0UL ) );
3869  const size_t jend( ( IsLower_v<MT1> )
3870  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3871  :( N ) );
3872  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3873 
3874  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3875  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3876 
3877  SIMDType xmm1, xmm2, xmm3, xmm4;
3878  size_t j( jbegin );
3879 
3880  for( ; j<jpos; j+=SIMDSIZE ) {
3881  const SIMDType x1( x.load(j) );
3882  xmm1 += A.load(i ,j) * x1;
3883  xmm2 += A.load(i+1UL,j) * x1;
3884  xmm3 += A.load(i+2UL,j) * x1;
3885  xmm4 += A.load(i+3UL,j) * x1;
3886  }
3887 
3888  y[i ] += sum( xmm1 ) * scalar;
3889  y[i+1UL] += sum( xmm2 ) * scalar;
3890  y[i+2UL] += sum( xmm3 ) * scalar;
3891  y[i+3UL] += sum( xmm4 ) * scalar;
3892 
3893  for( ; remainder && j<jend; ++j ) {
3894  y[i ] += A(i ,j) * x[j] * scalar;
3895  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3896  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3897  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3898  }
3899  }
3900 
3901  for( ; (i+3UL) <= M; i+=3UL )
3902  {
3903  const size_t jbegin( ( IsUpper_v<MT1> )
3904  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3905  :( 0UL ) );
3906  const size_t jend( ( IsLower_v<MT1> )
3907  ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
3908  :( N ) );
3909  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3910 
3911  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3912  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3913 
3914  SIMDType xmm1, xmm2, xmm3;
3915  size_t j( jbegin );
3916 
3917  for( ; j<jpos; j+=SIMDSIZE ) {
3918  const SIMDType x1( x.load(j) );
3919  xmm1 += A.load(i ,j) * x1;
3920  xmm2 += A.load(i+1UL,j) * x1;
3921  xmm3 += A.load(i+2UL,j) * x1;
3922  }
3923 
3924  y[i ] += sum( xmm1 ) * scalar;
3925  y[i+1UL] += sum( xmm2 ) * scalar;
3926  y[i+2UL] += sum( xmm3 ) * scalar;
3927 
3928  for( ; remainder && j<jend; ++j ) {
3929  y[i ] += A(i ,j) * x[j] * scalar;
3930  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3931  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3932  }
3933  }
3934 
3935  for( ; (i+2UL) <= M; i+=2UL )
3936  {
3937  const size_t jbegin( ( IsUpper_v<MT1> )
3938  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3939  :( 0UL ) );
3940  const size_t jend( ( IsLower_v<MT1> )
3941  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
3942  :( N ) );
3943  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3944 
3945  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3946  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3947 
3948  SIMDType xmm1, xmm2;
3949  size_t j( jbegin );
3950 
3951  for( ; j<jpos; j+=SIMDSIZE ) {
3952  const SIMDType x1( x.load(j) );
3953  xmm1 += A.load(i ,j) * x1;
3954  xmm2 += A.load(i+1UL,j) * x1;
3955  }
3956 
3957  y[i ] += sum( xmm1 ) * scalar;
3958  y[i+1UL] += sum( xmm2 ) * scalar;
3959 
3960  for( ; remainder && j<jend; ++j ) {
3961  y[i ] += A(i ,j) * x[j] * scalar;
3962  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3963  }
3964  }
3965 
3966  if( i < M )
3967  {
3968  const size_t jbegin( ( IsUpper_v<MT1> )
3969  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
3970  :( 0UL ) );
3971  const size_t jend( ( IsLower_v<MT1> )
3972  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
3973  :( N ) );
3974  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3975 
3976  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3977  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3978 
3979  SIMDType xmm1;
3980  size_t j( jbegin );
3981 
3982  for( ; j<jpos; j+=SIMDSIZE ) {
3983  xmm1 += A.load(i,j) * x.load(j);
3984  }
3985 
3986  y[i] += sum( xmm1 ) * scalar;
3987 
3988  for( ; remainder && j<jend; ++j ) {
3989  y[i] += A(i,j) * x[j] * scalar;
3990  }
3991  }
3992  }
3993  //**********************************************************************************************
3994 
3995  //**Default addition assignment to dense vectors (large matrices)*******************************
4009  template< typename VT1 // Type of the left-hand side target vector
4010  , typename MT1 // Type of the left-hand side matrix operand
4011  , typename VT2 // Type of the right-hand side vector operand
4012  , typename ST2 > // Type of the scalar value
4013  static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4014  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4015  {
4016  selectDefaultAddAssignKernel( y, A, x, scalar );
4017  }
4018  //**********************************************************************************************
4019 
4020  //**Vectorized default addition assignment to dense vectors (large matrices)********************
4034  template< typename VT1 // Type of the left-hand side target vector
4035  , typename MT1 // Type of the left-hand side matrix operand
4036  , typename VT2 // Type of the right-hand side vector operand
4037  , typename ST2 > // Type of the scalar value
4038  static inline auto selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4039  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4040  {
4041  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4042 
4043  const size_t M( A.rows() );
4044  const size_t N( A.columns() );
4045 
4046  size_t i( 0UL );
4047 
4048  for( ; (i+8UL) <= M; i+=8UL )
4049  {
4050  const size_t jbegin( ( IsUpper_v<MT1> )
4051  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4052  :( 0UL ) );
4053  const size_t jend( ( IsLower_v<MT1> )
4054  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4055  :( N ) );
4056  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4057 
4058  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4059  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4060 
4061  size_t j( jbegin );
4062 
4063  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4064  const size_t j1( j+SIMDSIZE );
4065  const size_t j2( j+SIMDSIZE*2UL );
4066  const size_t j3( j+SIMDSIZE*3UL );
4067  const SIMDType x1( x.load(j ) );
4068  const SIMDType x2( x.load(j1) );
4069  const SIMDType x3( x.load(j2) );
4070  const SIMDType x4( x.load(j3) );
4071  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4072  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4073  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4074  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4075  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4076  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4077  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4078  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4079  }
4080 
4081  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4082  const size_t j1( j+SIMDSIZE );
4083  const SIMDType x1( x.load(j ) );
4084  const SIMDType x2( x.load(j1) );
4085  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4086  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4087  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4088  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4089  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4090  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4091  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4092  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4093  }
4094 
4095  for( ; j<jpos; j+=SIMDSIZE ) {
4096  const SIMDType x1( x.load(j) );
4097  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4098  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4099  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4100  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4101  y[i+4UL] += sum( A.load(i+4UL,j) * x1 ) * scalar;
4102  y[i+5UL] += sum( A.load(i+5UL,j) * x1 ) * scalar;
4103  y[i+6UL] += sum( A.load(i+6UL,j) * x1 ) * scalar;
4104  y[i+7UL] += sum( A.load(i+7UL,j) * x1 ) * scalar;
4105  }
4106 
4107  for( ; remainder && j<jend; ++j ) {
4108  y[i ] += A(i ,j) * x[j] * scalar;
4109  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4110  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4111  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4112  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4113  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4114  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4115  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4116  }
4117  }
4118 
4119  for( ; (i+4UL) <= M; i+=4UL )
4120  {
4121  const size_t jbegin( ( IsUpper_v<MT1> )
4122  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4123  :( 0UL ) );
4124  const size_t jend( ( IsLower_v<MT1> )
4125  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4126  :( N ) );
4127  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4128 
4129  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4130  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4131 
4132  size_t j( jbegin );
4133 
4134  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4135  const size_t j1( j+SIMDSIZE );
4136  const size_t j2( j+SIMDSIZE*2UL );
4137  const size_t j3( j+SIMDSIZE*3UL );
4138  const SIMDType x1( x.load(j ) );
4139  const SIMDType x2( x.load(j1) );
4140  const SIMDType x3( x.load(j2) );
4141  const SIMDType x4( x.load(j3) );
4142  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4143  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4144  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4145  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4146  }
4147 
4148  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4149  const size_t j1( j+SIMDSIZE );
4150  const SIMDType x1( x.load(j ) );
4151  const SIMDType x2( x.load(j1) );
4152  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4153  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4154  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4155  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4156  }
4157 
4158  for( ; j<jpos; j+=SIMDSIZE ) {
4159  const SIMDType x1( x.load(j) );
4160  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4161  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4162  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4163  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4164  }
4165 
4166  for( ; remainder && j<jend; ++j ) {
4167  y[i ] += A(i ,j) * x[j] * scalar;
4168  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4169  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4170  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4171  }
4172  }
4173 
4174  for( ; (i+2UL) <= M; i+=2UL )
4175  {
4176  const size_t jbegin( ( IsUpper_v<MT1> )
4177  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4178  :( 0UL ) );
4179  const size_t jend( ( IsLower_v<MT1> )
4180  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4181  :( N ) );
4182  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4183 
4184  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4185  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4186 
4187  size_t j( jbegin );
4188 
4189  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4190  const size_t j1( j+SIMDSIZE );
4191  const size_t j2( j+SIMDSIZE*2UL );
4192  const size_t j3( j+SIMDSIZE*3UL );
4193  const SIMDType x1( x.load(j ) );
4194  const SIMDType x2( x.load(j1) );
4195  const SIMDType x3( x.load(j2) );
4196  const SIMDType x4( x.load(j3) );
4197  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4198  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4199  }
4200 
4201  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4202  const size_t j1( j+SIMDSIZE );
4203  const SIMDType x1( x.load(j ) );
4204  const SIMDType x2( x.load(j1) );
4205  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4206  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4207  }
4208 
4209  for( ; j<jpos; j+=SIMDSIZE ) {
4210  const SIMDType x1( x.load(j) );
4211  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4212  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4213  }
4214 
4215  for( ; remainder && j<jend; ++j ) {
4216  y[i ] += A(i ,j) * x[j] * scalar;
4217  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4218  }
4219  }
4220 
4221  if( i < M )
4222  {
4223  const size_t jbegin( ( IsUpper_v<MT1> )
4224  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4225  :( 0UL ) );
4226  const size_t jend( ( IsLower_v<MT1> )
4227  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4228  :( N ) );
4229  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4230 
4231  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4232  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4233 
4234  size_t j( jbegin );
4235 
4236  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4237  const size_t j1( j+SIMDSIZE );
4238  const size_t j2( j+SIMDSIZE*2UL );
4239  const size_t j3( j+SIMDSIZE*3UL );
4240  const SIMDType x1( x.load(j ) );
4241  const SIMDType x2( x.load(j1) );
4242  const SIMDType x3( x.load(j2) );
4243  const SIMDType x4( x.load(j3) );
4244  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4245  }
4246 
4247  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4248  const size_t j1( j+SIMDSIZE );
4249  const SIMDType x1( x.load(j ) );
4250  const SIMDType x2( x.load(j1) );
4251  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4252  }
4253 
4254  for( ; j<jpos; j+=SIMDSIZE ) {
4255  const SIMDType x1( x.load(j) );
4256  y[i] += sum( A.load(i,j) * x1 ) * scalar;
4257  }
4258 
4259  for( ; remainder && j<jend; ++j ) {
4260  y[i] += A(i,j) * x[j] * scalar;
4261  }
4262  }
4263  }
4264  //**********************************************************************************************
4265 
4266  //**BLAS-based addition assignment to dense vectors (default)***********************************
4280  template< typename VT1 // Type of the left-hand side target vector
4281  , typename MT1 // Type of the left-hand side matrix operand
4282  , typename VT2 // Type of the right-hand side vector operand
4283  , typename ST2 > // Type of the scalar value
4284  static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4285  -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4286  {
4287  selectLargeAddAssignKernel( y, A, x, scalar );
4288  }
4289  //**********************************************************************************************
4290 
4291  //**BLAS-based addition assignment to dense vectors*********************************************
4292 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4293 
4306  template< typename VT1 // Type of the left-hand side target vector
4307  , typename MT1 // Type of the left-hand side matrix operand
4308  , typename VT2 // Type of the right-hand side vector operand
4309  , typename ST2 > // Type of the scalar value
4310  static inline auto selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4311  -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4312  {
4313  using ET = ElementType_t<VT1>;
4314 
4315  if( IsTriangular_v<MT1> ) {
4316  ResultType_t<VT1> tmp( serial( scalar * x ) );
4317  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4318  addAssign( y, tmp );
4319  }
4320  else {
4321  gemv( y, A, x, ET(scalar), ET(1) );
4322  }
4323  }
4324 #endif
4325  //**********************************************************************************************
4326 
4327  //**Addition assignment to sparse vectors*******************************************************
4328  // No special implementation for the addition assignment to sparse vectors.
4329  //**********************************************************************************************
4330 
4331  //**Subtraction assignment to dense vectors*****************************************************
4343  template< typename VT1 > // Type of the target dense vector
4344  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4345  {
4347 
4348  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4349 
4350  LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4351  RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4352 
4353  if( left.rows() == 0UL || left.columns() == 0UL ) {
4354  return;
4355  }
4356 
4357  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4358  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
4359 
4360  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4361  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4362  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4363  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
4364 
4365  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
4366  }
4367  //**********************************************************************************************
4368 
4369  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4380  template< typename VT1 // Type of the left-hand side target vector
4381  , typename MT1 // Type of the left-hand side matrix operand
4382  , typename VT2 // Type of the right-hand side vector operand
4383  , typename ST2 > // Type of the scalar value
4384  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4385  {
4386  if( ( IsDiagonal_v<MT1> ) ||
4387  ( IsComputation_v<MT> && !evaluateMatrix ) ||
4388  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4389  selectSmallSubAssignKernel( y, A, x, scalar );
4390  else
4391  selectBlasSubAssignKernel( y, A, x, scalar );
4392  }
4393  //**********************************************************************************************
4394 
4395  //**Default subtraction assignment to dense vectors*********************************************
4409  template< typename VT1 // Type of the left-hand side target vector
4410  , typename MT1 // Type of the left-hand side matrix operand
4411  , typename VT2 // Type of the right-hand side vector operand
4412  , typename ST2 > // Type of the scalar value
4413  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4414  {
4415  y.subAssign( A * x * scalar );
4416  }
4417  //**********************************************************************************************
4418 
4419  //**Default subtraction assignment to dense vectors (small matrices)****************************
4433  template< typename VT1 // Type of the left-hand side target vector
4434  , typename MT1 // Type of the left-hand side matrix operand
4435  , typename VT2 // Type of the right-hand side vector operand
4436  , typename ST2 > // Type of the scalar value
4437  static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4438  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4439  {
4440  selectDefaultSubAssignKernel( y, A, x, scalar );
4441  }
4442  //**********************************************************************************************
4443 
4444  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4458  template< typename VT1 // Type of the left-hand side target vector
4459  , typename MT1 // Type of the left-hand side matrix operand
4460  , typename VT2 // Type of the right-hand side vector operand
4461  , typename ST2 > // Type of the scalar value
4462  static inline auto selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4463  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4464  {
4465  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4466 
4467  const size_t M( A.rows() );
4468  const size_t N( A.columns() );
4469 
4470  size_t i( 0UL );
4471 
4472  for( ; (i+8UL) <= M; i+=8UL )
4473  {
4474  const size_t jbegin( ( IsUpper_v<MT1> )
4475  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4476  :( 0UL ) );
4477  const size_t jend( ( IsLower_v<MT1> )
4478  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4479  :( N ) );
4480  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4481 
4482  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4483  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4484 
4485  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4486  size_t j( jbegin );
4487 
4488  for( ; j<jpos; j+=SIMDSIZE ) {
4489  const SIMDType x1( x.load(j) );
4490  xmm1 += A.load(i ,j) * x1;
4491  xmm2 += A.load(i+1UL,j) * x1;
4492  xmm3 += A.load(i+2UL,j) * x1;
4493  xmm4 += A.load(i+3UL,j) * x1;
4494  xmm5 += A.load(i+4UL,j) * x1;
4495  xmm6 += A.load(i+5UL,j) * x1;
4496  xmm7 += A.load(i+6UL,j) * x1;
4497  xmm8 += A.load(i+7UL,j) * x1;
4498  }
4499 
4500  y[i ] -= sum( xmm1 ) * scalar;
4501  y[i+1UL] -= sum( xmm2 ) * scalar;
4502  y[i+2UL] -= sum( xmm3 ) * scalar;
4503  y[i+3UL] -= sum( xmm4 ) * scalar;
4504  y[i+4UL] -= sum( xmm5 ) * scalar;
4505  y[i+5UL] -= sum( xmm6 ) * scalar;
4506  y[i+6UL] -= sum( xmm7 ) * scalar;
4507  y[i+7UL] -= sum( xmm8 ) * scalar;
4508 
4509  for( ; remainder && j<jend; ++j ) {
4510  y[i ] -= A(i ,j) * x[j] * scalar;
4511  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4512  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4513  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4514  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4515  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4516  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4517  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4518  }
4519  }
4520 
4521  for( ; (i+4UL) <= M; i+=4UL )
4522  {
4523  const size_t jbegin( ( IsUpper_v<MT1> )
4524  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4525  :( 0UL ) );
4526  const size_t jend( ( IsLower_v<MT1> )
4527  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4528  :( N ) );
4529  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4530 
4531  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4532  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4533 
4534  SIMDType xmm1, xmm2, xmm3, xmm4;
4535  size_t j( jbegin );
4536 
4537  for( ; j<jpos; j+=SIMDSIZE ) {
4538  const SIMDType x1( x.load(j) );
4539  xmm1 += A.load(i ,j) * x1;
4540  xmm2 += A.load(i+1UL,j) * x1;
4541  xmm3 += A.load(i+2UL,j) * x1;
4542  xmm4 += A.load(i+3UL,j) * x1;
4543  }
4544 
4545  y[i ] -= sum( xmm1 ) * scalar;
4546  y[i+1UL] -= sum( xmm2 ) * scalar;
4547  y[i+2UL] -= sum( xmm3 ) * scalar;
4548  y[i+3UL] -= sum( xmm4 ) * scalar;
4549 
4550  for( ; remainder && j<jend; ++j ) {
4551  y[i ] -= A(i ,j) * x[j] * scalar;
4552  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4553  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4554  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4555  }
4556  }
4557 
4558  for( ; (i+3UL) <= M; i+=3UL )
4559  {
4560  const size_t jbegin( ( IsUpper_v<MT1> )
4561  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4562  :( 0UL ) );
4563  const size_t jend( ( IsLower_v<MT1> )
4564  ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
4565  :( N ) );
4566  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4567 
4568  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4569  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4570 
4571  SIMDType xmm1, xmm2, xmm3;
4572  size_t j( jbegin );
4573 
4574  for( ; j<jpos; j+=SIMDSIZE ) {
4575  const SIMDType x1( x.load(j) );
4576  xmm1 += A.load(i ,j) * x1;
4577  xmm2 += A.load(i+1UL,j) * x1;
4578  xmm3 += A.load(i+2UL,j) * x1;
4579  }
4580 
4581  y[i ] -= sum( xmm1 ) * scalar;
4582  y[i+1UL] -= sum( xmm2 ) * scalar;
4583  y[i+2UL] -= sum( xmm3 ) * scalar;
4584 
4585  for( ; remainder && j<jend; ++j ) {
4586  y[i ] -= A(i ,j) * x[j] * scalar;
4587  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4588  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4589  }
4590  }
4591 
4592  for( ; (i+2UL) <= M; i+=2UL )
4593  {
4594  const size_t jbegin( ( IsUpper_v<MT1> )
4595  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4596  :( 0UL ) );
4597  const size_t jend( ( IsLower_v<MT1> )
4598  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4599  :( N ) );
4600  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4601 
4602  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4603  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4604 
4605  SIMDType xmm1, xmm2;
4606  size_t j( jbegin );
4607 
4608  for( ; j<jpos; j+=SIMDSIZE ) {
4609  const SIMDType x1( x.load(j) );
4610  xmm1 += A.load(i ,j) * x1;
4611  xmm2 += A.load(i+1UL,j) * x1;
4612  }
4613 
4614  y[i ] -= sum( xmm1 ) * scalar;
4615  y[i+1UL] -= sum( xmm2 ) * scalar;
4616 
4617  for( ; remainder && j<jend; ++j ) {
4618  y[i ] -= A(i ,j) * x[j] * scalar;
4619  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4620  }
4621  }
4622 
4623  if( i < M )
4624  {
4625  const size_t jbegin( ( IsUpper_v<MT1> )
4626  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4627  :( 0UL ) );
4628  const size_t jend( ( IsLower_v<MT1> )
4629  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4630  :( N ) );
4631  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4632 
4633  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4634  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4635 
4636  SIMDType xmm1;
4637  size_t j( jbegin );
4638 
4639  for( ; j<jpos; j+=SIMDSIZE ) {
4640  xmm1 += A.load(i,j) * x.load(j);
4641  }
4642 
4643  y[i] -= sum( xmm1 ) * scalar;
4644 
4645  for( ; remainder && j<jend; ++j ) {
4646  y[i] -= A(i,j) * x[j] * scalar;
4647  }
4648  }
4649  }
4650  //**********************************************************************************************
4651 
4652  //**Default subtraction assignment to dense vectors (large matrices)****************************
4666  template< typename VT1 // Type of the left-hand side target vector
4667  , typename MT1 // Type of the left-hand side matrix operand
4668  , typename VT2 // Type of the right-hand side vector operand
4669  , typename ST2 > // Type of the scalar value
4670  static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4671  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4672  {
4673  selectDefaultSubAssignKernel( y, A, x, scalar );
4674  }
4675  //**********************************************************************************************
4676 
4677  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4691  template< typename VT1 // Type of the left-hand side target vector
4692  , typename MT1 // Type of the left-hand side matrix operand
4693  , typename VT2 // Type of the right-hand side vector operand
4694  , typename ST2 > // Type of the scalar value
4695  static inline auto selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4696  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4697  {
4698  constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4699 
4700  const size_t M( A.rows() );
4701  const size_t N( A.columns() );
4702 
4703  size_t i( 0UL );
4704 
4705  for( ; (i+8UL) <= M; i+=8UL )
4706  {
4707  const size_t jbegin( ( IsUpper_v<MT1> )
4708  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4709  :( 0UL ) );
4710  const size_t jend( ( IsLower_v<MT1> )
4711  ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4712  :( N ) );
4713  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4714 
4715  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4716  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4717 
4718  size_t j( jbegin );
4719 
4720  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4721  const size_t j1( j+SIMDSIZE );
4722  const size_t j2( j+SIMDSIZE*2UL );
4723  const size_t j3( j+SIMDSIZE*3UL );
4724  const SIMDType x1( x.load(j ) );
4725  const SIMDType x2( x.load(j1) );
4726  const SIMDType x3( x.load(j2) );
4727  const SIMDType x4( x.load(j3) );
4728  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4729  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4730  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4731  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4732  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4733  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4734  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4735  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4736  }
4737 
4738  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4739  const size_t j1( j+SIMDSIZE );
4740  const SIMDType x1( x.load(j ) );
4741  const SIMDType x2( x.load(j1) );
4742  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4743  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4744  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4745  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4746  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4747  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4748  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4749  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4750  }
4751 
4752  for( ; j<jpos; j+=SIMDSIZE ) {
4753  const SIMDType x1( x.load(j) );
4754  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4755  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4756  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4757  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4758  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 ) * scalar;
4759  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 ) * scalar;
4760  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 ) * scalar;
4761  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 ) * scalar;
4762  }
4763 
4764  for( ; remainder && j<jend; ++j ) {
4765  y[i ] -= A(i ,j) * x[j] * scalar;
4766  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4767  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4768  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4769  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4770  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4771  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4772  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4773  }
4774  }
4775 
4776  for( ; (i+4UL) <= M; i+=4UL )
4777  {
4778  const size_t jbegin( ( IsUpper_v<MT1> )
4779  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4780  :( 0UL ) );
4781  const size_t jend( ( IsLower_v<MT1> )
4782  ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4783  :( N ) );
4784  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4785 
4786  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4787  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4788 
4789  size_t j( jbegin );
4790 
4791  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4792  const size_t j1( j+SIMDSIZE );
4793  const size_t j2( j+SIMDSIZE*2UL );
4794  const size_t j3( j+SIMDSIZE*3UL );
4795  const SIMDType x1( x.load(j ) );
4796  const SIMDType x2( x.load(j1) );
4797  const SIMDType x3( x.load(j2) );
4798  const SIMDType x4( x.load(j3) );
4799  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4800  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4801  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4802  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4803  }
4804 
4805  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4806  const size_t j1( j+SIMDSIZE );
4807  const SIMDType x1( x.load(j ) );
4808  const SIMDType x2( x.load(j1) );
4809  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4810  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4811  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4812  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4813  }
4814 
4815  for( ; j<jpos; j+=SIMDSIZE ) {
4816  const SIMDType x1( x.load(j) );
4817  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4818  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4819  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4820  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4821  }
4822 
4823  for( ; remainder && j<jend; ++j ) {
4824  y[i ] -= A(i ,j) * x[j] * scalar;
4825  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4826  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4827  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4828  }
4829  }
4830 
4831  for( ; (i+2UL) <= M; i+=2UL )
4832  {
4833  const size_t jbegin( ( IsUpper_v<MT1> )
4834  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4835  :( 0UL ) );
4836  const size_t jend( ( IsLower_v<MT1> )
4837  ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4838  :( N ) );
4839  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4840 
4841  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4842  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4843 
4844  size_t j( jbegin );
4845 
4846  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4847  const size_t j1( j+SIMDSIZE );
4848  const size_t j2( j+SIMDSIZE*2UL );
4849  const size_t j3( j+SIMDSIZE*3UL );
4850  const SIMDType x1( x.load(j ) );
4851  const SIMDType x2( x.load(j1) );
4852  const SIMDType x3( x.load(j2) );
4853  const SIMDType x4( x.load(j3) );
4854  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4855  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4856  }
4857 
4858  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4859  const size_t j1( j+SIMDSIZE );
4860  const SIMDType x1( x.load(j ) );
4861  const SIMDType x2( x.load(j1) );
4862  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4863  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4864  }
4865 
4866  for( ; j<jpos; j+=SIMDSIZE ) {
4867  const SIMDType x1( x.load(j) );
4868  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4869  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4870  }
4871 
4872  for( ; remainder && j<jend; ++j ) {
4873  y[i ] -= A(i ,j) * x[j] * scalar;
4874  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4875  }
4876  }
4877 
4878  if( i < M )
4879  {
4880  const size_t jbegin( ( IsUpper_v<MT1> )
4881  ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) & size_t(-SIMDSIZE) )
4882  :( 0UL ) );
4883  const size_t jend( ( IsLower_v<MT1> )
4884  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4885  :( N ) );
4886  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4887 
4888  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4889  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4890 
4891  size_t j( jbegin );
4892 
4893  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4894  const size_t j1( j+SIMDSIZE );
4895  const size_t j2( j+SIMDSIZE*2UL );
4896  const size_t j3( j+SIMDSIZE*3UL );
4897  const SIMDType x1( x.load(j ) );
4898  const SIMDType x2( x.load(j1) );
4899  const SIMDType x3( x.load(j2) );
4900  const SIMDType x4( x.load(j3) );
4901  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4902  }
4903 
4904  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4905  const size_t j1( j+SIMDSIZE );
4906  const SIMDType x1( x.load(j ) );
4907  const SIMDType x2( x.load(j1) );
4908  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4909  }
4910 
4911  for( ; j<jpos; j+=SIMDSIZE ) {
4912  const SIMDType x1( x.load(j) );
4913  y[i] -= sum( A.load(i,j) * x1 ) * scalar;
4914  }
4915 
4916  for( ; remainder && j<jend; ++j ) {
4917  y[i] -= A(i,j) * x[j] * scalar;
4918  }
4919  }
4920  }
4921  //**********************************************************************************************
4922 
4923  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4937  template< typename VT1 // Type of the left-hand side target vector
4938  , typename MT1 // Type of the left-hand side matrix operand
4939  , typename VT2 // Type of the right-hand side vector operand
4940  , typename ST2 > // Type of the scalar value
4941  static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4942  -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4943  {
4944  selectLargeSubAssignKernel( y, A, x, scalar );
4945  }
4946  //**********************************************************************************************
4947 
4948  //**BLAS-based subtraction assignment to dense vectors******************************************
4949 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4950 
4963  template< typename VT1 // Type of the left-hand side target vector
4964  , typename MT1 // Type of the left-hand side matrix operand
4965  , typename VT2 // Type of the right-hand side vector operand
4966  , typename ST2 > // Type of the scalar value
4967  static inline auto selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4968  -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4969  {
4970  using ET = ElementType_t<VT1>;
4971 
4972  if( IsTriangular_v<MT1> ) {
4973  ResultType_t<VT1> tmp( serial( scalar * x ) );
4974  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4975  subAssign( y, tmp );
4976  }
4977  else {
4978  gemv( y, A, x, ET(-scalar), ET(1) );
4979  }
4980  }
4981 #endif
4982  //**********************************************************************************************
4983 
4984  //**Subtraction assignment to sparse vectors****************************************************
4985  // No special implementation for the subtraction assignment to sparse vectors.
4986  //**********************************************************************************************
4987 
4988  //**Multiplication assignment to dense vectors**************************************************
5000  template< typename VT1 > // Type of the target dense vector
5001  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5002  {
5004 
5008 
5009  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5010 
5011  const ResultType tmp( serial( rhs ) );
5012  multAssign( ~lhs, tmp );
5013  }
5014  //**********************************************************************************************
5015 
5016  //**Multiplication assignment to sparse vectors*************************************************
5017  // No special implementation for the multiplication assignment to sparse vectors.
5018  //**********************************************************************************************
5019 
5020  //**Division assignment to dense vectors********************************************************
5032  template< typename VT1 > // Type of the target dense vector
5033  friend inline void divAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5034  {
5036 
5040 
5041  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5042 
5043  const ResultType tmp( serial( rhs ) );
5044  divAssign( ~lhs, tmp );
5045  }
5046  //**********************************************************************************************
5047 
5048  //**Division assignment to sparse vectors*******************************************************
5049  // No special implementation for the division assignment to sparse vectors.
5050  //**********************************************************************************************
5051 
5052  //**SMP assignment to dense vectors*************************************************************
5066  template< typename VT1 > // Type of the target dense vector
5067  friend inline auto smpAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5068  -> EnableIf_t< UseSMPAssign_v<VT1> >
5069  {
5071 
5072  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5073 
5074  LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5075  RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5076 
5077  if( left.rows() == 0UL ) {
5078  return;
5079  }
5080  else if( left.columns() == 0UL ) {
5081  reset( ~lhs );
5082  return;
5083  }
5084 
5085  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5086  RT x( right ); // Evaluation of the right-hand side dense vector operand
5087 
5088  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5089  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5090  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5091  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5092 
5093  smpAssign( ~lhs, A * x * rhs.scalar_ );
5094  }
5095  //**********************************************************************************************
5096 
5097  //**SMP assignment to sparse vectors************************************************************
5111  template< typename VT1 > // Type of the target sparse vector
5112  friend inline auto smpAssign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5113  -> EnableIf_t< UseSMPAssign_v<VT1> >
5114  {
5116 
5120 
5121  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5122 
5123  const ResultType tmp( rhs );
5124  smpAssign( ~lhs, tmp );
5125  }
5126  //**********************************************************************************************
5127 
5128  //**SMP addition assignment to dense vectors****************************************************
5142  template< typename VT1 > // Type of the target dense vector
5143  friend inline auto smpAddAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5144  -> EnableIf_t< UseSMPAssign_v<VT1> >
5145  {
5147 
5148  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5149 
5150  LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5151  RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5152 
5153  if( left.rows() == 0UL || left.columns() == 0UL ) {
5154  return;
5155  }
5156 
5157  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5158  RT x( right ); // Evaluation of the right-hand side dense vector operand
5159 
5160  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5161  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5162  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5163  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5164 
5165  smpAddAssign( ~lhs, A * x * rhs.scalar_ );
5166  }
5167  //**********************************************************************************************
5168 
5169  //**SMP addition assignment to sparse vectors***************************************************
5170  // No special implementation for the SMP addition assignment to sparse vectors.
5171  //**********************************************************************************************
5172 
5173  //**SMP subtraction assignment to dense vectors*************************************************
5187  template< typename VT1 > // Type of the target dense vector
5188  friend inline auto smpSubAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5189  -> EnableIf_t< UseSMPAssign_v<VT1> >
5190  {
5192 
5193  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5194 
5195  LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5196  RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5197 
5198  if( left.rows() == 0UL || left.columns() == 0UL ) {
5199  return;
5200  }
5201 
5202  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5203  RT x( right ); // Evaluation of the right-hand side dense vector operand
5204 
5205  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5206  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5207  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5208  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5209 
5210  smpSubAssign( ~lhs, A * x * rhs.scalar_ );
5211  }
5212  //**********************************************************************************************
5213 
5214  //**SMP subtraction assignment to sparse vectors************************************************
5215  // No special implementation for the SMP subtraction assignment to sparse vectors.
5216  //**********************************************************************************************
5217 
5218  //**SMP multiplication assignment to dense vectors**********************************************
5232  template< typename VT1 > // Type of the target dense vector
5233  friend inline auto smpMultAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5234  -> EnableIf_t< UseSMPAssign_v<VT1> >
5235  {
5237 
5241 
5242  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5243 
5244  const ResultType tmp( rhs );
5245  smpMultAssign( ~lhs, tmp );
5246  }
5247  //**********************************************************************************************
5248 
5249  //**SMP multiplication assignment to sparse vectors*********************************************
5250  // No special implementation for the SMP multiplication assignment to sparse vectors.
5251  //**********************************************************************************************
5252 
5253  //**SMP division assignment to dense vectors****************************************************
5267  template< typename VT1 > // Type of the target dense vector
5268  friend inline auto smpDivAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5269  -> EnableIf_t< UseSMPAssign_v<VT1> >
5270  {
5272 
5276 
5277  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5278 
5279  const ResultType tmp( rhs );
5280  smpDivAssign( ~lhs, tmp );
5281  }
5282  //**********************************************************************************************
5283 
5284  //**SMP division assignment to sparse vectors***************************************************
5285  // No special implementation for the SMP division assignment to sparse vectors.
5286  //**********************************************************************************************
5287 
5288  //**Compile time checks*************************************************************************
5297  //**********************************************************************************************
5298 };
5300 //*************************************************************************************************
5301 
5302 
5303 
5304 
5305 //=================================================================================================
5306 //
5307 // GLOBAL BINARY ARITHMETIC OPERATORS
5308 //
5309 //=================================================================================================
5310 
5311 //*************************************************************************************************
5341 template< typename MT // Type of the left-hand side dense matrix
5342  , typename VT > // Type of the right-hand side dense vector
5343 inline decltype(auto)
5344  operator*( const DenseMatrix<MT,false>& mat, const DenseVector<VT,false>& vec )
5345 {
5347 
5349 
5350  if( (~mat).columns() != (~vec).size() ) {
5351  BLAZE_THROW_INVALID_ARGUMENT( "Matrix and vector sizes do not match" );
5352  }
5353 
5354  using ReturnType = const DMatDVecMultExpr<MT,VT>;
5355  return ReturnType( ~mat, ~vec );
5356 }
5357 //*************************************************************************************************
5358 
5359 
5360 
5361 
5362 //=================================================================================================
5363 //
5364 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
5365 //
5366 //=================================================================================================
5367 
5368 //*************************************************************************************************
5382 template< typename MT // Matrix base type of the left-hand side expression
5383  , typename VT > // Type of the right-hand side dense vector
5384 inline decltype(auto)
5385  operator*( const MatMatMultExpr<MT>& mat, const DenseVector<VT,false>& vec )
5386 {
5388 
5389  return (~mat).leftOperand() * ( (~mat).rightOperand() * vec );
5390 }
5392 //*************************************************************************************************
5393 
5394 
5395 
5396 
5397 //=================================================================================================
5398 //
5399 // ISALIGNED SPECIALIZATIONS
5400 //
5401 //=================================================================================================
5402 
5403 //*************************************************************************************************
5405 template< typename MT, typename VT >
5406 struct IsAligned< DMatDVecMultExpr<MT,VT> >
5407  : public BoolConstant< IsAligned_v<MT> && IsAligned_v<VT> >
5408 {};
5410 //*************************************************************************************************
5411 
5412 } // namespace blaze
5413 
5414 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:567
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:523
Header file for auxiliary alias declarations.
Data type constraint.
Header file for the blaze::checked and blaze::unchecked instances.
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:210
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:163
ResultType_t< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:127
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:162
Header file for basic type definitions.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:204
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:260
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:533
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:166
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
static constexpr bool evaluateVector
Compilation switch for the composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:144
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:121
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
MultTrait_t< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:202
If_t< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:216
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:428
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:513
Header file for the DenseVector base class.
If_t< useAssign, const ResultType, const DVecScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:169
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:339
System settings for performance optimizations.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:351
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:220
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:433
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:371
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Constraint on the data type.
DenseVector< This, TF > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:160
ElementType_t< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:129
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:467
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:159
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:307
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2146
Header file for the HasSIMDAdd type trait.
Header file for all SIMD functionality.
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:107
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:67
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:83
If_t< IsExpression_v< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:213
ResultType_t< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:128
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:104
Constraint on the data type.
Header file for the exception macros of the math module.
If_t< IsExpression_v< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:172
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:327
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:584
Header file for all forward declarations for expression class templates.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:203
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:383
CompositeType_t< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:131
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:557
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:207
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDVecMultExpr.h:237
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.The MultTrait_t alias declaration provid...
Definition: MultTrait.h:240
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:585
System settings for the BLAS mode.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:454
ElementType_t< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:130
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:361
If_t< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:219
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:577
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDVecMultExpr.h:231
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:384
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:246
Header file for the IsContiguous type trait.
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:161
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:109
CompositeType_t< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:132
Header file for BLAS general matrix/vector multiplication functions (gemv)
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:545
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:442
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:205
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDVecMultExpr.h:224
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
Constraint on the data type.
Header file for the complex data type.
Header file for the IsUpper type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:317
Header file for the MatVecMultExpr base class.
Constraint on the data type.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:294
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:206
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:191
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:175
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:423