TDVecTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/Exception.h>
59 #include <blaze/math/shims/Reset.h>
61 #include <blaze/math/SIMD.h>
81 #include <blaze/math/views/Check.h>
82 #include <blaze/system/BLAS.h>
85 #include <blaze/util/Assert.h>
86 #include <blaze/util/Complex.h>
88 #include <blaze/util/DisableIf.h>
89 #include <blaze/util/EnableIf.h>
92 #include <blaze/util/mpl/If.h>
93 #include <blaze/util/Types.h>
101 
102 
103 namespace blaze {
104 
105 //=================================================================================================
106 //
107 // CLASS TDVECTDMATMULTEXPR
108 //
109 //=================================================================================================
110 
111 //*************************************************************************************************
118 template< typename VT // Type of the left-hand side dense vector
119  , typename MT > // Type of the right-hand side dense matrix
120 class TDVecTDMatMultExpr
121  : public TVecMatMultExpr< DenseVector< TDVecTDMatMultExpr<VT,MT>, true > >
122  , private Computation
123 {
124  private:
125  //**Type definitions****************************************************************************
132  //**********************************************************************************************
133 
134  //**********************************************************************************************
136  static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
137  //**********************************************************************************************
138 
139  //**********************************************************************************************
141  static constexpr bool evaluateMatrix =
142  ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
143  IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
144  //**********************************************************************************************
145 
146  //**********************************************************************************************
148 
152  template< typename T1 >
153  static constexpr bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
155  //**********************************************************************************************
156 
157  //**********************************************************************************************
159 
162  template< typename T1, typename T2, typename T3 >
163  static constexpr bool UseBlasKernel_v =
165  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
166  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
167  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
168  !IsDiagonal_v<T3> &&
169  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
170  IsBLASCompatible_v< ElementType_t<T1> > &&
171  IsBLASCompatible_v< ElementType_t<T2> > &&
172  IsBLASCompatible_v< ElementType_t<T3> > &&
173  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
174  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
176  //**********************************************************************************************
177 
178  //**********************************************************************************************
180 
184  template< typename T1, typename T2, typename T3 >
185  static constexpr bool UseVectorizedDefaultKernel_v =
186  ( useOptimizedKernels &&
187  !IsDiagonal_v<T3> &&
188  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
189  IsSIMDCombinable_v< ElementType_t<T1>
191  , ElementType_t<T3> > &&
192  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
193  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
195  //**********************************************************************************************
196 
197  public:
198  //**Type definitions****************************************************************************
205  using ReturnType = const ElementType;
206  using CompositeType = const ResultType;
207 
209  using LeftOperand = If_t< IsExpression_v<VT>, const VT, const VT& >;
210 
212  using RightOperand = If_t< IsExpression_v<MT>, const MT, const MT& >;
213 
216 
219  //**********************************************************************************************
220 
221  //**Compilation flags***************************************************************************
223  static constexpr bool simdEnabled =
224  ( !IsDiagonal_v<MT> &&
225  VT::simdEnabled && MT::simdEnabled &&
226  HasSIMDAdd_v<VET,MET> &&
227  HasSIMDMult_v<VET,MET> );
228 
230  static constexpr bool smpAssignable =
232  //**********************************************************************************************
233 
234  //**SIMD properties*****************************************************************************
236  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
237  //**********************************************************************************************
238 
239  //**Constructor*********************************************************************************
245  explicit inline TDVecTDMatMultExpr( const VT& vec, const MT& mat ) noexcept
246  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
247  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
248  {
249  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
250  }
251  //**********************************************************************************************
252 
253  //**Subscript operator**************************************************************************
259  inline ReturnType operator[]( size_t index ) const {
260  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
261 
262  if( IsDiagonal_v<MT> )
263  {
264  return vec_[index] * mat_(index,index);
265  }
266  else if( IsLower_v<MT> && ( index > 8UL ) )
267  {
268  const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
269  const size_t n ( mat_.rows() - begin );
270  return subvector( vec_, begin, n, unchecked ) *
271  subvector( column( mat_, index, unchecked ), begin, n, unchecked );
272  }
273  else if( IsUpper_v<MT> && ( index + 8UL < mat_.rows() ) )
274  {
275  const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
276  return subvector( vec_, 0UL, n, unchecked ) *
277  subvector( column( mat_, index, unchecked ), 0UL, n, unchecked );
278  }
279  else
280  {
281  return vec_ * column( mat_, index, unchecked );
282  }
283  }
284  //**********************************************************************************************
285 
286  //**At function*********************************************************************************
293  inline ReturnType at( size_t index ) const {
294  if( index >= mat_.columns() ) {
295  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
296  }
297  return (*this)[index];
298  }
299  //**********************************************************************************************
300 
301  //**Size function*******************************************************************************
306  inline size_t size() const noexcept {
307  return mat_.columns();
308  }
309  //**********************************************************************************************
310 
311  //**Left operand access*************************************************************************
316  inline LeftOperand leftOperand() const noexcept {
317  return vec_;
318  }
319  //**********************************************************************************************
320 
321  //**Right operand access************************************************************************
326  inline RightOperand rightOperand() const noexcept {
327  return mat_;
328  }
329  //**********************************************************************************************
330 
331  //**********************************************************************************************
337  template< typename T >
338  inline bool canAlias( const T* alias ) const noexcept {
339  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
340  }
341  //**********************************************************************************************
342 
343  //**********************************************************************************************
349  template< typename T >
350  inline bool isAliased( const T* alias ) const noexcept {
351  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
352  }
353  //**********************************************************************************************
354 
355  //**********************************************************************************************
360  inline bool isAligned() const noexcept {
361  return vec_.isAligned() && mat_.isAligned();
362  }
363  //**********************************************************************************************
364 
365  //**********************************************************************************************
370  inline bool canSMPAssign() const noexcept {
371  return ( !BLAZE_BLAS_MODE ||
374  ( IsComputation_v<MT> && !evaluateMatrix ) ||
375  ( mat_.rows() * mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
376  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
377  }
378  //**********************************************************************************************
379 
380  private:
381  //**Member variables****************************************************************************
384  //**********************************************************************************************
385 
386  //**Assignment to dense vectors*****************************************************************
399  template< typename VT1 > // Type of the target dense vector
400  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
401  {
403 
404  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
405 
406  if( rhs.mat_.rows() == 0UL ) {
407  reset( ~lhs );
408  return;
409  }
410  else if( rhs.mat_.columns() == 0UL ) {
411  return;
412  }
413 
414  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
415  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
416 
417  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
418  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
419  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
420  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
421 
422  TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
423  }
425  //**********************************************************************************************
426 
427  //**Assignment to dense vectors (kernel selection)**********************************************
438  template< typename VT1 // Type of the left-hand side target vector
439  , typename VT2 // Type of the left-hand side vector operand
440  , typename MT1 > // Type of the right-hand side matrix operand
441  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
442  {
443  if( ( IsDiagonal_v<MT1> ) ||
444  ( IsComputation_v<MT> && !evaluateMatrix ) ||
445  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
446  selectSmallAssignKernel( y, x, A );
447  else
448  selectBlasAssignKernel( y, x, A );
449  }
451  //**********************************************************************************************
452 
453  //**Default assignment to dense vectors*********************************************************
467  template< typename VT1 // Type of the left-hand side target vector
468  , typename VT2 // Type of the left-hand side vector operand
469  , typename MT1 > // Type of the right-hand side matrix operand
470  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
471  {
472  y.assign( x * A );
473  }
475  //**********************************************************************************************
476 
477  //**Default assignment to dense vectors (small matrices)****************************************
491  template< typename VT1 // Type of the left-hand side target vector
492  , typename VT2 // Type of the left-hand side vector operand
493  , typename MT1 > // Type of the right-hand side matrix operand
494  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
495  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
496  {
497  selectDefaultAssignKernel( y, x, A );
498  }
500  //**********************************************************************************************
501 
502  //**Vectorized default assignment to dense vectors (small matrices)*****************************
516  template< typename VT1 // Type of the left-hand side target vector
517  , typename VT2 // Type of the left-hand side vector operand
518  , typename MT1 > // Type of the right-hand side matrix operand
519  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
520  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
521  {
522  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
523 
524  const size_t M( A.rows() );
525  const size_t N( A.columns() );
526 
527  size_t j( 0UL );
528 
529  for( ; (j+8UL) <= N; j+=8UL )
530  {
531  const size_t ibegin( ( IsLower_v<MT1> )
532  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
533  :( 0UL ) );
534  const size_t iend( ( IsUpper_v<MT1> )
535  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
536  :( M ) );
537  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
538 
539  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
540  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
541 
542  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
543  size_t i( ibegin );
544 
545  for( ; i<ipos; i+=SIMDSIZE ) {
546  const SIMDType x1( x.load(i) );
547  xmm1 += x1 * A.load(i,j );
548  xmm2 += x1 * A.load(i,j+1UL);
549  xmm3 += x1 * A.load(i,j+2UL);
550  xmm4 += x1 * A.load(i,j+3UL);
551  xmm5 += x1 * A.load(i,j+4UL);
552  xmm6 += x1 * A.load(i,j+5UL);
553  xmm7 += x1 * A.load(i,j+6UL);
554  xmm8 += x1 * A.load(i,j+7UL);
555  }
556 
557  y[j ] = sum( xmm1 );
558  y[j+1UL] = sum( xmm2 );
559  y[j+2UL] = sum( xmm3 );
560  y[j+3UL] = sum( xmm4 );
561  y[j+4UL] = sum( xmm5 );
562  y[j+5UL] = sum( xmm6 );
563  y[j+6UL] = sum( xmm7 );
564  y[j+7UL] = sum( xmm8 );
565 
566  for( ; remainder && i<iend; ++i ) {
567  y[j ] += x[i] * A(i,j );
568  y[j+1UL] += x[i] * A(i,j+1UL);
569  y[j+2UL] += x[i] * A(i,j+2UL);
570  y[j+3UL] += x[i] * A(i,j+3UL);
571  y[j+4UL] += x[i] * A(i,j+4UL);
572  y[j+5UL] += x[i] * A(i,j+5UL);
573  y[j+6UL] += x[i] * A(i,j+6UL);
574  y[j+7UL] += x[i] * A(i,j+7UL);
575  }
576  }
577 
578  for( ; (j+4UL) <= N; j+=4UL )
579  {
580  const size_t ibegin( ( IsLower_v<MT1> )
581  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
582  :( 0UL ) );
583  const size_t iend( ( IsUpper_v<MT1> )
584  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
585  :( M ) );
586  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
587 
588  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
589  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
590 
591  SIMDType xmm1, xmm2, xmm3, xmm4;
592  size_t i( ibegin );
593 
594  for( ; i<ipos; i+=SIMDSIZE ) {
595  const SIMDType x1( x.load(i) );
596  xmm1 += x1 * A.load(i,j );
597  xmm2 += x1 * A.load(i,j+1UL);
598  xmm3 += x1 * A.load(i,j+2UL);
599  xmm4 += x1 * A.load(i,j+3UL);
600  }
601 
602  y[j ] = sum( xmm1 );
603  y[j+1UL] = sum( xmm2 );
604  y[j+2UL] = sum( xmm3 );
605  y[j+3UL] = sum( xmm4 );
606 
607  for( ; remainder && i<iend; ++i ) {
608  y[j ] += x[i] * A(i,j );
609  y[j+1UL] += x[i] * A(i,j+1UL);
610  y[j+2UL] += x[i] * A(i,j+2UL);
611  y[j+3UL] += x[i] * A(i,j+3UL);
612  }
613  }
614 
615  for( ; (j+3UL) <= N; j+=3UL )
616  {
617  const size_t ibegin( ( IsLower_v<MT1> )
618  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
619  :( 0UL ) );
620  const size_t iend( ( IsUpper_v<MT1> )
621  ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
622  :( M ) );
623  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
624 
625  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
626  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
627 
628  SIMDType xmm1, xmm2, xmm3;
629  size_t i( ibegin );
630 
631  for( ; i<ipos; i+=SIMDSIZE ) {
632  const SIMDType x1( x.load(i) );
633  xmm1 += x1 * A.load(i,j );
634  xmm2 += x1 * A.load(i,j+1UL);
635  xmm3 += x1 * A.load(i,j+2UL);
636  }
637 
638  y[j ] = sum( xmm1 );
639  y[j+1UL] = sum( xmm2 );
640  y[j+2UL] = sum( xmm3 );
641 
642  for( ; remainder && i<iend; ++i ) {
643  y[j ] += x[i] * A(i,j );
644  y[j+1UL] += x[i] * A(i,j+1UL);
645  y[j+2UL] += x[i] * A(i,j+2UL);
646  }
647  }
648 
649  for( ; (j+2UL) <= N; j+=2UL )
650  {
651  const size_t ibegin( ( IsLower_v<MT1> )
652  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
653  :( 0UL ) );
654  const size_t iend( ( IsUpper_v<MT1> )
655  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
656  :( M ) );
657  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
658 
659  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
660  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
661 
662  SIMDType xmm1, xmm2;
663  size_t i( ibegin );
664 
665  for( ; i<ipos; i+=SIMDSIZE ) {
666  const SIMDType x1( x.load(i) );
667  xmm1 += x1 * A.load(i,j );
668  xmm2 += x1 * A.load(i,j+1UL);
669  }
670 
671  y[j ] = sum( xmm1 );
672  y[j+1UL] = sum( xmm2 );
673 
674  for( ; remainder && i<iend; ++i ) {
675  y[j ] += x[i] * A(i,j );
676  y[j+1UL] += x[i] * A(i,j+1UL);
677  }
678  }
679 
680  if( j < N )
681  {
682  const size_t ibegin( ( IsLower_v<MT1> )
683  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
684  :( 0UL ) );
685  const size_t iend( ( IsUpper_v<MT1> )
686  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
687  :( M ) );
688  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
689 
690  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
691  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
692 
693  SIMDType xmm1;
694  size_t i( ibegin );
695 
696  for( ; i<ipos; i+=SIMDSIZE ) {
697  xmm1 += x.load(i) * A.load(i,j);
698  }
699 
700  y[j] = sum( xmm1 );
701 
702  for( ; remainder && i<iend; ++i ) {
703  y[j] += x[i] * A(i,j);
704  }
705  }
706  }
708  //**********************************************************************************************
709 
710  //**Default assignment to dense vectors (large matrices)****************************************
724  template< typename VT1 // Type of the left-hand side target vector
725  , typename VT2 // Type of the left-hand side vector operand
726  , typename MT1 > // Type of the right-hand side matrix operand
727  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
728  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
729  {
730  selectDefaultAssignKernel( y, x, A );
731  }
733  //**********************************************************************************************
734 
735  //**Vectorized default assignment to dense vectors (large matrices)*****************************
749  template< typename VT1 // Type of the left-hand side target vector
750  , typename VT2 // Type of the left-hand side vector operand
751  , typename MT1 > // Type of the right-hand side matrix operand
752  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
753  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
754  {
755  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
756 
757  const size_t M( A.rows() );
758  const size_t N( A.columns() );
759 
760  reset( y );
761 
762  size_t j( 0UL );
763 
764  for( ; (j+8UL) <= N; j+=8UL )
765  {
766  const size_t ibegin( ( IsLower_v<MT1> )
767  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
768  :( 0UL ) );
769  const size_t iend( ( IsUpper_v<MT1> )
770  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
771  :( M ) );
772  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
773 
774  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
775  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
776 
777  size_t i( ibegin );
778 
779  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
780  const size_t i1( i+SIMDSIZE );
781  const size_t i2( i+SIMDSIZE*2UL );
782  const size_t i3( i+SIMDSIZE*3UL );
783  const SIMDType x1( x.load(i ) );
784  const SIMDType x2( x.load(i1) );
785  const SIMDType x3( x.load(i2) );
786  const SIMDType x4( x.load(i3) );
787  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
788  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
789  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
790  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
791  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
792  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
793  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
794  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
795  }
796 
797  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
798  const size_t i1( i+SIMDSIZE );
799  const SIMDType x1( x.load(i ) );
800  const SIMDType x2( x.load(i1) );
801  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
802  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
803  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
804  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
805  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
806  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
807  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
808  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
809  }
810 
811  for( ; i<ipos; i+=SIMDSIZE ) {
812  const SIMDType x1( x.load(i) );
813  y[j ] += sum( x1 * A.load(i,j ) );
814  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
815  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
816  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
817  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
818  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
819  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
820  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
821  }
822 
823  for( ; remainder && i<iend; ++i ) {
824  y[j ] += x[i] * A(i,j );
825  y[j+1UL] += x[i] * A(i,j+1UL);
826  y[j+2UL] += x[i] * A(i,j+2UL);
827  y[j+3UL] += x[i] * A(i,j+3UL);
828  y[j+4UL] += x[i] * A(i,j+4UL);
829  y[j+5UL] += x[i] * A(i,j+5UL);
830  y[j+6UL] += x[i] * A(i,j+6UL);
831  y[j+7UL] += x[i] * A(i,j+7UL);
832  }
833  }
834 
835  for( ; (j+4UL) <= N; j+=4UL )
836  {
837  const size_t ibegin( ( IsLower_v<MT1> )
838  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
839  :( 0UL ) );
840  const size_t iend( ( IsUpper_v<MT1> )
841  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
842  :( M ) );
843  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
844 
845  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
846  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
847 
848  size_t i( ibegin );
849 
850  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
851  const size_t i1( i+SIMDSIZE );
852  const size_t i2( i+SIMDSIZE*2UL );
853  const size_t i3( i+SIMDSIZE*3UL );
854  const SIMDType x1( x.load(i ) );
855  const SIMDType x2( x.load(i1) );
856  const SIMDType x3( x.load(i2) );
857  const SIMDType x4( x.load(i3) );
858  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
859  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
860  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
861  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
862  }
863 
864  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
865  const size_t i1( i+SIMDSIZE );
866  const SIMDType x1( x.load(i ) );
867  const SIMDType x2( x.load(i1) );
868  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
869  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
870  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
871  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
872  }
873 
874  for( ; i<ipos; i+=SIMDSIZE ) {
875  const SIMDType x1( x.load(i) );
876  y[j ] += sum( x1 * A.load(i,j ) );
877  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
878  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
879  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
880  }
881 
882  for( ; remainder && i<iend; ++i ) {
883  y[j ] += x[i] * A(i,j );
884  y[j+1UL] += x[i] * A(i,j+1UL);
885  y[j+2UL] += x[i] * A(i,j+2UL);
886  y[j+3UL] += x[i] * A(i,j+3UL);
887  }
888  }
889 
890  for( ; (j+2UL) <= N; j+=2UL )
891  {
892  const size_t ibegin( ( IsLower_v<MT1> )
893  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
894  :( 0UL ) );
895  const size_t iend( ( IsUpper_v<MT1> )
896  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
897  :( M ) );
898  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
899 
900  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
901  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
902 
903  size_t i( ibegin );
904 
905  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
906  const size_t i1( i+SIMDSIZE );
907  const size_t i2( i+SIMDSIZE*2UL );
908  const size_t i3( i+SIMDSIZE*3UL );
909  const SIMDType x1( x.load(i ) );
910  const SIMDType x2( x.load(i1) );
911  const SIMDType x3( x.load(i2) );
912  const SIMDType x4( x.load(i3) );
913  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
914  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
915  }
916 
917  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
918  const size_t i1( i+SIMDSIZE );
919  const SIMDType x1( x.load(i ) );
920  const SIMDType x2( x.load(i1) );
921  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
922  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
923  }
924 
925  for( ; i<ipos; i+=SIMDSIZE ) {
926  const SIMDType x1( x.load(i) );
927  y[j ] += sum( x1 * A.load(i,j ) );
928  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
929  }
930 
931  for( ; remainder && i<iend; ++i ) {
932  y[j ] += x[i] * A(i,j );
933  y[j+1UL] += x[i] * A(i,j+1UL);
934  }
935  }
936 
937  if( j < N )
938  {
939  const size_t ibegin( ( IsLower_v<MT1> )
940  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
941  :( 0UL ) );
942  const size_t iend( ( IsUpper_v<MT1> )
943  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
944  :( M ) );
945  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
946 
947  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
948  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
949 
950  size_t i( ibegin );
951 
952  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
953  const size_t i1( i+SIMDSIZE );
954  const size_t i2( i+SIMDSIZE*2UL );
955  const size_t i3( i+SIMDSIZE*3UL );
956  const SIMDType x1( x.load(i ) );
957  const SIMDType x2( x.load(i1) );
958  const SIMDType x3( x.load(i2) );
959  const SIMDType x4( x.load(i3) );
960  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
961  }
962 
963  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
964  const size_t i1( i+SIMDSIZE );
965  const SIMDType x1( x.load(i ) );
966  const SIMDType x2( x.load(i1) );
967  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
968  }
969 
970  for( ; i<ipos; i+=SIMDSIZE ) {
971  const SIMDType x1( x.load(i) );
972  y[j] += sum( x1 * A.load(i,j) );
973  }
974 
975  for( ; remainder && i<iend; ++i ) {
976  y[j] += x[i] * A(i,j);
977  }
978  }
979  }
981  //**********************************************************************************************
982 
983  //**BLAS-based assignment to dense vectors (default)********************************************
997  template< typename VT1 // Type of the left-hand side target vector
998  , typename VT2 // Type of the left-hand side vector operand
999  , typename MT1 > // Type of the right-hand side matrix operand
1000  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1001  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1002  {
1003  selectLargeAssignKernel( y, x, A );
1004  }
1006  //**********************************************************************************************
1007 
1008  //**BLAS-based assignment to dense vectors******************************************************
1009 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1010 
1023  template< typename VT1 // Type of the left-hand side target vector
1024  , typename VT2 // Type of the left-hand side vector operand
1025  , typename MT1 > // Type of the right-hand side matrix operand
1026  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1027  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1028  {
1029  using ET = ElementType_t<VT1>;
1030 
1031  if( IsTriangular_v<MT1> ) {
1032  assign( y, x );
1033  trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1034  }
1035  else {
1036  gemv( y, x, A, ET(1), ET(0) );
1037  }
1038  }
1040 #endif
1041  //**********************************************************************************************
1042 
1043  //**Assignment to sparse vectors****************************************************************
1056  template< typename VT1 > // Type of the target sparse vector
1057  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1058  {
1060 
1064 
1065  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1066 
1067  const ResultType tmp( serial( rhs ) );
1068  assign( ~lhs, tmp );
1069  }
1071  //**********************************************************************************************
1072 
1073  //**Addition assignment to dense vectors********************************************************
1086  template< typename VT1 > // Type of the target dense vector
1087  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1088  {
1090 
1091  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1092 
1093  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1094  return;
1095  }
1096 
1097  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1098  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1099 
1100  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1101  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1102  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1103  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1104 
1105  TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1106  }
1108  //**********************************************************************************************
1109 
1110  //**Addition assignment to dense vectors (kernel selection)*************************************
1121  template< typename VT1 // Type of the left-hand side target vector
1122  , typename VT2 // Type of the left-hand side vector operand
1123  , typename MT1 > // Type of the right-hand side matrix operand
1124  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1125  {
1126  if( ( IsDiagonal_v<MT1> ) ||
1127  ( IsComputation_v<MT> && !evaluateMatrix ) ||
1128  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1129  selectSmallAddAssignKernel( y, x, A );
1130  else
1131  selectBlasAddAssignKernel( y, x, A );
1132  }
1134  //**********************************************************************************************
1135 
1136  //**Default addition assignment to dense vectors************************************************
1150  template< typename VT1 // Type of the left-hand side target vector
1151  , typename VT2 // Type of the left-hand side vector operand
1152  , typename MT1 > // Type of the right-hand side matrix operand
1153  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1154  {
1155  y.addAssign( x * A );
1156  }
1158  //**********************************************************************************************
1159 
1160  //**Default addition assignment to dense vectors (small matrices)*******************************
1174  template< typename VT1 // Type of the left-hand side target vector
1175  , typename VT2 // Type of the left-hand side vector operand
1176  , typename MT1 > // Type of the right-hand side matrix operand
1177  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1178  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1179  {
1180  selectDefaultAddAssignKernel( y, x, A );
1181  }
1183  //**********************************************************************************************
1184 
1185  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1200  template< typename VT1 // Type of the left-hand side target vector
1201  , typename VT2 // Type of the left-hand side vector operand
1202  , typename MT1 > // Type of the right-hand side matrix operand
1203  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1204  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1205  {
1206  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1207 
1208  const size_t M( A.rows() );
1209  const size_t N( A.columns() );
1210 
1211  size_t j( 0UL );
1212 
1213  for( ; (j+8UL) <= N; j+=8UL )
1214  {
1215  const size_t ibegin( ( IsLower_v<MT1> )
1216  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1217  :( 0UL ) );
1218  const size_t iend( ( IsUpper_v<MT1> )
1219  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1220  :( M ) );
1221  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1222 
1223  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1224  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1225 
1226  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1227  size_t i( ibegin );
1228 
1229  for( ; i<ipos; i+=SIMDSIZE ) {
1230  const SIMDType x1( x.load(i) );
1231  xmm1 += x1 * A.load(i,j );
1232  xmm2 += x1 * A.load(i,j+1UL);
1233  xmm3 += x1 * A.load(i,j+2UL);
1234  xmm4 += x1 * A.load(i,j+3UL);
1235  xmm5 += x1 * A.load(i,j+4UL);
1236  xmm6 += x1 * A.load(i,j+5UL);
1237  xmm7 += x1 * A.load(i,j+6UL);
1238  xmm8 += x1 * A.load(i,j+7UL);
1239  }
1240 
1241  y[j ] += sum( xmm1 );
1242  y[j+1UL] += sum( xmm2 );
1243  y[j+2UL] += sum( xmm3 );
1244  y[j+3UL] += sum( xmm4 );
1245  y[j+4UL] += sum( xmm5 );
1246  y[j+5UL] += sum( xmm6 );
1247  y[j+6UL] += sum( xmm7 );
1248  y[j+7UL] += sum( xmm8 );
1249 
1250  for( ; remainder && i<iend; ++i ) {
1251  y[j ] += x[i] * A(i,j );
1252  y[j+1UL] += x[i] * A(i,j+1UL);
1253  y[j+2UL] += x[i] * A(i,j+2UL);
1254  y[j+3UL] += x[i] * A(i,j+3UL);
1255  y[j+4UL] += x[i] * A(i,j+4UL);
1256  y[j+5UL] += x[i] * A(i,j+5UL);
1257  y[j+6UL] += x[i] * A(i,j+6UL);
1258  y[j+7UL] += x[i] * A(i,j+7UL);
1259  }
1260  }
1261 
1262  for( ; (j+4UL) <= N; j+=4UL )
1263  {
1264  const size_t ibegin( ( IsLower_v<MT1> )
1265  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1266  :( 0UL ) );
1267  const size_t iend( ( IsUpper_v<MT1> )
1268  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1269  :( M ) );
1270  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1271 
1272  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1273  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1274 
1275  SIMDType xmm1, xmm2, xmm3, xmm4;
1276  size_t i( ibegin );
1277 
1278  for( ; i<ipos; i+=SIMDSIZE ) {
1279  const SIMDType x1( x.load(i) );
1280  xmm1 += x1 * A.load(i,j );
1281  xmm2 += x1 * A.load(i,j+1UL);
1282  xmm3 += x1 * A.load(i,j+2UL);
1283  xmm4 += x1 * A.load(i,j+3UL);
1284  }
1285 
1286  y[j ] += sum( xmm1 );
1287  y[j+1UL] += sum( xmm2 );
1288  y[j+2UL] += sum( xmm3 );
1289  y[j+3UL] += sum( xmm4 );
1290 
1291  for( ; remainder && i<iend; ++i ) {
1292  y[j ] += x[i] * A(i,j );
1293  y[j+1UL] += x[i] * A(i,j+1UL);
1294  y[j+2UL] += x[i] * A(i,j+2UL);
1295  y[j+3UL] += x[i] * A(i,j+3UL);
1296  }
1297  }
1298 
1299  for( ; (j+3UL) <= N; j+=3UL )
1300  {
1301  const size_t ibegin( ( IsLower_v<MT1> )
1302  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1303  :( 0UL ) );
1304  const size_t iend( ( IsUpper_v<MT1> )
1305  ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
1306  :( M ) );
1307  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1308 
1309  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1310  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1311 
1312  SIMDType xmm1, xmm2, xmm3;
1313  size_t i( ibegin );
1314 
1315  for( ; i<ipos; i+=SIMDSIZE ) {
1316  const SIMDType x1( x.load(i) );
1317  xmm1 += x1 * A.load(i,j );
1318  xmm2 += x1 * A.load(i,j+1UL);
1319  xmm3 += x1 * A.load(i,j+2UL);
1320  }
1321 
1322  y[j ] += sum( xmm1 );
1323  y[j+1UL] += sum( xmm2 );
1324  y[j+2UL] += sum( xmm3 );
1325 
1326  for( ; remainder && i<iend; ++i ) {
1327  y[j ] += x[i] * A(i,j );
1328  y[j+1UL] += x[i] * A(i,j+1UL);
1329  y[j+2UL] += x[i] * A(i,j+2UL);
1330  }
1331  }
1332 
1333  for( ; (j+2UL) <= N; j+=2UL )
1334  {
1335  const size_t ibegin( ( IsLower_v<MT1> )
1336  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1337  :( 0UL ) );
1338  const size_t iend( ( IsUpper_v<MT1> )
1339  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1340  :( M ) );
1341  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1342 
1343  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1344  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1345 
1346  SIMDType xmm1, xmm2;
1347  size_t i( ibegin );
1348 
1349  for( ; i<ipos; i+=SIMDSIZE ) {
1350  const SIMDType x1( x.load(i) );
1351  xmm1 += x1 * A.load(i,j );
1352  xmm2 += x1 * A.load(i,j+1UL);
1353  }
1354 
1355  y[j ] += sum( xmm1 );
1356  y[j+1UL] += sum( xmm2 );
1357 
1358  for( ; remainder && i<iend; ++i ) {
1359  y[j ] += x[i] * A(i,j );
1360  y[j+1UL] += x[i] * A(i,j+1UL);
1361  }
1362  }
1363 
1364  if( j < N )
1365  {
1366  const size_t ibegin( ( IsLower_v<MT1> )
1367  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1368  :( 0UL ) );
1369  const size_t iend( ( IsUpper_v<MT1> )
1370  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1371  :( M ) );
1372  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1373 
1374  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1375  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1376 
1377  SIMDType xmm1;
1378  size_t i( ibegin );
1379 
1380  for( ; i<ipos; i+=SIMDSIZE ) {
1381  xmm1 += A.load(i,j) * x.load(i);
1382  }
1383 
1384  y[j] += sum( xmm1 );
1385 
1386  for( ; remainder && i<iend; ++i ) {
1387  y[j] += x[i] * A(i,j);
1388  }
1389  }
1390  }
1392  //**********************************************************************************************
1393 
1394  //**Default addition assignment to dense vectors (large matrices)*******************************
1408  template< typename VT1 // Type of the left-hand side target vector
1409  , typename VT2 // Type of the left-hand side vector operand
1410  , typename MT1 > // Type of the right-hand side matrix operand
1411  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1412  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1413  {
1414  selectDefaultAddAssignKernel( y, x, A );
1415  }
1417  //**********************************************************************************************
1418 
1419  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1434  template< typename VT1 // Type of the left-hand side target vector
1435  , typename VT2 // Type of the left-hand side vector operand
1436  , typename MT1 > // Type of the right-hand side matrix operand
1437  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1438  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1439  {
1440  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1441 
1442  const size_t M( A.rows() );
1443  const size_t N( A.columns() );
1444 
1445  size_t j( 0UL );
1446 
1447  for( ; (j+8UL) <= N; j+=8UL )
1448  {
1449  const size_t ibegin( ( IsLower_v<MT1> )
1450  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1451  :( 0UL ) );
1452  const size_t iend( ( IsUpper_v<MT1> )
1453  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1454  :( M ) );
1455  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1456 
1457  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1458  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1459 
1460  size_t i( ibegin );
1461 
1462  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1463  const size_t i1( i+SIMDSIZE );
1464  const size_t i2( i+SIMDSIZE*2UL );
1465  const size_t i3( i+SIMDSIZE*3UL );
1466  const SIMDType x1( x.load(i ) );
1467  const SIMDType x2( x.load(i1) );
1468  const SIMDType x3( x.load(i2) );
1469  const SIMDType x4( x.load(i3) );
1470  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1471  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1472  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1473  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1474  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1475  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1476  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1477  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1478  }
1479 
1480  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1481  const size_t i1( i+SIMDSIZE );
1482  const SIMDType x1( x.load(i ) );
1483  const SIMDType x2( x.load(i1) );
1484  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1485  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1486  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1487  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1488  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1489  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1490  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1491  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1492  }
1493 
1494  for( ; i<ipos; i+=SIMDSIZE ) {
1495  const SIMDType x1( x.load(i) );
1496  y[j ] += sum( x1 * A.load(i,j ) );
1497  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1498  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1499  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1500  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
1501  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
1502  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
1503  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
1504  }
1505 
1506  for( ; remainder && i<iend; ++i ) {
1507  y[j ] += x[i] * A(i,j );
1508  y[j+1UL] += x[i] * A(i,j+1UL);
1509  y[j+2UL] += x[i] * A(i,j+2UL);
1510  y[j+3UL] += x[i] * A(i,j+3UL);
1511  y[j+4UL] += x[i] * A(i,j+4UL);
1512  y[j+5UL] += x[i] * A(i,j+5UL);
1513  y[j+6UL] += x[i] * A(i,j+6UL);
1514  y[j+7UL] += x[i] * A(i,j+7UL);
1515  }
1516  }
1517 
1518  for( ; (j+4UL) <= N; j+=4UL )
1519  {
1520  const size_t ibegin( ( IsLower_v<MT1> )
1521  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1522  :( 0UL ) );
1523  const size_t iend( ( IsUpper_v<MT1> )
1524  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1525  :( M ) );
1526  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1527 
1528  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1529  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1530 
1531  size_t i( ibegin );
1532 
1533  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1534  const size_t i1( i+SIMDSIZE );
1535  const size_t i2( i+SIMDSIZE*2UL );
1536  const size_t i3( i+SIMDSIZE*3UL );
1537  const SIMDType x1( x.load(i ) );
1538  const SIMDType x2( x.load(i1) );
1539  const SIMDType x3( x.load(i2) );
1540  const SIMDType x4( x.load(i3) );
1541  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1542  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1543  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1544  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1545  }
1546 
1547  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1548  const size_t i1( i+SIMDSIZE );
1549  const SIMDType x1( x.load(i ) );
1550  const SIMDType x2( x.load(i1) );
1551  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1552  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1553  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1554  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1555  }
1556 
1557  for( ; i<ipos; i+=SIMDSIZE ) {
1558  const SIMDType x1( x.load(i) );
1559  y[j ] += sum( x1 * A.load(i,j ) );
1560  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1561  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1562  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1563  }
1564 
1565  for( ; remainder && i<iend; ++i ) {
1566  y[j ] += x[i] * A(i,j );
1567  y[j+1UL] += x[i] * A(i,j+1UL);
1568  y[j+2UL] += x[i] * A(i,j+2UL);
1569  y[j+3UL] += x[i] * A(i,j+3UL);
1570  }
1571  }
1572 
1573  for( ; (j+2UL) <= N; j+=2UL )
1574  {
1575  const size_t ibegin( ( IsLower_v<MT1> )
1576  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1577  :( 0UL ) );
1578  const size_t iend( ( IsUpper_v<MT1> )
1579  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1580  :( M ) );
1581  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1582 
1583  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1584  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1585 
1586  size_t i( ibegin );
1587 
1588  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1589  const size_t i1( i+SIMDSIZE );
1590  const size_t i2( i+SIMDSIZE*2UL );
1591  const size_t i3( i+SIMDSIZE*3UL );
1592  const SIMDType x1( x.load(i ) );
1593  const SIMDType x2( x.load(i1) );
1594  const SIMDType x3( x.load(i2) );
1595  const SIMDType x4( x.load(i3) );
1596  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1597  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1598  }
1599 
1600  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1601  const size_t i1( i+SIMDSIZE );
1602  const SIMDType x1( x.load(i ) );
1603  const SIMDType x2( x.load(i1) );
1604  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1605  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1606  }
1607 
1608  for( ; i<ipos; i+=SIMDSIZE ) {
1609  const SIMDType x1( x.load(i) );
1610  y[j ] += sum( x1 * A.load(i,j ) );
1611  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1612  }
1613 
1614  for( ; remainder && i<iend; ++i ) {
1615  y[j ] += x[i] * A(i,j );
1616  y[j+1UL] += x[i] * A(i,j+1UL);
1617  }
1618  }
1619 
1620  if( j < N )
1621  {
1622  const size_t ibegin( ( IsLower_v<MT1> )
1623  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1624  :( 0UL ) );
1625  const size_t iend( ( IsUpper_v<MT1> )
1626  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1627  :( M ) );
1628  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1629 
1630  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1631  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1632 
1633  size_t i( ibegin );
1634 
1635  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1636  const size_t i1( i+SIMDSIZE );
1637  const size_t i2( i+SIMDSIZE*2UL );
1638  const size_t i3( i+SIMDSIZE*3UL );
1639  const SIMDType x1( x.load(i ) );
1640  const SIMDType x2( x.load(i1) );
1641  const SIMDType x3( x.load(i2) );
1642  const SIMDType x4( x.load(i3) );
1643  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1644  }
1645 
1646  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1647  const size_t i1( i+SIMDSIZE );
1648  const SIMDType x1( x.load(i ) );
1649  const SIMDType x2( x.load(i1) );
1650  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1651  }
1652 
1653  for( ; i<ipos; i+=SIMDSIZE ) {
1654  const SIMDType x1( x.load(i) );
1655  y[j] += sum( x1 * A.load(i,j) );
1656  }
1657 
1658  for( ; remainder && i<iend; ++i ) {
1659  y[j] += x[i] * A(i,j);
1660  }
1661  }
1662  }
1664  //**********************************************************************************************
1665 
1666  //**BLAS-based addition assignment to dense vectors (default)***********************************
1680  template< typename VT1 // Type of the left-hand side target vector
1681  , typename VT2 // Type of the left-hand side vector operand
1682  , typename MT1 > // Type of the right-hand side matrix operand
1683  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1684  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1685  {
1686  selectLargeAddAssignKernel( y, x, A );
1687  }
1689  //**********************************************************************************************
1690 
1691  //**BLAS-based addition assignment to dense vectors*********************************************
1692 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1693 
1706  template< typename VT1 // Type of the left-hand side target vector
1707  , typename VT2 // Type of the left-hand side vector operand
1708  , typename MT1 > // Type of the right-hand side matrix operand
1709  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1710  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1711  {
1712  using ET = ElementType_t<VT1>;
1713 
1714  if( IsTriangular_v<MT1> ) {
1715  ResultType_t<VT1> tmp( serial( x ) );
1716  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1717  addAssign( y, tmp );
1718  }
1719  else {
1720  gemv( y, x, A, ET(1), ET(1) );
1721  }
1722  }
1724 #endif
1725  //**********************************************************************************************
1726 
1727  //**Addition assignment to sparse vectors*******************************************************
1728  // No special implementation for the addition assignment to sparse vectors.
1729  //**********************************************************************************************
1730 
1731  //**Subtraction assignment to dense vectors*****************************************************
1744  template< typename VT1 > // Type of the target dense vector
1745  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1746  {
1748 
1749  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1750 
1751  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1752  return;
1753  }
1754 
1755  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1756  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1757 
1758  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1759  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1760  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1761  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1762 
1763  TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1764  }
1766  //**********************************************************************************************
1767 
1768  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1779  template< typename VT1 // Type of the left-hand side target vector
1780  , typename VT2 // Type of the left-hand side vector operand
1781  , typename MT1 > // Type of the right-hand side matrix operand
1782  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1783  {
1784  if( ( IsDiagonal_v<MT1> ) ||
1785  ( IsComputation_v<MT> && !evaluateMatrix ) ||
1786  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1787  selectSmallSubAssignKernel( y, x, A );
1788  else
1789  selectBlasSubAssignKernel( y, x, A );
1790  }
1792  //**********************************************************************************************
1793 
1794  //**Default subtraction assignment to dense vectors*********************************************
1808  template< typename VT1 // Type of the left-hand side target vector
1809  , typename VT2 // Type of the left-hand side vector operand
1810  , typename MT1 > // Type of the right-hand side matrix operand
1811  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1812  {
1813  y.subAssign( x * A );
1814  }
1816  //**********************************************************************************************
1817 
1818  //**Default subtraction assignment to dense vectors (small matrices)****************************
1832  template< typename VT1 // Type of the left-hand side target vector
1833  , typename VT2 // Type of the left-hand side vector operand
1834  , typename MT1 > // Type of the right-hand side matrix operand
1835  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1836  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1837  {
1838  selectDefaultSubAssignKernel( y, x, A );
1839  }
1841  //**********************************************************************************************
1842 
1843  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1858  template< typename VT1 // Type of the left-hand side target vector
1859  , typename VT2 // Type of the left-hand side vector operand
1860  , typename MT1 > // Type of the right-hand side matrix operand
1861  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1862  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1863  {
1864  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1865 
1866  const size_t M( A.rows() );
1867  const size_t N( A.columns() );
1868 
1869  size_t j( 0UL );
1870 
1871  for( ; (j+8UL) <= N; j+=8UL )
1872  {
1873  const size_t ibegin( ( IsLower_v<MT1> )
1874  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1875  :( 0UL ) );
1876  const size_t iend( ( IsUpper_v<MT1> )
1877  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1878  :( M ) );
1879  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1880 
1881  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1882  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1883 
1884  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1885  size_t i( ibegin );
1886 
1887  for( ; i<ipos; i+=SIMDSIZE ) {
1888  const SIMDType x1( x.load(i) );
1889  xmm1 += x1 * A.load(i,j );
1890  xmm2 += x1 * A.load(i,j+1UL);
1891  xmm3 += x1 * A.load(i,j+2UL);
1892  xmm4 += x1 * A.load(i,j+3UL);
1893  xmm5 += x1 * A.load(i,j+4UL);
1894  xmm6 += x1 * A.load(i,j+5UL);
1895  xmm7 += x1 * A.load(i,j+6UL);
1896  xmm8 += x1 * A.load(i,j+7UL);
1897  }
1898 
1899  y[j ] -= sum( xmm1 );
1900  y[j+1UL] -= sum( xmm2 );
1901  y[j+2UL] -= sum( xmm3 );
1902  y[j+3UL] -= sum( xmm4 );
1903  y[j+4UL] -= sum( xmm5 );
1904  y[j+5UL] -= sum( xmm6 );
1905  y[j+6UL] -= sum( xmm7 );
1906  y[j+7UL] -= sum( xmm8 );
1907 
1908  for( ; remainder && i<iend; ++i ) {
1909  y[j ] -= x[i] * A(i,j );
1910  y[j+1UL] -= x[i] * A(i,j+1UL);
1911  y[j+2UL] -= x[i] * A(i,j+2UL);
1912  y[j+3UL] -= x[i] * A(i,j+3UL);
1913  y[j+4UL] -= x[i] * A(i,j+4UL);
1914  y[j+5UL] -= x[i] * A(i,j+5UL);
1915  y[j+6UL] -= x[i] * A(i,j+6UL);
1916  y[j+7UL] -= x[i] * A(i,j+7UL);
1917  }
1918  }
1919 
1920  for( ; (j+4UL) <= N; j+=4UL )
1921  {
1922  const size_t ibegin( ( IsLower_v<MT1> )
1923  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1924  :( 0UL ) );
1925  const size_t iend( ( IsUpper_v<MT1> )
1926  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1927  :( M ) );
1928  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1929 
1930  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1931  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1932 
1933  SIMDType xmm1, xmm2, xmm3, xmm4;
1934  size_t i( ibegin );
1935 
1936  for( ; i<ipos; i+=SIMDSIZE ) {
1937  const SIMDType x1( x.load(i) );
1938  xmm1 += x1 * A.load(i,j );
1939  xmm2 += x1 * A.load(i,j+1UL);
1940  xmm3 += x1 * A.load(i,j+2UL);
1941  xmm4 += x1 * A.load(i,j+3UL);
1942  }
1943 
1944  y[j ] -= sum( xmm1 );
1945  y[j+1UL] -= sum( xmm2 );
1946  y[j+2UL] -= sum( xmm3 );
1947  y[j+3UL] -= sum( xmm4 );
1948 
1949  for( ; remainder && i<iend; ++i ) {
1950  y[j ] -= x[i] * A(i,j );
1951  y[j+1UL] -= x[i] * A(i,j+1UL);
1952  y[j+2UL] -= x[i] * A(i,j+2UL);
1953  y[j+3UL] -= x[i] * A(i,j+3UL);
1954  }
1955  }
1956 
1957  for( ; (j+3UL) <= N; j+=3UL )
1958  {
1959  const size_t ibegin( ( IsLower_v<MT1> )
1960  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1961  :( 0UL ) );
1962  const size_t iend( ( IsUpper_v<MT1> )
1963  ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
1964  :( M ) );
1965  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1966 
1967  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1968  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1969 
1970  SIMDType xmm1, xmm2, xmm3;
1971  size_t i( ibegin );
1972 
1973  for( ; i<ipos; i+=SIMDSIZE ) {
1974  const SIMDType x1( x.load(i) );
1975  xmm1 += x1 * A.load(i,j );
1976  xmm2 += x1 * A.load(i,j+1UL);
1977  xmm3 += x1 * A.load(i,j+2UL);
1978  }
1979 
1980  y[j ] -= sum( xmm1 );
1981  y[j+1UL] -= sum( xmm2 );
1982  y[j+2UL] -= sum( xmm3 );
1983 
1984  for( ; remainder && i<iend; ++i ) {
1985  y[j ] -= x[i] * A(i,j );
1986  y[j+1UL] -= x[i] * A(i,j+1UL);
1987  y[j+2UL] -= x[i] * A(i,j+2UL);
1988  }
1989  }
1990 
1991  for( ; (j+2UL) <= N; j+=2UL )
1992  {
1993  const size_t ibegin( ( IsLower_v<MT1> )
1994  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
1995  :( 0UL ) );
1996  const size_t iend( ( IsUpper_v<MT1> )
1997  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1998  :( M ) );
1999  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2000 
2001  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2002  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2003 
2004  SIMDType xmm1, xmm2;
2005  size_t i( ibegin );
2006 
2007  for( ; i<ipos; i+=SIMDSIZE ) {
2008  const SIMDType x1( x.load(i) );
2009  xmm1 += x1 * A.load(i,j );
2010  xmm2 += x1 * A.load(i,j+1UL);
2011  }
2012 
2013  y[j ] -= sum( xmm1 );
2014  y[j+1UL] -= sum( xmm2 );
2015 
2016  for( ; remainder && i<iend; ++i ) {
2017  y[j ] -= x[i] * A(i,j );
2018  y[j+1UL] -= x[i] * A(i,j+1UL);
2019  }
2020  }
2021 
2022  if( j < N )
2023  {
2024  const size_t ibegin( ( IsLower_v<MT1> )
2025  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
2026  :( 0UL ) );
2027  const size_t iend( ( IsUpper_v<MT1> )
2028  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
2029  :( M ) );
2030  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2031 
2032  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2033  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2034 
2035  SIMDType xmm1;
2036  size_t i( ibegin );
2037 
2038  for( ; i<ipos; i+=SIMDSIZE ) {
2039  xmm1 += A.load(i,j) * x.load(i);
2040  }
2041 
2042  y[j] -= sum( xmm1 );
2043 
2044  for( ; remainder && i<iend; ++i ) {
2045  y[j] -= x[i] * A(i,j);
2046  }
2047  }
2048  }
2050  //**********************************************************************************************
2051 
2052  //**Default subtraction assignment to dense vectors (large matrices)****************************
2066  template< typename VT1 // Type of the left-hand side target vector
2067  , typename VT2 // Type of the left-hand side vector operand
2068  , typename MT1 > // Type of the right-hand side matrix operand
2069  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2070  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2071  {
2072  selectDefaultSubAssignKernel( y, x, A );
2073  }
2075  //**********************************************************************************************
2076 
2077  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2092  template< typename VT1 // Type of the left-hand side target vector
2093  , typename VT2 // Type of the left-hand side vector operand
2094  , typename MT1 > // Type of the right-hand side matrix operand
2095  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2096  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2097  {
2098  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
2099 
2100  const size_t M( A.rows() );
2101  const size_t N( A.columns() );
2102 
2103  size_t j( 0UL );
2104 
2105  for( ; (j+8UL) <= N; j+=8UL )
2106  {
2107  const size_t ibegin( ( IsLower_v<MT1> )
2108  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
2109  :( 0UL ) );
2110  const size_t iend( ( IsUpper_v<MT1> )
2111  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
2112  :( M ) );
2113  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2114 
2115  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2116  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2117 
2118  size_t i( ibegin );
2119 
2120  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2121  const size_t i1( i+SIMDSIZE );
2122  const size_t i2( i+SIMDSIZE*2UL );
2123  const size_t i3( i+SIMDSIZE*3UL );
2124  const SIMDType x1( x.load(i ) );
2125  const SIMDType x2( x.load(i1) );
2126  const SIMDType x3( x.load(i2) );
2127  const SIMDType x4( x.load(i3) );
2128  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2129  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2130  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2131  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2132  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2133  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2134  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2135  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2136  }
2137 
2138  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2139  const size_t i1( i+SIMDSIZE );
2140  const SIMDType x1( x.load(i ) );
2141  const SIMDType x2( x.load(i1) );
2142  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2143  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2144  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2145  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2146  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2147  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2148  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2149  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2150  }
2151 
2152  for( ; i<ipos; i+=SIMDSIZE ) {
2153  const SIMDType x1( x.load(i) );
2154  y[j ] -= sum( x1 * A.load(i,j ) );
2155  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2156  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2157  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2158  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) );
2159  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) );
2160  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) );
2161  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) );
2162  }
2163 
2164  for( ; remainder && i<iend; ++i ) {
2165  y[j ] -= x[i] * A(i,j );
2166  y[j+1UL] -= x[i] * A(i,j+1UL);
2167  y[j+2UL] -= x[i] * A(i,j+2UL);
2168  y[j+3UL] -= x[i] * A(i,j+3UL);
2169  y[j+4UL] -= x[i] * A(i,j+4UL);
2170  y[j+5UL] -= x[i] * A(i,j+5UL);
2171  y[j+6UL] -= x[i] * A(i,j+6UL);
2172  y[j+7UL] -= x[i] * A(i,j+7UL);
2173  }
2174  }
2175 
2176  for( ; (j+4UL) <= N; j+=4UL )
2177  {
2178  const size_t ibegin( ( IsLower_v<MT1> )
2179  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
2180  :( 0UL ) );
2181  const size_t iend( ( IsUpper_v<MT1> )
2182  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
2183  :( M ) );
2184  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2185 
2186  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2187  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2188 
2189  size_t i( ibegin );
2190 
2191  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2192  const size_t i1( i+SIMDSIZE );
2193  const size_t i2( i+SIMDSIZE*2UL );
2194  const size_t i3( i+SIMDSIZE*3UL );
2195  const SIMDType x1( x.load(i ) );
2196  const SIMDType x2( x.load(i1) );
2197  const SIMDType x3( x.load(i2) );
2198  const SIMDType x4( x.load(i3) );
2199  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2200  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2201  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2202  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2203  }
2204 
2205  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2206  const size_t i1( i+SIMDSIZE );
2207  const SIMDType x1( x.load(i ) );
2208  const SIMDType x2( x.load(i1) );
2209  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2210  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2211  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2212  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2213  }
2214 
2215  for( ; i<ipos; i+=SIMDSIZE ) {
2216  const SIMDType x1( x.load(i) );
2217  y[j ] -= sum( x1 * A.load(i,j ) );
2218  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2219  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2220  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2221  }
2222 
2223  for( ; remainder && i<iend; ++i ) {
2224  y[j ] -= x[i] * A(i,j );
2225  y[j+1UL] -= x[i] * A(i,j+1UL);
2226  y[j+2UL] -= x[i] * A(i,j+2UL);
2227  y[j+3UL] -= x[i] * A(i,j+3UL);
2228  }
2229  }
2230 
2231  for( ; (j+2UL) <= N; j+=2UL )
2232  {
2233  const size_t ibegin( ( IsLower_v<MT1> )
2234  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
2235  :( 0UL ) );
2236  const size_t iend( ( IsUpper_v<MT1> )
2237  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
2238  :( M ) );
2239  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2240 
2241  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2242  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2243 
2244  size_t i( ibegin );
2245 
2246  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2247  const size_t i1( i+SIMDSIZE );
2248  const size_t i2( i+SIMDSIZE*2UL );
2249  const size_t i3( i+SIMDSIZE*3UL );
2250  const SIMDType x1( x.load(i ) );
2251  const SIMDType x2( x.load(i1) );
2252  const SIMDType x3( x.load(i2) );
2253  const SIMDType x4( x.load(i3) );
2254  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2255  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2256  }
2257 
2258  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2259  const size_t i1( i+SIMDSIZE );
2260  const SIMDType x1( x.load(i ) );
2261  const SIMDType x2( x.load(i1) );
2262  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2263  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2264  }
2265 
2266  for( ; i<ipos; i+=SIMDSIZE ) {
2267  const SIMDType x1( x.load(i) );
2268  y[j ] -= sum( x1 * A.load(i,j ) );
2269  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2270  }
2271 
2272  for( ; remainder && i<iend; ++i ) {
2273  y[j ] -= x[i] * A(i,j );
2274  y[j+1UL] -= x[i] * A(i,j+1UL);
2275  }
2276  }
2277 
2278  if( j < N )
2279  {
2280  const size_t ibegin( ( IsLower_v<MT1> )
2281  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
2282  :( 0UL ) );
2283  const size_t iend( ( IsUpper_v<MT1> )
2284  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
2285  :( M ) );
2286  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2287 
2288  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2289  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2290 
2291  size_t i( ibegin );
2292 
2293  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2294  const size_t i1( i+SIMDSIZE );
2295  const size_t i2( i+SIMDSIZE*2UL );
2296  const size_t i3( i+SIMDSIZE*3UL );
2297  const SIMDType x1( x.load(i ) );
2298  const SIMDType x2( x.load(i1) );
2299  const SIMDType x3( x.load(i2) );
2300  const SIMDType x4( x.load(i3) );
2301  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2302  }
2303 
2304  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2305  const size_t i1( i+SIMDSIZE );
2306  const SIMDType x1( x.load(i ) );
2307  const SIMDType x2( x.load(i1) );
2308  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2309  }
2310 
2311  for( ; i<ipos; i+=SIMDSIZE ) {
2312  const SIMDType x1( x.load(i) );
2313  y[j] -= sum( x1 * A.load(i,j) );
2314  }
2315 
2316  for( ; remainder && i<iend; ++i ) {
2317  y[j] -= x[i] * A(i,j);
2318  }
2319  }
2320  }
2322  //**********************************************************************************************
2323 
2324  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2338  template< typename VT1 // Type of the left-hand side target vector
2339  , typename VT2 // Type of the left-hand side vector operand
2340  , typename MT1 > // Type of the right-hand side matrix operand
2341  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2342  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2343  {
2344  selectLargeSubAssignKernel( y, x, A );
2345  }
2347  //**********************************************************************************************
2348 
2349  //**BLAS-based subtraction assignment to dense vectors******************************************
2350 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2351 
2364  template< typename VT1 // Type of the left-hand side target vector
2365  , typename VT2 // Type of the left-hand side vector operand
2366  , typename MT1 > // Type of the right-hand side matrix operand
2367  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2368  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2369  {
2370  using ET = ElementType_t<VT1>;
2371 
2372  if( IsTriangular_v<MT1> ) {
2373  ResultType_t<VT1> tmp( serial( x ) );
2374  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2375  subAssign( y, tmp );
2376  }
2377  else {
2378  gemv( y, x, A, ET(-1), ET(1) );
2379  }
2380  }
2382 #endif
2383  //**********************************************************************************************
2384 
2385  //**Subtraction assignment to sparse vectors****************************************************
2386  // No special implementation for the subtraction assignment to sparse vectors.
2387  //**********************************************************************************************
2388 
2389  //**Multiplication assignment to dense vectors**************************************************
2402  template< typename VT1 > // Type of the target dense vector
2403  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2404  {
2406 
2410 
2411  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2412 
2413  const ResultType tmp( serial( rhs ) );
2414  multAssign( ~lhs, tmp );
2415  }
2417  //**********************************************************************************************
2418 
2419  //**Multiplication assignment to sparse vectors*************************************************
2420  // No special implementation for the multiplication assignment to sparse vectors.
2421  //**********************************************************************************************
2422 
2423  //**Division assignment to dense vectors********************************************************
2436  template< typename VT1 > // Type of the target dense vector
2437  friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2438  {
2440 
2444 
2445  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2446 
2447  const ResultType tmp( serial( rhs ) );
2448  divAssign( ~lhs, tmp );
2449  }
2451  //**********************************************************************************************
2452 
2453  //**Division assignment to sparse vectors*******************************************************
2454  // No special implementation for the division assignment to sparse vectors.
2455  //**********************************************************************************************
2456 
2457  //**SMP assignment to dense vectors*************************************************************
2472  template< typename VT1 > // Type of the target dense vector
2473  friend inline auto smpAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2474  -> EnableIf_t< UseSMPAssign_v<VT1> >
2475  {
2477 
2478  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2479 
2480  if( rhs.mat_.rows() == 0UL ) {
2481  reset( ~lhs );
2482  return;
2483  }
2484  else if( rhs.mat_.columns() == 0UL ) {
2485  return;
2486  }
2487 
2488  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2489  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2490 
2491  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2492  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2493  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2494  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2495 
2496  smpAssign( ~lhs, x * A );
2497  }
2499  //**********************************************************************************************
2500 
2501  //**SMP assignment to sparse vectors************************************************************
2516  template< typename VT1 > // Type of the target sparse vector
2517  friend inline auto smpAssign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2518  -> EnableIf_t< UseSMPAssign_v<VT1> >
2519  {
2521 
2525 
2526  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2527 
2528  const ResultType tmp( rhs );
2529  smpAssign( ~lhs, tmp );
2530  }
2532  //**********************************************************************************************
2533 
2534  //**SMP addition assignment to dense vectors****************************************************
2549  template< typename VT1 > // Type of the target dense vector
2550  friend inline auto smpAddAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2551  -> EnableIf_t< UseSMPAssign_v<VT1> >
2552  {
2554 
2555  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2556 
2557  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2558  return;
2559  }
2560 
2561  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2562  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2563 
2564  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2565  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2566  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2567  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2568 
2569  smpAddAssign( ~lhs, x * A );
2570  }
2572  //**********************************************************************************************
2573 
2574  //**SMP addition assignment to sparse vectors***************************************************
2575  // No special implementation for the SMP addition assignment to sparse vectors.
2576  //**********************************************************************************************
2577 
2578  //**SMP subtraction assignment to dense vectors*************************************************
2593  template< typename VT1 > // Type of the target dense vector
2594  friend inline auto smpSubAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2595  -> EnableIf_t< UseSMPAssign_v<VT1> >
2596  {
2598 
2599  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2600 
2601  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2602  return;
2603  }
2604 
2605  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2606  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2607 
2608  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2609  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2610  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2611  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2612 
2613  smpSubAssign( ~lhs, x * A );
2614  }
2616  //**********************************************************************************************
2617 
2618  //**SMP subtraction assignment to sparse vectors************************************************
2619  // No special implementation for the SMP subtraction assignment to sparse vectors.
2620  //**********************************************************************************************
2621 
2622  //**SMP multiplication assignment to dense vectors**********************************************
2637  template< typename VT1 > // Type of the target dense vector
2638  friend inline auto smpMultAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2639  -> EnableIf_t< UseSMPAssign_v<VT1> >
2640  {
2642 
2646 
2647  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2648 
2649  const ResultType tmp( rhs );
2650  smpMultAssign( ~lhs, tmp );
2651  }
2653  //**********************************************************************************************
2654 
2655  //**SMP multiplication assignment to sparse vectors*********************************************
2656  // No special implementation for the SMP multiplication assignment to sparse vectors.
2657  //**********************************************************************************************
2658 
2659  //**SMP division assignment to dense vectors****************************************************
2674  template< typename VT1 > // Type of the target dense vector
2675  friend inline auto smpDivAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2676  -> EnableIf_t< UseSMPAssign_v<VT1> >
2677  {
2679 
2683 
2684  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2685 
2686  const ResultType tmp( rhs );
2687  smpDivAssign( ~lhs, tmp );
2688  }
2690  //**********************************************************************************************
2691 
2692  //**SMP division assignment to sparse vectors***************************************************
2693  // No special implementation for the SMP division assignment to sparse vectors.
2694  //**********************************************************************************************
2695 
2696  //**Compile time checks*************************************************************************
2704  //**********************************************************************************************
2705 };
2706 //*************************************************************************************************
2707 
2708 
2709 
2710 
2711 //=================================================================================================
2712 //
2713 // DVECSCALARMULTEXPR SPECIALIZATION
2714 //
2715 //=================================================================================================
2716 
2717 //*************************************************************************************************
2725 template< typename VT // Type of the left-hand side dense vector
2726  , typename MT // Type of the right-hand side dense matrix
2727  , typename ST > // Type of the side scalar value
2728 class DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >
2729  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true > >
2730  , private Computation
2731 {
2732  private:
2733  //**Type definitions****************************************************************************
2734  using VMM = TDVecTDMatMultExpr<VT,MT>;
2735  using RES = ResultType_t<VMM>;
2736  using VRT = ResultType_t<VT>;
2737  using MRT = ResultType_t<MT>;
2738  using VET = ElementType_t<VRT>;
2739  using MET = ElementType_t<MRT>;
2740  using VCT = CompositeType_t<VT>;
2741  using MCT = CompositeType_t<MT>;
2742  //**********************************************************************************************
2743 
2744  //**********************************************************************************************
2746  static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2747  //**********************************************************************************************
2748 
2749  //**********************************************************************************************
2751  static constexpr bool evaluateMatrix =
2752  ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2753  IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2754  //**********************************************************************************************
2755 
2756  //**********************************************************************************************
2758 
2760  template< typename T1 >
2761  static constexpr bool UseSMPAssign_v =
2762  ( T1::smpAssignable && ( evaluateVector || evaluateMatrix ) );
2763  //**********************************************************************************************
2764 
2765  //**********************************************************************************************
2767 
2769  template< typename T1, typename T2, typename T3, typename T4 >
2770  static constexpr bool UseBlasKernel_v =
2772  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2773  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2774  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2775  !IsDiagonal_v<T3> &&
2776  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2777  IsBLASCompatible_v< ElementType_t<T1> > &&
2778  IsBLASCompatible_v< ElementType_t<T2> > &&
2779  IsBLASCompatible_v< ElementType_t<T3> > &&
2780  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2781  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2782  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2783  //**********************************************************************************************
2784 
2785  //**********************************************************************************************
2787 
2790  template< typename T1, typename T2, typename T3, typename T4 >
2791  static constexpr bool UseVectorizedDefaultKernel_v =
2792  ( useOptimizedKernels &&
2793  !IsDiagonal_v<T3> &&
2794  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2795  IsSIMDCombinable_v< ElementType_t<T1>
2796  , ElementType_t<T2>
2797  , ElementType_t<T3>
2798  , T4 > &&
2799  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2800  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2801  //**********************************************************************************************
2802 
2803  public:
2804  //**Type definitions****************************************************************************
2805  using This = DVecScalarMultExpr<VMM,ST,true>;
2806  using BaseType = DenseVector<This,true>;
2807  using ResultType = MultTrait_t<RES,ST>;
2808  using TransposeType = TransposeType_t<ResultType>;
2809  using ElementType = ElementType_t<ResultType>;
2810  using SIMDType = SIMDTrait_t<ElementType>;
2811  using ReturnType = const ElementType;
2812  using CompositeType = const ResultType;
2813 
2815  using LeftOperand = const TDVecTDMatMultExpr<VT,MT>;
2816 
2818  using RightOperand = ST;
2819 
2821  using LT = If_t< evaluateVector, const VRT, VCT >;
2822 
2824  using RT = If_t< evaluateMatrix, const MRT, MCT >;
2825  //**********************************************************************************************
2826 
2827  //**Compilation flags***************************************************************************
2829  static constexpr bool simdEnabled =
2830  ( !IsDiagonal_v<MT> &&
2831  VT::simdEnabled && MT::simdEnabled &&
2832  IsSIMDCombinable_v<VET,MET,ST> &&
2833  HasSIMDAdd_v<VET,MET> &&
2834  HasSIMDMult_v<VET,MET> );
2835 
2837  static constexpr bool smpAssignable =
2838  ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
2839  //**********************************************************************************************
2840 
2841  //**SIMD properties*****************************************************************************
2843  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
2844  //**********************************************************************************************
2845 
2846  //**Constructor*********************************************************************************
2852  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2853  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2854  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2855  {}
2856  //**********************************************************************************************
2857 
2858  //**Subscript operator**************************************************************************
2864  inline ReturnType operator[]( size_t index ) const {
2865  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2866  return vector_[index] * scalar_;
2867  }
2868  //**********************************************************************************************
2869 
2870  //**At function*********************************************************************************
2877  inline ReturnType at( size_t index ) const {
2878  if( index >= vector_.size() ) {
2879  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2880  }
2881  return (*this)[index];
2882  }
2883  //**********************************************************************************************
2884 
2885  //**Size function*******************************************************************************
2890  inline size_t size() const {
2891  return vector_.size();
2892  }
2893  //**********************************************************************************************
2894 
2895  //**Left operand access*************************************************************************
2900  inline LeftOperand leftOperand() const {
2901  return vector_;
2902  }
2903  //**********************************************************************************************
2904 
2905  //**Right operand access************************************************************************
2910  inline RightOperand rightOperand() const {
2911  return scalar_;
2912  }
2913  //**********************************************************************************************
2914 
2915  //**********************************************************************************************
2921  template< typename T >
2922  inline bool canAlias( const T* alias ) const {
2923  return vector_.canAlias( alias );
2924  }
2925  //**********************************************************************************************
2926 
2927  //**********************************************************************************************
2933  template< typename T >
2934  inline bool isAliased( const T* alias ) const {
2935  return vector_.isAliased( alias );
2936  }
2937  //**********************************************************************************************
2938 
2939  //**********************************************************************************************
2944  inline bool isAligned() const {
2945  return vector_.isAligned();
2946  }
2947  //**********************************************************************************************
2948 
2949  //**********************************************************************************************
2954  inline bool canSMPAssign() const noexcept {
2955  RightOperand_t<VMM> A( vector_.rightOperand() );
2956  return ( !BLAZE_BLAS_MODE ||
2959  ( IsComputation_v<MT> && !evaluateMatrix ) ||
2960  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2961  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
2962  }
2963  //**********************************************************************************************
2964 
2965  private:
2966  //**Member variables****************************************************************************
2969  //**********************************************************************************************
2970 
2971  //**Assignment to dense vectors*****************************************************************
2983  template< typename VT1 // Type of the target dense vector
2984  , bool TF > // Transpose flag of the target dense vector
2985  friend inline void assign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
2986  {
2988 
2989  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2990 
2991  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
2992  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
2993 
2994  if( right.rows() == 0UL ) {
2995  reset( ~lhs );
2996  return;
2997  }
2998  else if( right.columns() == 0UL ) {
2999  return;
3000  }
3001 
3002  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3003  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3004 
3005  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3006  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3007  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3008  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3009 
3010  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
3011  }
3012  //**********************************************************************************************
3013 
3014  //**Assignment to dense vectors (kernel selection)**********************************************
3025  template< typename VT1 // Type of the left-hand side target vector
3026  , typename VT2 // Type of the left-hand side vector operand
3027  , typename MT1 // Type of the right-hand side matrix operand
3028  , typename ST2 > // Type of the scalar value
3029  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3030  {
3031  if( ( IsDiagonal_v<MT1> ) ||
3032  ( IsComputation_v<MT> && !evaluateMatrix ) ||
3033  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3034  selectSmallAssignKernel( y, x, A, scalar );
3035  else
3036  selectBlasAssignKernel( y, x, A, scalar );
3037  }
3038  //**********************************************************************************************
3039 
3040  //**Default assignment to dense vectors*********************************************************
3054  template< typename VT1 // Type of the left-hand side target vector
3055  , typename VT2 // Type of the left-hand side vector operand
3056  , typename MT1 // Type of the right-hand side matrix operand
3057  , typename ST2 > // Type of the scalar value
3058  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3059  {
3060  y.assign( x * A * scalar );
3061  }
3062  //**********************************************************************************************
3063 
3064  //**Default assignment to dense vectors (small matrices)****************************************
3078  template< typename VT1 // Type of the left-hand side target vector
3079  , typename VT2 // Type of the left-hand side vector operand
3080  , typename MT1 // Type of the right-hand side matrix operand
3081  , typename ST2 > // Type of the scalar value
3082  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3083  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3084  {
3085  selectDefaultAssignKernel( y, x, A, scalar );
3086  }
3087  //**********************************************************************************************
3088 
3089  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3104  template< typename VT1 // Type of the left-hand side target vector
3105  , typename VT2 // Type of the left-hand side vector operand
3106  , typename MT1 // Type of the right-hand side matrix operand
3107  , typename ST2 > // Type of the scalar value
3108  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3109  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3110  {
3111  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3112 
3113  const size_t M( A.rows() );
3114  const size_t N( A.columns() );
3115 
3116  size_t j( 0UL );
3117 
3118  for( ; (j+8UL) <= N; j+=8UL )
3119  {
3120  const size_t ibegin( ( IsLower_v<MT1> )
3121  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3122  :( 0UL ) );
3123  const size_t iend( ( IsUpper_v<MT1> )
3124  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3125  :( M ) );
3126  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3127 
3128  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3129  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3130 
3131  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3132  size_t i( ibegin );
3133 
3134  for( ; i<ipos; i+=SIMDSIZE ) {
3135  const SIMDType x1( x.load(i) );
3136  xmm1 += x1 * A.load(i,j );
3137  xmm2 += x1 * A.load(i,j+1UL);
3138  xmm3 += x1 * A.load(i,j+2UL);
3139  xmm4 += x1 * A.load(i,j+3UL);
3140  xmm5 += x1 * A.load(i,j+4UL);
3141  xmm6 += x1 * A.load(i,j+5UL);
3142  xmm7 += x1 * A.load(i,j+6UL);
3143  xmm8 += x1 * A.load(i,j+7UL);
3144  }
3145 
3146  y[j ] = sum( xmm1 ) * scalar;
3147  y[j+1UL] = sum( xmm2 ) * scalar;
3148  y[j+2UL] = sum( xmm3 ) * scalar;
3149  y[j+3UL] = sum( xmm4 ) * scalar;
3150  y[j+4UL] = sum( xmm5 ) * scalar;
3151  y[j+5UL] = sum( xmm6 ) * scalar;
3152  y[j+6UL] = sum( xmm7 ) * scalar;
3153  y[j+7UL] = sum( xmm8 ) * scalar;
3154 
3155  for( ; remainder && i<iend; ++i ) {
3156  y[j ] += x[i] * A(i,j ) * scalar;
3157  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3158  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3159  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3160  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3161  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3162  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3163  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3164  }
3165  }
3166 
3167  for( ; (j+4UL) <= N; j+=4UL )
3168  {
3169  const size_t ibegin( ( IsLower_v<MT1> )
3170  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3171  :( 0UL ) );
3172  const size_t iend( ( IsUpper_v<MT1> )
3173  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3174  :( M ) );
3175  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3176 
3177  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3178  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3179 
3180  SIMDType xmm1, xmm2, xmm3, xmm4;
3181  size_t i( ibegin );
3182 
3183  for( ; i<ipos; i+=SIMDSIZE ) {
3184  const SIMDType x1( x.load(i) );
3185  xmm1 += x1 * A.load(i,j );
3186  xmm2 += x1 * A.load(i,j+1UL);
3187  xmm3 += x1 * A.load(i,j+2UL);
3188  xmm4 += x1 * A.load(i,j+3UL);
3189  }
3190 
3191  y[j ] = sum( xmm1 ) * scalar;
3192  y[j+1UL] = sum( xmm2 ) * scalar;
3193  y[j+2UL] = sum( xmm3 ) * scalar;
3194  y[j+3UL] = sum( xmm4 ) * scalar;
3195 
3196  for( ; remainder && i<iend; ++i ) {
3197  y[j ] += x[i] * A(i,j ) * scalar;
3198  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3199  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3200  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3201  }
3202  }
3203 
3204  for( ; (j+3UL) <= N; j+=3UL )
3205  {
3206  const size_t ibegin( ( IsLower_v<MT1> )
3207  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3208  :( 0UL ) );
3209  const size_t iend( ( IsUpper_v<MT1> )
3210  ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
3211  :( M ) );
3212  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3213 
3214  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3215  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3216 
3217  SIMDType xmm1, xmm2, xmm3;
3218  size_t i( ibegin );
3219 
3220  for( ; i<ipos; i+=SIMDSIZE ) {
3221  const SIMDType x1( x.load(i) );
3222  xmm1 += x1 * A.load(i,j );
3223  xmm2 += x1 * A.load(i,j+1UL);
3224  xmm3 += x1 * A.load(i,j+2UL);
3225  }
3226 
3227  y[j ] = sum( xmm1 ) * scalar;
3228  y[j+1UL] = sum( xmm2 ) * scalar;
3229  y[j+2UL] = sum( xmm3 ) * scalar;
3230 
3231  for( ; remainder && i<iend; ++i ) {
3232  y[j ] += x[i] * A(i,j ) * scalar;
3233  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3234  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3235  }
3236  }
3237 
3238  for( ; (j+2UL) <= N; j+=2UL )
3239  {
3240  const size_t ibegin( ( IsLower_v<MT1> )
3241  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3242  :( 0UL ) );
3243  const size_t iend( ( IsUpper_v<MT1> )
3244  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
3245  :( M ) );
3246  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3247 
3248  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3249  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3250 
3251  SIMDType xmm1, xmm2;
3252  size_t i( ibegin );
3253 
3254  for( ; i<ipos; i+=SIMDSIZE ) {
3255  const SIMDType x1( x.load(i) );
3256  xmm1 += x1 * A.load(i,j );
3257  xmm2 += x1 * A.load(i,j+1UL);
3258  }
3259 
3260  y[j ] = sum( xmm1 ) * scalar;
3261  y[j+1UL] = sum( xmm2 ) * scalar;
3262 
3263  for( ; remainder && i<iend; ++i ) {
3264  y[j ] += x[i] * A(i,j ) * scalar;
3265  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3266  }
3267  }
3268 
3269  if( j < N )
3270  {
3271  const size_t ibegin( ( IsLower_v<MT1> )
3272  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3273  :( 0UL ) );
3274  const size_t iend( ( IsUpper_v<MT1> )
3275  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
3276  :( M ) );
3277  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3278 
3279  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3280  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3281 
3282  SIMDType xmm1;
3283  size_t i( ibegin );
3284 
3285  for( ; i<ipos; i+=SIMDSIZE ) {
3286  xmm1 += A.load(i,j) * x.load(i);
3287  }
3288 
3289  y[j] = sum( xmm1 ) * scalar;
3290 
3291  for( ; remainder && i<iend; ++i ) {
3292  y[j] += x[i] * A(i,j) * scalar;
3293  }
3294  }
3295  }
3296  //**********************************************************************************************
3297 
3298  //**Default assignment to dense vectors (large matrices)****************************************
3312  template< typename VT1 // Type of the left-hand side target vector
3313  , typename VT2 // Type of the left-hand side vector operand
3314  , typename MT1 // Type of the right-hand side matrix operand
3315  , typename ST2 > // Type of the scalar value
3316  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3317  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3318  {
3319  selectDefaultAssignKernel( y, x, A, scalar );
3320  }
3321  //**********************************************************************************************
3322 
3323  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3338  template< typename VT1 // Type of the left-hand side target vector
3339  , typename VT2 // Type of the left-hand side vector operand
3340  , typename MT1 // Type of the right-hand side matrix operand
3341  , typename ST2 > // Type of the scalar value
3342  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3343  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3344  {
3345  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3346 
3347  const size_t M( A.rows() );
3348  const size_t N( A.columns() );
3349 
3350  reset( y );
3351 
3352  size_t j( 0UL );
3353 
3354  for( ; (j+8UL) <= N; j+=8UL )
3355  {
3356  const size_t ibegin( ( IsLower_v<MT1> )
3357  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3358  :( 0UL ) );
3359  const size_t iend( ( IsUpper_v<MT1> )
3360  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3361  :( M ) );
3362  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3363 
3364  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3365  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3366 
3367  size_t i( ibegin );
3368 
3369  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3370  const size_t i1( i+SIMDSIZE );
3371  const size_t i2( i+SIMDSIZE*2UL );
3372  const size_t i3( i+SIMDSIZE*3UL );
3373  const SIMDType x1( x.load(i ) );
3374  const SIMDType x2( x.load(i1) );
3375  const SIMDType x3( x.load(i2) );
3376  const SIMDType x4( x.load(i3) );
3377  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3378  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3379  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3380  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3381  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3382  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3383  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3384  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3385  }
3386 
3387  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3388  const size_t i1( i+SIMDSIZE );
3389  const SIMDType x1( x.load(i ) );
3390  const SIMDType x2( x.load(i1) );
3391  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3392  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3393  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3394  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3395  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3396  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3397  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3398  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3399  }
3400 
3401  for( ; i<ipos; i+=SIMDSIZE ) {
3402  const SIMDType x1( x.load(i) );
3403  y[j ] += sum( x1 * A.load(i,j ) );
3404  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3405  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3406  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3407  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
3408  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
3409  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
3410  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
3411  }
3412 
3413  for( ; remainder && i<iend; ++i ) {
3414  y[j ] += x[i] * A(i,j );
3415  y[j+1UL] += x[i] * A(i,j+1UL);
3416  y[j+2UL] += x[i] * A(i,j+2UL);
3417  y[j+3UL] += x[i] * A(i,j+3UL);
3418  y[j+4UL] += x[i] * A(i,j+4UL);
3419  y[j+5UL] += x[i] * A(i,j+5UL);
3420  y[j+6UL] += x[i] * A(i,j+6UL);
3421  y[j+7UL] += x[i] * A(i,j+7UL);
3422  }
3423 
3424  y[j ] *= scalar;
3425  y[j+1UL] *= scalar;
3426  y[j+2UL] *= scalar;
3427  y[j+3UL] *= scalar;
3428  y[j+4UL] *= scalar;
3429  y[j+5UL] *= scalar;
3430  y[j+6UL] *= scalar;
3431  y[j+7UL] *= scalar;
3432  }
3433 
3434  for( ; (j+4UL) <= N; j+=4UL )
3435  {
3436  const size_t ibegin( ( IsLower_v<MT1> )
3437  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3438  :( 0UL ) );
3439  const size_t iend( ( IsUpper_v<MT1> )
3440  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3441  :( M ) );
3442  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3443 
3444  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3445  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3446 
3447  size_t i( ibegin );
3448 
3449  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3450  const size_t i1( i+SIMDSIZE );
3451  const size_t i2( i+SIMDSIZE*2UL );
3452  const size_t i3( i+SIMDSIZE*3UL );
3453  const SIMDType x1( x.load(i ) );
3454  const SIMDType x2( x.load(i1) );
3455  const SIMDType x3( x.load(i2) );
3456  const SIMDType x4( x.load(i3) );
3457  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3458  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3459  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3460  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3461  }
3462 
3463  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3464  const size_t i1( i+SIMDSIZE );
3465  const SIMDType x1( x.load(i ) );
3466  const SIMDType x2( x.load(i1) );
3467  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3468  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3469  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3470  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3471  }
3472 
3473  for( ; i<ipos; i+=SIMDSIZE ) {
3474  const SIMDType x1( x.load(i) );
3475  y[j ] += sum( x1 * A.load(i,j ) );
3476  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3477  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3478  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3479  }
3480 
3481  for( ; remainder && i<iend; ++i ) {
3482  y[j ] += x[i] * A(i,j );
3483  y[j+1UL] += x[i] * A(i,j+1UL);
3484  y[j+2UL] += x[i] * A(i,j+2UL);
3485  y[j+3UL] += x[i] * A(i,j+3UL);
3486  }
3487 
3488  y[j ] *= scalar;
3489  y[j+1UL] *= scalar;
3490  y[j+2UL] *= scalar;
3491  y[j+3UL] *= scalar;
3492  }
3493 
3494  for( ; (j+2UL) <= N; j+=2UL )
3495  {
3496  const size_t ibegin( ( IsLower_v<MT1> )
3497  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3498  :( 0UL ) );
3499  const size_t iend( ( IsUpper_v<MT1> )
3500  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
3501  :( M ) );
3502  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3503 
3504  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3505  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3506 
3507  size_t i( ibegin );
3508 
3509  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3510  const size_t i1( i+SIMDSIZE );
3511  const size_t i2( i+SIMDSIZE*2UL );
3512  const size_t i3( i+SIMDSIZE*3UL );
3513  const SIMDType x1( x.load(i ) );
3514  const SIMDType x2( x.load(i1) );
3515  const SIMDType x3( x.load(i2) );
3516  const SIMDType x4( x.load(i3) );
3517  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3518  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3519  }
3520 
3521  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3522  const size_t i1( i+SIMDSIZE );
3523  const SIMDType x1( x.load(i ) );
3524  const SIMDType x2( x.load(i1) );
3525  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3526  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3527  }
3528 
3529  for( ; i<ipos; i+=SIMDSIZE ) {
3530  const SIMDType x1( x.load(i) );
3531  y[j ] += sum( x1 * A.load(i,j ) );
3532  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3533  }
3534 
3535  for( ; remainder && i<iend; ++i ) {
3536  y[j ] += x[i] * A(i,j );
3537  y[j+1UL] += x[i] * A(i,j+1UL);
3538  }
3539 
3540  y[j ] *= scalar;
3541  y[j+1UL] *= scalar;
3542  }
3543 
3544  if( j < N )
3545  {
3546  const size_t ibegin( ( IsLower_v<MT1> )
3547  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3548  :( 0UL ) );
3549  const size_t iend( ( IsUpper_v<MT1> )
3550  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
3551  :( M ) );
3552  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3553 
3554  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3555  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3556 
3557  size_t i( ibegin );
3558 
3559  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3560  const size_t i1( i+SIMDSIZE );
3561  const size_t i2( i+SIMDSIZE*2UL );
3562  const size_t i3( i+SIMDSIZE*3UL );
3563  const SIMDType x1( x.load(i ) );
3564  const SIMDType x2( x.load(i1) );
3565  const SIMDType x3( x.load(i2) );
3566  const SIMDType x4( x.load(i3) );
3567  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3568  }
3569 
3570  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3571  const size_t i1( i+SIMDSIZE );
3572  const SIMDType x1( x.load(i ) );
3573  const SIMDType x2( x.load(i1) );
3574  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3575  }
3576 
3577  for( ; i<ipos; i+=SIMDSIZE ) {
3578  const SIMDType x1( x.load(i) );
3579  y[j] += sum( x1 * A.load(i,j) );
3580  }
3581 
3582  for( ; remainder && i<iend; ++i ) {
3583  y[j] += x[i] * A(i,j);
3584  }
3585 
3586  y[j] *= scalar;
3587  }
3588  }
3589  //**********************************************************************************************
3590 
3591  //**BLAS-based assignment to dense vectors (default)********************************************
3604  template< typename VT1 // Type of the left-hand side target vector
3605  , typename VT2 // Type of the left-hand side vector operand
3606  , typename MT1 // Type of the right-hand side matrix operand
3607  , typename ST2 > // Type of the scalar value
3608  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3609  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3610  {
3611  selectLargeAssignKernel( y, x, A, scalar );
3612  }
3613  //**********************************************************************************************
3614 
3615  //**BLAS-based assignment to dense vectors******************************************************
3616 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3617 
3630  template< typename VT1 // Type of the left-hand side target vector
3631  , typename VT2 // Type of the left-hand side vector operand
3632  , typename MT1 // Type of the right-hand side matrix operand
3633  , typename ST2 > // Type of the scalar value
3634  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3635  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3636  {
3637  using ET = ElementType_t<VT1>;
3638 
3639  if( IsTriangular_v<MT1> ) {
3640  assign( y, scalar * x );
3641  trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3642  }
3643  else {
3644  gemv( y, x, A, ET(scalar), ET(0) );
3645  }
3646  }
3647 #endif
3648  //**********************************************************************************************
3649 
3650  //**Assignment to sparse vectors****************************************************************
3662  template< typename VT1 // Type of the target sparse vector
3663  , bool TF > // Transpose flag of the target sparse vector
3664  friend inline void assign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3665  {
3667 
3671 
3672  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3673 
3674  const ResultType tmp( serial( rhs ) );
3675  assign( ~lhs, tmp );
3676  }
3677  //**********************************************************************************************
3678 
3679  //**Addition assignment to dense vectors********************************************************
3691  template< typename VT1 // Type of the target dense vector
3692  , bool TF > // Transpose flag of the target dense vector
3693  friend inline void addAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3694  {
3696 
3697  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3698 
3699  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3700  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3701 
3702  if( right.rows() == 0UL || right.columns() == 0UL ) {
3703  return;
3704  }
3705 
3706  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3707  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3708 
3709  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3710  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3711  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3712  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3713 
3714  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3715  }
3716  //**********************************************************************************************
3717 
3718  //**Addition assignment to dense vectors (kernel selection)*************************************
3729  template< typename VT1 // Type of the left-hand side target vector
3730  , typename VT2 // Type of the left-hand side vector operand
3731  , typename MT1 // Type of the right-hand side matrix operand
3732  , typename ST2 > // Type of the scalar value
3733  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3734  {
3735  if( ( IsDiagonal_v<MT1> ) ||
3736  ( IsComputation_v<MT> && !evaluateMatrix ) ||
3737  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3738  selectSmallAddAssignKernel( y, x, A, scalar );
3739  else
3740  selectBlasAddAssignKernel( y, x, A, scalar );
3741  }
3742  //**********************************************************************************************
3743 
3744  //**Default addition assignment to dense vectors************************************************
3758  template< typename VT1 // Type of the left-hand side target vector
3759  , typename VT2 // Type of the left-hand side vector operand
3760  , typename MT1 // Type of the right-hand side matrix operand
3761  , typename ST2 > // Type of the scalar value
3762  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3763  {
3764  y.addAssign( x * A * scalar );
3765  }
3766  //**********************************************************************************************
3767 
3768  //**Default addition assignment to dense vectors (small matrices)*******************************
3782  template< typename VT1 // Type of the left-hand side target vector
3783  , typename VT2 // Type of the left-hand side vector operand
3784  , typename MT1 // Type of the right-hand side matrix operand
3785  , typename ST2 > // Type of the scalar value
3786  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3787  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3788  {
3789  selectDefaultAddAssignKernel( y, x, A, scalar );
3790  }
3791  //**********************************************************************************************
3792 
3793  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3808  template< typename VT1 // Type of the left-hand side target vector
3809  , typename VT2 // Type of the left-hand side vector operand
3810  , typename MT1 // Type of the right-hand side matrix operand
3811  , typename ST2 > // Type of the scalar value
3812  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3813  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3814  {
3815  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3816 
3817  const size_t M( A.rows() );
3818  const size_t N( A.columns() );
3819 
3820  size_t j( 0UL );
3821 
3822  for( ; (j+8UL) <= N; j+=8UL )
3823  {
3824  const size_t ibegin( ( IsLower_v<MT1> )
3825  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3826  :( 0UL ) );
3827  const size_t iend( ( IsUpper_v<MT1> )
3828  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3829  :( M ) );
3830  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3831 
3832  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3833  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3834 
3835  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3836  size_t i( ibegin );
3837 
3838  for( ; i<ipos; i+=SIMDSIZE ) {
3839  const SIMDType x1( x.load(i) );
3840  xmm1 += x1 * A.load(i,j );
3841  xmm2 += x1 * A.load(i,j+1UL);
3842  xmm3 += x1 * A.load(i,j+2UL);
3843  xmm4 += x1 * A.load(i,j+3UL);
3844  xmm5 += x1 * A.load(i,j+4UL);
3845  xmm6 += x1 * A.load(i,j+5UL);
3846  xmm7 += x1 * A.load(i,j+6UL);
3847  xmm8 += x1 * A.load(i,j+7UL);
3848  }
3849 
3850  y[j ] += sum( xmm1 ) * scalar;
3851  y[j+1UL] += sum( xmm2 ) * scalar;
3852  y[j+2UL] += sum( xmm3 ) * scalar;
3853  y[j+3UL] += sum( xmm4 ) * scalar;
3854  y[j+4UL] += sum( xmm5 ) * scalar;
3855  y[j+5UL] += sum( xmm6 ) * scalar;
3856  y[j+6UL] += sum( xmm7 ) * scalar;
3857  y[j+7UL] += sum( xmm8 ) * scalar;
3858 
3859  for( ; remainder && i<iend; ++i ) {
3860  y[j ] += x[i] * A(i,j ) * scalar;
3861  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3862  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3863  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3864  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3865  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3866  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3867  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3868  }
3869  }
3870 
3871  for( ; (j+4UL) <= N; j+=4UL )
3872  {
3873  const size_t ibegin( ( IsLower_v<MT1> )
3874  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3875  :( 0UL ) );
3876  const size_t iend( ( IsUpper_v<MT1> )
3877  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3878  :( M ) );
3879  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3880 
3881  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3882  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3883 
3884  SIMDType xmm1, xmm2, xmm3, xmm4;
3885  size_t i( ibegin );
3886 
3887  for( ; i<ipos; i+=SIMDSIZE ) {
3888  const SIMDType x1( x.load(i) );
3889  xmm1 += x1 * A.load(i,j );
3890  xmm2 += x1 * A.load(i,j+1UL);
3891  xmm3 += x1 * A.load(i,j+2UL);
3892  xmm4 += x1 * A.load(i,j+3UL);
3893  }
3894 
3895  y[j ] += sum( xmm1 ) * scalar;
3896  y[j+1UL] += sum( xmm2 ) * scalar;
3897  y[j+2UL] += sum( xmm3 ) * scalar;
3898  y[j+3UL] += sum( xmm4 ) * scalar;
3899 
3900  for( ; remainder && i<iend; ++i ) {
3901  y[j ] += x[i] * A(i,j ) * scalar;
3902  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3903  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3904  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3905  }
3906  }
3907 
3908  for( ; (j+3UL) <= N; j+=3UL )
3909  {
3910  const size_t ibegin( ( IsLower_v<MT1> )
3911  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3912  :( 0UL ) );
3913  const size_t iend( ( IsUpper_v<MT1> )
3914  ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
3915  :( M ) );
3916  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3917 
3918  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3919  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3920 
3921  SIMDType xmm1, xmm2, xmm3;
3922  size_t i( ibegin );
3923 
3924  for( ; i<ipos; i+=SIMDSIZE ) {
3925  const SIMDType x1( x.load(i) );
3926  xmm1 += x1 * A.load(i,j );
3927  xmm2 += x1 * A.load(i,j+1UL);
3928  xmm3 += x1 * A.load(i,j+2UL);
3929  }
3930 
3931  y[j ] += sum( xmm1 ) * scalar;
3932  y[j+1UL] += sum( xmm2 ) * scalar;
3933  y[j+2UL] += sum( xmm3 ) * scalar;
3934 
3935  for( ; remainder && i<iend; ++i ) {
3936  y[j ] += x[i] * A(i,j ) * scalar;
3937  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3938  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3939  }
3940  }
3941 
3942  for( ; (j+2UL) <= N; j+=2UL )
3943  {
3944  const size_t ibegin( ( IsLower_v<MT1> )
3945  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3946  :( 0UL ) );
3947  const size_t iend( ( IsUpper_v<MT1> )
3948  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
3949  :( M ) );
3950  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3951 
3952  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3953  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3954 
3955  SIMDType xmm1, xmm2;
3956  size_t i( ibegin );
3957 
3958  for( ; i<ipos; i+=SIMDSIZE ) {
3959  const SIMDType x1( x.load(i) );
3960  xmm1 += x1 * A.load(i,j );
3961  xmm2 += x1 * A.load(i,j+1UL);
3962  }
3963 
3964  y[j ] += sum( xmm1 ) * scalar;
3965  y[j+1UL] += sum( xmm2 ) * scalar;
3966 
3967  for( ; remainder && i<iend; ++i ) {
3968  y[j ] += x[i] * A(i,j ) * scalar;
3969  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3970  }
3971  }
3972 
3973  if( j < N )
3974  {
3975  const size_t ibegin( ( IsLower_v<MT1> )
3976  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
3977  :( 0UL ) );
3978  const size_t iend( ( IsUpper_v<MT1> )
3979  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
3980  :( M ) );
3981  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3982 
3983  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3984  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3985 
3986  SIMDType xmm1;
3987  size_t i( ibegin );
3988 
3989  for( ; i<ipos; i+=SIMDSIZE ) {
3990  xmm1 += A.load(i,j) * x.load(i);
3991  }
3992 
3993  y[j] += sum( xmm1 ) * scalar;
3994 
3995  for( ; remainder && i<iend; ++i ) {
3996  y[j] += x[i] * A(i,j) * scalar;
3997  }
3998  }
3999  }
4000  //**********************************************************************************************
4001 
4002  //**Default addition assignment to dense vectors (large matrices)*******************************
4016  template< typename VT1 // Type of the left-hand side target vector
4017  , typename VT2 // Type of the left-hand side vector operand
4018  , typename MT1 // Type of the right-hand side matrix operand
4019  , typename ST2 > // Type of the scalar value
4020  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4021  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4022  {
4023  selectDefaultAddAssignKernel( y, x, A, scalar );
4024  }
4025  //**********************************************************************************************
4026 
4027  //**Vectorized default addition assignment to dense vectors (large matrices)********************
4042  template< typename VT1 // Type of the left-hand side target vector
4043  , typename VT2 // Type of the left-hand side vector operand
4044  , typename MT1 // Type of the right-hand side matrix operand
4045  , typename ST2 > // Type of the scalar value
4046  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4047  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4048  {
4049  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4050 
4051  const size_t M( A.rows() );
4052  const size_t N( A.columns() );
4053 
4054  size_t j( 0UL );
4055 
4056  for( ; (j+8UL) <= N; j+=8UL )
4057  {
4058  const size_t ibegin( ( IsLower_v<MT1> )
4059  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4060  :( 0UL ) );
4061  const size_t iend( ( IsUpper_v<MT1> )
4062  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4063  :( M ) );
4064  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4065 
4066  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4067  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4068 
4069  size_t i( ibegin );
4070 
4071  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4072  const size_t i1( i+SIMDSIZE );
4073  const size_t i2( i+SIMDSIZE*2UL );
4074  const size_t i3( i+SIMDSIZE*3UL );
4075  const SIMDType x1( x.load(i ) );
4076  const SIMDType x2( x.load(i1) );
4077  const SIMDType x3( x.load(i2) );
4078  const SIMDType x4( x.load(i3) );
4079  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4080  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4081  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4082  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4083  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4084  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4085  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4086  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4087  }
4088 
4089  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4090  const size_t i1( i+SIMDSIZE );
4091  const SIMDType x1( x.load(i ) );
4092  const SIMDType x2( x.load(i1) );
4093  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4094  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4095  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4096  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4097  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4098  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4099  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4100  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4101  }
4102 
4103  for( ; i<ipos; i+=SIMDSIZE ) {
4104  const SIMDType x1( x.load(i) );
4105  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4106  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4107  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4108  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4109  y[j+4UL] += sum( x1 * A.load(i,j+4UL) ) * scalar;
4110  y[j+5UL] += sum( x1 * A.load(i,j+5UL) ) * scalar;
4111  y[j+6UL] += sum( x1 * A.load(i,j+6UL) ) * scalar;
4112  y[j+7UL] += sum( x1 * A.load(i,j+7UL) ) * scalar;
4113  }
4114 
4115  for( ; remainder && i<iend; ++i ) {
4116  y[j ] += x[i] * A(i,j ) * scalar;
4117  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4118  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4119  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4120  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4121  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4122  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4123  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4124  }
4125  }
4126 
4127  for( ; (j+4UL) <= N; j+=4UL )
4128  {
4129  const size_t ibegin( ( IsLower_v<MT1> )
4130  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4131  :( 0UL ) );
4132  const size_t iend( ( IsUpper_v<MT1> )
4133  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4134  :( M ) );
4135  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4136 
4137  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4138  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4139 
4140  size_t i( ibegin );
4141 
4142  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4143  const size_t i1( i+SIMDSIZE );
4144  const size_t i2( i+SIMDSIZE*2UL );
4145  const size_t i3( i+SIMDSIZE*3UL );
4146  const SIMDType x1( x.load(i ) );
4147  const SIMDType x2( x.load(i1) );
4148  const SIMDType x3( x.load(i2) );
4149  const SIMDType x4( x.load(i3) );
4150  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4151  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4152  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4153  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4154  }
4155 
4156  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4157  const size_t i1( i+SIMDSIZE );
4158  const SIMDType x1( x.load(i ) );
4159  const SIMDType x2( x.load(i1) );
4160  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4161  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4162  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4163  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4164  }
4165 
4166  for( ; i<ipos; i+=SIMDSIZE ) {
4167  const SIMDType x1( x.load(i) );
4168  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4169  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4170  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4171  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4172  }
4173 
4174  for( ; remainder && i<iend; ++i ) {
4175  y[j ] += x[i] * A(i,j ) * scalar;
4176  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4177  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4178  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4179  }
4180  }
4181 
4182  for( ; (j+2UL) <= N; j+=2UL )
4183  {
4184  const size_t ibegin( ( IsLower_v<MT1> )
4185  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4186  :( 0UL ) );
4187  const size_t iend( ( IsUpper_v<MT1> )
4188  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4189  :( M ) );
4190  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4191 
4192  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4193  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4194 
4195  size_t i( ibegin );
4196 
4197  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4198  const size_t i1( i+SIMDSIZE );
4199  const size_t i2( i+SIMDSIZE*2UL );
4200  const size_t i3( i+SIMDSIZE*3UL );
4201  const SIMDType x1( x.load(i ) );
4202  const SIMDType x2( x.load(i1) );
4203  const SIMDType x3( x.load(i2) );
4204  const SIMDType x4( x.load(i3) );
4205  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4206  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4207  }
4208 
4209  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4210  const size_t i1( i+SIMDSIZE );
4211  const SIMDType x1( x.load(i ) );
4212  const SIMDType x2( x.load(i1) );
4213  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4214  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4215  }
4216 
4217  for( ; i<ipos; i+=SIMDSIZE ) {
4218  const SIMDType x1( x.load(i) );
4219  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4220  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4221  }
4222 
4223  for( ; remainder && i<iend; ++i ) {
4224  y[j ] += x[i] * A(i,j ) * scalar;
4225  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4226  }
4227  }
4228 
4229  if( j < N )
4230  {
4231  const size_t ibegin( ( IsLower_v<MT1> )
4232  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4233  :( 0UL ) );
4234  const size_t iend( ( IsUpper_v<MT1> )
4235  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4236  :( M ) );
4237  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4238 
4239  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4240  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4241 
4242  size_t i( ibegin );
4243 
4244  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4245  const size_t i1( i+SIMDSIZE );
4246  const size_t i2( i+SIMDSIZE*2UL );
4247  const size_t i3( i+SIMDSIZE*3UL );
4248  const SIMDType x1( x.load(i ) );
4249  const SIMDType x2( x.load(i1) );
4250  const SIMDType x3( x.load(i2) );
4251  const SIMDType x4( x.load(i3) );
4252  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4253  }
4254 
4255  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4256  const size_t i1( i+SIMDSIZE );
4257  const SIMDType x1( x.load(i ) );
4258  const SIMDType x2( x.load(i1) );
4259  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4260  }
4261 
4262  for( ; i<ipos; i+=SIMDSIZE ) {
4263  const SIMDType x1( x.load(i) );
4264  y[j] += sum( x1 * A.load(i,j) ) * scalar;
4265  }
4266 
4267  for( ; remainder && i<iend; ++i ) {
4268  y[j] += x[i] * A(i,j) * scalar;
4269  }
4270  }
4271  }
4272  //**********************************************************************************************
4273 
4274  //**BLAS-based addition assignment to dense vectors (default)***********************************
4289  template< typename VT1 // Type of the left-hand side target vector
4290  , typename VT2 // Type of the left-hand side vector operand
4291  , typename MT1 // Type of the right-hand side matrix operand
4292  , typename ST2 > // Type of the scalar value
4293  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4294  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4295  {
4296  selectLargeAddAssignKernel( y, x, A, scalar );
4297  }
4298  //**********************************************************************************************
4299 
4300  //**BLAS-based addition assignment to dense vectors*********************************************
4301 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4302 
4315  template< typename VT1 // Type of the left-hand side target vector
4316  , typename VT2 // Type of the left-hand side vector operand
4317  , typename MT1 // Type of the right-hand side matrix operand
4318  , typename ST2 > // Type of the scalar value
4319  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4320  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4321  {
4322  using ET = ElementType_t<VT1>;
4323 
4324  if( IsTriangular_v<MT1> ) {
4325  ResultType_t<VT1> tmp( serial( scalar * x ) );
4326  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4327  addAssign( y, tmp );
4328  }
4329  else {
4330  gemv( y, x, A, ET(scalar), ET(1) );
4331  }
4332  }
4333 #endif
4334  //**********************************************************************************************
4335 
4336  //**Addition assignment to sparse vectors*******************************************************
4337  // No special implementation for the addition assignment to sparse vectors.
4338  //**********************************************************************************************
4339 
4340  //**Subtraction assignment to dense vectors*****************************************************
4352  template< typename VT1 // Type of the target dense vector
4353  , bool TF > // Transpose flag of the target dense vector
4354  friend inline void subAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
4355  {
4357 
4358  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4359 
4360  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4361  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4362 
4363  if( right.rows() == 0UL || right.columns() == 0UL ) {
4364  return;
4365  }
4366 
4367  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
4368  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4369 
4370  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4371  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4372  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4373  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4374 
4375  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
4376  }
4377  //**********************************************************************************************
4378 
4379  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4390  template< typename VT1 // Type of the left-hand side target vector
4391  , typename VT2 // Type of the left-hand side vector operand
4392  , typename MT1 // Type of the right-hand side matrix operand
4393  , typename ST2 > // Type of the scalar value
4394  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4395  {
4396  if( ( IsDiagonal_v<MT1> ) ||
4397  ( IsComputation_v<MT> && !evaluateMatrix ) ||
4398  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4399  selectSmallSubAssignKernel( y, x, A, scalar );
4400  else
4401  selectBlasSubAssignKernel( y, x, A, scalar );
4402  }
4403  //**********************************************************************************************
4404 
4405  //**Default subtraction assignment to dense vectors*********************************************
4419  template< typename VT1 // Type of the left-hand side target vector
4420  , typename VT2 // Type of the left-hand side vector operand
4421  , typename MT1 // Type of the right-hand side matrix operand
4422  , typename ST2 > // Type of the scalar value
4423  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4424  {
4425  y.subAssign( x * A * scalar );
4426  }
4427  //**********************************************************************************************
4428 
4429  //**Default subtraction assignment to dense vectors (small matrices)****************************
4443  template< typename VT1 // Type of the left-hand side target vector
4444  , typename VT2 // Type of the left-hand side vector operand
4445  , typename MT1 // Type of the right-hand side matrix operand
4446  , typename ST2 > // Type of the scalar value
4447  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4448  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4449  {
4450  selectDefaultSubAssignKernel( y, x, A, scalar );
4451  }
4452  //**********************************************************************************************
4453 
4454  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4469  template< typename VT1 // Type of the left-hand side target vector
4470  , typename VT2 // Type of the left-hand side vector operand
4471  , typename MT1 // Type of the right-hand side matrix operand
4472  , typename ST2 > // Type of the scalar value
4473  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4474  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4475  {
4476  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4477 
4478  const size_t M( A.rows() );
4479  const size_t N( A.columns() );
4480 
4481  size_t j( 0UL );
4482 
4483  for( ; (j+8UL) <= N; j+=8UL )
4484  {
4485  const size_t ibegin( ( IsLower_v<MT1> )
4486  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4487  :( 0UL ) );
4488  const size_t iend( ( IsUpper_v<MT1> )
4489  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4490  :( M ) );
4491  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4492 
4493  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4494  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4495 
4496  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4497  size_t i( ibegin );
4498 
4499  for( ; i<ipos; i+=SIMDSIZE ) {
4500  const SIMDType x1( x.load(i) );
4501  xmm1 += x1 * A.load(i,j );
4502  xmm2 += x1 * A.load(i,j+1UL);
4503  xmm3 += x1 * A.load(i,j+2UL);
4504  xmm4 += x1 * A.load(i,j+3UL);
4505  xmm5 += x1 * A.load(i,j+4UL);
4506  xmm6 += x1 * A.load(i,j+5UL);
4507  xmm7 += x1 * A.load(i,j+6UL);
4508  xmm8 += x1 * A.load(i,j+7UL);
4509  }
4510 
4511  y[j ] -= sum( xmm1 ) * scalar;
4512  y[j+1UL] -= sum( xmm2 ) * scalar;
4513  y[j+2UL] -= sum( xmm3 ) * scalar;
4514  y[j+3UL] -= sum( xmm4 ) * scalar;
4515  y[j+4UL] -= sum( xmm5 ) * scalar;
4516  y[j+5UL] -= sum( xmm6 ) * scalar;
4517  y[j+6UL] -= sum( xmm7 ) * scalar;
4518  y[j+7UL] -= sum( xmm8 ) * scalar;
4519 
4520  for( ; remainder && i<iend; ++i ) {
4521  y[j ] -= x[i] * A(i,j ) * scalar;
4522  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4523  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4524  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4525  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4526  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4527  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4528  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4529  }
4530  }
4531 
4532  for( ; (j+4UL) <= N; j+=4UL )
4533  {
4534  const size_t ibegin( ( IsLower_v<MT1> )
4535  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4536  :( 0UL ) );
4537  const size_t iend( ( IsUpper_v<MT1> )
4538  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4539  :( M ) );
4540  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4541 
4542  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4543  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4544 
4545  SIMDType xmm1, xmm2, xmm3, xmm4;
4546  size_t i( ibegin );
4547 
4548  for( ; i<ipos; i+=SIMDSIZE ) {
4549  const SIMDType x1( x.load(i) );
4550  xmm1 += x1 * A.load(i,j );
4551  xmm2 += x1 * A.load(i,j+1UL);
4552  xmm3 += x1 * A.load(i,j+2UL);
4553  xmm4 += x1 * A.load(i,j+3UL);
4554  }
4555 
4556  y[j ] -= sum( xmm1 ) * scalar;
4557  y[j+1UL] -= sum( xmm2 ) * scalar;
4558  y[j+2UL] -= sum( xmm3 ) * scalar;
4559  y[j+3UL] -= sum( xmm4 ) * scalar;
4560 
4561  for( ; remainder && i<iend; ++i ) {
4562  y[j ] -= x[i] * A(i,j ) * scalar;
4563  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4564  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4565  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4566  }
4567  }
4568 
4569  for( ; (j+3UL) <= N; j+=3UL )
4570  {
4571  const size_t ibegin( ( IsLower_v<MT1> )
4572  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4573  :( 0UL ) );
4574  const size_t iend( ( IsUpper_v<MT1> )
4575  ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
4576  :( M ) );
4577  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4578 
4579  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4580  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4581 
4582  SIMDType xmm1, xmm2, xmm3;
4583  size_t i( ibegin );
4584 
4585  for( ; i<ipos; i+=SIMDSIZE ) {
4586  const SIMDType x1( x.load(i) );
4587  xmm1 += x1 * A.load(i,j );
4588  xmm2 += x1 * A.load(i,j+1UL);
4589  xmm3 += x1 * A.load(i,j+2UL);
4590  }
4591 
4592  y[j ] -= sum( xmm1 ) * scalar;
4593  y[j+1UL] -= sum( xmm2 ) * scalar;
4594  y[j+2UL] -= sum( xmm3 ) * scalar;
4595 
4596  for( ; remainder && i<iend; ++i ) {
4597  y[j ] -= x[i] * A(i,j ) * scalar;
4598  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4599  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4600  }
4601  }
4602 
4603  for( ; (j+2UL) <= N; j+=2UL )
4604  {
4605  const size_t ibegin( ( IsLower_v<MT1> )
4606  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4607  :( 0UL ) );
4608  const size_t iend( ( IsUpper_v<MT1> )
4609  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4610  :( M ) );
4611  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4612 
4613  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4614  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4615 
4616  SIMDType xmm1, xmm2;
4617  size_t i( ibegin );
4618 
4619  for( ; i<ipos; i+=SIMDSIZE ) {
4620  const SIMDType x1( x.load(i) );
4621  xmm1 += x1 * A.load(i,j );
4622  xmm2 += x1 * A.load(i,j+1UL);
4623  }
4624 
4625  y[j ] -= sum( xmm1 ) * scalar;
4626  y[j+1UL] -= sum( xmm2 ) * scalar;
4627 
4628  for( ; remainder && i<iend; ++i ) {
4629  y[j ] -= x[i] * A(i,j ) * scalar;
4630  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4631  }
4632  }
4633 
4634  if( j < N )
4635  {
4636  const size_t ibegin( ( IsLower_v<MT1> )
4637  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4638  :( 0UL ) );
4639  const size_t iend( ( IsUpper_v<MT1> )
4640  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4641  :( M ) );
4642  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4643 
4644  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4645  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4646 
4647  SIMDType xmm1;
4648  size_t i( ibegin );
4649 
4650  for( ; i<ipos; i+=SIMDSIZE ) {
4651  xmm1 += A.load(i,j) * x.load(i);
4652  }
4653 
4654  y[j] -= sum( xmm1 ) * scalar;
4655 
4656  for( ; remainder && i<iend; ++i ) {
4657  y[j] -= x[i] * A(i,j) * scalar;
4658  }
4659  }
4660  }
4661  //**********************************************************************************************
4662 
4663  //**Default subtraction assignment to dense vectors (large matrices)****************************
4677  template< typename VT1 // Type of the left-hand side target vector
4678  , typename VT2 // Type of the left-hand side vector operand
4679  , typename MT1 // Type of the right-hand side matrix operand
4680  , typename ST2 > // Type of the scalar value
4681  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4682  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4683  {
4684  selectDefaultSubAssignKernel( y, x, A, scalar );
4685  }
4686  //**********************************************************************************************
4687 
4688  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4703  template< typename VT1 // Type of the left-hand side target vector
4704  , typename VT2 // Type of the left-hand side vector operand
4705  , typename MT1 // Type of the right-hand side matrix operand
4706  , typename ST2 > // Type of the scalar value
4707  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4708  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4709  {
4710  constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4711 
4712  const size_t M( A.rows() );
4713  const size_t N( A.columns() );
4714 
4715  size_t j( 0UL );
4716 
4717  for( ; (j+8UL) <= N; j+=8UL )
4718  {
4719  const size_t ibegin( ( IsLower_v<MT1> )
4720  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4721  :( 0UL ) );
4722  const size_t iend( ( IsUpper_v<MT1> )
4723  ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4724  :( M ) );
4725  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4726 
4727  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4728  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4729 
4730  size_t i( ibegin );
4731 
4732  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4733  const size_t i1( i+SIMDSIZE );
4734  const size_t i2( i+SIMDSIZE*2UL );
4735  const size_t i3( i+SIMDSIZE*3UL );
4736  const SIMDType x1( x.load(i ) );
4737  const SIMDType x2( x.load(i1) );
4738  const SIMDType x3( x.load(i2) );
4739  const SIMDType x4( x.load(i3) );
4740  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4741  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4742  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4743  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4744  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4745  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4746  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4747  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4748  }
4749 
4750  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4751  const size_t i1( i+SIMDSIZE );
4752  const SIMDType x1( x.load(i ) );
4753  const SIMDType x2( x.load(i1) );
4754  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4755  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4756  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4757  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4758  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4759  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4760  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4761  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4762  }
4763 
4764  for( ; i<ipos; i+=SIMDSIZE ) {
4765  const SIMDType x1( x.load(i) );
4766  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4767  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4768  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4769  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4770  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) ) * scalar;
4771  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) ) * scalar;
4772  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) ) * scalar;
4773  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) ) * scalar;
4774  }
4775 
4776  for( ; remainder && i<iend; ++i ) {
4777  y[j ] -= x[i] * A(i,j ) * scalar;
4778  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4779  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4780  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4781  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4782  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4783  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4784  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4785  }
4786  }
4787 
4788  for( ; (j+4UL) <= N; j+=4UL )
4789  {
4790  const size_t ibegin( ( IsLower_v<MT1> )
4791  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4792  :( 0UL ) );
4793  const size_t iend( ( IsUpper_v<MT1> )
4794  ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4795  :( M ) );
4796  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4797 
4798  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4799  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4800 
4801  size_t i( ibegin );
4802 
4803  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4804  const size_t i1( i+SIMDSIZE );
4805  const size_t i2( i+SIMDSIZE*2UL );
4806  const size_t i3( i+SIMDSIZE*3UL );
4807  const SIMDType x1( x.load(i ) );
4808  const SIMDType x2( x.load(i1) );
4809  const SIMDType x3( x.load(i2) );
4810  const SIMDType x4( x.load(i3) );
4811  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4812  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4813  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4814  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4815  }
4816 
4817  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4818  const size_t i1( i+SIMDSIZE );
4819  const SIMDType x1( x.load(i ) );
4820  const SIMDType x2( x.load(i1) );
4821  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4822  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4823  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4824  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4825  }
4826 
4827  for( ; i<ipos; i+=SIMDSIZE ) {
4828  const SIMDType x1( x.load(i) );
4829  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4830  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4831  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4832  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4833  }
4834 
4835  for( ; remainder && i<iend; ++i ) {
4836  y[j ] -= x[i] * A(i,j ) * scalar;
4837  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4838  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4839  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4840  }
4841  }
4842 
4843  for( ; (j+2UL) <= N; j+=2UL )
4844  {
4845  const size_t ibegin( ( IsLower_v<MT1> )
4846  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4847  :( 0UL ) );
4848  const size_t iend( ( IsUpper_v<MT1> )
4849  ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4850  :( M ) );
4851  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4852 
4853  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4854  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4855 
4856  size_t i( ibegin );
4857 
4858  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4859  const size_t i1( i+SIMDSIZE );
4860  const size_t i2( i+SIMDSIZE*2UL );
4861  const size_t i3( i+SIMDSIZE*3UL );
4862  const SIMDType x1( x.load(i ) );
4863  const SIMDType x2( x.load(i1) );
4864  const SIMDType x3( x.load(i2) );
4865  const SIMDType x4( x.load(i3) );
4866  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4867  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4868  }
4869 
4870  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4871  const size_t i1( i+SIMDSIZE );
4872  const SIMDType x1( x.load(i ) );
4873  const SIMDType x2( x.load(i1) );
4874  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4875  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4876  }
4877 
4878  for( ; i<ipos; i+=SIMDSIZE ) {
4879  const SIMDType x1( x.load(i) );
4880  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4881  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4882  }
4883 
4884  for( ; remainder && i<iend; ++i ) {
4885  y[j ] -= x[i] * A(i,j ) * scalar;
4886  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4887  }
4888  }
4889 
4890  if( j < N )
4891  {
4892  const size_t ibegin( ( IsLower_v<MT1> )
4893  ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) & size_t(-SIMDSIZE) )
4894  :( 0UL ) );
4895  const size_t iend( ( IsUpper_v<MT1> )
4896  ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4897  :( M ) );
4898  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4899 
4900  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4901  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4902 
4903  size_t i( ibegin );
4904 
4905  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4906  const size_t i1( i+SIMDSIZE );
4907  const size_t i2( i+SIMDSIZE*2UL );
4908  const size_t i3( i+SIMDSIZE*3UL );
4909  const SIMDType x1( x.load(i ) );
4910  const SIMDType x2( x.load(i1) );
4911  const SIMDType x3( x.load(i2) );
4912  const SIMDType x4( x.load(i3) );
4913  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4914  }
4915 
4916  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4917  const size_t i1( i+SIMDSIZE );
4918  const SIMDType x1( x.load(i ) );
4919  const SIMDType x2( x.load(i1) );
4920  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4921  }
4922 
4923  for( ; i<ipos; i+=SIMDSIZE ) {
4924  const SIMDType x1( x.load(i) );
4925  y[j] -= sum( x1 * A.load(i,j) ) * scalar;
4926  }
4927 
4928  for( ; remainder && i<iend; ++i ) {
4929  y[j] -= x[i] * A(i,j) * scalar;
4930  }
4931  }
4932  }
4933  //**********************************************************************************************
4934 
4935  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4950  template< typename VT1 // Type of the left-hand side target vector
4951  , typename VT2 // Type of the left-hand side vector operand
4952  , typename MT1 // Type of the right-hand side matrix operand
4953  , typename ST2 > // Type of the scalar value
4954  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4955  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4956  {
4957  selectLargeSubAssignKernel( y, x, A, scalar );
4958  }
4959  //**********************************************************************************************
4960 
4961  //**BLAS-based subtraction assignment to dense vectors******************************************
4962 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4963 
4976  template< typename VT1 // Type of the left-hand side target vector
4977  , typename VT2 // Type of the left-hand side vector operand
4978  , typename MT1 // Type of the right-hand side matrix operand
4979  , typename ST2 > // Type of the scalar value
4980  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4981  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4982  {
4983  using ET = ElementType_t<VT1>;
4984 
4985  if( IsTriangular_v<MT1> ) {
4986  ResultType_t<VT1> tmp( serial( scalar * x ) );
4987  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4988  subAssign( y, tmp );
4989  }
4990  else {
4991  gemv( y, x, A, ET(-scalar), ET(1) );
4992  }
4993  }
4994 #endif
4995  //**********************************************************************************************
4996 
4997  //**Subtraction assignment to sparse vectors****************************************************
4998  // No special implementation for the subtraction assignment to sparse vectors.
4999  //**********************************************************************************************
5000 
5001  //**Multiplication assignment to dense vectors**************************************************
5013  template< typename VT1 // Type of the target dense vector
5014  , bool TF > // Transpose flag of the target dense vector
5015  friend inline void multAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5016  {
5018 
5022 
5023  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5024 
5025  const ResultType tmp( serial( rhs ) );
5026  multAssign( ~lhs, tmp );
5027  }
5028  //**********************************************************************************************
5029 
5030  //**Multiplication assignment to sparse vectors*************************************************
5031  // No special implementation for the multiplication assignment to sparse vectors.
5032  //**********************************************************************************************
5033 
5034  //**Division assignment to dense vectors********************************************************
5046  template< typename VT1 // Type of the target dense vector
5047  , bool TF > // Transpose flag of the target dense vector
5048  friend inline void divAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5049  {
5051 
5055 
5056  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5057 
5058  const ResultType tmp( serial( rhs ) );
5059  divAssign( ~lhs, tmp );
5060  }
5061  //**********************************************************************************************
5062 
5063  //**Division assignment to sparse vectors*******************************************************
5064  // No special implementation for the division assignment to sparse vectors.
5065  //**********************************************************************************************
5066 
5067  //**SMP assignment to dense vectors*************************************************************
5081  template< typename VT1 // Type of the target dense vector
5082  , bool TF > // Transpose flag of the target dense vector
5083  friend inline auto smpAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5084  -> EnableIf_t< UseSMPAssign_v<VT1> >
5085  {
5087 
5088  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5089 
5090  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5091  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5092 
5093  if( right.rows() == 0UL ) {
5094  reset( ~lhs );
5095  return;
5096  }
5097  else if( right.columns() == 0UL ) {
5098  return;
5099  }
5100 
5101  LT x( left ); // Evaluation of the left-hand side dense vector operand
5102  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5103 
5104  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5105  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5106  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5107  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5108 
5109  smpAssign( ~lhs, x * A * rhs.scalar_ );
5110  }
5111  //**********************************************************************************************
5112 
5113  //**SMP assignment to sparse vectors************************************************************
5127  template< typename VT1 // Type of the target sparse vector
5128  , bool TF > // Transpose flag of the target sparse vector
5129  friend inline auto smpAssign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5130  -> EnableIf_t< UseSMPAssign_v<VT1> >
5131  {
5133 
5137 
5138  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5139 
5140  const ResultType tmp( rhs );
5141  smpAssign( ~lhs, tmp );
5142  }
5143  //**********************************************************************************************
5144 
5145  //**SMP addition assignment to dense vectors****************************************************
5159  template< typename VT1 // Type of the target dense vector
5160  , bool TF > // Transpose flag of the target dense vector
5161  friend inline auto smpAddAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5162  -> EnableIf_t< UseSMPAssign_v<VT1> >
5163  {
5165 
5166  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5167 
5168  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5169  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5170 
5171  if( right.rows() == 0UL || right.columns() == 0UL ) {
5172  return;
5173  }
5174 
5175  LT x( left ); // Evaluation of the left-hand side dense vector operand
5176  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5177 
5178  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5179  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5180  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5181  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5182 
5183  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
5184  }
5185  //**********************************************************************************************
5186 
5187  //**SMP addition assignment to sparse vectors***************************************************
5188  // No special implementation for the SMP addition assignment to sparse vectors.
5189  //**********************************************************************************************
5190 
5191  //**SMP subtraction assignment to dense vectors*************************************************
5205  template< typename VT1 // Type of the target dense vector
5206  , bool TF > // Transpose flag of the target dense vector
5207  friend inline auto smpSubAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5208  -> EnableIf_t< UseSMPAssign_v<VT1> >
5209  {
5211 
5212  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5213 
5214  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5215  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5216 
5217  if( right.rows() == 0UL || right.columns() == 0UL ) {
5218  return;
5219  }
5220 
5221  LT x( left ); // Evaluation of the left-hand side dense vector operand
5222  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5223 
5224  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5225  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5226  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5227  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5228 
5229  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
5230  }
5231  //**********************************************************************************************
5232 
5233  //**SMP subtraction assignment to sparse vectors************************************************
5234  // No special implementation for the SMP subtraction assignment to sparse vectors.
5235  //**********************************************************************************************
5236 
5237  //**SMP multiplication assignment to dense vectors**********************************************
5251  template< typename VT1 // Type of the target dense vector
5252  , bool TF > // Transpose flag of the target dense vector
5253  friend inline auto smpMultAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5254  -> EnableIf_t< UseSMPAssign_v<VT1> >
5255  {
5257 
5261 
5262  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5263 
5264  const ResultType tmp( rhs );
5265  smpMultAssign( ~lhs, tmp );
5266  }
5267  //**********************************************************************************************
5268 
5269  //**SMP multiplication assignment to sparse vectors*********************************************
5270  // No special implementation for the SMP multiplication assignment to sparse vectors.
5271  //**********************************************************************************************
5272 
5273  //**SMP division assignment to dense vectors****************************************************
5287  template< typename VT1 // Type of the target dense vector
5288  , bool TF > // Transpose flag of the target dense vector
5289  friend inline auto smpDivAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5290  -> EnableIf_t< UseSMPAssign_v<VT1> >
5291  {
5293 
5297 
5298  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5299 
5300  const ResultType tmp( rhs );
5301  smpDivAssign( ~lhs, tmp );
5302  }
5303  //**********************************************************************************************
5304 
5305  //**SMP division assignment to sparse vectors***************************************************
5306  // No special implementation for the SMP division assignment to sparse vectors.
5307  //**********************************************************************************************
5308 
5309  //**Compile time checks*************************************************************************
5318  //**********************************************************************************************
5319 };
5321 //*************************************************************************************************
5322 
5323 
5324 
5325 
5326 //=================================================================================================
5327 //
5328 // GLOBAL BINARY ARITHMETIC OPERATORS
5329 //
5330 //=================================================================================================
5331 
5332 //*************************************************************************************************
5363 template< typename VT // Type of the left-hand side dense vector
5364  , typename MT > // Type of the right-hand side dense matrix
5365 inline decltype(auto)
5366  operator*( const DenseVector<VT,true>& vec, const DenseMatrix<MT,true>& mat )
5367 {
5369 
5371 
5372  if( (~vec).size() != (~mat).rows() ) {
5373  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
5374  }
5375 
5376  using ReturnType = const TDVecTDMatMultExpr<VT,MT>;
5377  return ReturnType( ~vec, ~mat );
5378 }
5379 //*************************************************************************************************
5380 
5381 
5382 
5383 
5384 //=================================================================================================
5385 //
5386 // ISALIGNED SPECIALIZATIONS
5387 //
5388 //=================================================================================================
5389 
5390 //*************************************************************************************************
5392 template< typename VT, typename MT >
5393 struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
5394  : public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
5395 {};
5397 //*************************************************************************************************
5398 
5399 } // namespace blaze
5400 
5401 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:567
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:523
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:167
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Header file for the blaze::checked and blaze::unchecked instances.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:162
Header file for basic type definitions.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:202
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:533
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:166
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecTDMatMultExpr.h:230
Header file for the IsSame and IsStrictlySame type traits.
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:218
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:370
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:128
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:245
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:428
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:259
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:513
Header file for the DenseVector base class.
If_t< useAssign, const ResultType, const DVecScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:169
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:382
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:326
Header file for the Computation base class.
Header file for the reset shim.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
If_t< IsExpression_v< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:212
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:220
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:433
Header file for the IsComplexDouble type trait.
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:129
Constraint on the data type.
DenseVector< This, TF > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:160
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:306
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:467
Header file for the DisableIf class template.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:206
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:159
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecTDMatMultExpr.h:236
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2146
Header file for the HasSIMDAdd type trait.
Header file for all SIMD functionality.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:350
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:107
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:215
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:83
Header file for the IsTriangular type trait.
Constraint on the data type.
If_t< IsExpression_v< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:209
Header file for the exception macros of the math module.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:316
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:126
If_t< IsExpression_v< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:172
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:584
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:557
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.The MultTrait_t alias declaration provid...
Definition: MultTrait.h:240
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:585
System settings for the BLAS mode.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:454
Header file for the IsSIMDCombinable type trait.
Header file for the HasSIMDMult type trait.
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:136
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:577
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:293
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:383
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:141
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Header file for the TVecMatMultExpr base class.
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:161
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:109
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS general matrix/vector multiplication functions (gemv)
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:104
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:545
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecTDMatMultExpr.h:223
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:130
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:442
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:203
Header file for the IsComplexFloat type trait.
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:131
Header file for the IntegralConstant class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the complex data type.
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:127
Header file for the IsUpper type trait.
Constraint on the data type.
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:204
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:338
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:205
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:360
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:201
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:191
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:175
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:423