Blaze  3.6
TDVecDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/Exception.h>
60 #include <blaze/math/shims/Reset.h>
62 #include <blaze/math/SIMD.h>
82 #include <blaze/math/views/Check.h>
83 #include <blaze/system/BLAS.h>
88 #include <blaze/util/Assert.h>
89 #include <blaze/util/Complex.h>
91 #include <blaze/util/DisableIf.h>
92 #include <blaze/util/EnableIf.h>
95 #include <blaze/util/mpl/If.h>
96 #include <blaze/util/Types.h>
104 
105 
106 namespace blaze {
107 
108 //=================================================================================================
109 //
110 // CLASS TDVECDMATMULTEXPR
111 //
112 //=================================================================================================
113 
114 //*************************************************************************************************
121 template< typename VT // Type of the left-hand side dense vector
122  , typename MT > // Type of the right-hand side dense matrix
123 class TDVecDMatMultExpr
124  : public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
125  , private Computation
126 {
127  private:
128  //**Type definitions****************************************************************************
135  //**********************************************************************************************
136 
137  //**********************************************************************************************
139  static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
140  //**********************************************************************************************
141 
142  //**********************************************************************************************
144  static constexpr bool evaluateMatrix =
145  ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
146  IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
147  //**********************************************************************************************
148 
149  //**********************************************************************************************
151 
155  template< typename T1 >
156  static constexpr bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
158  //**********************************************************************************************
159 
160  //**********************************************************************************************
162 
165  template< typename T1, typename T2, typename T3 >
166  static constexpr bool UseBlasKernel_v =
168  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
169  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
170  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
171  !IsDiagonal_v<T3> &&
172  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
173  IsBLASCompatible_v< ElementType_t<T1> > &&
174  IsBLASCompatible_v< ElementType_t<T2> > &&
175  IsBLASCompatible_v< ElementType_t<T3> > &&
176  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
177  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
179  //**********************************************************************************************
180 
181  //**********************************************************************************************
183 
187  template< typename T1, typename T2, typename T3 >
188  static constexpr bool UseVectorizedDefaultKernel_v =
189  ( useOptimizedKernels &&
190  !IsDiagonal_v<T3> &&
191  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
192  IsSIMDCombinable_v< ElementType_t<T1>
194  , ElementType_t<T3> > &&
195  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
196  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
198  //**********************************************************************************************
199 
200  public:
201  //**Type definitions****************************************************************************
208  using ReturnType = const ElementType;
209  using CompositeType = const ResultType;
210 
212  using LeftOperand = If_t< IsExpression_v<VT>, const VT, const VT& >;
213 
215  using RightOperand = If_t< IsExpression_v<MT>, const MT, const MT& >;
216 
219 
222  //**********************************************************************************************
223 
224  //**Compilation flags***************************************************************************
226  static constexpr bool simdEnabled =
227  ( !IsDiagonal_v<MT> &&
228  VT::simdEnabled && MT::simdEnabled &&
229  HasSIMDAdd_v<VET,MET> &&
230  HasSIMDMult_v<VET,MET> );
231 
233  static constexpr bool smpAssignable =
234  ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
235  //**********************************************************************************************
236 
237  //**SIMD properties*****************************************************************************
239  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
240  //**********************************************************************************************
241 
242  //**Constructor*********************************************************************************
248  explicit inline TDVecDMatMultExpr( const VT& vec, const MT& mat ) noexcept
249  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
250  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
251  {
252  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
253  }
254  //**********************************************************************************************
255 
256  //**Subscript operator**************************************************************************
262  inline ReturnType operator[]( size_t index ) const {
263  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
264 
265  if( IsDiagonal_v<MT> )
266  {
267  return vec_[index] * mat_(index,index);
268  }
269  else if( IsLower_v<MT> && ( index > 8UL ) )
270  {
271  const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
272  const size_t n ( mat_.rows() - begin );
273  return subvector( vec_, begin, n, unchecked ) *
274  subvector( column( mat_, index, unchecked ), begin, n, unchecked );
275  }
276  else if( IsUpper_v<MT> && ( index + 8UL < mat_.rows() ) )
277  {
278  const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
279  return subvector( vec_, 0UL, n, unchecked ) *
280  subvector( column( mat_, index, unchecked ), 0UL, n, unchecked );
281  }
282  else
283  {
284  return vec_ * column( mat_, index, unchecked );
285  }
286  }
287  //**********************************************************************************************
288 
289  //**At function*********************************************************************************
296  inline ReturnType at( size_t index ) const {
297  if( index >= mat_.columns() ) {
298  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
299  }
300  return (*this)[index];
301  }
302  //**********************************************************************************************
303 
304  //**Size function*******************************************************************************
309  inline size_t size() const noexcept {
310  return mat_.columns();
311  }
312  //**********************************************************************************************
313 
314  //**Left operand access*************************************************************************
319  inline LeftOperand leftOperand() const noexcept {
320  return vec_;
321  }
322  //**********************************************************************************************
323 
324  //**Right operand access************************************************************************
329  inline RightOperand rightOperand() const noexcept {
330  return mat_;
331  }
332  //**********************************************************************************************
333 
334  //**********************************************************************************************
340  template< typename T >
341  inline bool canAlias( const T* alias ) const noexcept {
342  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
343  }
344  //**********************************************************************************************
345 
346  //**********************************************************************************************
352  template< typename T >
353  inline bool isAliased( const T* alias ) const noexcept {
354  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
355  }
356  //**********************************************************************************************
357 
358  //**********************************************************************************************
363  inline bool isAligned() const noexcept {
364  return vec_.isAligned() && mat_.isAligned();
365  }
366  //**********************************************************************************************
367 
368  //**********************************************************************************************
373  inline bool canSMPAssign() const noexcept {
374  return ( !BLAZE_BLAS_MODE ||
377  ( IsComputation_v<MT> && !evaluateMatrix ) ||
378  ( mat_.rows() * mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
379  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
380  }
381  //**********************************************************************************************
382 
383  private:
384  //**Member variables****************************************************************************
387  //**********************************************************************************************
388 
389  //**Assignment to dense vectors*****************************************************************
402  template< typename VT1 > // Type of the target dense vector
403  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
404  {
406 
407  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
408 
409  if( rhs.mat_.rows() == 0UL ) {
410  reset( ~lhs );
411  return;
412  }
413  else if( rhs.mat_.columns() == 0UL ) {
414  return;
415  }
416 
417  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
418  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
419 
420  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
421  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
422  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
423  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
424 
425  TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
426  }
428  //**********************************************************************************************
429 
430  //**Assignment to dense vectors (kernel selection)**********************************************
441  template< typename VT1 // Type of the left-hand side target vector
442  , typename VT2 // Type of the left-hand side vector operand
443  , typename MT1 > // Type of the right-hand side matrix operand
444  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
445  {
446  if( ( IsDiagonal_v<MT1> ) ||
447  ( IsComputation_v<MT> && !evaluateMatrix ) ||
448  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
449  selectSmallAssignKernel( y, x, A );
450  else
451  selectBlasAssignKernel( y, x, A );
452  }
454  //**********************************************************************************************
455 
456  //**Default assignment to dense vectors*********************************************************
470  template< typename VT1 // Type of the left-hand side target vector
471  , typename VT2 // Type of the left-hand side vector operand
472  , typename MT1 > // Type of the right-hand side matrix operand
473  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
474  {
475  const size_t M( A.rows() );
476  const size_t N( A.columns() );
477 
478  if( IsStrictlyUpper_v<MT1> ) {
479  reset( y[0] );
480  }
481 
482  if( !IsLower_v<MT1> )
483  {
484  const size_t jbegin( IsStrictlyUpper_v<MT1> ? 1UL : 0UL );
485  for( size_t j=jbegin; j<N; ++j ) {
486  y[j] = x[0UL] * A(0UL,j);
487  }
488  }
489 
490  for( size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
491  {
492  if( IsDiagonal_v<MT1> )
493  {
494  y[i] = x[i] * A(i,i);
495  }
496  else
497  {
498  const size_t jbegin( ( IsUpper_v<MT1> )
499  ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
500  :( 0UL ) );
501  const size_t jend( ( IsLower_v<MT1> )
502  ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
503  :( N ) );
504  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
505 
506  const size_t jnum( jend - jbegin );
507  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
508 
509  for( size_t j=jbegin; j<jpos; j+=2UL ) {
510  y[j ] += x[i] * A(i,j );
511  y[j+1UL] += x[i] * A(i,j+1UL);
512  }
513  if( jpos < jend ) {
514  y[jpos] += x[i] * A(i,jpos);
515  }
516  if( IsLower_v<MT1> ) {
517  y[jend] = x[i] * A(i,jend);
518  }
519  }
520  }
521 
522  if( IsStrictlyLower_v<MT1> ) {
523  reset( y[N-1UL] );
524  }
525  }
527  //**********************************************************************************************
528 
529  //**Default assignment to dense vectors (small matrices)****************************************
543  template< typename VT1 // Type of the left-hand side target vector
544  , typename VT2 // Type of the left-hand side vector operand
545  , typename MT1 > // Type of the right-hand side matrix operand
546  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
547  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
548  {
549  selectDefaultAssignKernel( y, x, A );
550  }
552  //**********************************************************************************************
553 
554  //**Vectorized default assignment to dense vectors (small matrices)*****************************
568  template< typename VT1 // Type of the left-hand side target vector
569  , typename VT2 // Type of the left-hand side vector operand
570  , typename MT1 > // Type of the right-hand side matrix operand
571  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
572  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
573  {
574  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
575 
576  const size_t M( A.rows() );
577  const size_t N( A.columns() );
578 
579  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
580  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
581 
582  size_t j( 0UL );
583 
584  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
585  {
586  const size_t ibegin( ( IsLower_v<MT1> )
587  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
588  :( 0UL ) );
589  const size_t iend( ( IsUpper_v<MT1> )
590  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
591  :( M ) );
592  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
593 
594  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
595 
596  for( size_t i=ibegin; i<iend; ++i ) {
597  const SIMDType x1( set( x[i] ) );
598  xmm1 += x1 * A.load(i,j );
599  xmm2 += x1 * A.load(i,j+SIMDSIZE );
600  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
601  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
602  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
603  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
604  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
605  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
606  }
607 
608  y.store( j , xmm1 );
609  y.store( j+SIMDSIZE , xmm2 );
610  y.store( j+SIMDSIZE*2UL, xmm3 );
611  y.store( j+SIMDSIZE*3UL, xmm4 );
612  y.store( j+SIMDSIZE*4UL, xmm5 );
613  y.store( j+SIMDSIZE*5UL, xmm6 );
614  y.store( j+SIMDSIZE*6UL, xmm7 );
615  y.store( j+SIMDSIZE*7UL, xmm8 );
616  }
617 
618  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
619  {
620  const size_t ibegin( ( IsLower_v<MT1> )
621  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
622  :( 0UL ) );
623  const size_t iend( ( IsUpper_v<MT1> )
624  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
625  :( M ) );
626  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
627 
628  SIMDType xmm1, xmm2, xmm3, xmm4;
629 
630  for( size_t i=ibegin; i<iend; ++i ) {
631  const SIMDType x1( set( x[i] ) );
632  xmm1 += x1 * A.load(i,j );
633  xmm2 += x1 * A.load(i,j+SIMDSIZE );
634  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
635  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
636  }
637 
638  y.store( j , xmm1 );
639  y.store( j+SIMDSIZE , xmm2 );
640  y.store( j+SIMDSIZE*2UL, xmm3 );
641  y.store( j+SIMDSIZE*3UL, xmm4 );
642  }
643 
644  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
645  {
646  const size_t ibegin( ( IsLower_v<MT1> )
647  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
648  :( 0UL ) );
649  const size_t iend( ( IsUpper_v<MT1> )
650  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
651  :( M ) );
652  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
653 
654  SIMDType xmm1, xmm2, xmm3;
655 
656  for( size_t i=ibegin; i<iend; ++i ) {
657  const SIMDType x1( set( x[i] ) );
658  xmm1 += x1 * A.load(i,j );
659  xmm2 += x1 * A.load(i,j+SIMDSIZE );
660  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
661  }
662 
663  y.store( j , xmm1 );
664  y.store( j+SIMDSIZE , xmm2 );
665  y.store( j+SIMDSIZE*2UL, xmm3 );
666  }
667 
668  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
669  {
670  const size_t ibegin( ( IsLower_v<MT1> )
671  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
672  :( 0UL ) );
673  const size_t iend( ( IsUpper_v<MT1> )
674  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
675  :( M ) );
676  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
677 
678  SIMDType xmm1, xmm2;
679 
680  for( size_t i=ibegin; i<iend; ++i ) {
681  const SIMDType x1( set( x[i] ) );
682  xmm1 += x1 * A.load(i,j );
683  xmm2 += x1 * A.load(i,j+SIMDSIZE);
684  }
685 
686  y.store( j , xmm1 );
687  y.store( j+SIMDSIZE, xmm2 );
688  }
689 
690  for( ; j<jpos; j+=SIMDSIZE )
691  {
692  const size_t ibegin( ( IsLower_v<MT1> )
693  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
694  :( 0UL ) );
695  const size_t iend( ( IsUpper_v<MT1> )
696  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
697  :( M ) );
698  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
699 
700  SIMDType xmm1;
701 
702  for( size_t i=ibegin; i<iend; ++i ) {
703  xmm1 += set( x[i] ) * A.load(i,j);
704  }
705 
706  y.store( j, xmm1 );
707  }
708 
709  for( ; remainder && j<N; ++j )
710  {
711  const size_t ibegin( ( IsLower_v<MT1> )
712  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
713  :( 0UL ) );
714  const size_t iend( ( IsUpper_v<MT1> )
715  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
716  :( M ) );
717  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
718 
719  ElementType value{};
720 
721  for( size_t i=ibegin; i<iend; ++i ) {
722  value += x[i] * A(i,j);
723  }
724 
725  y[j] = value;
726  }
727  }
729  //**********************************************************************************************
730 
731  //**Default assignment to dense vectors (large matrices)****************************************
745  template< typename VT1 // Type of the left-hand side target vector
746  , typename VT2 // Type of the left-hand side vector operand
747  , typename MT1 > // Type of the right-hand side matrix operand
748  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
749  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
750  {
751  selectDefaultAssignKernel( y, x, A );
752  }
754  //**********************************************************************************************
755 
756  //**Vectorized default assignment to dense vectors (large matrices)*****************************
770  template< typename VT1 // Type of the left-hand side target vector
771  , typename VT2 // Type of the left-hand side vector operand
772  , typename MT1 > // Type of the right-hand side matrix operand
773  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
774  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
775  {
776  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
777 
778  const size_t M( A.rows() );
779  const size_t N( A.columns() );
780 
781  const size_t jblock( 32768UL / sizeof( ElementType ) );
782  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
783 
784  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
785 
786  reset( y );
787 
788  for( size_t jj=0U; jj<N; jj+=jblock ) {
789  for( size_t ii=0UL; ii<M; ii+=iblock )
790  {
791  const size_t iend( min( ii+iblock, M ) );
792  const size_t jtmp( min( jj+jblock, N ) );
793  const size_t jend( ( IsLower_v<MT1> )
794  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
795  :( jtmp ) );
796 
797  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
798  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
799 
800  size_t j( ( IsUpper_v<MT1> )
801  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
802  :( jj ) );
803 
804  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
805  {
806  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
807 
808  for( size_t i=ii; i<iend; ++i ) {
809  const SIMDType x1( set( x[i] ) );
810  xmm1 += x1 * A.load(i,j );
811  xmm2 += x1 * A.load(i,j+SIMDSIZE );
812  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
813  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
814  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
815  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
816  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
817  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
818  }
819 
820  y.store( j , y.load(j ) + xmm1 );
821  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
822  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
823  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
824  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
825  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
826  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
827  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
828  }
829 
830  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
831  {
832  SIMDType xmm1, xmm2, xmm3, xmm4;
833 
834  for( size_t i=ii; i<iend; ++i ) {
835  const SIMDType x1( set( x[i] ) );
836  xmm1 += x1 * A.load(i,j );
837  xmm2 += x1 * A.load(i,j+SIMDSIZE );
838  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
839  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
840  }
841 
842  y.store( j , y.load(j ) + xmm1 );
843  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
844  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
845  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
846  }
847 
848  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
849  {
850  SIMDType xmm1, xmm2, xmm3;
851 
852  for( size_t i=ii; i<iend; ++i ) {
853  const SIMDType x1( set( x[i] ) );
854  xmm1 += x1 * A.load(i,j );
855  xmm2 += x1 * A.load(i,j+SIMDSIZE );
856  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
857  }
858 
859  y.store( j , y.load(j ) + xmm1 );
860  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
861  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
862  }
863 
864  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
865  {
866  SIMDType xmm1, xmm2;
867 
868  for( size_t i=ii; i<iend; ++i ) {
869  const SIMDType x1( set( x[i] ) );
870  xmm1 += x1 * A.load(i,j );
871  xmm2 += x1 * A.load(i,j+SIMDSIZE);
872  }
873 
874  y.store( j , y.load(j ) + xmm1 );
875  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
876  }
877 
878  for( ; j<jpos; j+=SIMDSIZE )
879  {
880  SIMDType xmm1;
881 
882  for( size_t i=ii; i<iend; ++i ) {
883  xmm1 += set( x[i] ) * A.load(i,j);
884  }
885 
886  y.store( j, y.load(j) + xmm1 );
887  }
888 
889  for( ; remainder && j<jend; ++j )
890  {
891  ElementType value{};
892 
893  for( size_t i=ii; i<iend; ++i ) {
894  value += x[i] * A(i,j);
895  }
896 
897  y[j] += value;
898  }
899  }
900  }
901  }
903  //**********************************************************************************************
904 
905  //**BLAS-based assignment to dense vectors (default)********************************************
919  template< typename VT1 // Type of the left-hand side target vector
920  , typename VT2 // Type of the left-hand side vector operand
921  , typename MT1 > // Type of the right-hand side matrix operand
922  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
923  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
924  {
925  selectLargeAssignKernel( y, x, A );
926  }
928  //**********************************************************************************************
929 
930  //**BLAS-based assignment to dense vectors******************************************************
931 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
932 
945  template< typename VT1 // Type of the left-hand side target vector
946  , typename VT2 // Type of the left-hand side vector operand
947  , typename MT1 > // Type of the right-hand side matrix operand
948  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
949  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
950  {
951  using ET = ElementType_t<VT1>;
952 
953  if( IsTriangular_v<MT1> ) {
954  assign( y, x );
955  trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
956  }
957  else {
958  gemv( y, x, A, ET(1), ET(0) );
959  }
960  }
962 #endif
963  //**********************************************************************************************
964 
965  //**Assignment to sparse vectors****************************************************************
978  template< typename VT1 > // Type of the target sparse vector
979  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
980  {
982 
986 
987  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
988 
989  const ResultType tmp( serial( rhs ) );
990  assign( ~lhs, tmp );
991  }
993  //**********************************************************************************************
994 
995  //**Addition assignment to dense vectors********************************************************
1008  template< typename VT1 > // Type of the target dense vector
1009  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1010  {
1012 
1013  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1014 
1015  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1016  return;
1017  }
1018 
1019  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1020  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1021 
1022  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1023  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1024  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1025  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1026 
1027  TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1028  }
1030  //**********************************************************************************************
1031 
1032  //**Addition assignment to dense vectors (kernel selection)*************************************
1043  template< typename VT1 // Type of the left-hand side target vector
1044  , typename VT2 // Type of the left-hand side vector operand
1045  , typename MT1 > // Type of the right-hand side matrix operand
1046  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1047  {
1048  if( ( IsDiagonal_v<MT1> ) ||
1049  ( IsComputation_v<MT> && !evaluateMatrix ) ||
1050  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1051  selectSmallAddAssignKernel( y, x, A );
1052  else
1053  selectBlasAddAssignKernel( y, x, A );
1054  }
1056  //**********************************************************************************************
1057 
1058  //**Default addition assignment to dense vectors************************************************
1072  template< typename VT1 // Type of the left-hand side target vector
1073  , typename VT2 // Type of the left-hand side vector operand
1074  , typename MT1 > // Type of the right-hand side matrix operand
1075  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1076  {
1077  const size_t M( A.rows() );
1078  const size_t N( A.columns() );
1079 
1080  for( size_t i=0UL; i<M; ++i )
1081  {
1082  if( IsDiagonal_v<MT1> )
1083  {
1084  y[i] += x[i] * A(i,i);
1085  }
1086  else
1087  {
1088  const size_t jbegin( ( IsUpper_v<MT1> )
1089  ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1090  :( 0UL ) );
1091  const size_t jend( ( IsLower_v<MT1> )
1092  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1093  :( N ) );
1094  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1095 
1096  const size_t jnum( jend - jbegin );
1097  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1098 
1099  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1100  y[j ] += x[i] * A(i,j );
1101  y[j+1UL] += x[i] * A(i,j+1UL);
1102  }
1103  if( jpos < jend ) {
1104  y[jpos] += x[i] * A(i,jpos);
1105  }
1106  }
1107  }
1108  }
1110  //**********************************************************************************************
1111 
1112  //**Default addition assignment to dense vectors (small matrices)*******************************
1126  template< typename VT1 // Type of the left-hand side target vector
1127  , typename VT2 // Type of the left-hand side vector operand
1128  , typename MT1 > // Type of the right-hand side matrix operand
1129  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1130  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1131  {
1132  selectDefaultAddAssignKernel( y, x, A );
1133  }
1135  //**********************************************************************************************
1136 
1137  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1151  template< typename VT1 // Type of the left-hand side target vector
1152  , typename VT2 // Type of the left-hand side vector operand
1153  , typename MT1 > // Type of the right-hand side matrix operand
1154  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1155  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1156  {
1157  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1158 
1159  const size_t M( A.rows() );
1160  const size_t N( A.columns() );
1161 
1162  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1163  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1164 
1165  size_t j( 0UL );
1166 
1167  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1168  {
1169  const size_t ibegin( ( IsLower_v<MT1> )
1170  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1171  :( 0UL ) );
1172  const size_t iend( ( IsUpper_v<MT1> )
1173  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1174  :( M ) );
1175  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1176 
1177  SIMDType xmm1( y.load(j ) );
1178  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1179  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1180  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1181  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1182  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1183  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1184  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1185 
1186  for( size_t i=ibegin; i<iend; ++i ) {
1187  const SIMDType x1( set( x[i] ) );
1188  xmm1 += x1 * A.load(i,j );
1189  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1190  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1191  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1192  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1193  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1194  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1195  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1196  }
1197 
1198  y.store( j , xmm1 );
1199  y.store( j+SIMDSIZE , xmm2 );
1200  y.store( j+SIMDSIZE*2UL, xmm3 );
1201  y.store( j+SIMDSIZE*3UL, xmm4 );
1202  y.store( j+SIMDSIZE*4UL, xmm5 );
1203  y.store( j+SIMDSIZE*5UL, xmm6 );
1204  y.store( j+SIMDSIZE*6UL, xmm7 );
1205  y.store( j+SIMDSIZE*7UL, xmm8 );
1206  }
1207 
1208  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1209  {
1210  const size_t ibegin( ( IsLower_v<MT1> )
1211  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1212  :( 0UL ) );
1213  const size_t iend( ( IsUpper_v<MT1> )
1214  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1215  :( M ) );
1216  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1217 
1218  SIMDType xmm1( y.load(j ) );
1219  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1220  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1221  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1222 
1223  for( size_t i=ibegin; i<iend; ++i ) {
1224  const SIMDType x1( set( x[i] ) );
1225  xmm1 += x1 * A.load(i,j );
1226  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1227  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1228  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1229  }
1230 
1231  y.store( j , xmm1 );
1232  y.store( j+SIMDSIZE , xmm2 );
1233  y.store( j+SIMDSIZE*2UL, xmm3 );
1234  y.store( j+SIMDSIZE*3UL, xmm4 );
1235  }
1236 
1237  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1238  {
1239  const size_t ibegin( ( IsLower_v<MT1> )
1240  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1241  :( 0UL ) );
1242  const size_t iend( ( IsUpper_v<MT1> )
1243  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1244  :( M ) );
1245  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1246 
1247  SIMDType xmm1( y.load(j ) );
1248  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1249  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1250 
1251  for( size_t i=ibegin; i<iend; ++i ) {
1252  const SIMDType x1( set( x[i] ) );
1253  xmm1 += x1 * A.load(i,j );
1254  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1255  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1256  }
1257 
1258  y.store( j , xmm1 );
1259  y.store( j+SIMDSIZE , xmm2 );
1260  y.store( j+SIMDSIZE*2UL, xmm3 );
1261  }
1262 
1263  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1264  {
1265  const size_t ibegin( ( IsLower_v<MT1> )
1266  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1267  :( 0UL ) );
1268  const size_t iend( ( IsUpper_v<MT1> )
1269  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1270  :( M ) );
1271  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1272 
1273  SIMDType xmm1( y.load(j ) );
1274  SIMDType xmm2( y.load(j+SIMDSIZE) );
1275 
1276  for( size_t i=ibegin; i<iend; ++i ) {
1277  const SIMDType x1( set( x[i] ) );
1278  xmm1 += x1 * A.load(i,j );
1279  xmm2 += x1 * A.load(i,j+SIMDSIZE);
1280  }
1281 
1282  y.store( j , xmm1 );
1283  y.store( j+SIMDSIZE, xmm2 );
1284  }
1285 
1286  for( ; j<jpos; j+=SIMDSIZE )
1287  {
1288  const size_t ibegin( ( IsLower_v<MT1> )
1289  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1290  :( 0UL ) );
1291  const size_t iend( ( IsUpper_v<MT1> )
1292  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1293  :( M ) );
1294  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1295 
1296  SIMDType xmm1( y.load(j) );
1297 
1298  for( size_t i=ibegin; i<iend; ++i ) {
1299  xmm1 += set( x[i] ) * A.load(i,j);
1300  }
1301 
1302  y.store( j, xmm1 );
1303  }
1304 
1305  for( ; remainder && j<N; ++j )
1306  {
1307  const size_t ibegin( ( IsLower_v<MT1> )
1308  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1309  :( 0UL ) );
1310  const size_t iend( ( IsUpper_v<MT1> )
1311  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1312  :( M ) );
1313  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1314 
1315  ElementType value{};
1316 
1317  for( size_t i=ibegin; i<iend; ++i ) {
1318  value += x[i] * A(i,j);
1319  }
1320 
1321  y[j] += value;
1322  }
1323  }
1325  //**********************************************************************************************
1326 
1327  //**Default addition assignment to dense vectors (large matrices)*******************************
1341  template< typename VT1 // Type of the left-hand side target vector
1342  , typename VT2 // Type of the left-hand side vector operand
1343  , typename MT1 > // Type of the right-hand side matrix operand
1344  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1345  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1346  {
1347  selectDefaultAddAssignKernel( y, x, A );
1348  }
1350  //**********************************************************************************************
1351 
1352  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1366  template< typename VT1 // Type of the left-hand side target vector
1367  , typename VT2 // Type of the left-hand side vector operand
1368  , typename MT1 > // Type of the right-hand side matrix operand
1369  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1370  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1371  {
1372  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1373 
1374  const size_t M( A.rows() );
1375  const size_t N( A.columns() );
1376 
1377  const size_t jblock( 32768UL / sizeof( ElementType ) );
1378  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1379 
1380  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1381 
1382  for( size_t jj=0U; jj<N; jj+=jblock ) {
1383  for( size_t ii=0UL; ii<M; ii+=iblock )
1384  {
1385  const size_t iend( min( ii+iblock, M ) );
1386  const size_t jtmp( min( jj+jblock, N ) );
1387  const size_t jend( ( IsLower_v<MT1> )
1388  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1389  :( jtmp ) );
1390 
1391  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1392  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1393 
1394  size_t j( ( IsUpper_v<MT1> )
1395  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1396  :( jj ) );
1397 
1398  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1399  {
1400  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1401 
1402  for( size_t i=ii; i<iend; ++i ) {
1403  const SIMDType x1( set( x[i] ) );
1404  xmm1 += x1 * A.load(i,j );
1405  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1406  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1407  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1408  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1409  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1410  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1411  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1412  }
1413 
1414  y.store( j , y.load(j ) + xmm1 );
1415  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1416  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1417  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1418  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1419  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1420  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1421  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1422  }
1423 
1424  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1425  {
1426  SIMDType xmm1, xmm2, xmm3, xmm4;
1427 
1428  for( size_t i=ii; i<iend; ++i ) {
1429  const SIMDType x1( set( x[i] ) );
1430  xmm1 += x1 * A.load(i,j );
1431  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1432  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1433  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1434  }
1435 
1436  y.store( j , y.load(j ) + xmm1 );
1437  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1438  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1439  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1440  }
1441 
1442  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1443  {
1444  SIMDType xmm1, xmm2, xmm3;
1445 
1446  for( size_t i=ii; i<iend; ++i ) {
1447  const SIMDType x1( set( x[i] ) );
1448  xmm1 += x1 * A.load(i,j );
1449  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1450  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1451  }
1452 
1453  y.store( j , y.load(j ) + xmm1 );
1454  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1455  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1456  }
1457 
1458  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1459  {
1460  SIMDType xmm1, xmm2;
1461 
1462  for( size_t i=ii; i<iend; ++i ) {
1463  const SIMDType x1( set( x[i] ) );
1464  xmm1 += x1 * A.load(i,j );
1465  xmm2 += x1 * A.load(i,j+SIMDSIZE);
1466  }
1467 
1468  y.store( j , y.load(j ) + xmm1 );
1469  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1470  }
1471 
1472  for( ; j<jpos; j+=SIMDSIZE )
1473  {
1474  SIMDType xmm1;
1475 
1476  for( size_t i=ii; i<iend; ++i ) {
1477  xmm1 += set( x[i] ) * A.load(i,j);
1478  }
1479 
1480  y.store( j, y.load(j) + xmm1 );
1481  }
1482 
1483  for( ; remainder && j<jend; ++j )
1484  {
1485  ElementType value{};
1486 
1487  for( size_t i=ii; i<iend; ++i ) {
1488  value += x[i] * A(i,j);
1489  }
1490 
1491  y[j] += value;
1492  }
1493  }
1494  }
1495  }
1497  //**********************************************************************************************
1498 
1499  //**BLAS-based addition assignment to dense vectors (default)***********************************
1513  template< typename VT1 // Type of the left-hand side target vector
1514  , typename VT2 // Type of the left-hand side vector operand
1515  , typename MT1 > // Type of the right-hand side matrix operand
1516  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1517  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1518  {
1519  selectLargeAddAssignKernel( y, x, A );
1520  }
1522  //**********************************************************************************************
1523 
1524  //**BLAS-based addition assignment to dense vectors*********************************************
1525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1526 
1539  template< typename VT1 // Type of the left-hand side target vector
1540  , typename VT2 // Type of the left-hand side vector operand
1541  , typename MT1 > // Type of the right-hand side matrix operand
1542  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1543  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1544  {
1545  using ET = ElementType_t<VT1>;
1546 
1547  if( IsTriangular_v<MT1> ) {
1548  ResultType_t<VT1> tmp( serial( x ) );
1549  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1550  addAssign( y, tmp );
1551  }
1552  else {
1553  gemv( y, x, A, ET(1), ET(1) );
1554  }
1555  }
1557 #endif
1558  //**********************************************************************************************
1559 
1560  //**Addition assignment to sparse vectors*******************************************************
1561  // No special implementation for the addition assignment to sparse vectors.
1562  //**********************************************************************************************
1563 
1564  //**Subtraction assignment to dense vectors*****************************************************
1577  template< typename VT1 > // Type of the target dense vector
1578  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1579  {
1581 
1582  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1583 
1584  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1585  return;
1586  }
1587 
1588  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1589  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1590 
1591  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1592  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1593  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1594  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1595 
1596  TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1597  }
1599  //**********************************************************************************************
1600 
1601  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1612  template< typename VT1 // Type of the left-hand side target vector
1613  , typename VT2 // Type of the left-hand side vector operand
1614  , typename MT1 > // Type of the right-hand side matrix operand
1615  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1616  {
1617  if( ( IsDiagonal_v<MT1> ) ||
1618  ( IsComputation_v<MT> && !evaluateMatrix ) ||
1619  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1620  selectSmallSubAssignKernel( y, x, A );
1621  else
1622  selectBlasSubAssignKernel( y, x, A );
1623  }
1625  //**********************************************************************************************
1626 
1627  //**Default subtraction assignment to dense vectors*********************************************
1641  template< typename VT1 // Type of the left-hand side target vector
1642  , typename VT2 // Type of the left-hand side vector operand
1643  , typename MT1 > // Type of the right-hand side matrix operand
1644  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1645  {
1646  const size_t M( A.rows() );
1647  const size_t N( A.columns() );
1648 
1649  for( size_t i=0UL; i<M; ++i )
1650  {
1651  if( IsDiagonal_v<MT1> )
1652  {
1653  y[i] -= x[i] * A(i,i);
1654  }
1655  else
1656  {
1657  const size_t jbegin( ( IsUpper_v<MT1> )
1658  ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1659  :( 0UL ) );
1660  const size_t jend( ( IsLower_v<MT1> )
1661  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1662  :( N ) );
1663  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1664 
1665  const size_t jnum( jend - jbegin );
1666  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1667 
1668  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1669  y[j ] -= x[i] * A(i,j );
1670  y[j+1UL] -= x[i] * A(i,j+1UL);
1671  }
1672  if( jpos < jend ) {
1673  y[jpos] -= x[i] * A(i,jpos);
1674  }
1675  }
1676  }
1677  }
1679  //**********************************************************************************************
1680 
1681  //**Default subtraction assignment to dense vectors (small matrices)****************************
1695  template< typename VT1 // Type of the left-hand side target vector
1696  , typename VT2 // Type of the left-hand side vector operand
1697  , typename MT1 > // Type of the right-hand side matrix operand
1698  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1699  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1700  {
1701  selectDefaultSubAssignKernel( y, x, A );
1702  }
1704  //**********************************************************************************************
1705 
1706  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1721  template< typename VT1 // Type of the left-hand side target vector
1722  , typename VT2 // Type of the left-hand side vector operand
1723  , typename MT1 > // Type of the right-hand side matrix operand
1724  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1725  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1726  {
1727  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1728 
1729  const size_t M( A.rows() );
1730  const size_t N( A.columns() );
1731 
1732  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1733  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1734 
1735  size_t j( 0UL );
1736 
1737  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1738  {
1739  const size_t ibegin( ( IsLower_v<MT1> )
1740  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1741  :( 0UL ) );
1742  const size_t iend( ( IsUpper_v<MT1> )
1743  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1744  :( M ) );
1745  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1746 
1747  SIMDType xmm1( y.load(j ) );
1748  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1749  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1750  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1751  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1752  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1753  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1754  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1755 
1756  for( size_t i=ibegin; i<iend; ++i ) {
1757  const SIMDType x1( set( x[i] ) );
1758  xmm1 -= x1 * A.load(i,j );
1759  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1760  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1761  xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1762  xmm5 -= x1 * A.load(i,j+SIMDSIZE*4UL);
1763  xmm6 -= x1 * A.load(i,j+SIMDSIZE*5UL);
1764  xmm7 -= x1 * A.load(i,j+SIMDSIZE*6UL);
1765  xmm8 -= x1 * A.load(i,j+SIMDSIZE*7UL);
1766  }
1767 
1768  y.store( j , xmm1 );
1769  y.store( j+SIMDSIZE , xmm2 );
1770  y.store( j+SIMDSIZE*2UL, xmm3 );
1771  y.store( j+SIMDSIZE*3UL, xmm4 );
1772  y.store( j+SIMDSIZE*4UL, xmm5 );
1773  y.store( j+SIMDSIZE*5UL, xmm6 );
1774  y.store( j+SIMDSIZE*6UL, xmm7 );
1775  y.store( j+SIMDSIZE*7UL, xmm8 );
1776  }
1777 
1778  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1779  {
1780  const size_t ibegin( ( IsLower_v<MT1> )
1781  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1782  :( 0UL ) );
1783  const size_t iend( ( IsUpper_v<MT1> )
1784  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1785  :( M ) );
1786  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1787 
1788  SIMDType xmm1( y.load(j ) );
1789  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1790  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1791  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1792 
1793  for( size_t i=ibegin; i<iend; ++i ) {
1794  const SIMDType x1( set( x[i] ) );
1795  xmm1 -= x1 * A.load(i,j );
1796  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1797  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1798  xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1799  }
1800 
1801  y.store( j , xmm1 );
1802  y.store( j+SIMDSIZE , xmm2 );
1803  y.store( j+SIMDSIZE*2UL, xmm3 );
1804  y.store( j+SIMDSIZE*3UL, xmm4 );
1805  }
1806 
1807  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1808  {
1809  const size_t ibegin( ( IsLower_v<MT1> )
1810  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1811  :( 0UL ) );
1812  const size_t iend( ( IsUpper_v<MT1> )
1813  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1814  :( M ) );
1815  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1816 
1817  SIMDType xmm1( y.load(j ) );
1818  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1819  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1820 
1821  for( size_t i=ibegin; i<iend; ++i ) {
1822  const SIMDType x1( set( x[i] ) );
1823  xmm1 -= x1 * A.load(i,j );
1824  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1825  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1826  }
1827 
1828  y.store( j , xmm1 );
1829  y.store( j+SIMDSIZE , xmm2 );
1830  y.store( j+SIMDSIZE*2UL, xmm3 );
1831  }
1832 
1833  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1834  {
1835  const size_t ibegin( ( IsLower_v<MT1> )
1836  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1837  :( 0UL ) );
1838  const size_t iend( ( IsUpper_v<MT1> )
1839  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1840  :( M ) );
1841  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1842 
1843  SIMDType xmm1( y.load(j ) );
1844  SIMDType xmm2( y.load(j+SIMDSIZE) );
1845 
1846  for( size_t i=ibegin; i<iend; ++i ) {
1847  const SIMDType x1( set( x[i] ) );
1848  xmm1 -= x1 * A.load(i,j );
1849  xmm2 -= x1 * A.load(i,j+SIMDSIZE);
1850  }
1851 
1852  y.store( j , xmm1 );
1853  y.store( j+SIMDSIZE, xmm2 );
1854  }
1855 
1856  for( ; j<jpos; j+=SIMDSIZE )
1857  {
1858  const size_t ibegin( ( IsLower_v<MT1> )
1859  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1860  :( 0UL ) );
1861  const size_t iend( ( IsUpper_v<MT1> )
1862  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1863  :( M ) );
1864  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1865 
1866  SIMDType xmm1( y.load(j) );
1867 
1868  for( size_t i=ibegin; i<iend; ++i ) {
1869  xmm1 -= set( x[i] ) * A.load(i,j);
1870  }
1871 
1872  y.store( j, xmm1 );
1873  }
1874 
1875  for( ; remainder && j<N; ++j )
1876  {
1877  const size_t ibegin( ( IsLower_v<MT1> )
1878  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1879  :( 0UL ) );
1880  const size_t iend( ( IsUpper_v<MT1> )
1881  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1882  :( M ) );
1883  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1884 
1885  ElementType value{};
1886 
1887  for( size_t i=ibegin; i<iend; ++i ) {
1888  value += x[i] * A(i,j);
1889  }
1890 
1891  y[j] -= value;
1892  }
1893  }
1895  //**********************************************************************************************
1896 
1897  //**Default subtraction assignment to dense vectors (large matrices)****************************
1911  template< typename VT1 // Type of the left-hand side target vector
1912  , typename VT2 // Type of the left-hand side vector operand
1913  , typename MT1 > // Type of the right-hand side matrix operand
1914  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1915  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1916  {
1917  selectDefaultSubAssignKernel( y, x, A );
1918  }
1920  //**********************************************************************************************
1921 
1922  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
1937  template< typename VT1 // Type of the left-hand side target vector
1938  , typename VT2 // Type of the left-hand side vector operand
1939  , typename MT1 > // Type of the right-hand side matrix operand
1940  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1941  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1942  {
1943  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1944 
1945  const size_t M( A.rows() );
1946  const size_t N( A.columns() );
1947 
1948  const size_t jblock( 32768UL / sizeof( ElementType ) );
1949  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1950 
1951  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1952 
1953  for( size_t jj=0U; jj<N; jj+=jblock ) {
1954  for( size_t ii=0UL; ii<M; ii+=iblock )
1955  {
1956  const size_t iend( min( ii+iblock, M ) );
1957  const size_t jtmp( min( jj+jblock, N ) );
1958  const size_t jend( ( IsLower_v<MT1> )
1959  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1960  :( jtmp ) );
1961 
1962  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1963  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1964 
1965  size_t j( ( IsUpper_v<MT1> )
1966  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1967  :( jj ) );
1968 
1969  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1970  {
1971  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1972 
1973  for( size_t i=ii; i<iend; ++i ) {
1974  const SIMDType x1( set( x[i] ) );
1975  xmm1 += x1 * A.load(i,j );
1976  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1977  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1978  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1979  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1980  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1981  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1982  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1983  }
1984 
1985  y.store( j , y.load(j ) - xmm1 );
1986  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
1987  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
1988  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
1989  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
1990  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
1991  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
1992  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
1993  }
1994 
1995  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1996  {
1997  SIMDType xmm1, xmm2, xmm3, xmm4;
1998 
1999  for( size_t i=ii; i<iend; ++i ) {
2000  const SIMDType x1( set( x[i] ) );
2001  xmm1 += x1 * A.load(i,j );
2002  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2003  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2004  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2005  }
2006 
2007  y.store( j , y.load(j ) - xmm1 );
2008  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2009  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2010  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2011  }
2012 
2013  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2014  {
2015  SIMDType xmm1, xmm2, xmm3;
2016 
2017  for( size_t i=ii; i<iend; ++i ) {
2018  const SIMDType x1( set( x[i] ) );
2019  xmm1 += x1 * A.load(i,j );
2020  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2021  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2022  }
2023 
2024  y.store( j , y.load(j ) - xmm1 );
2025  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2026  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2027  }
2028 
2029  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2030  {
2031  SIMDType xmm1, xmm2;
2032 
2033  for( size_t i=ii; i<iend; ++i ) {
2034  const SIMDType x1( set( x[i] ) );
2035  xmm1 += x1 * A.load(i,j );
2036  xmm2 += x1 * A.load(i,j+SIMDSIZE);
2037  }
2038 
2039  y.store( j , y.load(j ) - xmm1 );
2040  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2041  }
2042 
2043  for( ; j<jpos; j+=SIMDSIZE )
2044  {
2045  SIMDType xmm1;
2046 
2047  for( size_t i=ii; i<iend; ++i ) {
2048  xmm1 += set( x[i] ) * A.load(i,j);
2049  }
2050 
2051  y.store( j, y.load(j) - xmm1 );
2052  }
2053 
2054  for( ; remainder && j<jend; ++j )
2055  {
2056  ElementType value{};
2057 
2058  for( size_t i=ii; i<iend; ++i ) {
2059  value += x[i] * A(i,j);
2060  }
2061 
2062  y[j] -= value;
2063  }
2064  }
2065  }
2066  }
2068  //**********************************************************************************************
2069 
2070  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2084  template< typename VT1 // Type of the left-hand side target vector
2085  , typename VT2 // Type of the left-hand side vector operand
2086  , typename MT1 > // Type of the right-hand side matrix operand
2087  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2088  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2089  {
2090  selectLargeSubAssignKernel( y, x, A );
2091  }
2093  //**********************************************************************************************
2094 
2095  //**BLAS-based subtraction assignment to dense vectors******************************************
2096 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2097 
2110  template< typename VT1 // Type of the left-hand side target vector
2111  , typename VT2 // Type of the left-hand side vector operand
2112  , typename MT1 > // Type of the right-hand side matrix operand
2113  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2114  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2115  {
2116  using ET = ElementType_t<VT1>;
2117 
2118  if( IsTriangular_v<MT1> ) {
2119  ResultType_t<VT1> tmp( serial( x ) );
2120  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2121  subAssign( y, tmp );
2122  }
2123  else {
2124  gemv( y, x, A, ET(-1), ET(1) );
2125  }
2126  }
2128 #endif
2129  //**********************************************************************************************
2130 
2131  //**Subtraction assignment to sparse vectors****************************************************
2132  // No special implementation for the subtraction assignment to sparse vectors.
2133  //**********************************************************************************************
2134 
2135  //**Multiplication assignment to dense vectors**************************************************
2148  template< typename VT1 > // Type of the target dense vector
2149  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2150  {
2152 
2156 
2157  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2158 
2159  const ResultType tmp( serial( rhs ) );
2160  multAssign( ~lhs, tmp );
2161  }
2163  //**********************************************************************************************
2164 
2165  //**Multiplication assignment to sparse vectors*************************************************
2166  // No special implementation for the multiplication assignment to sparse vectors.
2167  //**********************************************************************************************
2168 
2169  //**Division assignment to dense vectors********************************************************
2182  template< typename VT1 > // Type of the target dense vector
2183  friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2184  {
2186 
2190 
2191  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2192 
2193  const ResultType tmp( serial( rhs ) );
2194  divAssign( ~lhs, tmp );
2195  }
2197  //**********************************************************************************************
2198 
2199  //**Division assignment to sparse vectors*******************************************************
2200  // No special implementation for the division assignment to sparse vectors.
2201  //**********************************************************************************************
2202 
2203  //**SMP assignment to dense vectors*************************************************************
2218  template< typename VT1 > // Type of the target dense vector
2219  friend inline auto smpAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2220  -> EnableIf_t< UseSMPAssign_v<VT1> >
2221  {
2223 
2224  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2225 
2226  if( rhs.mat_.rows() == 0UL ) {
2227  reset( ~lhs );
2228  return;
2229  }
2230  else if( rhs.mat_.columns() == 0UL ) {
2231  return;
2232  }
2233 
2234  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2235  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2236 
2237  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2238  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2239  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2240  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2241 
2242  smpAssign( ~lhs, x * A );
2243  }
2245  //**********************************************************************************************
2246 
2247  //**SMP assignment to sparse vectors************************************************************
2262  template< typename VT1 > // Type of the target sparse vector
2263  friend inline auto smpAssign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2264  -> EnableIf_t< UseSMPAssign_v<VT1> >
2265  {
2267 
2271 
2272  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2273 
2274  const ResultType tmp( rhs );
2275  smpAssign( ~lhs, tmp );
2276  }
2278  //**********************************************************************************************
2279 
2280  //**SMP addition assignment to dense vectors****************************************************
2295  template< typename VT1 > // Type of the target dense vector
2296  friend inline auto smpAddAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2297  -> EnableIf_t< UseSMPAssign_v<VT1> >
2298  {
2300 
2301  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2302 
2303  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2304  return;
2305  }
2306 
2307  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2308  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2309 
2310  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2311  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2312  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2313  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2314 
2315  smpAddAssign( ~lhs, x * A );
2316  }
2318  //**********************************************************************************************
2319 
2320  //**SMP addition assignment to sparse vectors***************************************************
2321  // No special implementation for the SMP addition assignment to sparse vectors.
2322  //**********************************************************************************************
2323 
2324  //**SMP subtraction assignment to dense vectors*************************************************
2339  template< typename VT1 > // Type of the target dense vector
2340  friend inline auto smpSubAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2341  -> EnableIf_t< UseSMPAssign_v<VT1> >
2342  {
2344 
2345  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2346 
2347  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2348  return;
2349  }
2350 
2351  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2352  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2353 
2354  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2355  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2356  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2357  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2358 
2359  smpSubAssign( ~lhs, x * A );
2360  }
2362  //**********************************************************************************************
2363 
2364  //**SMP subtraction assignment to sparse vectors************************************************
2365  // No special implementation for the SMP subtraction assignment to sparse vectors.
2366  //**********************************************************************************************
2367 
2368  //**SMP multiplication assignment to dense vectors**********************************************
2383  template< typename VT1 > // Type of the target dense vector
2384  friend inline auto smpMultAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2385  -> EnableIf_t< UseSMPAssign_v<VT1> >
2386  {
2388 
2392 
2393  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2394 
2395  const ResultType tmp( rhs );
2396  smpMultAssign( ~lhs, tmp );
2397  }
2399  //**********************************************************************************************
2400 
2401  //**SMP multiplication assignment to sparse vectors*********************************************
2402  // No special implementation for the SMP multiplication assignment to sparse vectors.
2403  //**********************************************************************************************
2404 
2405  //**SMP division assignment to dense vectors****************************************************
2420  template< typename VT1 > // Type of the target dense vector
2421  friend inline auto smpDivAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2422  -> EnableIf_t< UseSMPAssign_v<VT1> >
2423  {
2425 
2429 
2430  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2431 
2432  const ResultType tmp( rhs );
2433  smpDivAssign( ~lhs, tmp );
2434  }
2436  //**********************************************************************************************
2437 
2438  //**SMP division assignment to sparse vectors***************************************************
2439  // No special implementation for the SMP division assignment to sparse vectors.
2440  //**********************************************************************************************
2441 
2442  //**Compile time checks*************************************************************************
2450  //**********************************************************************************************
2451 };
2452 //*************************************************************************************************
2453 
2454 
2455 
2456 
2457 //=================================================================================================
2458 //
2459 // DVECSCALARMULTEXPR SPECIALIZATION
2460 //
2461 //=================================================================================================
2462 
2463 //*************************************************************************************************
2471 template< typename VT // Type of the left-hand side dense vector
2472  , typename MT // Type of the right-hand side dense matrix
2473  , typename ST > // Type of the side scalar value
2474 class DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >
2475  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2476  , private Computation
2477 {
2478  private:
2479  //**Type definitions****************************************************************************
2480  using VMM = TDVecDMatMultExpr<VT,MT>;
2481  using RES = ResultType_t<VMM>;
2482  using VRT = ResultType_t<VT>;
2483  using MRT = ResultType_t<MT>;
2484  using VET = ElementType_t<VRT>;
2485  using MET = ElementType_t<MRT>;
2486  using VCT = CompositeType_t<VT>;
2487  using MCT = CompositeType_t<MT>;
2488  //**********************************************************************************************
2489 
2490  //**********************************************************************************************
2492  static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2493  //**********************************************************************************************
2494 
2495  //**********************************************************************************************
2497  static constexpr bool evaluateMatrix =
2498  ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2499  IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2500  //**********************************************************************************************
2501 
2502  //**********************************************************************************************
2504 
2507  template< typename T1 >
2508  static constexpr bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
2509  //**********************************************************************************************
2510 
2511  //**********************************************************************************************
2513 
2515  template< typename T1, typename T2, typename T3, typename T4 >
2516  static constexpr bool UseBlasKernel_v =
2518  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2519  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2520  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2521  !IsDiagonal_v<T3> &&
2522  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2523  IsBLASCompatible_v< ElementType_t<T1> > &&
2524  IsBLASCompatible_v< ElementType_t<T2> > &&
2525  IsBLASCompatible_v< ElementType_t<T3> > &&
2526  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2527  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2528  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2529  //**********************************************************************************************
2530 
2531  //**********************************************************************************************
2533 
2536  template< typename T1, typename T2, typename T3, typename T4 >
2537  static constexpr bool UseVectorizedDefaultKernel_v =
2538  ( useOptimizedKernels &&
2539  !IsDiagonal_v<T3> &&
2540  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2541  IsSIMDCombinable_v< ElementType_t<T1>
2542  , ElementType_t<T2>
2543  , ElementType_t<T3>
2544  , T4 > &&
2545  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2546  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2547  //**********************************************************************************************
2548 
2549  public:
2550  //**Type definitions****************************************************************************
2551  using This = DVecScalarMultExpr<VMM,ST,true>;
2552  using BaseType = DenseVector<This,true>;
2553  using ResultType = MultTrait_t<RES,ST>;
2554  using TransposeType = TransposeType_t<ResultType>;
2555  using ElementType = ElementType_t<ResultType>;
2556  using SIMDType = SIMDTrait_t<ElementType>;
2557  using ReturnType = const ElementType;
2558  using CompositeType = const ResultType;
2559 
2561  using LeftOperand = const TDVecDMatMultExpr<VT,MT>;
2562 
2564  using RightOperand = ST;
2565 
2567  using LT = If_t< evaluateVector, const VRT, VCT >;
2568 
2570  using RT = If_t< evaluateMatrix, const MRT, MCT >;
2571  //**********************************************************************************************
2572 
2573  //**Compilation flags***************************************************************************
2575  static constexpr bool simdEnabled =
2576  ( !IsDiagonal_v<MT> &&
2577  VT::simdEnabled && MT::simdEnabled &&
2578  IsSIMDCombinable_v<VET,MET,ST> &&
2579  HasSIMDAdd_v<VET,MET> &&
2580  HasSIMDMult_v<VET,MET> );
2581 
2583  static constexpr bool smpAssignable =
2584  ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
2585  //**********************************************************************************************
2586 
2587  //**SIMD properties*****************************************************************************
2589  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
2590  //**********************************************************************************************
2591 
2592  //**Constructor*********************************************************************************
2598  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2599  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2600  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2601  {}
2602  //**********************************************************************************************
2603 
2604  //**Subscript operator**************************************************************************
2610  inline ReturnType operator[]( size_t index ) const {
2611  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2612  return vector_[index] * scalar_;
2613  }
2614  //**********************************************************************************************
2615 
2616  //**At function*********************************************************************************
2623  inline ReturnType at( size_t index ) const {
2624  if( index >= vector_.size() ) {
2625  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2626  }
2627  return (*this)[index];
2628  }
2629  //**********************************************************************************************
2630 
2631  //**Size function*******************************************************************************
2636  inline size_t size() const {
2637  return vector_.size();
2638  }
2639  //**********************************************************************************************
2640 
2641  //**Left operand access*************************************************************************
2646  inline LeftOperand leftOperand() const {
2647  return vector_;
2648  }
2649  //**********************************************************************************************
2650 
2651  //**Right operand access************************************************************************
2656  inline RightOperand rightOperand() const {
2657  return scalar_;
2658  }
2659  //**********************************************************************************************
2660 
2661  //**********************************************************************************************
2667  template< typename T >
2668  inline bool canAlias( const T* alias ) const {
2669  return vector_.canAlias( alias );
2670  }
2671  //**********************************************************************************************
2672 
2673  //**********************************************************************************************
2679  template< typename T >
2680  inline bool isAliased( const T* alias ) const {
2681  return vector_.isAliased( alias );
2682  }
2683  //**********************************************************************************************
2684 
2685  //**********************************************************************************************
2690  inline bool isAligned() const {
2691  return vector_.isAligned();
2692  }
2693  //**********************************************************************************************
2694 
2695  //**********************************************************************************************
2700  inline bool canSMPAssign() const noexcept {
2701  RightOperand_t<VMM> A( vector_.rightOperand() );
2702  return ( !BLAZE_BLAS_MODE ||
2705  ( IsComputation_v<MT> && !evaluateMatrix ) ||
2706  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2707  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
2708  }
2709  //**********************************************************************************************
2710 
2711  private:
2712  //**Member variables****************************************************************************
2715  //**********************************************************************************************
2716 
2717  //**Assignment to dense vectors*****************************************************************
2729  template< typename VT1 > // Type of the target dense vector
2730  friend inline void assign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
2731  {
2733 
2734  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2735 
2736  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
2737  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
2738 
2739  if( right.rows() == 0UL ) {
2740  reset( ~lhs );
2741  return;
2742  }
2743  else if( right.columns() == 0UL ) {
2744  return;
2745  }
2746 
2747  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
2748  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
2749 
2750  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
2751  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
2752  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
2753  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2754 
2755  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2756  }
2757  //**********************************************************************************************
2758 
2759  //**Assignment to dense vectors (kernel selection)**********************************************
2770  template< typename VT1 // Type of the left-hand side target vector
2771  , typename VT2 // Type of the left-hand side vector operand
2772  , typename MT1 // Type of the right-hand side matrix operand
2773  , typename ST2 > // Type of the scalar value
2774  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2775  {
2776  if( ( IsDiagonal_v<MT1> ) ||
2777  ( IsComputation_v<MT> && !evaluateMatrix ) ||
2778  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2779  selectSmallAssignKernel( y, x, A, scalar );
2780  else
2781  selectBlasAssignKernel( y, x, A, scalar );
2782  }
2783  //**********************************************************************************************
2784 
2785  //**Default assignment to dense vectors*********************************************************
2799  template< typename VT1 // Type of the left-hand side target vector
2800  , typename VT2 // Type of the left-hand side vector operand
2801  , typename MT1 // Type of the right-hand side matrix operand
2802  , typename ST2 > // Type of the scalar value
2803  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2804  {
2805  const size_t M( A.rows() );
2806  const size_t N( A.columns() );
2807 
2808  if( IsStrictlyUpper_v<MT1> ) {
2809  reset( y[0] );
2810  }
2811 
2812  if( !IsLower_v<MT1> )
2813  {
2814  for( size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<N; ++j ) {
2815  y[j] = x[0UL] * A(0UL,j);
2816  }
2817  }
2818 
2819  for( size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
2820  {
2821  if( IsDiagonal_v<MT1> )
2822  {
2823  y[i] = x[i] * A(i,i) * scalar;
2824  }
2825  else
2826  {
2827  const size_t jbegin( ( IsUpper_v<MT1> )
2828  ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
2829  :( 0UL ) );
2830  const size_t jend( ( IsLower_v<MT1> )
2831  ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
2832  :( N ) );
2833  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2834 
2835  const size_t jnum( jend - jbegin );
2836  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2837 
2838  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2839  y[j ] += x[i] * A(i,j );
2840  y[j+1UL] += x[i] * A(i,j+1UL);
2841  }
2842  if( jpos < jend ) {
2843  y[jpos] += x[i] * A(i,jpos);
2844  }
2845  if( IsLower_v<MT1> ) {
2846  y[jend] = x[i] * A(i,jend);
2847  }
2848  }
2849  }
2850 
2851  if( IsStrictlyLower_v<MT1> ) {
2852  reset( y[N-1UL] );
2853  }
2854 
2855  if( !IsDiagonal_v<MT1> )
2856  {
2857  const size_t iend( IsStrictlyLower_v<MT1> ? N-1UL : N );
2858  for( size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<iend; ++j ) {
2859  y[j] *= scalar;
2860  }
2861  }
2862  }
2863  //**********************************************************************************************
2864 
2865  //**Default assignment to dense vectors (small matrices)****************************************
2879  template< typename VT1 // Type of the left-hand side target vector
2880  , typename VT2 // Type of the left-hand side vector operand
2881  , typename MT1 // Type of the right-hand side matrix operand
2882  , typename ST2 > // Type of the scalar value
2883  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2884  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2885  {
2886  selectDefaultAssignKernel( y, x, A, scalar );
2887  }
2888  //**********************************************************************************************
2889 
2890  //**Default assignment to dense vectors (small matrices)****************************************
2904  template< typename VT1 // Type of the left-hand side target vector
2905  , typename VT2 // Type of the left-hand side vector operand
2906  , typename MT1 // Type of the right-hand side matrix operand
2907  , typename ST2 > // Type of the scalar value
2908  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2909  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2910  {
2911  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
2912 
2913  const size_t M( A.rows() );
2914  const size_t N( A.columns() );
2915 
2916  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2917  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2918 
2919  const SIMDType factor( set( scalar ) );
2920 
2921  size_t j( 0UL );
2922 
2923  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2924  {
2925  const size_t ibegin( ( IsLower_v<MT1> )
2926  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2927  :( 0UL ) );
2928  const size_t iend( ( IsUpper_v<MT1> )
2929  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2930  :( M ) );
2931  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2932 
2933  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934 
2935  for( size_t i=ibegin; i<iend; ++i ) {
2936  const SIMDType x1( set( x[i] ) );
2937  xmm1 += x1 * A.load(i,j );
2938  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2939  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2940  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2941  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
2942  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
2943  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
2944  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
2945  }
2946 
2947  y.store( j , xmm1*factor );
2948  y.store( j+SIMDSIZE , xmm2*factor );
2949  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2950  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2951  y.store( j+SIMDSIZE*4UL, xmm5*factor );
2952  y.store( j+SIMDSIZE*5UL, xmm6*factor );
2953  y.store( j+SIMDSIZE*6UL, xmm7*factor );
2954  y.store( j+SIMDSIZE*7UL, xmm8*factor );
2955  }
2956 
2957  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2958  {
2959  const size_t ibegin( ( IsLower_v<MT1> )
2960  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2961  :( 0UL ) );
2962  const size_t iend( ( IsUpper_v<MT1> )
2963  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2964  :( M ) );
2965  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2966 
2967  SIMDType xmm1, xmm2, xmm3, xmm4;
2968 
2969  for( size_t i=ibegin; i<iend; ++i ) {
2970  const SIMDType x1( set( x[i] ) );
2971  xmm1 += x1 * A.load(i,j );
2972  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2973  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2974  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2975  }
2976 
2977  y.store( j , xmm1*factor );
2978  y.store( j+SIMDSIZE , xmm2*factor );
2979  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2980  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2981  }
2982 
2983  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2984  {
2985  const size_t ibegin( ( IsLower_v<MT1> )
2986  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2987  :( 0UL ) );
2988  const size_t iend( ( IsUpper_v<MT1> )
2989  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2990  :( M ) );
2991  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2992 
2993  SIMDType xmm1, xmm2, xmm3;
2994 
2995  for( size_t i=ibegin; i<iend; ++i ) {
2996  const SIMDType x1( set( x[i] ) );
2997  xmm1 += x1 * A.load(i,j );
2998  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2999  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3000  }
3001 
3002  y.store( j , xmm1*factor );
3003  y.store( j+SIMDSIZE , xmm2*factor );
3004  y.store( j+SIMDSIZE*2UL, xmm3*factor );
3005  }
3006 
3007  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3008  {
3009  const size_t ibegin( ( IsLower_v<MT1> )
3010  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3011  :( 0UL ) );
3012  const size_t iend( ( IsUpper_v<MT1> )
3013  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3014  :( M ) );
3015  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3016 
3017  SIMDType xmm1, xmm2;
3018 
3019  for( size_t i=ibegin; i<iend; ++i ) {
3020  const SIMDType x1( set( x[i] ) );
3021  xmm1 += x1 * A.load(i,j );
3022  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3023  }
3024 
3025  y.store( j , xmm1*factor );
3026  y.store( j+SIMDSIZE, xmm2*factor );
3027  }
3028 
3029  for( ; j<jpos; j+=SIMDSIZE )
3030  {
3031  const size_t ibegin( ( IsLower_v<MT1> )
3032  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3033  :( 0UL ) );
3034  const size_t iend( ( IsUpper_v<MT1> )
3035  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3036  :( M ) );
3037  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3038 
3039  SIMDType xmm1;
3040 
3041  for( size_t i=ibegin; i<iend; ++i ) {
3042  xmm1 += set( x[i] ) * A.load(i,j);
3043  }
3044 
3045  y.store( j, xmm1*factor );
3046  }
3047 
3048  for( ; remainder && j<N; ++j )
3049  {
3050  const size_t ibegin( ( IsLower_v<MT1> )
3051  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3052  :( 0UL ) );
3053  const size_t iend( ( IsUpper_v<MT1> )
3054  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3055  :( M ) );
3056  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3057 
3058  ElementType value{};
3059 
3060  for( size_t i=ibegin; i<iend; ++i ) {
3061  value += x[i] * A(i,j);
3062  }
3063 
3064  y[j] = value * scalar;
3065  }
3066  }
3067  //**********************************************************************************************
3068 
3069  //**Default assignment to dense vectors (large matrices)****************************************
3083  template< typename VT1 // Type of the left-hand side target vector
3084  , typename VT2 // Type of the left-hand side vector operand
3085  , typename MT1 // Type of the right-hand side matrix operand
3086  , typename ST2 > // Type of the scalar value
3087  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3088  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3089  {
3090  selectDefaultAssignKernel( y, x, A, scalar );
3091  }
3092  //**********************************************************************************************
3093 
3094  //**Default assignment to dense vectors (large matrices)****************************************
3108  template< typename VT1 // Type of the left-hand side target vector
3109  , typename VT2 // Type of the left-hand side vector operand
3110  , typename MT1 // Type of the right-hand side matrix operand
3111  , typename ST2 > // Type of the scalar value
3112  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3113  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3114  {
3115  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3116 
3117  const size_t M( A.rows() );
3118  const size_t N( A.columns() );
3119 
3120  const size_t jblock( 32768UL / sizeof( ElementType ) );
3121  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3122 
3123  const SIMDType factor( set( scalar ) );
3124 
3125  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3126 
3127  reset( y );
3128 
3129  for( size_t jj=0U; jj<N; jj+=jblock ) {
3130  for( size_t ii=0UL; ii<M; ii+=iblock )
3131  {
3132  const size_t iend( min( ii+iblock, M ) );
3133  const size_t jtmp( min( jj+jblock, N ) );
3134  const size_t jend( ( IsLower_v<MT1> )
3135  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3136  :( jtmp ) );
3137 
3138  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3139  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3140 
3141  size_t j( ( IsUpper_v<MT1> )
3142  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3143  :( jj ) );
3144 
3145  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3146  {
3147  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3148 
3149  for( size_t i=ii; i<iend; ++i ) {
3150  const SIMDType x1( set( x[i] ) );
3151  xmm1 += x1 * A.load(i,j );
3152  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3153  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3154  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3155  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3156  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3157  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3158  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3159  }
3160 
3161  y.store( j , y.load(j ) + xmm1*factor );
3162  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3163  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3164  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3165  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3166  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3167  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3168  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3169  }
3170 
3171  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3172  {
3173  SIMDType xmm1, xmm2, xmm3, xmm4;
3174 
3175  for( size_t i=ii; i<iend; ++i ) {
3176  const SIMDType x1( set( x[i] ) );
3177  xmm1 += x1 * A.load(i,j );
3178  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3179  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3180  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3181  }
3182 
3183  y.store( j , y.load(j ) + xmm1*factor );
3184  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3185  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3186  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3187  }
3188 
3189  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3190  {
3191  SIMDType xmm1, xmm2, xmm3;
3192 
3193  for( size_t i=ii; i<iend; ++i ) {
3194  const SIMDType x1( set( x[i] ) );
3195  xmm1 += x1 * A.load(i,j );
3196  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3197  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3198  }
3199 
3200  y.store( j , y.load(j ) + xmm1*factor );
3201  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3202  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3203  }
3204 
3205  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3206  {
3207  SIMDType xmm1, xmm2;
3208 
3209  for( size_t i=ii; i<iend; ++i ) {
3210  const SIMDType x1( set( x[i] ) );
3211  xmm1 += x1 * A.load(i,j );
3212  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3213  }
3214 
3215  y.store( j , y.load(j ) + xmm1*factor );
3216  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3217  }
3218 
3219  for( ; j<jpos; j+=SIMDSIZE )
3220  {
3221  SIMDType xmm1;
3222 
3223  for( size_t i=ii; i<iend; ++i ) {
3224  xmm1 += set( x[i] ) * A.load(i,j);
3225  }
3226 
3227  y.store( j, y.load(j) + xmm1*factor );
3228  }
3229 
3230  for( ; remainder && j<jend; ++j )
3231  {
3232  ElementType value{};
3233 
3234  for( size_t i=ii; i<iend; ++i ) {
3235  value += x[i] * A(i,j);
3236  }
3237 
3238  y[j] += value * scalar;
3239  }
3240  }
3241  }
3242  }
3243  //**********************************************************************************************
3244 
3245  //**BLAS-based assignment to dense vectors (default)********************************************
3258  template< typename VT1 // Type of the left-hand side target vector
3259  , typename VT2 // Type of the left-hand side vector operand
3260  , typename MT1 // Type of the right-hand side matrix operand
3261  , typename ST2 > // Type of the scalar value
3262  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3263  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3264  {
3265  selectLargeAssignKernel( y, x, A, scalar );
3266  }
3267  //**********************************************************************************************
3268 
3269  //**BLAS-based assignment to dense vectors******************************************************
3270 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3271 
3284  template< typename VT1 // Type of the left-hand side target vector
3285  , typename VT2 // Type of the left-hand side vector operand
3286  , typename MT1 // Type of the right-hand side matrix operand
3287  , typename ST2 > // Type of the scalar value
3288  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3289  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3290  {
3291  using ET = ElementType_t<VT1>;
3292 
3293  if( IsTriangular_v<MT1> ) {
3294  assign( y, scalar * x );
3295  trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3296  }
3297  else {
3298  gemv( y, x, A, ET(scalar), ET(0) );
3299  }
3300  }
3301 #endif
3302  //**********************************************************************************************
3303 
3304  //**Assignment to sparse vectors****************************************************************
3316  template< typename VT1 > // Type of the target sparse vector
3317  friend inline void assign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3318  {
3320 
3324 
3325  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3326 
3327  const ResultType tmp( serial( rhs ) );
3328  assign( ~lhs, tmp );
3329  }
3330  //**********************************************************************************************
3331 
3332  //**Addition assignment to dense vectors********************************************************
3344  template< typename VT1 > // Type of the target dense vector
3345  friend inline void addAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3346  {
3348 
3349  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3350 
3351  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3352  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3353 
3354  if( right.rows() == 0UL || right.columns() == 0UL ) {
3355  return;
3356  }
3357 
3358  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3359  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3360 
3361  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3362  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3363  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3364  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3365 
3366  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3367  }
3368  //**********************************************************************************************
3369 
3370  //**Addition assignment to dense vectors (kernel selection)*************************************
3381  template< typename VT1 // Type of the left-hand side target vector
3382  , typename VT2 // Type of the left-hand side vector operand
3383  , typename MT1 // Type of the right-hand side matrix operand
3384  , typename ST2 > // Type of the scalar value
3385  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3386  {
3387  if( ( IsDiagonal_v<MT1> ) ||
3388  ( IsComputation_v<MT> && !evaluateMatrix ) ||
3389  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3390  selectSmallAddAssignKernel( y, x, A, scalar );
3391  else
3392  selectBlasAddAssignKernel( y, x, A, scalar );
3393  }
3394  //**********************************************************************************************
3395 
3396  //**Default addition assignment to dense vectors************************************************
3410  template< typename VT1 // Type of the left-hand side target vector
3411  , typename VT2 // Type of the left-hand side vector operand
3412  , typename MT1 // Type of the right-hand side matrix operand
3413  , typename ST2 > // Type of the scalar value
3414  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3415  {
3416  y.addAssign( x * A * scalar );
3417  }
3418  //**********************************************************************************************
3419 
3420  //**Default addition assignment to dense vectors (small matrices)*******************************
3434  template< typename VT1 // Type of the left-hand side target vector
3435  , typename VT2 // Type of the left-hand side vector operand
3436  , typename MT1 // Type of the right-hand side matrix operand
3437  , typename ST2 > // Type of the scalar value
3438  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3439  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3440  {
3441  selectDefaultAddAssignKernel( y, x, A, scalar );
3442  }
3443  //**********************************************************************************************
3444 
3445  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3460  template< typename VT1 // Type of the left-hand side target vector
3461  , typename VT2 // Type of the left-hand side vector operand
3462  , typename MT1 // Type of the right-hand side matrix operand
3463  , typename ST2 > // Type of the scalar value
3464  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3465  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3466  {
3467  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3468 
3469  const size_t M( A.rows() );
3470  const size_t N( A.columns() );
3471 
3472  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3473  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3474 
3475  const SIMDType factor( set( scalar ) );
3476 
3477  size_t j( 0UL );
3478 
3479  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3480  {
3481  const size_t ibegin( ( IsLower_v<MT1> )
3482  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3483  :( 0UL ) );
3484  const size_t iend( ( IsUpper_v<MT1> )
3485  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3486  :( M ) );
3487  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3488 
3489  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3490 
3491  for( size_t i=ibegin; i<iend; ++i ) {
3492  const SIMDType x1( set( x[i] ) );
3493  xmm1 += x1 * A.load(i,j );
3494  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3495  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3496  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3497  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3498  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3499  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3500  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3501  }
3502 
3503  y.store( j , y.load(j ) + xmm1*factor );
3504  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3505  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3506  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3507  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3508  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3509  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3510  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3511  }
3512 
3513  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3514  {
3515  const size_t ibegin( ( IsLower_v<MT1> )
3516  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3517  :( 0UL ) );
3518  const size_t iend( ( IsUpper_v<MT1> )
3519  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3520  :( M ) );
3521  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3522 
3523  SIMDType xmm1, xmm2, xmm3, xmm4;
3524 
3525  for( size_t i=ibegin; i<iend; ++i ) {
3526  const SIMDType x1( set( x[i] ) );
3527  xmm1 += x1 * A.load(i,j );
3528  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3529  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3530  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3531  }
3532 
3533  y.store( j , y.load(j ) + xmm1*factor );
3534  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3535  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3536  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3537  }
3538 
3539  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3540  {
3541  const size_t ibegin( ( IsLower_v<MT1> )
3542  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3543  :( 0UL ) );
3544  const size_t iend( ( IsUpper_v<MT1> )
3545  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3546  :( M ) );
3547  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3548 
3549  SIMDType xmm1, xmm2, xmm3;
3550 
3551  for( size_t i=ibegin; i<iend; ++i ) {
3552  const SIMDType x1( set( x[i] ) );
3553  xmm1 += x1 * A.load(i,j );
3554  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3555  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3556  }
3557 
3558  y.store( j , y.load(j ) + xmm1*factor );
3559  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3560  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3561  }
3562 
3563  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3564  {
3565  const size_t ibegin( ( IsLower_v<MT1> )
3566  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3567  :( 0UL ) );
3568  const size_t iend( ( IsUpper_v<MT1> )
3569  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3570  :( M ) );
3571  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3572 
3573  SIMDType xmm1, xmm2;
3574 
3575  for( size_t i=ibegin; i<iend; ++i ) {
3576  const SIMDType x1( set( x[i] ) );
3577  xmm1 += x1 * A.load(i,j );
3578  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3579  }
3580 
3581  y.store( j , y.load(j ) + xmm1*factor );
3582  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3583  }
3584 
3585  for( ; j<jpos; j+=SIMDSIZE )
3586  {
3587  const size_t ibegin( ( IsLower_v<MT1> )
3588  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3589  :( 0UL ) );
3590  const size_t iend( ( IsUpper_v<MT1> )
3591  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3592  :( M ) );
3593  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3594 
3595  SIMDType xmm1;
3596 
3597  for( size_t i=ibegin; i<iend; ++i ) {
3598  xmm1 += set( x[i] ) * A.load(i,j);
3599  }
3600 
3601  y.store( j, y.load(j) + xmm1*factor );
3602  }
3603 
3604  for( ; remainder && j<N; ++j )
3605  {
3606  const size_t ibegin( ( IsLower_v<MT1> )
3607  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3608  :( 0UL ) );
3609  const size_t iend( ( IsUpper_v<MT1> )
3610  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3611  :( M ) );
3612  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3613 
3614  ElementType value{};
3615 
3616  for( size_t i=ibegin; i<iend; ++i ) {
3617  value += x[i] * A(i,j);
3618  }
3619 
3620  y[j] += value * scalar;
3621  }
3622  }
3623  //**********************************************************************************************
3624 
3625  //**Default addition assignment to dense vectors (large matrices)*******************************
3639  template< typename VT1 // Type of the left-hand side target vector
3640  , typename VT2 // Type of the left-hand side vector operand
3641  , typename MT1 // Type of the right-hand side matrix operand
3642  , typename ST2 > // Type of the scalar value
3643  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3644  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3645  {
3646  selectDefaultAddAssignKernel( y, x, A, scalar );
3647  }
3648  //**********************************************************************************************
3649 
3650  //**Vectorized default addition assignment to dense vectors (large matrices)********************
3665  template< typename VT1 // Type of the left-hand side target vector
3666  , typename VT2 // Type of the left-hand side vector operand
3667  , typename MT1 // Type of the right-hand side matrix operand
3668  , typename ST2 > // Type of the scalar value
3669  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3670  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3671  {
3672  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3673 
3674  const size_t M( A.rows() );
3675  const size_t N( A.columns() );
3676 
3677  const size_t jblock( 32768UL / sizeof( ElementType ) );
3678  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3679 
3680  const SIMDType factor( set( scalar ) );
3681 
3682  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3683 
3684  for( size_t jj=0U; jj<N; jj+=jblock ) {
3685  for( size_t ii=0UL; ii<M; ii+=iblock )
3686  {
3687  const size_t iend( min( ii+iblock, M ) );
3688  const size_t jtmp( min( jj+jblock, N ) );
3689  const size_t jend( ( IsLower_v<MT1> )
3690  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3691  :( jtmp ) );
3692 
3693  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3694  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3695 
3696  size_t j( ( IsUpper_v<MT1> )
3697  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3698  :( jj ) );
3699 
3700  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3701  {
3702  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3703 
3704  for( size_t i=ii; i<iend; ++i ) {
3705  const SIMDType x1( set( x[i] ) );
3706  xmm1 += x1 * A.load(i,j );
3707  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3708  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3709  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3710  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3711  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3712  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3713  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3714  }
3715 
3716  y.store( j , y.load(j ) + xmm1*factor );
3717  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3718  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3719  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3720  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3721  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3722  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3723  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3724  }
3725 
3726  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3727  {
3728  SIMDType xmm1, xmm2, xmm3, xmm4;
3729 
3730  for( size_t i=ii; i<iend; ++i ) {
3731  const SIMDType x1( set( x[i] ) );
3732  xmm1 += x1 * A.load(i,j );
3733  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3734  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3735  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3736  }
3737 
3738  y.store( j , y.load(j ) + xmm1*factor );
3739  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3740  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3741  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3742  }
3743 
3744  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3745  {
3746  SIMDType xmm1, xmm2, xmm3;
3747 
3748  for( size_t i=ii; i<iend; ++i ) {
3749  const SIMDType x1( set( x[i] ) );
3750  xmm1 += x1 * A.load(i,j );
3751  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3752  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3753  }
3754 
3755  y.store( j , y.load(j ) + xmm1*factor );
3756  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3757  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3758  }
3759 
3760  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3761  {
3762  SIMDType xmm1, xmm2;
3763 
3764  for( size_t i=ii; i<iend; ++i ) {
3765  const SIMDType x1( set( x[i] ) );
3766  xmm1 += x1 * A.load(i,j );
3767  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3768  }
3769 
3770  y.store( j , y.load(j ) + xmm1*factor );
3771  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3772  }
3773 
3774  for( ; j<jpos; j+=SIMDSIZE )
3775  {
3776  SIMDType xmm1;
3777 
3778  for( size_t i=ii; i<iend; ++i ) {
3779  xmm1 += set( x[i] ) * A.load(i,j);
3780  }
3781 
3782  y.store( j, y.load(j) + xmm1*factor );
3783  }
3784 
3785  for( ; remainder && j<jend; ++j )
3786  {
3787  ElementType value{};
3788 
3789  for( size_t i=ii; i<iend; ++i ) {
3790  value += x[i] * A(i,j);
3791  }
3792 
3793  y[j] += value * scalar;
3794  }
3795  }
3796  }
3797  }
3798  //**********************************************************************************************
3799 
3800  //**BLAS-based addition assignment to dense vectors (default)***********************************
3814  template< typename VT1 // Type of the left-hand side target vector
3815  , typename VT2 // Type of the left-hand side vector operand
3816  , typename MT1 // Type of the right-hand side matrix operand
3817  , typename ST2 > // Type of the scalar value
3818  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3819  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3820  {
3821  selectLargeAddAssignKernel( y, x, A, scalar );
3822  }
3823  //**********************************************************************************************
3824 
3825  //**BLAS-based addition assignment to dense vectors*********************************************
3826 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3827 
3840  template< typename VT1 // Type of the left-hand side target vector
3841  , typename VT2 // Type of the left-hand side vector operand
3842  , typename MT1 // Type of the right-hand side matrix operand
3843  , typename ST2 > // Type of the scalar value
3844  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3845  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3846  {
3847  using ET = ElementType_t<VT1>;
3848 
3849  if( IsTriangular_v<MT1> ) {
3850  ResultType_t<VT1> tmp( serial( scalar * x ) );
3851  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3852  addAssign( y, tmp );
3853  }
3854  else {
3855  gemv( y, x, A, ET(scalar), ET(1) );
3856  }
3857  }
3858 #endif
3859  //**********************************************************************************************
3860 
3861  //**Addition assignment to sparse vectors*******************************************************
3862  // No special implementation for the addition assignment to sparse vectors.
3863  //**********************************************************************************************
3864 
3865  //**Subtraction assignment to dense vectors*****************************************************
3877  template< typename VT1 > // Type of the target dense vector
3878  friend inline void subAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3879  {
3881 
3882  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3883 
3884  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3885  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3886 
3887  if( right.rows() == 0UL || right.columns() == 0UL ) {
3888  return;
3889  }
3890 
3891  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3892  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3893 
3894  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3895  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3896  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3897  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3898 
3899  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
3900  }
3901  //**********************************************************************************************
3902 
3903  //**Subtraction assignment to dense vectors (kernel selection)**********************************
3914  template< typename VT1 // Type of the left-hand side target vector
3915  , typename VT2 // Type of the left-hand side vector operand
3916  , typename MT1 // Type of the right-hand side matrix operand
3917  , typename ST2 > // Type of the scalar value
3918  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3919  {
3920  if( ( IsDiagonal_v<MT1> ) ||
3921  ( IsComputation_v<MT> && !evaluateMatrix ) ||
3922  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3923  selectSmallSubAssignKernel( y, x, A, scalar );
3924  else
3925  selectBlasSubAssignKernel( y, x, A, scalar );
3926  }
3927  //**********************************************************************************************
3928 
3929  //**Default subtraction assignment to dense vectors*********************************************
3943  template< typename VT1 // Type of the left-hand side target vector
3944  , typename VT2 // Type of the left-hand side vector operand
3945  , typename MT1 // Type of the right-hand side matrix operand
3946  , typename ST2 > // Type of the scalar value
3947  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3948  {
3949  y.subAssign( x * A * scalar );
3950  }
3951  //**********************************************************************************************
3952 
3953  //**Default subtraction assignment to dense vectors (small matrices)****************************
3967  template< typename VT1 // Type of the left-hand side target vector
3968  , typename VT2 // Type of the left-hand side vector operand
3969  , typename MT1 // Type of the right-hand side matrix operand
3970  , typename ST2 > // Type of the scalar value
3971  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3972  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3973  {
3974  selectDefaultSubAssignKernel( y, x, A, scalar );
3975  }
3976  //**********************************************************************************************
3977 
3978  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
3993  template< typename VT1 // Type of the left-hand side target vector
3994  , typename VT2 // Type of the left-hand side vector operand
3995  , typename MT1 // Type of the right-hand side matrix operand
3996  , typename ST2 > // Type of the scalar value
3997  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3998  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3999  {
4000  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4001 
4002  const size_t M( A.rows() );
4003  const size_t N( A.columns() );
4004 
4005  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
4006  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4007 
4008  const SIMDType factor( set( scalar ) );
4009 
4010  size_t j( 0UL );
4011 
4012  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4013  {
4014  const size_t ibegin( ( IsLower_v<MT1> )
4015  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4016  :( 0UL ) );
4017  const size_t iend( ( IsUpper_v<MT1> )
4018  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4019  :( M ) );
4020  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4021 
4022  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4023 
4024  for( size_t i=ibegin; i<iend; ++i ) {
4025  const SIMDType x1( set( x[i] ) );
4026  xmm1 += x1 * A.load(i,j );
4027  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4028  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4029  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4030  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4031  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4032  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4033  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4034  }
4035 
4036  y.store( j , y.load(j ) - xmm1*factor );
4037  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4038  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4039  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4040  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4041  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4042  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4043  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4044  }
4045 
4046  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4047  {
4048  const size_t ibegin( ( IsLower_v<MT1> )
4049  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4050  :( 0UL ) );
4051  const size_t iend( ( IsUpper_v<MT1> )
4052  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4053  :( M ) );
4054  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4055 
4056  SIMDType xmm1, xmm2, xmm3, xmm4;
4057 
4058  for( size_t i=ibegin; i<iend; ++i ) {
4059  const SIMDType x1( set( x[i] ) );
4060  xmm1 += x1 * A.load(i,j );
4061  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4062  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4063  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4064  }
4065 
4066  y.store( j , y.load(j ) - xmm1*factor );
4067  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4068  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4069  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4070  }
4071 
4072  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4073  {
4074  const size_t ibegin( ( IsLower_v<MT1> )
4075  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4076  :( 0UL ) );
4077  const size_t iend( ( IsUpper_v<MT1> )
4078  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4079  :( M ) );
4080  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4081 
4082  SIMDType xmm1, xmm2, xmm3;
4083 
4084  for( size_t i=ibegin; i<iend; ++i ) {
4085  const SIMDType x1( set( x[i] ) );
4086  xmm1 += x1 * A.load(i,j );
4087  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4088  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4089  }
4090 
4091  y.store( j , y.load(j ) - xmm1*factor );
4092  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4093  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4094  }
4095 
4096  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4097  {
4098  const size_t ibegin( ( IsLower_v<MT1> )
4099  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4100  :( 0UL ) );
4101  const size_t iend( ( IsUpper_v<MT1> )
4102  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4103  :( M ) );
4104  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4105 
4106  SIMDType xmm1, xmm2;
4107 
4108  for( size_t i=ibegin; i<iend; ++i ) {
4109  const SIMDType x1( set( x[i] ) );
4110  xmm1 += x1 * A.load(i,j );
4111  xmm2 += x1 * A.load(i,j+SIMDSIZE);
4112  }
4113 
4114  y.store( j , y.load(j ) - xmm1*factor );
4115  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4116  }
4117 
4118  for( ; j<jpos; j+=SIMDSIZE )
4119  {
4120  const size_t ibegin( ( IsLower_v<MT1> )
4121  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4122  :( 0UL ) );
4123  const size_t iend( ( IsUpper_v<MT1> )
4124  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4125  :( M ) );
4126  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4127 
4128  SIMDType xmm1;
4129 
4130  for( size_t i=ibegin; i<iend; ++i ) {
4131  xmm1 += set( x[i] ) * A.load(i,j);
4132  }
4133 
4134  y.store( j, y.load(j) - xmm1*factor );
4135  }
4136 
4137  for( ; remainder && j<N; ++j )
4138  {
4139  const size_t ibegin( ( IsLower_v<MT1> )
4140  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4141  :( 0UL ) );
4142  const size_t iend( ( IsUpper_v<MT1> )
4143  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4144  :( M ) );
4145  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4146 
4147  ElementType value{};
4148 
4149  for( size_t i=ibegin; i<iend; ++i ) {
4150  value += x[i] * A(i,j);
4151  }
4152 
4153  y[j] -= value * scalar;
4154  }
4155  }
4156  //**********************************************************************************************
4157 
4158  //**Default subtraction assignment to dense vectors (large matrices)****************************
4172  template< typename VT1 // Type of the left-hand side target vector
4173  , typename VT2 // Type of the left-hand side vector operand
4174  , typename MT1 // Type of the right-hand side matrix operand
4175  , typename ST2 > // Type of the scalar value
4176  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4177  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4178  {
4179  selectDefaultSubAssignKernel( y, x, A, scalar );
4180  }
4181  //**********************************************************************************************
4182 
4183  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4198  template< typename VT1 // Type of the left-hand side target vector
4199  , typename VT2 // Type of the left-hand side vector operand
4200  , typename MT1 // Type of the right-hand side matrix operand
4201  , typename ST2 > // Type of the scalar value
4202  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4203  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4204  {
4205  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4206 
4207  const size_t M( A.rows() );
4208  const size_t N( A.columns() );
4209 
4210  const size_t jblock( 32768UL / sizeof( ElementType ) );
4211  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4212 
4213  const SIMDType factor( set( scalar ) );
4214 
4215  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
4216 
4217  for( size_t jj=0U; jj<N; jj+=jblock ) {
4218  for( size_t ii=0UL; ii<M; ii+=iblock )
4219  {
4220  const size_t iend( min( ii+iblock, M ) );
4221  const size_t jtmp( min( jj+jblock, N ) );
4222  const size_t jend( ( IsLower_v<MT1> )
4223  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
4224  :( jtmp ) );
4225 
4226  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4227  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4228 
4229  size_t j( ( IsUpper_v<MT1> )
4230  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
4231  :( jj ) );
4232 
4233  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4234  {
4235  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4236 
4237  for( size_t i=ii; i<iend; ++i ) {
4238  const SIMDType x1( set( x[i] ) );
4239  xmm1 += x1 * A.load(i,j );
4240  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4241  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4242  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4243  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4244  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4245  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4246  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4247  }
4248 
4249  y.store( j , y.load(j ) - xmm1*factor );
4250  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4251  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4252  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4253  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4254  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4255  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4256  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4257  }
4258 
4259  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4260  {
4261  SIMDType xmm1, xmm2, xmm3, xmm4;
4262 
4263  for( size_t i=ii; i<iend; ++i ) {
4264  const SIMDType x1( set( x[i] ) );
4265  xmm1 += x1 * A.load(i,j );
4266  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4267  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4268  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4269  }
4270 
4271  y.store( j , y.load(j ) - xmm1*factor );
4272  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4273  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4274  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4275  }
4276 
4277  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4278  {
4279  SIMDType xmm1, xmm2, xmm3;
4280 
4281  for( size_t i=ii; i<iend; ++i ) {
4282  const SIMDType x1( set( x[i] ) );
4283  xmm1 += x1 * A.load(i,j );
4284  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4285  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4286  }
4287 
4288  y.store( j , y.load(j ) - xmm1*factor );
4289  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4290  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4291  }
4292 
4293  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4294  {
4295  SIMDType xmm1, xmm2;
4296 
4297  for( size_t i=ii; i<iend; ++i ) {
4298  const SIMDType x1( set( x[i] ) );
4299  xmm1 += x1 * A.load(i,j );
4300  xmm2 += x1 * A.load(i,j+SIMDSIZE);
4301  }
4302 
4303  y.store( j , y.load(j ) - xmm1*factor );
4304  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4305  }
4306 
4307  for( ; j<jpos; j+=SIMDSIZE )
4308  {
4309  SIMDType xmm1;
4310 
4311  for( size_t i=ii; i<iend; ++i ) {
4312  xmm1 += set( x[i] ) * A.load(i,j);
4313  }
4314 
4315  y.store( j, y.load(j) - xmm1*factor );
4316  }
4317 
4318  for( ; remainder && j<jend; ++j )
4319  {
4320  ElementType value{};
4321 
4322  for( size_t i=ii; i<iend; ++i ) {
4323  value += x[i] * A(i,j);
4324  }
4325 
4326  y[j] -= value * scalar;
4327  }
4328  }
4329  }
4330  }
4331  //**********************************************************************************************
4332 
4333  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4347  template< typename VT1 // Type of the left-hand side target vector
4348  , typename VT2 // Type of the left-hand side vector operand
4349  , typename MT1 // Type of the right-hand side matrix operand
4350  , typename ST2 > // Type of the scalar value
4351  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4352  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4353  {
4354  selectLargeSubAssignKernel( y, x, A, scalar );
4355  }
4356  //**********************************************************************************************
4357 
4358  //**BLAS-based subtraction assignment to dense vectors******************************************
4359 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4360 
4373  template< typename VT1 // Type of the left-hand side target vector
4374  , typename VT2 // Type of the left-hand side vector operand
4375  , typename MT1 // Type of the right-hand side matrix operand
4376  , typename ST2 > // Type of the scalar value
4377  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4378  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4379  {
4380  using ET = ElementType_t<VT1>;
4381 
4382  if( IsTriangular_v<MT1> ) {
4383  ResultType_t<VT1> tmp( serial( scalar * x ) );
4384  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4385  subAssign( y, tmp );
4386  }
4387  else {
4388  gemv( y, x, A, ET(-scalar), ET(1) );
4389  }
4390  }
4391 #endif
4392  //**********************************************************************************************
4393 
4394  //**Subtraction assignment to sparse vectors****************************************************
4395  // No special implementation for the subtraction assignment to sparse vectors.
4396  //**********************************************************************************************
4397 
4398  //**Multiplication assignment to dense vectors**************************************************
4410  template< typename VT1 > // Type of the target dense vector
4411  friend inline void multAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4412  {
4414 
4418 
4419  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4420 
4421  const ResultType tmp( serial( rhs ) );
4422  multAssign( ~lhs, tmp );
4423  }
4424  //**********************************************************************************************
4425 
4426  //**Multiplication assignment to sparse vectors*************************************************
4427  // No special implementation for the multiplication assignment to sparse vectors.
4428  //**********************************************************************************************
4429 
4430  //**Division assignment to dense vectors********************************************************
4442  template< typename VT1 > // Type of the target dense vector
4443  friend inline void divAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4444  {
4446 
4450 
4451  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4452 
4453  const ResultType tmp( serial( rhs ) );
4454  divAssign( ~lhs, tmp );
4455  }
4456  //**********************************************************************************************
4457 
4458  //**Division assignment to sparse vectors*******************************************************
4459  // No special implementation for the division assignment to sparse vectors.
4460  //**********************************************************************************************
4461 
4462  //**SMP assignment to dense vectors*************************************************************
4476  template< typename VT1 > // Type of the target dense vector
4477  friend inline auto smpAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4478  -> EnableIf_t< UseSMPAssign_v<VT1> >
4479  {
4481 
4482  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4483 
4484  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4485  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4486 
4487  if( right.rows() == 0UL ) {
4488  reset( ~lhs );
4489  return;
4490  }
4491  else if( right.columns() == 0UL ) {
4492  return;
4493  }
4494 
4495  LT x( left ); // Evaluation of the left-hand side dense vector operand
4496  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4497 
4498  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4499  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4500  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4501  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4502 
4503  smpAssign( ~lhs, x * A * rhs.scalar_ );
4504  }
4505  //**********************************************************************************************
4506 
4507  //**SMP assignment to sparse vectors************************************************************
4521  template< typename VT1 > // Type of the target sparse vector
4522  friend inline auto smpAssign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4523  -> EnableIf_t< UseSMPAssign_v<VT1> >
4524  {
4526 
4530 
4531  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4532 
4533  const ResultType tmp( rhs );
4534  smpAssign( ~lhs, tmp );
4535  }
4536  //**********************************************************************************************
4537 
4538  //**SMP addition assignment to dense vectors****************************************************
4552  template< typename VT1 > // Type of the target dense vector
4553  friend inline auto smpAddAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4554  -> EnableIf_t< UseSMPAssign_v<VT1> >
4555  {
4557 
4558  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4559 
4560  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4561  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4562 
4563  if( right.rows() == 0UL || right.columns() == 0UL ) {
4564  return;
4565  }
4566 
4567  LT x( left ); // Evaluation of the left-hand side dense vector operand
4568  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4569 
4570  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4571  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4572  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4573  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4574 
4575  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
4576  }
4577  //**********************************************************************************************
4578 
4579  //**SMP addition assignment to sparse vectors***************************************************
4580  // No special implementation for the SMP addition assignment to sparse vectors.
4581  //**********************************************************************************************
4582 
4583  //**SMP subtraction assignment to dense vectors*************************************************
4597  template< typename VT1 > // Type of the target dense vector
4598  friend inline auto smpSubAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4599  -> EnableIf_t< UseSMPAssign_v<VT1> >
4600  {
4602 
4603  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4604 
4605  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4606  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4607 
4608  if( right.rows() == 0UL || right.columns() == 0UL ) {
4609  return;
4610  }
4611 
4612  LT x( left ); // Evaluation of the left-hand side dense vector operand
4613  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4614 
4615  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4616  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4617  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4618  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4619 
4620  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
4621  }
4622  //**********************************************************************************************
4623 
4624  //**SMP subtraction assignment to sparse vectors************************************************
4625  // No special implementation for the SMP subtraction assignment to sparse vectors.
4626  //**********************************************************************************************
4627 
4628  //**SMP multiplication assignment to dense vectors**********************************************
4643  template< typename VT1 > // Type of the target dense vector
4644  friend inline auto smpMultAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4645  -> EnableIf_t< UseSMPAssign_v<VT1> >
4646  {
4648 
4652 
4653  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4654 
4655  const ResultType tmp( rhs );
4656  smpMultAssign( ~lhs, tmp );
4657  }
4658  //**********************************************************************************************
4659 
4660  //**SMP multiplication assignment to sparse vectors*********************************************
4661  // No special implementation for the SMP multiplication assignment to sparse vectors.
4662  //**********************************************************************************************
4663 
4664  //**SMP division assignment to dense vectors****************************************************
4678  template< typename VT1 > // Type of the target dense vector
4679  friend inline auto smpDivAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4680  -> EnableIf_t< UseSMPAssign_v<VT1> >
4681  {
4683 
4687 
4688  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4689 
4690  const ResultType tmp( rhs );
4691  smpDivAssign( ~lhs, tmp );
4692  }
4693  //**********************************************************************************************
4694 
4695  //**SMP division assignment to sparse vectors***************************************************
4696  // No special implementation for the SMP division assignment to sparse vectors.
4697  //**********************************************************************************************
4698 
4699  //**Compile time checks*************************************************************************
4708  //**********************************************************************************************
4709 };
4711 //*************************************************************************************************
4712 
4713 
4714 
4715 
4716 //=================================================================================================
4717 //
4718 // GLOBAL BINARY ARITHMETIC OPERATORS
4719 //
4720 //=================================================================================================
4721 
4722 //*************************************************************************************************
4753 template< typename VT // Type of the left-hand side dense vector
4754  , typename MT > // Type of the right-hand side dense matrix
4755 inline decltype(auto)
4756  operator*( const DenseVector<VT,true>& vec, const DenseMatrix<MT,false>& mat )
4757 {
4759 
4761 
4762  if( (~vec).size() != (~mat).rows() ) {
4763  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
4764  }
4765 
4766  using ReturnType = const TDVecDMatMultExpr<VT,MT>;
4767  return ReturnType( ~vec, ~mat );
4768 }
4769 //*************************************************************************************************
4770 
4771 
4772 
4773 
4774 //=================================================================================================
4775 //
4776 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
4777 //
4778 //=================================================================================================
4779 
4780 //*************************************************************************************************
4794 template< typename VT // Type of the left-hand side dense vector
4795  , typename MT > // Matrix base type of the right-hand side expression
4796 inline decltype(auto)
4797  operator*( const DenseVector<VT,true>& vec, const MatMatMultExpr<MT>& mat )
4798 {
4800 
4801  return ( vec * (~mat).leftOperand() ) * (~mat).rightOperand();
4802 }
4804 //*************************************************************************************************
4805 
4806 
4807 
4808 
4809 //=================================================================================================
4810 //
4811 // ISALIGNED SPECIALIZATIONS
4812 //
4813 //=================================================================================================
4814 
4815 //*************************************************************************************************
4817 template< typename VT, typename MT >
4818 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4819  : public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
4820 {};
4822 //*************************************************************************************************
4823 
4824 } // namespace blaze
4825 
4826 #endif
If_t< IsExpression_v< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:215
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:204
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:568
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:524
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:164
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:163
Header file for basic type definitions.
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:534
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:167
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:429
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:296
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:514
Header file for the DenseVector base class.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:363
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:170
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:385
Header file for the Computation base class.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecDMatMultExpr.h:233
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:329
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:129
Header file for the IsFloat type trait.
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:132
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:353
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:220
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:434
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecDMatMultExpr.h:226
Header file for the IsComplexDouble type trait.
Constraint on the data type.
DenseVector< This, TF > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:161
Headerfile for the generic max algorithm.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:468
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:133
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:160
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:262
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:218
Header file for all SIMD functionality.
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:108
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:83
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:173
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:386
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:585
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:558
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:248
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.The MultTrait_t alias declaration provid...
Definition: MultTrait.h:240
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:586
System settings for the BLAS mode.
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:131
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:455
Header file for the IsSIMDCombinable type trait.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:206
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:578
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:373
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:208
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecDMatMultExpr.h:239
Header file for the IsContiguous type trait.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:144
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Header file for the TVecMatMultExpr base class.
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:162
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:175
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:139
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:207
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:309
Header file for BLAS general matrix/vector multiplication functions (gemv)
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:341
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type,...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:104
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:546
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:209
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:212
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:443
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:205
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:319
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the complex data type.
Header file for the IsUpper type trait.
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:221
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:134
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:191
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:130
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:176
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:424