TDVecDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/Exception.h>
60 #include <blaze/math/shims/Reset.h>
62 #include <blaze/math/SIMD.h>
82 #include <blaze/math/views/Check.h>
83 #include <blaze/system/BLAS.h>
88 #include <blaze/util/Assert.h>
89 #include <blaze/util/Complex.h>
91 #include <blaze/util/DisableIf.h>
92 #include <blaze/util/EnableIf.h>
95 #include <blaze/util/mpl/If.h>
96 #include <blaze/util/Types.h>
104 
105 
106 namespace blaze {
107 
108 //=================================================================================================
109 //
110 // CLASS TDVECDMATMULTEXPR
111 //
112 //=================================================================================================
113 
114 //*************************************************************************************************
121 template< typename VT // Type of the left-hand side dense vector
122  , typename MT > // Type of the right-hand side dense matrix
123 class TDVecDMatMultExpr
124  : public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
125  , private Computation
126 {
127  private:
128  //**Type definitions****************************************************************************
135  //**********************************************************************************************
136 
137  //**********************************************************************************************
139  static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
140  //**********************************************************************************************
141 
142  //**********************************************************************************************
144  static constexpr bool evaluateMatrix =
145  ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
146  IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
147  //**********************************************************************************************
148 
149  //**********************************************************************************************
151 
155  template< typename T1 >
156  static constexpr bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
158  //**********************************************************************************************
159 
160  //**********************************************************************************************
162 
165  template< typename T1, typename T2, typename T3 >
166  static constexpr bool UseBlasKernel_v =
168  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
169  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
170  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
171  !IsDiagonal_v<T3> &&
172  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
173  IsBLASCompatible_v< ElementType_t<T1> > &&
174  IsBLASCompatible_v< ElementType_t<T2> > &&
175  IsBLASCompatible_v< ElementType_t<T3> > &&
176  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
177  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
179  //**********************************************************************************************
180 
181  //**********************************************************************************************
183 
187  template< typename T1, typename T2, typename T3 >
188  static constexpr bool UseVectorizedDefaultKernel_v =
189  ( useOptimizedKernels &&
190  !IsDiagonal_v<T3> &&
191  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
192  IsSIMDCombinable_v< ElementType_t<T1>
194  , ElementType_t<T3> > &&
195  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
196  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
198  //**********************************************************************************************
199 
200  public:
201  //**Type definitions****************************************************************************
208  using ReturnType = const ElementType;
209  using CompositeType = const ResultType;
210 
212  using LeftOperand = If_t< IsExpression_v<VT>, const VT, const VT& >;
213 
215  using RightOperand = If_t< IsExpression_v<MT>, const MT, const MT& >;
216 
219 
222  //**********************************************************************************************
223 
224  //**Compilation flags***************************************************************************
226  static constexpr bool simdEnabled =
227  ( !IsDiagonal_v<MT> &&
228  VT::simdEnabled && MT::simdEnabled &&
229  HasSIMDAdd_v<VET,MET> &&
230  HasSIMDMult_v<VET,MET> );
231 
233  static constexpr bool smpAssignable =
235  //**********************************************************************************************
236 
237  //**SIMD properties*****************************************************************************
239  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
240  //**********************************************************************************************
241 
242  //**Constructor*********************************************************************************
248  explicit inline TDVecDMatMultExpr( const VT& vec, const MT& mat ) noexcept
249  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
250  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
251  {
252  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
253  }
254  //**********************************************************************************************
255 
256  //**Subscript operator**************************************************************************
262  inline ReturnType operator[]( size_t index ) const {
263  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
264 
265  if( IsDiagonal_v<MT> )
266  {
267  return vec_[index] * mat_(index,index);
268  }
269  else if( IsLower_v<MT> && ( index > 8UL ) )
270  {
271  const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
272  const size_t n ( mat_.rows() - begin );
273  return subvector( vec_, begin, n, unchecked ) *
274  subvector( column( mat_, index, unchecked ), begin, n, unchecked );
275  }
276  else if( IsUpper_v<MT> && ( index + 8UL < mat_.rows() ) )
277  {
278  const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
279  return subvector( vec_, 0UL, n, unchecked ) *
280  subvector( column( mat_, index, unchecked ), 0UL, n, unchecked );
281  }
282  else
283  {
284  return vec_ * column( mat_, index, unchecked );
285  }
286  }
287  //**********************************************************************************************
288 
289  //**At function*********************************************************************************
296  inline ReturnType at( size_t index ) const {
297  if( index >= mat_.columns() ) {
298  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
299  }
300  return (*this)[index];
301  }
302  //**********************************************************************************************
303 
304  //**Size function*******************************************************************************
309  inline size_t size() const noexcept {
310  return mat_.columns();
311  }
312  //**********************************************************************************************
313 
314  //**Left operand access*************************************************************************
319  inline LeftOperand leftOperand() const noexcept {
320  return vec_;
321  }
322  //**********************************************************************************************
323 
324  //**Right operand access************************************************************************
329  inline RightOperand rightOperand() const noexcept {
330  return mat_;
331  }
332  //**********************************************************************************************
333 
334  //**********************************************************************************************
340  template< typename T >
341  inline bool canAlias( const T* alias ) const noexcept {
342  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
343  }
344  //**********************************************************************************************
345 
346  //**********************************************************************************************
352  template< typename T >
353  inline bool isAliased( const T* alias ) const noexcept {
354  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
355  }
356  //**********************************************************************************************
357 
358  //**********************************************************************************************
363  inline bool isAligned() const noexcept {
364  return vec_.isAligned() && mat_.isAligned();
365  }
366  //**********************************************************************************************
367 
368  //**********************************************************************************************
373  inline bool canSMPAssign() const noexcept {
374  return ( !BLAZE_BLAS_MODE ||
377  ( IsComputation_v<MT> && !evaluateMatrix ) ||
378  ( mat_.rows() * mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
379  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
380  }
381  //**********************************************************************************************
382 
383  private:
384  //**Member variables****************************************************************************
387  //**********************************************************************************************
388 
389  //**Assignment to dense vectors*****************************************************************
402  template< typename VT1 > // Type of the target dense vector
403  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
404  {
406 
407  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
408 
409  if( rhs.mat_.rows() == 0UL ) {
410  reset( ~lhs );
411  return;
412  }
413  else if( rhs.mat_.columns() == 0UL ) {
414  return;
415  }
416 
417  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
418  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
419 
420  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
421  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
422  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
423  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
424 
425  TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
426  }
428  //**********************************************************************************************
429 
430  //**Assignment to dense vectors (kernel selection)**********************************************
441  template< typename VT1 // Type of the left-hand side target vector
442  , typename VT2 // Type of the left-hand side vector operand
443  , typename MT1 > // Type of the right-hand side matrix operand
444  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
445  {
446  if( ( IsDiagonal_v<MT1> ) ||
447  ( IsComputation_v<MT> && !evaluateMatrix ) ||
448  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
449  selectSmallAssignKernel( y, x, A );
450  else
451  selectBlasAssignKernel( y, x, A );
452  }
454  //**********************************************************************************************
455 
456  //**Default assignment to dense vectors*********************************************************
470  template< typename VT1 // Type of the left-hand side target vector
471  , typename VT2 // Type of the left-hand side vector operand
472  , typename MT1 > // Type of the right-hand side matrix operand
473  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
474  {
475  const size_t M( A.rows() );
476  const size_t N( A.columns() );
477 
478  if( IsStrictlyUpper_v<MT1> ) {
479  reset( y[0] );
480  }
481 
482  if( !IsLower_v<MT1> )
483  {
484  const size_t jbegin( IsStrictlyUpper_v<MT1> ? 1UL : 0UL );
485  for( size_t j=jbegin; j<N; ++j ) {
486  y[j] = x[0UL] * A(0UL,j);
487  }
488  }
489 
490  for( size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
491  {
492  if( IsDiagonal_v<MT1> )
493  {
494  y[i] = x[i] * A(i,i);
495  }
496  else
497  {
498  const size_t jbegin( ( IsUpper_v<MT1> )
499  ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
500  :( 0UL ) );
501  const size_t jend( ( IsLower_v<MT1> )
502  ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
503  :( N ) );
504  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
505 
506  const size_t jnum( jend - jbegin );
507  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
508 
509  for( size_t j=jbegin; j<jpos; j+=2UL ) {
510  y[j ] += x[i] * A(i,j );
511  y[j+1UL] += x[i] * A(i,j+1UL);
512  }
513  if( jpos < jend ) {
514  y[jpos] += x[i] * A(i,jpos);
515  }
516  if( IsLower_v<MT1> ) {
517  y[jend] = x[i] * A(i,jend);
518  }
519  }
520  }
521 
522  if( IsStrictlyLower_v<MT1> ) {
523  reset( y[N-1UL] );
524  }
525  }
527  //**********************************************************************************************
528 
529  //**Default assignment to dense vectors (small matrices)****************************************
543  template< typename VT1 // Type of the left-hand side target vector
544  , typename VT2 // Type of the left-hand side vector operand
545  , typename MT1 > // Type of the right-hand side matrix operand
546  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
547  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
548  {
549  selectDefaultAssignKernel( y, x, A );
550  }
552  //**********************************************************************************************
553 
554  //**Vectorized default assignment to dense vectors (small matrices)*****************************
568  template< typename VT1 // Type of the left-hand side target vector
569  , typename VT2 // Type of the left-hand side vector operand
570  , typename MT1 > // Type of the right-hand side matrix operand
571  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
572  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
573  {
574  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
575 
576  const size_t M( A.rows() );
577  const size_t N( A.columns() );
578 
579  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
580  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
581 
582  size_t j( 0UL );
583 
584  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
585  {
586  const size_t ibegin( ( IsLower_v<MT1> )
587  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
588  :( 0UL ) );
589  const size_t iend( ( IsUpper_v<MT1> )
590  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
591  :( M ) );
592  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
593 
594  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
595 
596  for( size_t i=ibegin; i<iend; ++i ) {
597  const SIMDType x1( set( x[i] ) );
598  xmm1 += x1 * A.load(i,j );
599  xmm2 += x1 * A.load(i,j+SIMDSIZE );
600  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
601  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
602  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
603  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
604  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
605  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
606  }
607 
608  y.store( j , xmm1 );
609  y.store( j+SIMDSIZE , xmm2 );
610  y.store( j+SIMDSIZE*2UL, xmm3 );
611  y.store( j+SIMDSIZE*3UL, xmm4 );
612  y.store( j+SIMDSIZE*4UL, xmm5 );
613  y.store( j+SIMDSIZE*5UL, xmm6 );
614  y.store( j+SIMDSIZE*6UL, xmm7 );
615  y.store( j+SIMDSIZE*7UL, xmm8 );
616  }
617 
618  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
619  {
620  const size_t ibegin( ( IsLower_v<MT1> )
621  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
622  :( 0UL ) );
623  const size_t iend( ( IsUpper_v<MT1> )
624  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
625  :( M ) );
626  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
627 
628  SIMDType xmm1, xmm2, xmm3, xmm4;
629 
630  for( size_t i=ibegin; i<iend; ++i ) {
631  const SIMDType x1( set( x[i] ) );
632  xmm1 += x1 * A.load(i,j );
633  xmm2 += x1 * A.load(i,j+SIMDSIZE );
634  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
635  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
636  }
637 
638  y.store( j , xmm1 );
639  y.store( j+SIMDSIZE , xmm2 );
640  y.store( j+SIMDSIZE*2UL, xmm3 );
641  y.store( j+SIMDSIZE*3UL, xmm4 );
642  }
643 
644  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
645  {
646  const size_t ibegin( ( IsLower_v<MT1> )
647  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
648  :( 0UL ) );
649  const size_t iend( ( IsUpper_v<MT1> )
650  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
651  :( M ) );
652  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
653 
654  SIMDType xmm1, xmm2, xmm3;
655 
656  for( size_t i=ibegin; i<iend; ++i ) {
657  const SIMDType x1( set( x[i] ) );
658  xmm1 += x1 * A.load(i,j );
659  xmm2 += x1 * A.load(i,j+SIMDSIZE );
660  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
661  }
662 
663  y.store( j , xmm1 );
664  y.store( j+SIMDSIZE , xmm2 );
665  y.store( j+SIMDSIZE*2UL, xmm3 );
666  }
667 
668  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
669  {
670  const size_t ibegin( ( IsLower_v<MT1> )
671  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
672  :( 0UL ) );
673  const size_t iend( ( IsUpper_v<MT1> )
674  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
675  :( M ) );
676  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
677 
678  SIMDType xmm1, xmm2;
679 
680  for( size_t i=ibegin; i<iend; ++i ) {
681  const SIMDType x1( set( x[i] ) );
682  xmm1 += x1 * A.load(i,j );
683  xmm2 += x1 * A.load(i,j+SIMDSIZE);
684  }
685 
686  y.store( j , xmm1 );
687  y.store( j+SIMDSIZE, xmm2 );
688  }
689 
690  for( ; j<jpos; j+=SIMDSIZE )
691  {
692  const size_t ibegin( ( IsLower_v<MT1> )
693  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
694  :( 0UL ) );
695  const size_t iend( ( IsUpper_v<MT1> )
696  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
697  :( M ) );
698  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
699 
700  SIMDType xmm1;
701 
702  for( size_t i=ibegin; i<iend; ++i ) {
703  xmm1 += set( x[i] ) * A.load(i,j);
704  }
705 
706  y.store( j, xmm1 );
707  }
708 
709  for( ; remainder && j<N; ++j )
710  {
711  const size_t ibegin( ( IsLower_v<MT1> )
712  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
713  :( 0UL ) );
714  const size_t iend( ( IsUpper_v<MT1> )
715  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
716  :( M ) );
717  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
718 
719  ElementType value{};
720 
721  for( size_t i=ibegin; i<iend; ++i ) {
722  value += x[i] * A(i,j);
723  }
724 
725  y[j] = value;
726  }
727  }
729  //**********************************************************************************************
730 
731  //**Default assignment to dense vectors (large matrices)****************************************
745  template< typename VT1 // Type of the left-hand side target vector
746  , typename VT2 // Type of the left-hand side vector operand
747  , typename MT1 > // Type of the right-hand side matrix operand
748  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
749  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
750  {
751  selectDefaultAssignKernel( y, x, A );
752  }
754  //**********************************************************************************************
755 
756  //**Vectorized default assignment to dense vectors (large matrices)*****************************
770  template< typename VT1 // Type of the left-hand side target vector
771  , typename VT2 // Type of the left-hand side vector operand
772  , typename MT1 > // Type of the right-hand side matrix operand
773  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
774  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
775  {
776  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
777 
778  const size_t M( A.rows() );
779  const size_t N( A.columns() );
780 
781  const size_t jblock( 32768UL / sizeof( ElementType ) );
782  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
783 
784  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
785 
786  reset( y );
787 
788  for( size_t jj=0U; jj<N; jj+=jblock ) {
789  for( size_t ii=0UL; ii<M; ii+=iblock )
790  {
791  const size_t iend( min( ii+iblock, M ) );
792  const size_t jtmp( min( jj+jblock, N ) );
793  const size_t jend( ( IsLower_v<MT1> )
794  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
795  :( jtmp ) );
796 
797  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
798  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
799 
800  size_t j( ( IsUpper_v<MT1> )
801  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
802  :( jj ) );
803 
804  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
805  {
806  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
807 
808  for( size_t i=ii; i<iend; ++i ) {
809  const SIMDType x1( set( x[i] ) );
810  xmm1 += x1 * A.load(i,j );
811  xmm2 += x1 * A.load(i,j+SIMDSIZE );
812  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
813  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
814  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
815  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
816  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
817  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
818  }
819 
820  y.store( j , y.load(j ) + xmm1 );
821  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
822  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
823  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
824  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
825  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
826  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
827  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
828  }
829 
830  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
831  {
832  SIMDType xmm1, xmm2, xmm3, xmm4;
833 
834  for( size_t i=ii; i<iend; ++i ) {
835  const SIMDType x1( set( x[i] ) );
836  xmm1 += x1 * A.load(i,j );
837  xmm2 += x1 * A.load(i,j+SIMDSIZE );
838  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
839  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
840  }
841 
842  y.store( j , y.load(j ) + xmm1 );
843  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
844  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
845  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
846  }
847 
848  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
849  {
850  SIMDType xmm1, xmm2, xmm3;
851 
852  for( size_t i=ii; i<iend; ++i ) {
853  const SIMDType x1( set( x[i] ) );
854  xmm1 += x1 * A.load(i,j );
855  xmm2 += x1 * A.load(i,j+SIMDSIZE );
856  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
857  }
858 
859  y.store( j , y.load(j ) + xmm1 );
860  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
861  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
862  }
863 
864  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
865  {
866  SIMDType xmm1, xmm2;
867 
868  for( size_t i=ii; i<iend; ++i ) {
869  const SIMDType x1( set( x[i] ) );
870  xmm1 += x1 * A.load(i,j );
871  xmm2 += x1 * A.load(i,j+SIMDSIZE);
872  }
873 
874  y.store( j , y.load(j ) + xmm1 );
875  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
876  }
877 
878  for( ; j<jpos; j+=SIMDSIZE )
879  {
880  SIMDType xmm1;
881 
882  for( size_t i=ii; i<iend; ++i ) {
883  xmm1 += set( x[i] ) * A.load(i,j);
884  }
885 
886  y.store( j, y.load(j) + xmm1 );
887  }
888 
889  for( ; remainder && j<jend; ++j )
890  {
891  ElementType value{};
892 
893  for( size_t i=ii; i<iend; ++i ) {
894  value += x[i] * A(i,j);
895  }
896 
897  y[j] += value;
898  }
899  }
900  }
901  }
903  //**********************************************************************************************
904 
905  //**BLAS-based assignment to dense vectors (default)********************************************
919  template< typename VT1 // Type of the left-hand side target vector
920  , typename VT2 // Type of the left-hand side vector operand
921  , typename MT1 > // Type of the right-hand side matrix operand
922  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
923  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
924  {
925  selectLargeAssignKernel( y, x, A );
926  }
928  //**********************************************************************************************
929 
930  //**BLAS-based assignment to dense vectors******************************************************
931 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
932 
945  template< typename VT1 // Type of the left-hand side target vector
946  , typename VT2 // Type of the left-hand side vector operand
947  , typename MT1 > // Type of the right-hand side matrix operand
948  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
949  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
950  {
951  using ET = ElementType_t<VT1>;
952 
953  if( IsTriangular_v<MT1> ) {
954  assign( y, x );
955  trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
956  }
957  else {
958  gemv( y, x, A, ET(1), ET(0) );
959  }
960  }
962 #endif
963  //**********************************************************************************************
964 
965  //**Assignment to sparse vectors****************************************************************
978  template< typename VT1 > // Type of the target sparse vector
979  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
980  {
982 
986 
987  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
988 
989  const ResultType tmp( serial( rhs ) );
990  assign( ~lhs, tmp );
991  }
993  //**********************************************************************************************
994 
995  //**Addition assignment to dense vectors********************************************************
1008  template< typename VT1 > // Type of the target dense vector
1009  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1010  {
1012 
1013  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1014 
1015  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1016  return;
1017  }
1018 
1019  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1020  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1021 
1022  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1023  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1024  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1025  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1026 
1027  TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1028  }
1030  //**********************************************************************************************
1031 
1032  //**Addition assignment to dense vectors (kernel selection)*************************************
1043  template< typename VT1 // Type of the left-hand side target vector
1044  , typename VT2 // Type of the left-hand side vector operand
1045  , typename MT1 > // Type of the right-hand side matrix operand
1046  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1047  {
1048  if( ( IsDiagonal_v<MT1> ) ||
1049  ( IsComputation_v<MT> && !evaluateMatrix ) ||
1050  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1051  selectSmallAddAssignKernel( y, x, A );
1052  else
1053  selectBlasAddAssignKernel( y, x, A );
1054  }
1056  //**********************************************************************************************
1057 
1058  //**Default addition assignment to dense vectors************************************************
1072  template< typename VT1 // Type of the left-hand side target vector
1073  , typename VT2 // Type of the left-hand side vector operand
1074  , typename MT1 > // Type of the right-hand side matrix operand
1075  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1076  {
1077  const size_t M( A.rows() );
1078  const size_t N( A.columns() );
1079 
1080  for( size_t i=0UL; i<M; ++i )
1081  {
1082  if( IsDiagonal_v<MT1> )
1083  {
1084  y[i] += x[i] * A(i,i);
1085  }
1086  else
1087  {
1088  const size_t jbegin( ( IsUpper_v<MT1> )
1089  ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1090  :( 0UL ) );
1091  const size_t jend( ( IsLower_v<MT1> )
1092  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1093  :( N ) );
1094  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1095 
1096  const size_t jnum( jend - jbegin );
1097  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1098 
1099  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1100  y[j ] += x[i] * A(i,j );
1101  y[j+1UL] += x[i] * A(i,j+1UL);
1102  }
1103  if( jpos < jend ) {
1104  y[jpos] += x[i] * A(i,jpos);
1105  }
1106  }
1107  }
1108  }
1110  //**********************************************************************************************
1111 
1112  //**Default addition assignment to dense vectors (small matrices)*******************************
1126  template< typename VT1 // Type of the left-hand side target vector
1127  , typename VT2 // Type of the left-hand side vector operand
1128  , typename MT1 > // Type of the right-hand side matrix operand
1129  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1130  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1131  {
1132  selectDefaultAddAssignKernel( y, x, A );
1133  }
1135  //**********************************************************************************************
1136 
1137  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1151  template< typename VT1 // Type of the left-hand side target vector
1152  , typename VT2 // Type of the left-hand side vector operand
1153  , typename MT1 > // Type of the right-hand side matrix operand
1154  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1155  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1156  {
1157  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1158 
1159  const size_t M( A.rows() );
1160  const size_t N( A.columns() );
1161 
1162  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1163  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1164 
1165  size_t j( 0UL );
1166 
1167  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1168  {
1169  const size_t ibegin( ( IsLower_v<MT1> )
1170  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1171  :( 0UL ) );
1172  const size_t iend( ( IsUpper_v<MT1> )
1173  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1174  :( M ) );
1175  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1176 
1177  SIMDType xmm1( y.load(j ) );
1178  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1179  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1180  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1181  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1182  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1183  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1184  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1185 
1186  for( size_t i=ibegin; i<iend; ++i ) {
1187  const SIMDType x1( set( x[i] ) );
1188  xmm1 += x1 * A.load(i,j );
1189  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1190  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1191  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1192  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1193  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1194  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1195  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1196  }
1197 
1198  y.store( j , xmm1 );
1199  y.store( j+SIMDSIZE , xmm2 );
1200  y.store( j+SIMDSIZE*2UL, xmm3 );
1201  y.store( j+SIMDSIZE*3UL, xmm4 );
1202  y.store( j+SIMDSIZE*4UL, xmm5 );
1203  y.store( j+SIMDSIZE*5UL, xmm6 );
1204  y.store( j+SIMDSIZE*6UL, xmm7 );
1205  y.store( j+SIMDSIZE*7UL, xmm8 );
1206  }
1207 
1208  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1209  {
1210  const size_t ibegin( ( IsLower_v<MT1> )
1211  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1212  :( 0UL ) );
1213  const size_t iend( ( IsUpper_v<MT1> )
1214  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1215  :( M ) );
1216  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1217 
1218  SIMDType xmm1( y.load(j ) );
1219  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1220  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1221  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1222 
1223  for( size_t i=ibegin; i<iend; ++i ) {
1224  const SIMDType x1( set( x[i] ) );
1225  xmm1 += x1 * A.load(i,j );
1226  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1227  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1228  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1229  }
1230 
1231  y.store( j , xmm1 );
1232  y.store( j+SIMDSIZE , xmm2 );
1233  y.store( j+SIMDSIZE*2UL, xmm3 );
1234  y.store( j+SIMDSIZE*3UL, xmm4 );
1235  }
1236 
1237  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1238  {
1239  const size_t ibegin( ( IsLower_v<MT1> )
1240  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1241  :( 0UL ) );
1242  const size_t iend( ( IsUpper_v<MT1> )
1243  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1244  :( M ) );
1245  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1246 
1247  SIMDType xmm1( y.load(j ) );
1248  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1249  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1250 
1251  for( size_t i=ibegin; i<iend; ++i ) {
1252  const SIMDType x1( set( x[i] ) );
1253  xmm1 += x1 * A.load(i,j );
1254  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1255  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1256  }
1257 
1258  y.store( j , xmm1 );
1259  y.store( j+SIMDSIZE , xmm2 );
1260  y.store( j+SIMDSIZE*2UL, xmm3 );
1261  }
1262 
1263  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1264  {
1265  const size_t ibegin( ( IsLower_v<MT1> )
1266  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1267  :( 0UL ) );
1268  const size_t iend( ( IsUpper_v<MT1> )
1269  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1270  :( M ) );
1271  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1272 
1273  SIMDType xmm1( y.load(j ) );
1274  SIMDType xmm2( y.load(j+SIMDSIZE) );
1275 
1276  for( size_t i=ibegin; i<iend; ++i ) {
1277  const SIMDType x1( set( x[i] ) );
1278  xmm1 += x1 * A.load(i,j );
1279  xmm2 += x1 * A.load(i,j+SIMDSIZE);
1280  }
1281 
1282  y.store( j , xmm1 );
1283  y.store( j+SIMDSIZE, xmm2 );
1284  }
1285 
1286  for( ; j<jpos; j+=SIMDSIZE )
1287  {
1288  const size_t ibegin( ( IsLower_v<MT1> )
1289  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1290  :( 0UL ) );
1291  const size_t iend( ( IsUpper_v<MT1> )
1292  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1293  :( M ) );
1294  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1295 
1296  SIMDType xmm1( y.load(j) );
1297 
1298  for( size_t i=ibegin; i<iend; ++i ) {
1299  xmm1 += set( x[i] ) * A.load(i,j);
1300  }
1301 
1302  y.store( j, xmm1 );
1303  }
1304 
1305  for( ; remainder && j<N; ++j )
1306  {
1307  const size_t ibegin( ( IsLower_v<MT1> )
1308  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1309  :( 0UL ) );
1310  const size_t iend( ( IsUpper_v<MT1> )
1311  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1312  :( M ) );
1313  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1314 
1315  ElementType value{};
1316 
1317  for( size_t i=ibegin; i<iend; ++i ) {
1318  value += x[i] * A(i,j);
1319  }
1320 
1321  y[j] += value;
1322  }
1323  }
1325  //**********************************************************************************************
1326 
1327  //**Default addition assignment to dense vectors (large matrices)*******************************
1341  template< typename VT1 // Type of the left-hand side target vector
1342  , typename VT2 // Type of the left-hand side vector operand
1343  , typename MT1 > // Type of the right-hand side matrix operand
1344  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1345  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1346  {
1347  selectDefaultAddAssignKernel( y, x, A );
1348  }
1350  //**********************************************************************************************
1351 
1352  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1366  template< typename VT1 // Type of the left-hand side target vector
1367  , typename VT2 // Type of the left-hand side vector operand
1368  , typename MT1 > // Type of the right-hand side matrix operand
1369  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1370  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1371  {
1372  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1373 
1374  const size_t M( A.rows() );
1375  const size_t N( A.columns() );
1376 
1377  const size_t jblock( 32768UL / sizeof( ElementType ) );
1378  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1379 
1380  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1381 
1382  for( size_t jj=0U; jj<N; jj+=jblock ) {
1383  for( size_t ii=0UL; ii<M; ii+=iblock )
1384  {
1385  const size_t iend( min( ii+iblock, M ) );
1386  const size_t jtmp( min( jj+jblock, N ) );
1387  const size_t jend( ( IsLower_v<MT1> )
1388  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1389  :( jtmp ) );
1390 
1391  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1392  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1393 
1394  size_t j( ( IsUpper_v<MT1> )
1395  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1396  :( jj ) );
1397 
1398  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1399  {
1400  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1401 
1402  for( size_t i=ii; i<iend; ++i ) {
1403  const SIMDType x1( set( x[i] ) );
1404  xmm1 += x1 * A.load(i,j );
1405  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1406  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1407  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1408  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1409  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1410  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1411  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1412  }
1413 
1414  y.store( j , y.load(j ) + xmm1 );
1415  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1416  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1417  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1418  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1419  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1420  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1421  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1422  }
1423 
1424  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1425  {
1426  SIMDType xmm1, xmm2, xmm3, xmm4;
1427 
1428  for( size_t i=ii; i<iend; ++i ) {
1429  const SIMDType x1( set( x[i] ) );
1430  xmm1 += x1 * A.load(i,j );
1431  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1432  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1433  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1434  }
1435 
1436  y.store( j , y.load(j ) + xmm1 );
1437  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1438  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1439  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1440  }
1441 
1442  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1443  {
1444  SIMDType xmm1, xmm2, xmm3;
1445 
1446  for( size_t i=ii; i<iend; ++i ) {
1447  const SIMDType x1( set( x[i] ) );
1448  xmm1 += x1 * A.load(i,j );
1449  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1450  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1451  }
1452 
1453  y.store( j , y.load(j ) + xmm1 );
1454  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1455  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1456  }
1457 
1458  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1459  {
1460  SIMDType xmm1, xmm2;
1461 
1462  for( size_t i=ii; i<iend; ++i ) {
1463  const SIMDType x1( set( x[i] ) );
1464  xmm1 += x1 * A.load(i,j );
1465  xmm2 += x1 * A.load(i,j+SIMDSIZE);
1466  }
1467 
1468  y.store( j , y.load(j ) + xmm1 );
1469  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1470  }
1471 
1472  for( ; j<jpos; j+=SIMDSIZE )
1473  {
1474  SIMDType xmm1;
1475 
1476  for( size_t i=ii; i<iend; ++i ) {
1477  xmm1 += set( x[i] ) * A.load(i,j);
1478  }
1479 
1480  y.store( j, y.load(j) + xmm1 );
1481  }
1482 
1483  for( ; remainder && j<jend; ++j )
1484  {
1485  ElementType value{};
1486 
1487  for( size_t i=ii; i<iend; ++i ) {
1488  value += x[i] * A(i,j);
1489  }
1490 
1491  y[j] += value;
1492  }
1493  }
1494  }
1495  }
1497  //**********************************************************************************************
1498 
1499  //**BLAS-based addition assignment to dense vectors (default)***********************************
1513  template< typename VT1 // Type of the left-hand side target vector
1514  , typename VT2 // Type of the left-hand side vector operand
1515  , typename MT1 > // Type of the right-hand side matrix operand
1516  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1517  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1518  {
1519  selectLargeAddAssignKernel( y, x, A );
1520  }
1522  //**********************************************************************************************
1523 
1524  //**BLAS-based addition assignment to dense vectors*********************************************
1525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1526 
1539  template< typename VT1 // Type of the left-hand side target vector
1540  , typename VT2 // Type of the left-hand side vector operand
1541  , typename MT1 > // Type of the right-hand side matrix operand
1542  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1543  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1544  {
1545  using ET = ElementType_t<VT1>;
1546 
1547  if( IsTriangular_v<MT1> ) {
1548  ResultType_t<VT1> tmp( serial( x ) );
1549  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1550  addAssign( y, tmp );
1551  }
1552  else {
1553  gemv( y, x, A, ET(1), ET(1) );
1554  }
1555  }
1557 #endif
1558  //**********************************************************************************************
1559 
1560  //**Addition assignment to sparse vectors*******************************************************
1561  // No special implementation for the addition assignment to sparse vectors.
1562  //**********************************************************************************************
1563 
1564  //**Subtraction assignment to dense vectors*****************************************************
1577  template< typename VT1 > // Type of the target dense vector
1578  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1579  {
1581 
1582  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1583 
1584  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1585  return;
1586  }
1587 
1588  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1589  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1590 
1591  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1592  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1593  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1594  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1595 
1596  TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1597  }
1599  //**********************************************************************************************
1600 
1601  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1612  template< typename VT1 // Type of the left-hand side target vector
1613  , typename VT2 // Type of the left-hand side vector operand
1614  , typename MT1 > // Type of the right-hand side matrix operand
1615  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1616  {
1617  if( ( IsDiagonal_v<MT1> ) ||
1618  ( IsComputation_v<MT> && !evaluateMatrix ) ||
1619  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1620  selectSmallSubAssignKernel( y, x, A );
1621  else
1622  selectBlasSubAssignKernel( y, x, A );
1623  }
1625  //**********************************************************************************************
1626 
1627  //**Default subtraction assignment to dense vectors*********************************************
1641  template< typename VT1 // Type of the left-hand side target vector
1642  , typename VT2 // Type of the left-hand side vector operand
1643  , typename MT1 > // Type of the right-hand side matrix operand
1644  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1645  {
1646  const size_t M( A.rows() );
1647  const size_t N( A.columns() );
1648 
1649  for( size_t i=0UL; i<M; ++i )
1650  {
1651  if( IsDiagonal_v<MT1> )
1652  {
1653  y[i] -= x[i] * A(i,i);
1654  }
1655  else
1656  {
1657  const size_t jbegin( ( IsUpper_v<MT1> )
1658  ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1659  :( 0UL ) );
1660  const size_t jend( ( IsLower_v<MT1> )
1661  ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1662  :( N ) );
1663  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1664 
1665  const size_t jnum( jend - jbegin );
1666  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1667 
1668  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1669  y[j ] -= x[i] * A(i,j );
1670  y[j+1UL] -= x[i] * A(i,j+1UL);
1671  }
1672  if( jpos < jend ) {
1673  y[jpos] -= x[i] * A(i,jpos);
1674  }
1675  }
1676  }
1677  }
1679  //**********************************************************************************************
1680 
1681  //**Default subtraction assignment to dense vectors (small matrices)****************************
1695  template< typename VT1 // Type of the left-hand side target vector
1696  , typename VT2 // Type of the left-hand side vector operand
1697  , typename MT1 > // Type of the right-hand side matrix operand
1698  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1699  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1700  {
1701  selectDefaultSubAssignKernel( y, x, A );
1702  }
1704  //**********************************************************************************************
1705 
1706  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1721  template< typename VT1 // Type of the left-hand side target vector
1722  , typename VT2 // Type of the left-hand side vector operand
1723  , typename MT1 > // Type of the right-hand side matrix operand
1724  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1725  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1726  {
1727  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1728 
1729  const size_t M( A.rows() );
1730  const size_t N( A.columns() );
1731 
1732  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1733  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1734 
1735  size_t j( 0UL );
1736 
1737  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1738  {
1739  const size_t ibegin( ( IsLower_v<MT1> )
1740  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1741  :( 0UL ) );
1742  const size_t iend( ( IsUpper_v<MT1> )
1743  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1744  :( M ) );
1745  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1746 
1747  SIMDType xmm1( y.load(j ) );
1748  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1749  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1750  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1751  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1752  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1753  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1754  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1755 
1756  for( size_t i=ibegin; i<iend; ++i ) {
1757  const SIMDType x1( set( x[i] ) );
1758  xmm1 -= x1 * A.load(i,j );
1759  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1760  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1761  xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1762  xmm5 -= x1 * A.load(i,j+SIMDSIZE*4UL);
1763  xmm6 -= x1 * A.load(i,j+SIMDSIZE*5UL);
1764  xmm7 -= x1 * A.load(i,j+SIMDSIZE*6UL);
1765  xmm8 -= x1 * A.load(i,j+SIMDSIZE*7UL);
1766  }
1767 
1768  y.store( j , xmm1 );
1769  y.store( j+SIMDSIZE , xmm2 );
1770  y.store( j+SIMDSIZE*2UL, xmm3 );
1771  y.store( j+SIMDSIZE*3UL, xmm4 );
1772  y.store( j+SIMDSIZE*4UL, xmm5 );
1773  y.store( j+SIMDSIZE*5UL, xmm6 );
1774  y.store( j+SIMDSIZE*6UL, xmm7 );
1775  y.store( j+SIMDSIZE*7UL, xmm8 );
1776  }
1777 
1778  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1779  {
1780  const size_t ibegin( ( IsLower_v<MT1> )
1781  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1782  :( 0UL ) );
1783  const size_t iend( ( IsUpper_v<MT1> )
1784  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1785  :( M ) );
1786  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1787 
1788  SIMDType xmm1( y.load(j ) );
1789  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1790  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1791  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1792 
1793  for( size_t i=ibegin; i<iend; ++i ) {
1794  const SIMDType x1( set( x[i] ) );
1795  xmm1 -= x1 * A.load(i,j );
1796  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1797  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1798  xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1799  }
1800 
1801  y.store( j , xmm1 );
1802  y.store( j+SIMDSIZE , xmm2 );
1803  y.store( j+SIMDSIZE*2UL, xmm3 );
1804  y.store( j+SIMDSIZE*3UL, xmm4 );
1805  }
1806 
1807  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1808  {
1809  const size_t ibegin( ( IsLower_v<MT1> )
1810  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1811  :( 0UL ) );
1812  const size_t iend( ( IsUpper_v<MT1> )
1813  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1814  :( M ) );
1815  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1816 
1817  SIMDType xmm1( y.load(j ) );
1818  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1819  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1820 
1821  for( size_t i=ibegin; i<iend; ++i ) {
1822  const SIMDType x1( set( x[i] ) );
1823  xmm1 -= x1 * A.load(i,j );
1824  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1825  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1826  }
1827 
1828  y.store( j , xmm1 );
1829  y.store( j+SIMDSIZE , xmm2 );
1830  y.store( j+SIMDSIZE*2UL, xmm3 );
1831  }
1832 
1833  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1834  {
1835  const size_t ibegin( ( IsLower_v<MT1> )
1836  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1837  :( 0UL ) );
1838  const size_t iend( ( IsUpper_v<MT1> )
1839  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1840  :( M ) );
1841  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1842 
1843  SIMDType xmm1( y.load(j ) );
1844  SIMDType xmm2( y.load(j+SIMDSIZE) );
1845 
1846  for( size_t i=ibegin; i<iend; ++i ) {
1847  const SIMDType x1( set( x[i] ) );
1848  xmm1 -= x1 * A.load(i,j );
1849  xmm2 -= x1 * A.load(i,j+SIMDSIZE);
1850  }
1851 
1852  y.store( j , xmm1 );
1853  y.store( j+SIMDSIZE, xmm2 );
1854  }
1855 
1856  for( ; j<jpos; j+=SIMDSIZE )
1857  {
1858  const size_t ibegin( ( IsLower_v<MT1> )
1859  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1860  :( 0UL ) );
1861  const size_t iend( ( IsUpper_v<MT1> )
1862  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1863  :( M ) );
1864  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1865 
1866  SIMDType xmm1( y.load(j) );
1867 
1868  for( size_t i=ibegin; i<iend; ++i ) {
1869  xmm1 -= set( x[i] ) * A.load(i,j);
1870  }
1871 
1872  y.store( j, xmm1 );
1873  }
1874 
1875  for( ; remainder && j<N; ++j )
1876  {
1877  const size_t ibegin( ( IsLower_v<MT1> )
1878  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1879  :( 0UL ) );
1880  const size_t iend( ( IsUpper_v<MT1> )
1881  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1882  :( M ) );
1883  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1884 
1885  ElementType value{};
1886 
1887  for( size_t i=ibegin; i<iend; ++i ) {
1888  value += x[i] * A(i,j);
1889  }
1890 
1891  y[j] -= value;
1892  }
1893  }
1895  //**********************************************************************************************
1896 
1897  //**Default subtraction assignment to dense vectors (large matrices)****************************
1911  template< typename VT1 // Type of the left-hand side target vector
1912  , typename VT2 // Type of the left-hand side vector operand
1913  , typename MT1 > // Type of the right-hand side matrix operand
1914  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1915  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1916  {
1917  selectDefaultSubAssignKernel( y, x, A );
1918  }
1920  //**********************************************************************************************
1921 
1922  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
1937  template< typename VT1 // Type of the left-hand side target vector
1938  , typename VT2 // Type of the left-hand side vector operand
1939  , typename MT1 > // Type of the right-hand side matrix operand
1940  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1941  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1942  {
1943  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1944 
1945  const size_t M( A.rows() );
1946  const size_t N( A.columns() );
1947 
1948  const size_t jblock( 32768UL / sizeof( ElementType ) );
1949  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1950 
1951  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1952 
1953  for( size_t jj=0U; jj<N; jj+=jblock ) {
1954  for( size_t ii=0UL; ii<M; ii+=iblock )
1955  {
1956  const size_t iend( min( ii+iblock, M ) );
1957  const size_t jtmp( min( jj+jblock, N ) );
1958  const size_t jend( ( IsLower_v<MT1> )
1959  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1960  :( jtmp ) );
1961 
1962  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1963  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1964 
1965  size_t j( ( IsUpper_v<MT1> )
1966  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1967  :( jj ) );
1968 
1969  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1970  {
1971  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1972 
1973  for( size_t i=ii; i<iend; ++i ) {
1974  const SIMDType x1( set( x[i] ) );
1975  xmm1 += x1 * A.load(i,j );
1976  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1977  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1978  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1979  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1980  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1981  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1982  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1983  }
1984 
1985  y.store( j , y.load(j ) - xmm1 );
1986  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
1987  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
1988  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
1989  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
1990  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
1991  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
1992  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
1993  }
1994 
1995  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1996  {
1997  SIMDType xmm1, xmm2, xmm3, xmm4;
1998 
1999  for( size_t i=ii; i<iend; ++i ) {
2000  const SIMDType x1( set( x[i] ) );
2001  xmm1 += x1 * A.load(i,j );
2002  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2003  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2004  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2005  }
2006 
2007  y.store( j , y.load(j ) - xmm1 );
2008  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2009  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2010  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2011  }
2012 
2013  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2014  {
2015  SIMDType xmm1, xmm2, xmm3;
2016 
2017  for( size_t i=ii; i<iend; ++i ) {
2018  const SIMDType x1( set( x[i] ) );
2019  xmm1 += x1 * A.load(i,j );
2020  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2021  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2022  }
2023 
2024  y.store( j , y.load(j ) - xmm1 );
2025  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2026  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2027  }
2028 
2029  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2030  {
2031  SIMDType xmm1, xmm2;
2032 
2033  for( size_t i=ii; i<iend; ++i ) {
2034  const SIMDType x1( set( x[i] ) );
2035  xmm1 += x1 * A.load(i,j );
2036  xmm2 += x1 * A.load(i,j+SIMDSIZE);
2037  }
2038 
2039  y.store( j , y.load(j ) - xmm1 );
2040  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2041  }
2042 
2043  for( ; j<jpos; j+=SIMDSIZE )
2044  {
2045  SIMDType xmm1;
2046 
2047  for( size_t i=ii; i<iend; ++i ) {
2048  xmm1 += set( x[i] ) * A.load(i,j);
2049  }
2050 
2051  y.store( j, y.load(j) - xmm1 );
2052  }
2053 
2054  for( ; remainder && j<jend; ++j )
2055  {
2056  ElementType value{};
2057 
2058  for( size_t i=ii; i<iend; ++i ) {
2059  value += x[i] * A(i,j);
2060  }
2061 
2062  y[j] -= value;
2063  }
2064  }
2065  }
2066  }
2068  //**********************************************************************************************
2069 
2070  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2084  template< typename VT1 // Type of the left-hand side target vector
2085  , typename VT2 // Type of the left-hand side vector operand
2086  , typename MT1 > // Type of the right-hand side matrix operand
2087  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2088  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2089  {
2090  selectLargeSubAssignKernel( y, x, A );
2091  }
2093  //**********************************************************************************************
2094 
2095  //**BLAS-based subtraction assignment to dense vectors******************************************
2096 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2097 
2110  template< typename VT1 // Type of the left-hand side target vector
2111  , typename VT2 // Type of the left-hand side vector operand
2112  , typename MT1 > // Type of the right-hand side matrix operand
2113  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2114  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2115  {
2116  using ET = ElementType_t<VT1>;
2117 
2118  if( IsTriangular_v<MT1> ) {
2119  ResultType_t<VT1> tmp( serial( x ) );
2120  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2121  subAssign( y, tmp );
2122  }
2123  else {
2124  gemv( y, x, A, ET(-1), ET(1) );
2125  }
2126  }
2128 #endif
2129  //**********************************************************************************************
2130 
2131  //**Subtraction assignment to sparse vectors****************************************************
2132  // No special implementation for the subtraction assignment to sparse vectors.
2133  //**********************************************************************************************
2134 
2135  //**Multiplication assignment to dense vectors**************************************************
2148  template< typename VT1 > // Type of the target dense vector
2149  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2150  {
2152 
2156 
2157  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2158 
2159  const ResultType tmp( serial( rhs ) );
2160  multAssign( ~lhs, tmp );
2161  }
2163  //**********************************************************************************************
2164 
2165  //**Multiplication assignment to sparse vectors*************************************************
2166  // No special implementation for the multiplication assignment to sparse vectors.
2167  //**********************************************************************************************
2168 
2169  //**Division assignment to dense vectors********************************************************
2182  template< typename VT1 > // Type of the target dense vector
2183  friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2184  {
2186 
2190 
2191  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2192 
2193  const ResultType tmp( serial( rhs ) );
2194  divAssign( ~lhs, tmp );
2195  }
2197  //**********************************************************************************************
2198 
2199  //**Division assignment to sparse vectors*******************************************************
2200  // No special implementation for the division assignment to sparse vectors.
2201  //**********************************************************************************************
2202 
2203  //**SMP assignment to dense vectors*************************************************************
2218  template< typename VT1 > // Type of the target dense vector
2219  friend inline auto smpAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2220  -> EnableIf_t< UseSMPAssign_v<VT1> >
2221  {
2223 
2224  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2225 
2226  if( rhs.mat_.rows() == 0UL ) {
2227  reset( ~lhs );
2228  return;
2229  }
2230  else if( rhs.mat_.columns() == 0UL ) {
2231  return;
2232  }
2233 
2234  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2235  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2236 
2237  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2238  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2239  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2240  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2241 
2242  smpAssign( ~lhs, x * A );
2243  }
2245  //**********************************************************************************************
2246 
2247  //**SMP assignment to sparse vectors************************************************************
2262  template< typename VT1 > // Type of the target sparse vector
2263  friend inline auto smpAssign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2264  -> EnableIf_t< UseSMPAssign_v<VT1> >
2265  {
2267 
2271 
2272  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2273 
2274  const ResultType tmp( rhs );
2275  smpAssign( ~lhs, tmp );
2276  }
2278  //**********************************************************************************************
2279 
2280  //**SMP addition assignment to dense vectors****************************************************
2295  template< typename VT1 > // Type of the target dense vector
2296  friend inline auto smpAddAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2297  -> EnableIf_t< UseSMPAssign_v<VT1> >
2298  {
2300 
2301  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2302 
2303  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2304  return;
2305  }
2306 
2307  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2308  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2309 
2310  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2311  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2312  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2313  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2314 
2315  smpAddAssign( ~lhs, x * A );
2316  }
2318  //**********************************************************************************************
2319 
2320  //**SMP addition assignment to sparse vectors***************************************************
2321  // No special implementation for the SMP addition assignment to sparse vectors.
2322  //**********************************************************************************************
2323 
2324  //**SMP subtraction assignment to dense vectors*************************************************
2339  template< typename VT1 > // Type of the target dense vector
2340  friend inline auto smpSubAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2341  -> EnableIf_t< UseSMPAssign_v<VT1> >
2342  {
2344 
2345  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2346 
2347  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2348  return;
2349  }
2350 
2351  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2352  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2353 
2354  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2355  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2356  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2357  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2358 
2359  smpSubAssign( ~lhs, x * A );
2360  }
2362  //**********************************************************************************************
2363 
2364  //**SMP subtraction assignment to sparse vectors************************************************
2365  // No special implementation for the SMP subtraction assignment to sparse vectors.
2366  //**********************************************************************************************
2367 
2368  //**SMP multiplication assignment to dense vectors**********************************************
2383  template< typename VT1 > // Type of the target dense vector
2384  friend inline auto smpMultAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2385  -> EnableIf_t< UseSMPAssign_v<VT1> >
2386  {
2388 
2392 
2393  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2394 
2395  const ResultType tmp( rhs );
2396  smpMultAssign( ~lhs, tmp );
2397  }
2399  //**********************************************************************************************
2400 
2401  //**SMP multiplication assignment to sparse vectors*********************************************
2402  // No special implementation for the SMP multiplication assignment to sparse vectors.
2403  //**********************************************************************************************
2404 
2405  //**SMP division assignment to dense vectors****************************************************
2420  template< typename VT1 > // Type of the target dense vector
2421  friend inline auto smpDivAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2422  -> EnableIf_t< UseSMPAssign_v<VT1> >
2423  {
2425 
2429 
2430  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2431 
2432  const ResultType tmp( rhs );
2433  smpDivAssign( ~lhs, tmp );
2434  }
2436  //**********************************************************************************************
2437 
2438  //**SMP division assignment to sparse vectors***************************************************
2439  // No special implementation for the SMP division assignment to sparse vectors.
2440  //**********************************************************************************************
2441 
2442  //**Compile time checks*************************************************************************
2450  //**********************************************************************************************
2451 };
2452 //*************************************************************************************************
2453 
2454 
2455 
2456 
2457 //=================================================================================================
2458 //
2459 // DVECSCALARMULTEXPR SPECIALIZATION
2460 //
2461 //=================================================================================================
2462 
2463 //*************************************************************************************************
2471 template< typename VT // Type of the left-hand side dense vector
2472  , typename MT // Type of the right-hand side dense matrix
2473  , typename ST > // Type of the side scalar value
2474 class DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >
2475  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2476  , private Computation
2477 {
2478  private:
2479  //**Type definitions****************************************************************************
2480  using VMM = TDVecDMatMultExpr<VT,MT>;
2481  using RES = ResultType_t<VMM>;
2482  using VRT = ResultType_t<VT>;
2483  using MRT = ResultType_t<MT>;
2484  using VET = ElementType_t<VRT>;
2485  using MET = ElementType_t<MRT>;
2486  using VCT = CompositeType_t<VT>;
2487  using MCT = CompositeType_t<MT>;
2488  //**********************************************************************************************
2489 
2490  //**********************************************************************************************
2492  static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2493  //**********************************************************************************************
2494 
2495  //**********************************************************************************************
2497  static constexpr bool evaluateMatrix =
2498  ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2499  IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2500  //**********************************************************************************************
2501 
2502  //**********************************************************************************************
2504 
2507  template< typename T1 >
2508  static constexpr bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
2509  //**********************************************************************************************
2510 
2511  //**********************************************************************************************
2513 
2515  template< typename T1, typename T2, typename T3, typename T4 >
2516  static constexpr bool UseBlasKernel_v =
2518  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2519  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2520  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2521  !IsDiagonal_v<T3> &&
2522  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2523  IsBLASCompatible_v< ElementType_t<T1> > &&
2524  IsBLASCompatible_v< ElementType_t<T2> > &&
2525  IsBLASCompatible_v< ElementType_t<T3> > &&
2526  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2527  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2528  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2529  //**********************************************************************************************
2530 
2531  //**********************************************************************************************
2533 
2536  template< typename T1, typename T2, typename T3, typename T4 >
2537  static constexpr bool UseVectorizedDefaultKernel_v =
2538  ( useOptimizedKernels &&
2539  !IsDiagonal_v<T3> &&
2540  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2541  IsSIMDCombinable_v< ElementType_t<T1>
2542  , ElementType_t<T2>
2543  , ElementType_t<T3>
2544  , T4 > &&
2545  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2546  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2547  //**********************************************************************************************
2548 
2549  public:
2550  //**Type definitions****************************************************************************
2551  using This = DVecScalarMultExpr<VMM,ST,true>;
2552  using BaseType = DenseVector<This,true>;
2553  using ResultType = MultTrait_t<RES,ST>;
2554  using TransposeType = TransposeType_t<ResultType>;
2555  using ElementType = ElementType_t<ResultType>;
2556  using SIMDType = SIMDTrait_t<ElementType>;
2557  using ReturnType = const ElementType;
2558  using CompositeType = const ResultType;
2559 
2561  using LeftOperand = const TDVecDMatMultExpr<VT,MT>;
2562 
2564  using RightOperand = ST;
2565 
2567  using LT = If_t< evaluateVector, const VRT, VCT >;
2568 
2570  using RT = If_t< evaluateMatrix, const MRT, MCT >;
2571  //**********************************************************************************************
2572 
2573  //**Compilation flags***************************************************************************
2575  static constexpr bool simdEnabled =
2576  ( !IsDiagonal_v<MT> &&
2577  VT::simdEnabled && MT::simdEnabled &&
2578  IsSIMDCombinable_v<VET,MET,ST> &&
2579  HasSIMDAdd_v<VET,MET> &&
2580  HasSIMDMult_v<VET,MET> );
2581 
2583  static constexpr bool smpAssignable =
2584  ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
2585  //**********************************************************************************************
2586 
2587  //**SIMD properties*****************************************************************************
2589  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
2590  //**********************************************************************************************
2591 
2592  //**Constructor*********************************************************************************
2598  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2599  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2600  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2601  {}
2602  //**********************************************************************************************
2603 
2604  //**Subscript operator**************************************************************************
2610  inline ReturnType operator[]( size_t index ) const {
2611  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2612  return vector_[index] * scalar_;
2613  }
2614  //**********************************************************************************************
2615 
2616  //**At function*********************************************************************************
2623  inline ReturnType at( size_t index ) const {
2624  if( index >= vector_.size() ) {
2625  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2626  }
2627  return (*this)[index];
2628  }
2629  //**********************************************************************************************
2630 
2631  //**Size function*******************************************************************************
2636  inline size_t size() const {
2637  return vector_.size();
2638  }
2639  //**********************************************************************************************
2640 
2641  //**Left operand access*************************************************************************
2646  inline LeftOperand leftOperand() const {
2647  return vector_;
2648  }
2649  //**********************************************************************************************
2650 
2651  //**Right operand access************************************************************************
2656  inline RightOperand rightOperand() const {
2657  return scalar_;
2658  }
2659  //**********************************************************************************************
2660 
2661  //**********************************************************************************************
2667  template< typename T >
2668  inline bool canAlias( const T* alias ) const {
2669  return vector_.canAlias( alias );
2670  }
2671  //**********************************************************************************************
2672 
2673  //**********************************************************************************************
2679  template< typename T >
2680  inline bool isAliased( const T* alias ) const {
2681  return vector_.isAliased( alias );
2682  }
2683  //**********************************************************************************************
2684 
2685  //**********************************************************************************************
2690  inline bool isAligned() const {
2691  return vector_.isAligned();
2692  }
2693  //**********************************************************************************************
2694 
2695  //**********************************************************************************************
2700  inline bool canSMPAssign() const noexcept {
2701  RightOperand_t<VMM> A( vector_.rightOperand() );
2702  return ( !BLAZE_BLAS_MODE ||
2705  ( IsComputation_v<MT> && !evaluateMatrix ) ||
2706  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2707  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
2708  }
2709  //**********************************************************************************************
2710 
2711  private:
2712  //**Member variables****************************************************************************
2715  //**********************************************************************************************
2716 
2717  //**Assignment to dense vectors*****************************************************************
2729  template< typename VT1 > // Type of the target dense vector
2730  friend inline void assign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
2731  {
2733 
2734  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2735 
2736  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
2737  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
2738 
2739  if( right.rows() == 0UL ) {
2740  reset( ~lhs );
2741  return;
2742  }
2743  else if( right.columns() == 0UL ) {
2744  return;
2745  }
2746 
2747  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
2748  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
2749 
2750  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
2751  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
2752  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
2753  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2754 
2755  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2756  }
2757  //**********************************************************************************************
2758 
2759  //**Assignment to dense vectors (kernel selection)**********************************************
2770  template< typename VT1 // Type of the left-hand side target vector
2771  , typename VT2 // Type of the left-hand side vector operand
2772  , typename MT1 // Type of the right-hand side matrix operand
2773  , typename ST2 > // Type of the scalar value
2774  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2775  {
2776  if( ( IsDiagonal_v<MT1> ) ||
2777  ( IsComputation_v<MT> && !evaluateMatrix ) ||
2778  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2779  selectSmallAssignKernel( y, x, A, scalar );
2780  else
2781  selectBlasAssignKernel( y, x, A, scalar );
2782  }
2783  //**********************************************************************************************
2784 
2785  //**Default assignment to dense vectors*********************************************************
2799  template< typename VT1 // Type of the left-hand side target vector
2800  , typename VT2 // Type of the left-hand side vector operand
2801  , typename MT1 // Type of the right-hand side matrix operand
2802  , typename ST2 > // Type of the scalar value
2803  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2804  {
2805  const size_t M( A.rows() );
2806  const size_t N( A.columns() );
2807 
2808  if( IsStrictlyUpper_v<MT1> ) {
2809  reset( y[0] );
2810  }
2811 
2812  if( !IsLower_v<MT1> )
2813  {
2814  for( size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<N; ++j ) {
2815  y[j] = x[0UL] * A(0UL,j);
2816  }
2817  }
2818 
2819  for( size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
2820  {
2821  if( IsDiagonal_v<MT1> )
2822  {
2823  y[i] = x[i] * A(i,i) * scalar;
2824  }
2825  else
2826  {
2827  const size_t jbegin( ( IsUpper_v<MT1> )
2828  ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
2829  :( 0UL ) );
2830  const size_t jend( ( IsLower_v<MT1> )
2831  ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
2832  :( N ) );
2833  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2834 
2835  const size_t jnum( jend - jbegin );
2836  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2837 
2838  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2839  y[j ] += x[i] * A(i,j );
2840  y[j+1UL] += x[i] * A(i,j+1UL);
2841  }
2842  if( jpos < jend ) {
2843  y[jpos] += x[i] * A(i,jpos);
2844  }
2845  if( IsLower_v<MT1> ) {
2846  y[jend] = x[i] * A(i,jend);
2847  }
2848  }
2849  }
2850 
2851  if( IsStrictlyLower_v<MT1> ) {
2852  reset( y[N-1UL] );
2853  }
2854 
2855  if( !IsDiagonal_v<MT1> )
2856  {
2857  const size_t iend( IsStrictlyLower_v<MT1> ? N-1UL : N );
2858  for( size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<iend; ++j ) {
2859  y[j] *= scalar;
2860  }
2861  }
2862  }
2863  //**********************************************************************************************
2864 
2865  //**Default assignment to dense vectors (small matrices)****************************************
2879  template< typename VT1 // Type of the left-hand side target vector
2880  , typename VT2 // Type of the left-hand side vector operand
2881  , typename MT1 // Type of the right-hand side matrix operand
2882  , typename ST2 > // Type of the scalar value
2883  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2884  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2885  {
2886  selectDefaultAssignKernel( y, x, A, scalar );
2887  }
2888  //**********************************************************************************************
2889 
2890  //**Default assignment to dense vectors (small matrices)****************************************
2904  template< typename VT1 // Type of the left-hand side target vector
2905  , typename VT2 // Type of the left-hand side vector operand
2906  , typename MT1 // Type of the right-hand side matrix operand
2907  , typename ST2 > // Type of the scalar value
2908  static inline auto selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2909  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2910  {
2911  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
2912 
2913  const size_t M( A.rows() );
2914  const size_t N( A.columns() );
2915 
2916  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2917  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2918 
2919  const SIMDType factor( set( scalar ) );
2920 
2921  size_t j( 0UL );
2922 
2923  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2924  {
2925  const size_t ibegin( ( IsLower_v<MT1> )
2926  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2927  :( 0UL ) );
2928  const size_t iend( ( IsUpper_v<MT1> )
2929  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2930  :( M ) );
2931  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2932 
2933  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934 
2935  for( size_t i=ibegin; i<iend; ++i ) {
2936  const SIMDType x1( set( x[i] ) );
2937  xmm1 += x1 * A.load(i,j );
2938  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2939  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2940  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2941  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
2942  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
2943  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
2944  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
2945  }
2946 
2947  y.store( j , xmm1*factor );
2948  y.store( j+SIMDSIZE , xmm2*factor );
2949  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2950  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2951  y.store( j+SIMDSIZE*4UL, xmm5*factor );
2952  y.store( j+SIMDSIZE*5UL, xmm6*factor );
2953  y.store( j+SIMDSIZE*6UL, xmm7*factor );
2954  y.store( j+SIMDSIZE*7UL, xmm8*factor );
2955  }
2956 
2957  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2958  {
2959  const size_t ibegin( ( IsLower_v<MT1> )
2960  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2961  :( 0UL ) );
2962  const size_t iend( ( IsUpper_v<MT1> )
2963  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2964  :( M ) );
2965  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2966 
2967  SIMDType xmm1, xmm2, xmm3, xmm4;
2968 
2969  for( size_t i=ibegin; i<iend; ++i ) {
2970  const SIMDType x1( set( x[i] ) );
2971  xmm1 += x1 * A.load(i,j );
2972  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2973  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2974  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2975  }
2976 
2977  y.store( j , xmm1*factor );
2978  y.store( j+SIMDSIZE , xmm2*factor );
2979  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2980  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2981  }
2982 
2983  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2984  {
2985  const size_t ibegin( ( IsLower_v<MT1> )
2986  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2987  :( 0UL ) );
2988  const size_t iend( ( IsUpper_v<MT1> )
2989  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2990  :( M ) );
2991  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2992 
2993  SIMDType xmm1, xmm2, xmm3;
2994 
2995  for( size_t i=ibegin; i<iend; ++i ) {
2996  const SIMDType x1( set( x[i] ) );
2997  xmm1 += x1 * A.load(i,j );
2998  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2999  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3000  }
3001 
3002  y.store( j , xmm1*factor );
3003  y.store( j+SIMDSIZE , xmm2*factor );
3004  y.store( j+SIMDSIZE*2UL, xmm3*factor );
3005  }
3006 
3007  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3008  {
3009  const size_t ibegin( ( IsLower_v<MT1> )
3010  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3011  :( 0UL ) );
3012  const size_t iend( ( IsUpper_v<MT1> )
3013  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3014  :( M ) );
3015  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3016 
3017  SIMDType xmm1, xmm2;
3018 
3019  for( size_t i=ibegin; i<iend; ++i ) {
3020  const SIMDType x1( set( x[i] ) );
3021  xmm1 += x1 * A.load(i,j );
3022  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3023  }
3024 
3025  y.store( j , xmm1*factor );
3026  y.store( j+SIMDSIZE, xmm2*factor );
3027  }
3028 
3029  for( ; j<jpos; j+=SIMDSIZE )
3030  {
3031  const size_t ibegin( ( IsLower_v<MT1> )
3032  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3033  :( 0UL ) );
3034  const size_t iend( ( IsUpper_v<MT1> )
3035  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3036  :( M ) );
3037  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3038 
3039  SIMDType xmm1;
3040 
3041  for( size_t i=ibegin; i<iend; ++i ) {
3042  xmm1 += set( x[i] ) * A.load(i,j);
3043  }
3044 
3045  y.store( j, xmm1*factor );
3046  }
3047 
3048  for( ; remainder && j<N; ++j )
3049  {
3050  const size_t ibegin( ( IsLower_v<MT1> )
3051  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3052  :( 0UL ) );
3053  const size_t iend( ( IsUpper_v<MT1> )
3054  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3055  :( M ) );
3056  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3057 
3058  ElementType value{};
3059 
3060  for( size_t i=ibegin; i<iend; ++i ) {
3061  value += x[i] * A(i,j);
3062  }
3063 
3064  y[j] = value * scalar;
3065  }
3066  }
3067  //**********************************************************************************************
3068 
3069  //**Default assignment to dense vectors (large matrices)****************************************
3083  template< typename VT1 // Type of the left-hand side target vector
3084  , typename VT2 // Type of the left-hand side vector operand
3085  , typename MT1 // Type of the right-hand side matrix operand
3086  , typename ST2 > // Type of the scalar value
3087  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3088  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3089  {
3090  selectDefaultAssignKernel( y, x, A, scalar );
3091  }
3092  //**********************************************************************************************
3093 
3094  //**Default assignment to dense vectors (large matrices)****************************************
3108  template< typename VT1 // Type of the left-hand side target vector
3109  , typename VT2 // Type of the left-hand side vector operand
3110  , typename MT1 // Type of the right-hand side matrix operand
3111  , typename ST2 > // Type of the scalar value
3112  static inline auto selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3113  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3114  {
3115  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3116 
3117  const size_t M( A.rows() );
3118  const size_t N( A.columns() );
3119 
3120  const size_t jblock( 32768UL / sizeof( ElementType ) );
3121  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3122 
3123  const SIMDType factor( set( scalar ) );
3124 
3125  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3126 
3127  reset( y );
3128 
3129  for( size_t jj=0U; jj<N; jj+=jblock ) {
3130  for( size_t ii=0UL; ii<M; ii+=iblock )
3131  {
3132  const size_t iend( min( ii+iblock, M ) );
3133  const size_t jtmp( min( jj+jblock, N ) );
3134  const size_t jend( ( IsLower_v<MT1> )
3135  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3136  :( jtmp ) );
3137 
3138  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3139  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3140 
3141  size_t j( ( IsUpper_v<MT1> )
3142  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3143  :( jj ) );
3144 
3145  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3146  {
3147  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3148 
3149  for( size_t i=ii; i<iend; ++i ) {
3150  const SIMDType x1( set( x[i] ) );
3151  xmm1 += x1 * A.load(i,j );
3152  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3153  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3154  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3155  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3156  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3157  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3158  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3159  }
3160 
3161  y.store( j , y.load(j ) + xmm1*factor );
3162  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3163  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3164  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3165  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3166  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3167  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3168  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3169  }
3170 
3171  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3172  {
3173  SIMDType xmm1, xmm2, xmm3, xmm4;
3174 
3175  for( size_t i=ii; i<iend; ++i ) {
3176  const SIMDType x1( set( x[i] ) );
3177  xmm1 += x1 * A.load(i,j );
3178  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3179  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3180  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3181  }
3182 
3183  y.store( j , y.load(j ) + xmm1*factor );
3184  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3185  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3186  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3187  }
3188 
3189  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3190  {
3191  SIMDType xmm1, xmm2, xmm3;
3192 
3193  for( size_t i=ii; i<iend; ++i ) {
3194  const SIMDType x1( set( x[i] ) );
3195  xmm1 += x1 * A.load(i,j );
3196  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3197  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3198  }
3199 
3200  y.store( j , y.load(j ) + xmm1*factor );
3201  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3202  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3203  }
3204 
3205  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3206  {
3207  SIMDType xmm1, xmm2;
3208 
3209  for( size_t i=ii; i<iend; ++i ) {
3210  const SIMDType x1( set( x[i] ) );
3211  xmm1 += x1 * A.load(i,j );
3212  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3213  }
3214 
3215  y.store( j , y.load(j ) + xmm1*factor );
3216  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3217  }
3218 
3219  for( ; j<jpos; j+=SIMDSIZE )
3220  {
3221  SIMDType xmm1;
3222 
3223  for( size_t i=ii; i<iend; ++i ) {
3224  xmm1 += set( x[i] ) * A.load(i,j);
3225  }
3226 
3227  y.store( j, y.load(j) + xmm1*factor );
3228  }
3229 
3230  for( ; remainder && j<jend; ++j )
3231  {
3232  ElementType value{};
3233 
3234  for( size_t i=ii; i<iend; ++i ) {
3235  value += x[i] * A(i,j);
3236  }
3237 
3238  y[j] += value * scalar;
3239  }
3240  }
3241  }
3242  }
3243  //**********************************************************************************************
3244 
3245  //**BLAS-based assignment to dense vectors (default)********************************************
3258  template< typename VT1 // Type of the left-hand side target vector
3259  , typename VT2 // Type of the left-hand side vector operand
3260  , typename MT1 // Type of the right-hand side matrix operand
3261  , typename ST2 > // Type of the scalar value
3262  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3263  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3264  {
3265  selectLargeAssignKernel( y, x, A, scalar );
3266  }
3267  //**********************************************************************************************
3268 
3269  //**BLAS-based assignment to dense vectors******************************************************
3270 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3271 
3284  template< typename VT1 // Type of the left-hand side target vector
3285  , typename VT2 // Type of the left-hand side vector operand
3286  , typename MT1 // Type of the right-hand side matrix operand
3287  , typename ST2 > // Type of the scalar value
3288  static inline auto selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3289  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3290  {
3291  using ET = ElementType_t<VT1>;
3292 
3293  if( IsTriangular_v<MT1> ) {
3294  assign( y, scalar * x );
3295  trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3296  }
3297  else {
3298  gemv( y, x, A, ET(scalar), ET(0) );
3299  }
3300  }
3301 #endif
3302  //**********************************************************************************************
3303 
3304  //**Assignment to sparse vectors****************************************************************
3316  template< typename VT1 > // Type of the target sparse vector
3317  friend inline void assign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3318  {
3320 
3324 
3325  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3326 
3327  const ResultType tmp( serial( rhs ) );
3328  assign( ~lhs, tmp );
3329  }
3330  //**********************************************************************************************
3331 
3332  //**Addition assignment to dense vectors********************************************************
3344  template< typename VT1 > // Type of the target dense vector
3345  friend inline void addAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3346  {
3348 
3349  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3350 
3351  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3352  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3353 
3354  if( right.rows() == 0UL || right.columns() == 0UL ) {
3355  return;
3356  }
3357 
3358  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3359  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3360 
3361  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3362  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3363  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3364  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3365 
3366  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3367  }
3368  //**********************************************************************************************
3369 
3370  //**Addition assignment to dense vectors (kernel selection)*************************************
3381  template< typename VT1 // Type of the left-hand side target vector
3382  , typename VT2 // Type of the left-hand side vector operand
3383  , typename MT1 // Type of the right-hand side matrix operand
3384  , typename ST2 > // Type of the scalar value
3385  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3386  {
3387  if( ( IsDiagonal_v<MT1> ) ||
3388  ( IsComputation_v<MT> && !evaluateMatrix ) ||
3389  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3390  selectSmallAddAssignKernel( y, x, A, scalar );
3391  else
3392  selectBlasAddAssignKernel( y, x, A, scalar );
3393  }
3394  //**********************************************************************************************
3395 
3396  //**Default addition assignment to dense vectors************************************************
3410  template< typename VT1 // Type of the left-hand side target vector
3411  , typename VT2 // Type of the left-hand side vector operand
3412  , typename MT1 // Type of the right-hand side matrix operand
3413  , typename ST2 > // Type of the scalar value
3414  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3415  {
3416  y.addAssign( x * A * scalar );
3417  }
3418  //**********************************************************************************************
3419 
3420  //**Default addition assignment to dense vectors (small matrices)*******************************
3434  template< typename VT1 // Type of the left-hand side target vector
3435  , typename VT2 // Type of the left-hand side vector operand
3436  , typename MT1 // Type of the right-hand side matrix operand
3437  , typename ST2 > // Type of the scalar value
3438  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3439  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3440  {
3441  selectDefaultAddAssignKernel( y, x, A, scalar );
3442  }
3443  //**********************************************************************************************
3444 
3445  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3460  template< typename VT1 // Type of the left-hand side target vector
3461  , typename VT2 // Type of the left-hand side vector operand
3462  , typename MT1 // Type of the right-hand side matrix operand
3463  , typename ST2 > // Type of the scalar value
3464  static inline auto selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3465  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3466  {
3467  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3468 
3469  const size_t M( A.rows() );
3470  const size_t N( A.columns() );
3471 
3472  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3473  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3474 
3475  const SIMDType factor( set( scalar ) );
3476 
3477  size_t j( 0UL );
3478 
3479  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3480  {
3481  const size_t ibegin( ( IsLower_v<MT1> )
3482  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3483  :( 0UL ) );
3484  const size_t iend( ( IsUpper_v<MT1> )
3485  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3486  :( M ) );
3487  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3488 
3489  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3490 
3491  for( size_t i=ibegin; i<iend; ++i ) {
3492  const SIMDType x1( set( x[i] ) );
3493  xmm1 += x1 * A.load(i,j );
3494  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3495  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3496  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3497  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3498  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3499  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3500  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3501  }
3502 
3503  y.store( j , y.load(j ) + xmm1*factor );
3504  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3505  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3506  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3507  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3508  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3509  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3510  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3511  }
3512 
3513  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3514  {
3515  const size_t ibegin( ( IsLower_v<MT1> )
3516  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3517  :( 0UL ) );
3518  const size_t iend( ( IsUpper_v<MT1> )
3519  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3520  :( M ) );
3521  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3522 
3523  SIMDType xmm1, xmm2, xmm3, xmm4;
3524 
3525  for( size_t i=ibegin; i<iend; ++i ) {
3526  const SIMDType x1( set( x[i] ) );
3527  xmm1 += x1 * A.load(i,j );
3528  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3529  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3530  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3531  }
3532 
3533  y.store( j , y.load(j ) + xmm1*factor );
3534  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3535  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3536  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3537  }
3538 
3539  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3540  {
3541  const size_t ibegin( ( IsLower_v<MT1> )
3542  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3543  :( 0UL ) );
3544  const size_t iend( ( IsUpper_v<MT1> )
3545  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3546  :( M ) );
3547  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3548 
3549  SIMDType xmm1, xmm2, xmm3;
3550 
3551  for( size_t i=ibegin; i<iend; ++i ) {
3552  const SIMDType x1( set( x[i] ) );
3553  xmm1 += x1 * A.load(i,j );
3554  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3555  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3556  }
3557 
3558  y.store( j , y.load(j ) + xmm1*factor );
3559  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3560  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3561  }
3562 
3563  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3564  {
3565  const size_t ibegin( ( IsLower_v<MT1> )
3566  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3567  :( 0UL ) );
3568  const size_t iend( ( IsUpper_v<MT1> )
3569  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3570  :( M ) );
3571  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3572 
3573  SIMDType xmm1, xmm2;
3574 
3575  for( size_t i=ibegin; i<iend; ++i ) {
3576  const SIMDType x1( set( x[i] ) );
3577  xmm1 += x1 * A.load(i,j );
3578  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3579  }
3580 
3581  y.store( j , y.load(j ) + xmm1*factor );
3582  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3583  }
3584 
3585  for( ; j<jpos; j+=SIMDSIZE )
3586  {
3587  const size_t ibegin( ( IsLower_v<MT1> )
3588  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3589  :( 0UL ) );
3590  const size_t iend( ( IsUpper_v<MT1> )
3591  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3592  :( M ) );
3593  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3594 
3595  SIMDType xmm1;
3596 
3597  for( size_t i=ibegin; i<iend; ++i ) {
3598  xmm1 += set( x[i] ) * A.load(i,j);
3599  }
3600 
3601  y.store( j, y.load(j) + xmm1*factor );
3602  }
3603 
3604  for( ; remainder && j<N; ++j )
3605  {
3606  const size_t ibegin( ( IsLower_v<MT1> )
3607  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3608  :( 0UL ) );
3609  const size_t iend( ( IsUpper_v<MT1> )
3610  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3611  :( M ) );
3612  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3613 
3614  ElementType value{};
3615 
3616  for( size_t i=ibegin; i<iend; ++i ) {
3617  value += x[i] * A(i,j);
3618  }
3619 
3620  y[j] += value * scalar;
3621  }
3622  }
3623  //**********************************************************************************************
3624 
3625  //**Default addition assignment to dense vectors (large matrices)*******************************
3639  template< typename VT1 // Type of the left-hand side target vector
3640  , typename VT2 // Type of the left-hand side vector operand
3641  , typename MT1 // Type of the right-hand side matrix operand
3642  , typename ST2 > // Type of the scalar value
3643  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3644  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3645  {
3646  selectDefaultAddAssignKernel( y, x, A, scalar );
3647  }
3648  //**********************************************************************************************
3649 
3650  //**Vectorized default addition assignment to dense vectors (large matrices)********************
3665  template< typename VT1 // Type of the left-hand side target vector
3666  , typename VT2 // Type of the left-hand side vector operand
3667  , typename MT1 // Type of the right-hand side matrix operand
3668  , typename ST2 > // Type of the scalar value
3669  static inline auto selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3670  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3671  {
3672  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3673 
3674  const size_t M( A.rows() );
3675  const size_t N( A.columns() );
3676 
3677  const size_t jblock( 32768UL / sizeof( ElementType ) );
3678  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3679 
3680  const SIMDType factor( set( scalar ) );
3681 
3682  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3683 
3684  for( size_t jj=0U; jj<N; jj+=jblock ) {
3685  for( size_t ii=0UL; ii<M; ii+=iblock )
3686  {
3687  const size_t iend( min( ii+iblock, M ) );
3688  const size_t jtmp( min( jj+jblock, N ) );
3689  const size_t jend( ( IsLower_v<MT1> )
3690  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3691  :( jtmp ) );
3692 
3693  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3694  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3695 
3696  size_t j( ( IsUpper_v<MT1> )
3697  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3698  :( jj ) );
3699 
3700  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3701  {
3702  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3703 
3704  for( size_t i=ii; i<iend; ++i ) {
3705  const SIMDType x1( set( x[i] ) );
3706  xmm1 += x1 * A.load(i,j );
3707  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3708  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3709  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3710  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3711  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3712  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3713  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3714  }
3715 
3716  y.store( j , y.load(j ) + xmm1*factor );
3717  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3718  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3719  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3720  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3721  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3722  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3723  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3724  }
3725 
3726  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3727  {
3728  SIMDType xmm1, xmm2, xmm3, xmm4;
3729 
3730  for( size_t i=ii; i<iend; ++i ) {
3731  const SIMDType x1( set( x[i] ) );
3732  xmm1 += x1 * A.load(i,j );
3733  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3734  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3735  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3736  }
3737 
3738  y.store( j , y.load(j ) + xmm1*factor );
3739  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3740  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3741  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3742  }
3743 
3744  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3745  {
3746  SIMDType xmm1, xmm2, xmm3;
3747 
3748  for( size_t i=ii; i<iend; ++i ) {
3749  const SIMDType x1( set( x[i] ) );
3750  xmm1 += x1 * A.load(i,j );
3751  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3752  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3753  }
3754 
3755  y.store( j , y.load(j ) + xmm1*factor );
3756  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3757  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3758  }
3759 
3760  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3761  {
3762  SIMDType xmm1, xmm2;
3763 
3764  for( size_t i=ii; i<iend; ++i ) {
3765  const SIMDType x1( set( x[i] ) );
3766  xmm1 += x1 * A.load(i,j );
3767  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3768  }
3769 
3770  y.store( j , y.load(j ) + xmm1*factor );
3771  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3772  }
3773 
3774  for( ; j<jpos; j+=SIMDSIZE )
3775  {
3776  SIMDType xmm1;
3777 
3778  for( size_t i=ii; i<iend; ++i ) {
3779  xmm1 += set( x[i] ) * A.load(i,j);
3780  }
3781 
3782  y.store( j, y.load(j) + xmm1*factor );
3783  }
3784 
3785  for( ; remainder && j<jend; ++j )
3786  {
3787  ElementType value{};
3788 
3789  for( size_t i=ii; i<iend; ++i ) {
3790  value += x[i] * A(i,j);
3791  }
3792 
3793  y[j] += value * scalar;
3794  }
3795  }
3796  }
3797  }
3798  //**********************************************************************************************
3799 
3800  //**BLAS-based addition assignment to dense vectors (default)***********************************
3814  template< typename VT1 // Type of the left-hand side target vector
3815  , typename VT2 // Type of the left-hand side vector operand
3816  , typename MT1 // Type of the right-hand side matrix operand
3817  , typename ST2 > // Type of the scalar value
3818  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3819  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3820  {
3821  selectLargeAddAssignKernel( y, x, A, scalar );
3822  }
3823  //**********************************************************************************************
3824 
3825  //**BLAS-based addition assignment to dense vectors*********************************************
3826 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3827 
3840  template< typename VT1 // Type of the left-hand side target vector
3841  , typename VT2 // Type of the left-hand side vector operand
3842  , typename MT1 // Type of the right-hand side matrix operand
3843  , typename ST2 > // Type of the scalar value
3844  static inline auto selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3845  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3846  {
3847  using ET = ElementType_t<VT1>;
3848 
3849  if( IsTriangular_v<MT1> ) {
3850  ResultType_t<VT1> tmp( serial( scalar * x ) );
3851  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3852  addAssign( y, tmp );
3853  }
3854  else {
3855  gemv( y, x, A, ET(scalar), ET(1) );
3856  }
3857  }
3858 #endif
3859  //**********************************************************************************************
3860 
3861  //**Addition assignment to sparse vectors*******************************************************
3862  // No special implementation for the addition assignment to sparse vectors.
3863  //**********************************************************************************************
3864 
3865  //**Subtraction assignment to dense vectors*****************************************************
3877  template< typename VT1 > // Type of the target dense vector
3878  friend inline void subAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3879  {
3881 
3882  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3883 
3884  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3885  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3886 
3887  if( right.rows() == 0UL || right.columns() == 0UL ) {
3888  return;
3889  }
3890 
3891  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3892  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3893 
3894  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3895  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3896  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3897  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3898 
3899  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
3900  }
3901  //**********************************************************************************************
3902 
3903  //**Subtraction assignment to dense vectors (kernel selection)**********************************
3914  template< typename VT1 // Type of the left-hand side target vector
3915  , typename VT2 // Type of the left-hand side vector operand
3916  , typename MT1 // Type of the right-hand side matrix operand
3917  , typename ST2 > // Type of the scalar value
3918  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3919  {
3920  if( ( IsDiagonal_v<MT1> ) ||
3921  ( IsComputation_v<MT> && !evaluateMatrix ) ||
3922  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3923  selectSmallSubAssignKernel( y, x, A, scalar );
3924  else
3925  selectBlasSubAssignKernel( y, x, A, scalar );
3926  }
3927  //**********************************************************************************************
3928 
3929  //**Default subtraction assignment to dense vectors*********************************************
3943  template< typename VT1 // Type of the left-hand side target vector
3944  , typename VT2 // Type of the left-hand side vector operand
3945  , typename MT1 // Type of the right-hand side matrix operand
3946  , typename ST2 > // Type of the scalar value
3947  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3948  {
3949  y.subAssign( x * A * scalar );
3950  }
3951  //**********************************************************************************************
3952 
3953  //**Default subtraction assignment to dense vectors (small matrices)****************************
3967  template< typename VT1 // Type of the left-hand side target vector
3968  , typename VT2 // Type of the left-hand side vector operand
3969  , typename MT1 // Type of the right-hand side matrix operand
3970  , typename ST2 > // Type of the scalar value
3971  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3972  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3973  {
3974  selectDefaultSubAssignKernel( y, x, A, scalar );
3975  }
3976  //**********************************************************************************************
3977 
3978  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
3993  template< typename VT1 // Type of the left-hand side target vector
3994  , typename VT2 // Type of the left-hand side vector operand
3995  , typename MT1 // Type of the right-hand side matrix operand
3996  , typename ST2 > // Type of the scalar value
3997  static inline auto selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3998  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3999  {
4000  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4001 
4002  const size_t M( A.rows() );
4003  const size_t N( A.columns() );
4004 
4005  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
4006  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4007 
4008  const SIMDType factor( set( scalar ) );
4009 
4010  size_t j( 0UL );
4011 
4012  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4013  {
4014  const size_t ibegin( ( IsLower_v<MT1> )
4015  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4016  :( 0UL ) );
4017  const size_t iend( ( IsUpper_v<MT1> )
4018  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4019  :( M ) );
4020  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4021 
4022  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4023 
4024  for( size_t i=ibegin; i<iend; ++i ) {
4025  const SIMDType x1( set( x[i] ) );
4026  xmm1 += x1 * A.load(i,j );
4027  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4028  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4029  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4030  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4031  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4032  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4033  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4034  }
4035 
4036  y.store( j , y.load(j ) - xmm1*factor );
4037  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4038  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4039  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4040  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4041  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4042  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4043  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4044  }
4045 
4046  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4047  {
4048  const size_t ibegin( ( IsLower_v<MT1> )
4049  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4050  :( 0UL ) );
4051  const size_t iend( ( IsUpper_v<MT1> )
4052  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4053  :( M ) );
4054  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4055 
4056  SIMDType xmm1, xmm2, xmm3, xmm4;
4057 
4058  for( size_t i=ibegin; i<iend; ++i ) {
4059  const SIMDType x1( set( x[i] ) );
4060  xmm1 += x1 * A.load(i,j );
4061  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4062  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4063  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4064  }
4065 
4066  y.store( j , y.load(j ) - xmm1*factor );
4067  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4068  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4069  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4070  }
4071 
4072  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4073  {
4074  const size_t ibegin( ( IsLower_v<MT1> )
4075  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4076  :( 0UL ) );
4077  const size_t iend( ( IsUpper_v<MT1> )
4078  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4079  :( M ) );
4080  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4081 
4082  SIMDType xmm1, xmm2, xmm3;
4083 
4084  for( size_t i=ibegin; i<iend; ++i ) {
4085  const SIMDType x1( set( x[i] ) );
4086  xmm1 += x1 * A.load(i,j );
4087  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4088  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4089  }
4090 
4091  y.store( j , y.load(j ) - xmm1*factor );
4092  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4093  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4094  }
4095 
4096  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4097  {
4098  const size_t ibegin( ( IsLower_v<MT1> )
4099  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4100  :( 0UL ) );
4101  const size_t iend( ( IsUpper_v<MT1> )
4102  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4103  :( M ) );
4104  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4105 
4106  SIMDType xmm1, xmm2;
4107 
4108  for( size_t i=ibegin; i<iend; ++i ) {
4109  const SIMDType x1( set( x[i] ) );
4110  xmm1 += x1 * A.load(i,j );
4111  xmm2 += x1 * A.load(i,j+SIMDSIZE);
4112  }
4113 
4114  y.store( j , y.load(j ) - xmm1*factor );
4115  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4116  }
4117 
4118  for( ; j<jpos; j+=SIMDSIZE )
4119  {
4120  const size_t ibegin( ( IsLower_v<MT1> )
4121  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4122  :( 0UL ) );
4123  const size_t iend( ( IsUpper_v<MT1> )
4124  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4125  :( M ) );
4126  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4127 
4128  SIMDType xmm1;
4129 
4130  for( size_t i=ibegin; i<iend; ++i ) {
4131  xmm1 += set( x[i] ) * A.load(i,j);
4132  }
4133 
4134  y.store( j, y.load(j) - xmm1*factor );
4135  }
4136 
4137  for( ; remainder && j<N; ++j )
4138  {
4139  const size_t ibegin( ( IsLower_v<MT1> )
4140  ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4141  :( 0UL ) );
4142  const size_t iend( ( IsUpper_v<MT1> )
4143  ?( min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4144  :( M ) );
4145  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4146 
4147  ElementType value{};
4148 
4149  for( size_t i=ibegin; i<iend; ++i ) {
4150  value += x[i] * A(i,j);
4151  }
4152 
4153  y[j] -= value * scalar;
4154  }
4155  }
4156  //**********************************************************************************************
4157 
4158  //**Default subtraction assignment to dense vectors (large matrices)****************************
4172  template< typename VT1 // Type of the left-hand side target vector
4173  , typename VT2 // Type of the left-hand side vector operand
4174  , typename MT1 // Type of the right-hand side matrix operand
4175  , typename ST2 > // Type of the scalar value
4176  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4177  -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4178  {
4179  selectDefaultSubAssignKernel( y, x, A, scalar );
4180  }
4181  //**********************************************************************************************
4182 
4183  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4198  template< typename VT1 // Type of the left-hand side target vector
4199  , typename VT2 // Type of the left-hand side vector operand
4200  , typename MT1 // Type of the right-hand side matrix operand
4201  , typename ST2 > // Type of the scalar value
4202  static inline auto selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4203  -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4204  {
4205  constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4206 
4207  const size_t M( A.rows() );
4208  const size_t N( A.columns() );
4209 
4210  const size_t jblock( 32768UL / sizeof( ElementType ) );
4211  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4212 
4213  const SIMDType factor( set( scalar ) );
4214 
4215  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
4216 
4217  for( size_t jj=0U; jj<N; jj+=jblock ) {
4218  for( size_t ii=0UL; ii<M; ii+=iblock )
4219  {
4220  const size_t iend( min( ii+iblock, M ) );
4221  const size_t jtmp( min( jj+jblock, N ) );
4222  const size_t jend( ( IsLower_v<MT1> )
4223  ?( min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
4224  :( jtmp ) );
4225 
4226  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4227  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4228 
4229  size_t j( ( IsUpper_v<MT1> )
4230  ?( max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
4231  :( jj ) );
4232 
4233  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4234  {
4235  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4236 
4237  for( size_t i=ii; i<iend; ++i ) {
4238  const SIMDType x1( set( x[i] ) );
4239  xmm1 += x1 * A.load(i,j );
4240  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4241  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4242  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4243  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4244  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4245  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4246  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4247  }
4248 
4249  y.store( j , y.load(j ) - xmm1*factor );
4250  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4251  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4252  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4253  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4254  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4255  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4256  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4257  }
4258 
4259  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4260  {
4261  SIMDType xmm1, xmm2, xmm3, xmm4;
4262 
4263  for( size_t i=ii; i<iend; ++i ) {
4264  const SIMDType x1( set( x[i] ) );
4265  xmm1 += x1 * A.load(i,j );
4266  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4267  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4268  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4269  }
4270 
4271  y.store( j , y.load(j ) - xmm1*factor );
4272  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4273  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4274  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4275  }
4276 
4277  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4278  {
4279  SIMDType xmm1, xmm2, xmm3;
4280 
4281  for( size_t i=ii; i<iend; ++i ) {
4282  const SIMDType x1( set( x[i] ) );
4283  xmm1 += x1 * A.load(i,j );
4284  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4285  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4286  }
4287 
4288  y.store( j , y.load(j ) - xmm1*factor );
4289  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4290  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4291  }
4292 
4293  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4294  {
4295  SIMDType xmm1, xmm2;
4296 
4297  for( size_t i=ii; i<iend; ++i ) {
4298  const SIMDType x1( set( x[i] ) );
4299  xmm1 += x1 * A.load(i,j );
4300  xmm2 += x1 * A.load(i,j+SIMDSIZE);
4301  }
4302 
4303  y.store( j , y.load(j ) - xmm1*factor );
4304  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4305  }
4306 
4307  for( ; j<jpos; j+=SIMDSIZE )
4308  {
4309  SIMDType xmm1;
4310 
4311  for( size_t i=ii; i<iend; ++i ) {
4312  xmm1 += set( x[i] ) * A.load(i,j);
4313  }
4314 
4315  y.store( j, y.load(j) - xmm1*factor );
4316  }
4317 
4318  for( ; remainder && j<jend; ++j )
4319  {
4320  ElementType value{};
4321 
4322  for( size_t i=ii; i<iend; ++i ) {
4323  value += x[i] * A(i,j);
4324  }
4325 
4326  y[j] -= value * scalar;
4327  }
4328  }
4329  }
4330  }
4331  //**********************************************************************************************
4332 
4333  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4347  template< typename VT1 // Type of the left-hand side target vector
4348  , typename VT2 // Type of the left-hand side vector operand
4349  , typename MT1 // Type of the right-hand side matrix operand
4350  , typename ST2 > // Type of the scalar value
4351  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4352  -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4353  {
4354  selectLargeSubAssignKernel( y, x, A, scalar );
4355  }
4356  //**********************************************************************************************
4357 
4358  //**BLAS-based subtraction assignment to dense vectors******************************************
4359 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4360 
4373  template< typename VT1 // Type of the left-hand side target vector
4374  , typename VT2 // Type of the left-hand side vector operand
4375  , typename MT1 // Type of the right-hand side matrix operand
4376  , typename ST2 > // Type of the scalar value
4377  static inline auto selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4378  -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4379  {
4380  using ET = ElementType_t<VT1>;
4381 
4382  if( IsTriangular_v<MT1> ) {
4383  ResultType_t<VT1> tmp( serial( scalar * x ) );
4384  trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4385  subAssign( y, tmp );
4386  }
4387  else {
4388  gemv( y, x, A, ET(-scalar), ET(1) );
4389  }
4390  }
4391 #endif
4392  //**********************************************************************************************
4393 
4394  //**Subtraction assignment to sparse vectors****************************************************
4395  // No special implementation for the subtraction assignment to sparse vectors.
4396  //**********************************************************************************************
4397 
4398  //**Multiplication assignment to dense vectors**************************************************
4410  template< typename VT1 > // Type of the target dense vector
4411  friend inline void multAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4412  {
4414 
4418 
4419  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4420 
4421  const ResultType tmp( serial( rhs ) );
4422  multAssign( ~lhs, tmp );
4423  }
4424  //**********************************************************************************************
4425 
4426  //**Multiplication assignment to sparse vectors*************************************************
4427  // No special implementation for the multiplication assignment to sparse vectors.
4428  //**********************************************************************************************
4429 
4430  //**Division assignment to dense vectors********************************************************
4442  template< typename VT1 > // Type of the target dense vector
4443  friend inline void divAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4444  {
4446 
4450 
4451  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4452 
4453  const ResultType tmp( serial( rhs ) );
4454  divAssign( ~lhs, tmp );
4455  }
4456  //**********************************************************************************************
4457 
4458  //**Division assignment to sparse vectors*******************************************************
4459  // No special implementation for the division assignment to sparse vectors.
4460  //**********************************************************************************************
4461 
4462  //**SMP assignment to dense vectors*************************************************************
4476  template< typename VT1 > // Type of the target dense vector
4477  friend inline auto smpAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4478  -> EnableIf_t< UseSMPAssign_v<VT1> >
4479  {
4481 
4482  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4483 
4484  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4485  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4486 
4487  if( right.rows() == 0UL ) {
4488  reset( ~lhs );
4489  return;
4490  }
4491  else if( right.columns() == 0UL ) {
4492  return;
4493  }
4494 
4495  LT x( left ); // Evaluation of the left-hand side dense vector operand
4496  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4497 
4498  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4499  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4500  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4501  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4502 
4503  smpAssign( ~lhs, x * A * rhs.scalar_ );
4504  }
4505  //**********************************************************************************************
4506 
4507  //**SMP assignment to sparse vectors************************************************************
4521  template< typename VT1 > // Type of the target sparse vector
4522  friend inline auto smpAssign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4523  -> EnableIf_t< UseSMPAssign_v<VT1> >
4524  {
4526 
4530 
4531  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4532 
4533  const ResultType tmp( rhs );
4534  smpAssign( ~lhs, tmp );
4535  }
4536  //**********************************************************************************************
4537 
4538  //**SMP addition assignment to dense vectors****************************************************
4552  template< typename VT1 > // Type of the target dense vector
4553  friend inline auto smpAddAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4554  -> EnableIf_t< UseSMPAssign_v<VT1> >
4555  {
4557 
4558  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4559 
4560  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4561  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4562 
4563  if( right.rows() == 0UL || right.columns() == 0UL ) {
4564  return;
4565  }
4566 
4567  LT x( left ); // Evaluation of the left-hand side dense vector operand
4568  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4569 
4570  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4571  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4572  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4573  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4574 
4575  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
4576  }
4577  //**********************************************************************************************
4578 
4579  //**SMP addition assignment to sparse vectors***************************************************
4580  // No special implementation for the SMP addition assignment to sparse vectors.
4581  //**********************************************************************************************
4582 
4583  //**SMP subtraction assignment to dense vectors*************************************************
4597  template< typename VT1 > // Type of the target dense vector
4598  friend inline auto smpSubAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4599  -> EnableIf_t< UseSMPAssign_v<VT1> >
4600  {
4602 
4603  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4604 
4605  LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4606  RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4607 
4608  if( right.rows() == 0UL || right.columns() == 0UL ) {
4609  return;
4610  }
4611 
4612  LT x( left ); // Evaluation of the left-hand side dense vector operand
4613  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4614 
4615  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4616  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4617  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4618  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4619 
4620  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
4621  }
4622  //**********************************************************************************************
4623 
4624  //**SMP subtraction assignment to sparse vectors************************************************
4625  // No special implementation for the SMP subtraction assignment to sparse vectors.
4626  //**********************************************************************************************
4627 
4628  //**SMP multiplication assignment to dense vectors**********************************************
4643  template< typename VT1 > // Type of the target dense vector
4644  friend inline auto smpMultAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4645  -> EnableIf_t< UseSMPAssign_v<VT1> >
4646  {
4648 
4652 
4653  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4654 
4655  const ResultType tmp( rhs );
4656  smpMultAssign( ~lhs, tmp );
4657  }
4658  //**********************************************************************************************
4659 
4660  //**SMP multiplication assignment to sparse vectors*********************************************
4661  // No special implementation for the SMP multiplication assignment to sparse vectors.
4662  //**********************************************************************************************
4663 
4664  //**SMP division assignment to dense vectors****************************************************
4678  template< typename VT1 > // Type of the target dense vector
4679  friend inline auto smpDivAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4680  -> EnableIf_t< UseSMPAssign_v<VT1> >
4681  {
4683 
4687 
4688  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4689 
4690  const ResultType tmp( rhs );
4691  smpDivAssign( ~lhs, tmp );
4692  }
4693  //**********************************************************************************************
4694 
4695  //**SMP division assignment to sparse vectors***************************************************
4696  // No special implementation for the SMP division assignment to sparse vectors.
4697  //**********************************************************************************************
4698 
4699  //**Compile time checks*************************************************************************
4708  //**********************************************************************************************
4709 };
4711 //*************************************************************************************************
4712 
4713 
4714 
4715 
4716 //=================================================================================================
4717 //
4718 // GLOBAL BINARY ARITHMETIC OPERATORS
4719 //
4720 //=================================================================================================
4721 
4722 //*************************************************************************************************
4753 template< typename VT // Type of the left-hand side dense vector
4754  , typename MT > // Type of the right-hand side dense matrix
4755 inline decltype(auto)
4756  operator*( const DenseVector<VT,true>& vec, const DenseMatrix<MT,false>& mat )
4757 {
4759 
4761 
4762  if( (~vec).size() != (~mat).rows() ) {
4763  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
4764  }
4765 
4766  using ReturnType = const TDVecDMatMultExpr<VT,MT>;
4767  return ReturnType( ~vec, ~mat );
4768 }
4769 //*************************************************************************************************
4770 
4771 
4772 
4773 
4774 //=================================================================================================
4775 //
4776 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
4777 //
4778 //=================================================================================================
4779 
4780 //*************************************************************************************************
4794 template< typename VT // Type of the left-hand side dense vector
4795  , typename MT > // Matrix base type of the right-hand side expression
4796 inline decltype(auto)
4797  operator*( const DenseVector<VT,true>& vec, const MatMatMultExpr<MT>& mat )
4798 {
4800 
4801  return ( vec * (~mat).leftOperand() ) * (~mat).rightOperand();
4802 }
4804 //*************************************************************************************************
4805 
4806 
4807 
4808 
4809 //=================================================================================================
4810 //
4811 // ISALIGNED SPECIALIZATIONS
4812 //
4813 //=================================================================================================
4814 
4815 //*************************************************************************************************
4817 template< typename VT, typename MT >
4818 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4819  : public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
4820 {};
4822 //*************************************************************************************************
4823 
4824 } // namespace blaze
4825 
4826 #endif
If_t< IsExpression_v< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:215
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:204
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:567
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:523
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:162
Header file for basic type definitions.
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:533
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:166
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:428
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:296
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:513
Header file for the DenseVector base class.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:363
If_t< useAssign, const ResultType, const DVecScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:169
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:385
Header file for the Computation base class.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecDMatMultExpr.h:233
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:329
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:129
Header file for the IsFloat type trait.
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:132
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:353
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:220
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:433
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecDMatMultExpr.h:226
Header file for the IsComplexDouble type trait.
Constraint on the data type.
DenseVector< This, TF > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:160
Headerfile for the generic max algorithm.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:467
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:133
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:159
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:262
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:218
Header file for all SIMD functionality.
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:107
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:83
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
If_t< IsExpression_v< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:172
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:386
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:584
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:557
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:248
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.The MultTrait_t alias declaration provid...
Definition: MultTrait.h:240
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:585
System settings for the BLAS mode.
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:131
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:454
Header file for the IsSIMDCombinable type trait.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:206
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:577
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:373
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:208
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecDMatMultExpr.h:239
Header file for the IsContiguous type trait.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:144
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Header file for the TVecMatMultExpr base class.
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:161
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:165
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:139
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:207
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:109
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:309
Header file for BLAS general matrix/vector multiplication functions (gemv)
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:341
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:104
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:545
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:209
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
If_t< IsExpression_v< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:212
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:442
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:205
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:319
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the complex data type.
Header file for the IsUpper type trait.
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:221
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:134
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:191
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:130
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:175
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:423