DMatDVecMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/Exception.h>
60 #include <blaze/math/shims/Reset.h>
62 #include <blaze/math/SIMD.h>
83 #include <blaze/math/views/Check.h>
84 #include <blaze/system/BLAS.h>
87 #include <blaze/util/Assert.h>
88 #include <blaze/util/Complex.h>
90 #include <blaze/util/DisableIf.h>
91 #include <blaze/util/EnableIf.h>
93 #include <blaze/util/mpl/And.h>
94 #include <blaze/util/mpl/If.h>
95 #include <blaze/util/Types.h>
103 
104 
105 namespace blaze {
106 
107 //=================================================================================================
108 //
109 // CLASS DMATDVECMULTEXPR
110 //
111 //=================================================================================================
112 
113 //*************************************************************************************************
120 template< typename MT // Type of the left-hand side dense matrix
121  , typename VT > // Type of the right-hand side dense vector
123  : public MatVecMultExpr< DenseVector< DMatDVecMultExpr<MT,VT>, false > >
124  , private Computation
125 {
126  private:
127  //**Type definitions****************************************************************************
134  //**********************************************************************************************
135 
136  //**********************************************************************************************
138  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
140  //**********************************************************************************************
141 
142  //**********************************************************************************************
144  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
145  //**********************************************************************************************
146 
147  //**********************************************************************************************
149 
153  template< typename T1 >
154  struct UseSMPAssign {
155  enum : bool { value = ( evaluateMatrix || evaluateVector ) };
156  };
158  //**********************************************************************************************
159 
160  //**********************************************************************************************
162 
165  template< typename T1, typename T2, typename T3 >
166  struct UseBlasKernel {
172  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
177  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
178  };
180  //**********************************************************************************************
181 
182  //**********************************************************************************************
184 
188  template< typename T1, typename T2, typename T3 >
189  struct UseVectorizedDefaultKernel {
190  enum : bool { value = useOptimizedKernels &&
192  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
195  , ElementType_<T3> >::value &&
198  };
200  //**********************************************************************************************
201 
202  public:
203  //**Type definitions****************************************************************************
209  using ReturnType = const ElementType;
210  using CompositeType = const ResultType;
211 
213  using LeftOperand = If_< IsExpression<MT>, const MT, const MT& >;
214 
216  using RightOperand = If_< IsExpression<VT>, const VT, const VT& >;
217 
220 
223  //**********************************************************************************************
224 
225  //**Compilation flags***************************************************************************
227  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
228  MT::simdEnabled && VT::simdEnabled &&
231 
233  enum : bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
234  !evaluateVector && VT::smpAssignable };
235  //**********************************************************************************************
236 
237  //**SIMD properties*****************************************************************************
239  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
240  //**********************************************************************************************
241 
242  //**Constructor*********************************************************************************
248  explicit inline DMatDVecMultExpr( const MT& mat, const VT& vec ) noexcept
249  : mat_( mat ) // Left-hand side dense matrix of the multiplication expression
250  , vec_( vec ) // Right-hand side dense vector of the multiplication expression
251  {
252  BLAZE_INTERNAL_ASSERT( mat_.columns() == vec_.size(), "Invalid matrix and vector sizes" );
253  }
254  //**********************************************************************************************
255 
256  //**Subscript operator**************************************************************************
262  inline ReturnType operator[]( size_t index ) const {
263  BLAZE_INTERNAL_ASSERT( index < mat_.rows(), "Invalid vector access index" );
264 
266  {
267  return mat_(index,index) * vec_[index];
268  }
269  else if( IsLower<MT>::value && ( index + 8UL < mat_.rows() ) )
270  {
271  const size_t n( IsStrictlyLower<MT>::value ? index : index+1UL );
272  return subvector( row( mat_, index, unchecked ), 0UL, n, unchecked ) *
273  subvector( vec_, 0UL, n, unchecked );
274  }
275  else if( IsUpper<MT>::value && ( index > 8UL ) )
276  {
277  const size_t begin( IsStrictlyUpper<MT>::value ? index+1UL : index );
278  const size_t n ( mat_.columns() - begin );
279  return subvector( row( mat_, index, unchecked ), begin, n, unchecked ) *
280  subvector( vec_, begin, n, unchecked );
281  }
282  else
283  {
284  return row( mat_, index, unchecked ) * vec_;
285  }
286  }
287  //**********************************************************************************************
288 
289  //**At function*********************************************************************************
296  inline ReturnType at( size_t index ) const {
297  if( index >= mat_.rows() ) {
298  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
299  }
300  return (*this)[index];
301  }
302  //**********************************************************************************************
303 
304  //**Size function*******************************************************************************
309  inline size_t size() const noexcept {
310  return mat_.rows();
311  }
312  //**********************************************************************************************
313 
314  //**Left operand access*************************************************************************
319  inline LeftOperand leftOperand() const noexcept{
320  return mat_;
321  }
322  //**********************************************************************************************
323 
324  //**Right operand access************************************************************************
329  inline RightOperand rightOperand() const noexcept {
330  return vec_;
331  }
332  //**********************************************************************************************
333 
334  //**********************************************************************************************
340  template< typename T >
341  inline bool canAlias( const T* alias ) const noexcept {
342  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
343  }
344  //**********************************************************************************************
345 
346  //**********************************************************************************************
352  template< typename T >
353  inline bool isAliased( const T* alias ) const noexcept {
354  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
355  }
356  //**********************************************************************************************
357 
358  //**********************************************************************************************
363  inline bool isAligned() const noexcept {
364  return mat_.isAligned() && vec_.isAligned();
365  }
366  //**********************************************************************************************
367 
368  //**********************************************************************************************
373  inline bool canSMPAssign() const noexcept {
374  return ( !BLAZE_BLAS_MODE ||
377  ( IsComputation<MT>::value && !evaluateMatrix ) ||
378  ( mat_.rows() * mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
379  ( size() > SMP_DMATDVECMULT_THRESHOLD );
380  }
381  //**********************************************************************************************
382 
383  private:
384  //**Member variables****************************************************************************
387  //**********************************************************************************************
388 
389  //**Assignment to dense vectors*****************************************************************
402  template< typename VT1 > // Type of the target dense vector
403  friend inline void assign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
404  {
406 
407  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
408 
409  if( rhs.mat_.rows() == 0UL ) {
410  return;
411  }
412  else if( rhs.mat_.columns() == 0UL ) {
413  reset( ~lhs );
414  return;
415  }
416 
417  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
418  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
419 
420  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
421  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
422  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
423  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
424 
425  DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
426  }
428  //**********************************************************************************************
429 
430  //**Assignment to dense vectors (kernel selection)**********************************************
441  template< typename VT1 // Type of the left-hand side target vector
442  , typename MT1 // Type of the left-hand side matrix operand
443  , typename VT2 > // Type of the right-hand side vector operand
444  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x )
445  {
446  if( ( IsDiagonal<MT1>::value ) ||
447  ( IsComputation<MT>::value && !evaluateMatrix ) ||
448  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
449  selectSmallAssignKernel( y, A, x );
450  else
451  selectBlasAssignKernel( y, A, x );
452  }
454  //**********************************************************************************************
455 
456  //**Default assignment to dense vectors*********************************************************
470  template< typename VT1 // Type of the left-hand side target vector
471  , typename MT1 // Type of the left-hand side matrix operand
472  , typename VT2 > // Type of the right-hand side vector operand
473  static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x )
474  {
475  y.assign( A * x );
476  }
478  //**********************************************************************************************
479 
480  //**Default assignment to dense vectors (small matrices)****************************************
494  template< typename VT1 // Type of the left-hand side target vector
495  , typename MT1 // Type of the left-hand side matrix operand
496  , typename VT2 > // Type of the right-hand side vector operand
498  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
499  {
500  selectDefaultAssignKernel( y, A, x );
501  }
503  //**********************************************************************************************
504 
505  //**Vectorized default assignment to dense vectors (small matrices)*****************************
519  template< typename VT1 // Type of the left-hand side target vector
520  , typename MT1 // Type of the left-hand side matrix operand
521  , typename VT2 > // Type of the right-hand side vector operand
523  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
524  {
525  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
526 
527  const size_t M( A.rows() );
528  const size_t N( A.columns() );
529 
530  size_t i( 0UL );
531 
532  for( ; (i+8UL) <= M; i+=8UL )
533  {
534  const size_t jbegin( ( IsUpper<MT1>::value )
535  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
536  :( 0UL ) );
537  const size_t jend( ( IsLower<MT1>::value )
538  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
539  :( N ) );
540  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
541 
542  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
543  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
544 
545  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
546  size_t j( jbegin );
547 
548  for( ; j<jpos; j+=SIMDSIZE ) {
549  const SIMDType x1( x.load(j) );
550  xmm1 += A.load(i ,j) * x1;
551  xmm2 += A.load(i+1UL,j) * x1;
552  xmm3 += A.load(i+2UL,j) * x1;
553  xmm4 += A.load(i+3UL,j) * x1;
554  xmm5 += A.load(i+4UL,j) * x1;
555  xmm6 += A.load(i+5UL,j) * x1;
556  xmm7 += A.load(i+6UL,j) * x1;
557  xmm8 += A.load(i+7UL,j) * x1;
558  }
559 
560  y[i ] = sum( xmm1 );
561  y[i+1UL] = sum( xmm2 );
562  y[i+2UL] = sum( xmm3 );
563  y[i+3UL] = sum( xmm4 );
564  y[i+4UL] = sum( xmm5 );
565  y[i+5UL] = sum( xmm6 );
566  y[i+6UL] = sum( xmm7 );
567  y[i+7UL] = sum( xmm8 );
568 
569  for( ; remainder && j<jend; ++j ) {
570  y[i ] += A(i ,j) * x[j];
571  y[i+1UL] += A(i+1UL,j) * x[j];
572  y[i+2UL] += A(i+2UL,j) * x[j];
573  y[i+3UL] += A(i+3UL,j) * x[j];
574  y[i+4UL] += A(i+4UL,j) * x[j];
575  y[i+5UL] += A(i+5UL,j) * x[j];
576  y[i+6UL] += A(i+6UL,j) * x[j];
577  y[i+7UL] += A(i+7UL,j) * x[j];
578  }
579  }
580 
581  for( ; (i+4UL) <= M; i+=4UL )
582  {
583  const size_t jbegin( ( IsUpper<MT1>::value )
584  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
585  :( 0UL ) );
586  const size_t jend( ( IsLower<MT1>::value )
587  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
588  :( N ) );
589  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
590 
591  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
592  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
593 
594  SIMDType xmm1, xmm2, xmm3, xmm4;
595  size_t j( jbegin );
596 
597  for( ; j<jpos; j+=SIMDSIZE ) {
598  const SIMDType x1( x.load(j) );
599  xmm1 += A.load(i ,j) * x1;
600  xmm2 += A.load(i+1UL,j) * x1;
601  xmm3 += A.load(i+2UL,j) * x1;
602  xmm4 += A.load(i+3UL,j) * x1;
603  }
604 
605  y[i ] = sum( xmm1 );
606  y[i+1UL] = sum( xmm2 );
607  y[i+2UL] = sum( xmm3 );
608  y[i+3UL] = sum( xmm4 );
609 
610  for( ; remainder && j<jend; ++j ) {
611  y[i ] += A(i ,j) * x[j];
612  y[i+1UL] += A(i+1UL,j) * x[j];
613  y[i+2UL] += A(i+2UL,j) * x[j];
614  y[i+3UL] += A(i+3UL,j) * x[j];
615  }
616  }
617 
618  for( ; (i+3UL) <= M; i+=3UL )
619  {
620  const size_t jbegin( ( IsUpper<MT1>::value )
621  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
622  :( 0UL ) );
623  const size_t jend( ( IsLower<MT1>::value )
624  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
625  :( N ) );
626  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
627 
628  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
629  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
630 
631  SIMDType xmm1, xmm2, xmm3;
632  size_t j( jbegin );
633 
634  for( ; j<jpos; j+=SIMDSIZE ) {
635  const SIMDType x1( x.load(j) );
636  xmm1 += A.load(i ,j) * x1;
637  xmm2 += A.load(i+1UL,j) * x1;
638  xmm3 += A.load(i+2UL,j) * x1;
639  }
640 
641  y[i ] = sum( xmm1 );
642  y[i+1UL] = sum( xmm2 );
643  y[i+2UL] = sum( xmm3 );
644 
645  for( ; remainder && j<jend; ++j ) {
646  y[i ] += A(i ,j) * x[j];
647  y[i+1UL] += A(i+1UL,j) * x[j];
648  y[i+2UL] += A(i+2UL,j) * x[j];
649  }
650  }
651 
652  for( ; (i+2UL) <= M; i+=2UL )
653  {
654  const size_t jbegin( ( IsUpper<MT1>::value )
655  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
656  :( 0UL ) );
657  const size_t jend( ( IsLower<MT1>::value )
658  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
659  :( N ) );
660  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
661 
662  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
663  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
664 
665  SIMDType xmm1, xmm2;
666  size_t j( jbegin );
667 
668  for( ; j<jpos; j+=SIMDSIZE ) {
669  const SIMDType x1( x.load(j) );
670  xmm1 += A.load(i ,j) * x1;
671  xmm2 += A.load(i+1UL,j) * x1;
672  }
673 
674  y[i ] = sum( xmm1 );
675  y[i+1UL] = sum( xmm2 );
676 
677  for( ; remainder && j<jend; ++j ) {
678  y[i ] += A(i ,j) * x[j];
679  y[i+1UL] += A(i+1UL,j) * x[j];
680  }
681  }
682 
683  if( i < M )
684  {
685  const size_t jbegin( ( IsUpper<MT1>::value )
686  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
687  :( 0UL ) );
688  const size_t jend( ( IsLower<MT1>::value )
689  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
690  :( N ) );
691  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
692 
693  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
694  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
695 
696  SIMDType xmm1;
697  size_t j( jbegin );
698 
699  for( ; j<jpos; j+=SIMDSIZE ) {
700  xmm1 += A.load(i,j) * x.load(j);
701  }
702 
703  y[i] = sum( xmm1 );
704 
705  for( ; remainder && j<jend; ++j ) {
706  y[i] += A(i,j) * x[j];
707  }
708  }
709  }
711  //**********************************************************************************************
712 
713  //**Default assignment to dense vectors (large matrices)****************************************
727  template< typename VT1 // Type of the left-hand side target vector
728  , typename MT1 // Type of the left-hand side matrix operand
729  , typename VT2 > // Type of the right-hand side vector operand
731  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
732  {
733  selectDefaultAssignKernel( y, A, x );
734  }
736  //**********************************************************************************************
737 
738  //**Vectorized default assignment to dense vectors (large matrices)*****************************
752  template< typename VT1 // Type of the left-hand side target vector
753  , typename MT1 // Type of the left-hand side matrix operand
754  , typename VT2 > // Type of the right-hand side vector operand
756  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
757  {
758  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
759 
760  const size_t M( A.rows() );
761  const size_t N( A.columns() );
762 
763  reset( y );
764 
765  size_t i( 0UL );
766 
767  for( ; (i+8UL) <= M; i+=8UL )
768  {
769  const size_t jbegin( ( IsUpper<MT1>::value )
770  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
771  :( 0UL ) );
772  const size_t jend( ( IsLower<MT1>::value )
773  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
774  :( N ) );
775  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
776 
777  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
778  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
779 
780  size_t j( jbegin );
781 
782  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
783  const size_t j1( j+SIMDSIZE );
784  const size_t j2( j+SIMDSIZE*2UL );
785  const size_t j3( j+SIMDSIZE*3UL );
786  const SIMDType x1( x.load(j ) );
787  const SIMDType x2( x.load(j1) );
788  const SIMDType x3( x.load(j2) );
789  const SIMDType x4( x.load(j3) );
790  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
791  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
792  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
793  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
794  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
795  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
796  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
797  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
798  }
799 
800  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
801  const size_t j1( j+SIMDSIZE );
802  const SIMDType x1( x.load(j ) );
803  const SIMDType x2( x.load(j1) );
804  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
805  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
806  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
807  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
808  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
809  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
810  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
811  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
812  }
813 
814  for( ; j<jpos; j+=SIMDSIZE ) {
815  const SIMDType x1( x.load(j) );
816  y[i ] += sum( A.load(i ,j) * x1 );
817  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
818  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
819  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
820  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
821  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
822  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
823  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
824  }
825 
826  for( ; remainder && j<jend; ++j ) {
827  y[i ] += A(i ,j) * x[j];
828  y[i+1UL] += A(i+1UL,j) * x[j];
829  y[i+2UL] += A(i+2UL,j) * x[j];
830  y[i+3UL] += A(i+3UL,j) * x[j];
831  y[i+4UL] += A(i+4UL,j) * x[j];
832  y[i+5UL] += A(i+5UL,j) * x[j];
833  y[i+6UL] += A(i+6UL,j) * x[j];
834  y[i+7UL] += A(i+7UL,j) * x[j];
835  }
836  }
837 
838  for( ; (i+4UL) <= M; i+=4UL )
839  {
840  const size_t jbegin( ( IsUpper<MT1>::value )
841  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
842  :( 0UL ) );
843  const size_t jend( ( IsLower<MT1>::value )
844  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
845  :( N ) );
846  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
847 
848  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
849  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
850 
851  size_t j( jbegin );
852 
853  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
854  const size_t j1( j+SIMDSIZE );
855  const size_t j2( j+SIMDSIZE*2UL );
856  const size_t j3( j+SIMDSIZE*3UL );
857  const SIMDType x1( x.load(j ) );
858  const SIMDType x2( x.load(j1) );
859  const SIMDType x3( x.load(j2) );
860  const SIMDType x4( x.load(j3) );
861  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
862  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
863  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
864  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
865  }
866 
867  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
868  const size_t j1( j+SIMDSIZE );
869  const SIMDType x1( x.load(j ) );
870  const SIMDType x2( x.load(j1) );
871  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
872  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
873  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
874  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
875  }
876 
877  for( ; j<jpos; j+=SIMDSIZE ) {
878  const SIMDType x1( x.load(j) );
879  y[i ] += sum( A.load(i ,j) * x1 );
880  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
881  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
882  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
883  }
884 
885  for( ; remainder && j<jend; ++j ) {
886  y[i ] += A(i ,j) * x[j];
887  y[i+1UL] += A(i+1UL,j) * x[j];
888  y[i+2UL] += A(i+2UL,j) * x[j];
889  y[i+3UL] += A(i+3UL,j) * x[j];
890  }
891  }
892 
893  for( ; (i+2UL) <= M; i+=2UL )
894  {
895  const size_t jbegin( ( IsUpper<MT1>::value )
896  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
897  :( 0UL ) );
898  const size_t jend( ( IsLower<MT1>::value )
899  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
900  :( N ) );
901  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
902 
903  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
904  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
905 
906  size_t j( jbegin );
907 
908  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
909  const size_t j1( j+SIMDSIZE );
910  const size_t j2( j+SIMDSIZE*2UL );
911  const size_t j3( j+SIMDSIZE*3UL );
912  const SIMDType x1( x.load(j ) );
913  const SIMDType x2( x.load(j1) );
914  const SIMDType x3( x.load(j2) );
915  const SIMDType x4( x.load(j3) );
916  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
917  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
918  }
919 
920  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
921  const size_t j1( j+SIMDSIZE );
922  const SIMDType x1( x.load(j ) );
923  const SIMDType x2( x.load(j1) );
924  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
925  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
926  }
927 
928  for( ; j<jpos; j+=SIMDSIZE ) {
929  const SIMDType x1( x.load(j) );
930  y[i ] += sum( A.load(i ,j) * x1 );
931  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
932  }
933 
934  for( ; remainder && j<jend; ++j ) {
935  y[i ] += A(i ,j) * x[j];
936  y[i+1UL] += A(i+1UL,j) * x[j];
937  }
938  }
939 
940  if( i < M )
941  {
942  const size_t jbegin( ( IsUpper<MT1>::value )
943  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
944  :( 0UL ) );
945  const size_t jend( ( IsLower<MT1>::value )
946  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
947  :( N ) );
948  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
949 
950  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
951  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
952 
953  size_t j( jbegin );
954 
955  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
956  const size_t j1( j+SIMDSIZE );
957  const size_t j2( j+SIMDSIZE*2UL );
958  const size_t j3( j+SIMDSIZE*3UL );
959  const SIMDType x1( x.load(j ) );
960  const SIMDType x2( x.load(j1) );
961  const SIMDType x3( x.load(j2) );
962  const SIMDType x4( x.load(j3) );
963  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
964  }
965 
966  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
967  const size_t j1( j+SIMDSIZE );
968  const SIMDType x1( x.load(j ) );
969  const SIMDType x2( x.load(j1) );
970  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
971  }
972 
973  for( ; j<jpos; j+=SIMDSIZE ) {
974  const SIMDType x1( x.load(j) );
975  y[i] += sum( A.load(i,j) * x1 );
976  }
977 
978  for( ; remainder && j<jend; ++j ) {
979  y[i] += A(i,j) * x[j];
980  }
981  }
982  }
984  //**********************************************************************************************
985 
986  //**BLAS-based assignment to dense vectors (default)********************************************
1000  template< typename VT1 // Type of the left-hand side target vector
1001  , typename MT1 // Type of the left-hand side matrix operand
1002  , typename VT2 > // Type of the right-hand side vector operand
1004  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1005  {
1006  selectLargeAssignKernel( y, A, x );
1007  }
1009  //**********************************************************************************************
1010 
1011  //**BLAS-based assignment to dense vectors******************************************************
1012 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1013 
1026  template< typename VT1 // Type of the left-hand side target vector
1027  , typename MT1 // Type of the left-hand side matrix operand
1028  , typename VT2 > // Type of the right-hand side vector operand
1030  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1031  {
1032  using ET = ElementType_<VT1>;
1033 
1034  if( IsTriangular<MT1>::value ) {
1035  assign( y, x );
1036  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1037  }
1038  else {
1039  gemv( y, A, x, ET(1), ET(0) );
1040  }
1041  }
1043 #endif
1044  //**********************************************************************************************
1045 
1046  //**Assignment to sparse vectors****************************************************************
1059  template< typename VT1 > // Type of the target sparse vector
1060  friend inline void assign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1061  {
1063 
1067 
1068  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1069 
1070  const ResultType tmp( serial( rhs ) );
1071  assign( ~lhs, tmp );
1072  }
1074  //**********************************************************************************************
1075 
1076  //**Addition assignment to dense vectors********************************************************
1089  template< typename VT1 > // Type of the target dense vector
1090  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1091  {
1093 
1094  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1095 
1096  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1097  return;
1098  }
1099 
1100  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1101  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1102 
1103  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1104  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1105  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1106  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1107 
1108  DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1109  }
1111  //**********************************************************************************************
1112 
1113  //**Addition assignment to dense vectors (kernel selection)*************************************
1124  template< typename VT1 // Type of the left-hand side target vector
1125  , typename MT1 // Type of the left-hand side matrix operand
1126  , typename VT2 > // Type of the right-hand side vector operand
1127  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1128  {
1129  if( ( IsDiagonal<MT1>::value ) ||
1130  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1131  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1132  selectSmallAddAssignKernel( y, A, x );
1133  else
1134  selectBlasAddAssignKernel( y, A, x );
1135  }
1137  //**********************************************************************************************
1138 
1139  //**Default addition assignment to dense vectors************************************************
1153  template< typename VT1 // Type of the left-hand side target vector
1154  , typename MT1 // Type of the left-hand side matrix operand
1155  , typename VT2 > // Type of the right-hand side vector operand
1156  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1157  {
1158  y.addAssign( A * x );
1159  }
1161  //**********************************************************************************************
1162 
1163  //**Default addition assignment to dense vectors (small matrices)*******************************
1177  template< typename VT1 // Type of the left-hand side target vector
1178  , typename MT1 // Type of the left-hand side matrix operand
1179  , typename VT2 > // Type of the right-hand side vector operand
1181  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1182  {
1183  selectDefaultAddAssignKernel( y, A, x );
1184  }
1186  //**********************************************************************************************
1187 
1188  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1202  template< typename VT1 // Type of the left-hand side target vector
1203  , typename MT1 // Type of the left-hand side matrix operand
1204  , typename VT2 > // Type of the right-hand side vector operand
1206  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1207  {
1208  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1209 
1210  const size_t M( A.rows() );
1211  const size_t N( A.columns() );
1212 
1213  size_t i( 0UL );
1214 
1215  for( ; (i+8UL) <= M; i+=8UL )
1216  {
1217  const size_t jbegin( ( IsUpper<MT1>::value )
1218  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1219  :( 0UL ) );
1220  const size_t jend( ( IsLower<MT1>::value )
1221  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1222  :( N ) );
1223  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1224 
1225  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1226  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1227 
1228  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1229  size_t j( jbegin );
1230 
1231  for( ; j<jpos; j+=SIMDSIZE ) {
1232  const SIMDType x1( x.load(j) );
1233  xmm1 += A.load(i ,j) * x1;
1234  xmm2 += A.load(i+1UL,j) * x1;
1235  xmm3 += A.load(i+2UL,j) * x1;
1236  xmm4 += A.load(i+3UL,j) * x1;
1237  xmm5 += A.load(i+4UL,j) * x1;
1238  xmm6 += A.load(i+5UL,j) * x1;
1239  xmm7 += A.load(i+6UL,j) * x1;
1240  xmm8 += A.load(i+7UL,j) * x1;
1241  }
1242 
1243  y[i ] += sum( xmm1 );
1244  y[i+1UL] += sum( xmm2 );
1245  y[i+2UL] += sum( xmm3 );
1246  y[i+3UL] += sum( xmm4 );
1247  y[i+4UL] += sum( xmm5 );
1248  y[i+5UL] += sum( xmm6 );
1249  y[i+6UL] += sum( xmm7 );
1250  y[i+7UL] += sum( xmm8 );
1251 
1252  for( ; remainder && j<jend; ++j ) {
1253  y[i ] += A(i ,j) * x[j];
1254  y[i+1UL] += A(i+1UL,j) * x[j];
1255  y[i+2UL] += A(i+2UL,j) * x[j];
1256  y[i+3UL] += A(i+3UL,j) * x[j];
1257  y[i+4UL] += A(i+4UL,j) * x[j];
1258  y[i+5UL] += A(i+5UL,j) * x[j];
1259  y[i+6UL] += A(i+6UL,j) * x[j];
1260  y[i+7UL] += A(i+7UL,j) * x[j];
1261  }
1262  }
1263 
1264  for( ; (i+4UL) <= M; i+=4UL )
1265  {
1266  const size_t jbegin( ( IsUpper<MT1>::value )
1267  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1268  :( 0UL ) );
1269  const size_t jend( ( IsLower<MT1>::value )
1270  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1271  :( N ) );
1272  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1273 
1274  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1275  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1276 
1277  SIMDType xmm1, xmm2, xmm3, xmm4;
1278  size_t j( jbegin );
1279 
1280  for( ; j<jpos; j+=SIMDSIZE ) {
1281  const SIMDType x1( x.load(j) );
1282  xmm1 += A.load(i ,j) * x1;
1283  xmm2 += A.load(i+1UL,j) * x1;
1284  xmm3 += A.load(i+2UL,j) * x1;
1285  xmm4 += A.load(i+3UL,j) * x1;
1286  }
1287 
1288  y[i ] += sum( xmm1 );
1289  y[i+1UL] += sum( xmm2 );
1290  y[i+2UL] += sum( xmm3 );
1291  y[i+3UL] += sum( xmm4 );
1292 
1293  for( ; remainder && j<jend; ++j ) {
1294  y[i ] += A(i ,j) * x[j];
1295  y[i+1UL] += A(i+1UL,j) * x[j];
1296  y[i+2UL] += A(i+2UL,j) * x[j];
1297  y[i+3UL] += A(i+3UL,j) * x[j];
1298  }
1299  }
1300 
1301  for( ; (i+3UL) <= M; i+=3UL )
1302  {
1303  const size_t jbegin( ( IsUpper<MT1>::value )
1304  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1305  :( 0UL ) );
1306  const size_t jend( ( IsLower<MT1>::value )
1307  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1308  :( N ) );
1309  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1310 
1311  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1312  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1313 
1314  SIMDType xmm1, xmm2, xmm3;
1315  size_t j( jbegin );
1316 
1317  for( ; j<jpos; j+=SIMDSIZE ) {
1318  const SIMDType x1( x.load(j) );
1319  xmm1 += A.load(i ,j) * x1;
1320  xmm2 += A.load(i+1UL,j) * x1;
1321  xmm3 += A.load(i+2UL,j) * x1;
1322  }
1323 
1324  y[i ] += sum( xmm1 );
1325  y[i+1UL] += sum( xmm2 );
1326  y[i+2UL] += sum( xmm3 );
1327 
1328  for( ; remainder && j<jend; ++j ) {
1329  y[i ] += A(i ,j) * x[j];
1330  y[i+1UL] += A(i+1UL,j) * x[j];
1331  y[i+2UL] += A(i+2UL,j) * x[j];
1332  }
1333  }
1334 
1335  for( ; (i+2UL) <= M; i+=2UL )
1336  {
1337  const size_t jbegin( ( IsUpper<MT1>::value )
1338  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1339  :( 0UL ) );
1340  const size_t jend( ( IsLower<MT1>::value )
1341  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1342  :( N ) );
1343  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1344 
1345  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1346  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1347 
1348  SIMDType xmm1, xmm2;
1349  size_t j( jbegin );
1350 
1351  for( ; j<jpos; j+=SIMDSIZE ) {
1352  const SIMDType x1( x.load(j) );
1353  xmm1 += A.load(i ,j) * x1;
1354  xmm2 += A.load(i+1UL,j) * x1;
1355  }
1356 
1357  y[i ] += sum( xmm1 );
1358  y[i+1UL] += sum( xmm2 );
1359 
1360  for( ; remainder && j<jend; ++j ) {
1361  y[i ] += A(i ,j) * x[j];
1362  y[i+1UL] += A(i+1UL,j) * x[j];
1363  }
1364  }
1365 
1366  if( i < M )
1367  {
1368  const size_t jbegin( ( IsUpper<MT1>::value )
1369  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1370  :( 0UL ) );
1371  const size_t jend( ( IsLower<MT1>::value )
1372  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1373  :( N ) );
1374  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1375 
1376  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1377  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1378 
1379  SIMDType xmm1;
1380  size_t j( jbegin );
1381 
1382  for( ; j<jpos; j+=SIMDSIZE ) {
1383  xmm1 += A.load(i,j) * x.load(j);
1384  }
1385 
1386  y[i] += sum( xmm1 );
1387 
1388  for( ; remainder && j<jend; ++j ) {
1389  y[i] += A(i,j) * x[j];
1390  }
1391  }
1392  }
1394  //**********************************************************************************************
1395 
1396  //**Default addition assignment to dense vectors (large matrices)*******************************
1410  template< typename VT1 // Type of the left-hand side target vector
1411  , typename MT1 // Type of the left-hand side matrix operand
1412  , typename VT2 > // Type of the right-hand side vector operand
1414  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1415  {
1416  selectDefaultAddAssignKernel( y, A, x );
1417  }
1419  //**********************************************************************************************
1420 
1421  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1435  template< typename VT1 // Type of the left-hand side target vector
1436  , typename MT1 // Type of the left-hand side matrix operand
1437  , typename VT2 > // Type of the right-hand side vector operand
1439  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1440  {
1441  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1442 
1443  const size_t M( A.rows() );
1444  const size_t N( A.columns() );
1445 
1446  size_t i( 0UL );
1447 
1448  for( ; (i+8UL) <= M; i+=8UL )
1449  {
1450  const size_t jbegin( ( IsUpper<MT1>::value )
1451  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1452  :( 0UL ) );
1453  const size_t jend( ( IsLower<MT1>::value )
1454  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1455  :( N ) );
1456  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1457 
1458  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1459  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1460 
1461  size_t j( jbegin );
1462 
1463  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1464  const size_t j1( j+SIMDSIZE );
1465  const size_t j2( j+SIMDSIZE*2UL );
1466  const size_t j3( j+SIMDSIZE*3UL );
1467  const SIMDType x1( x.load(j ) );
1468  const SIMDType x2( x.load(j1) );
1469  const SIMDType x3( x.load(j2) );
1470  const SIMDType x4( x.load(j3) );
1471  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1472  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1473  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1474  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1475  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1476  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1477  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1478  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1479  }
1480 
1481  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1482  const size_t j1( j+SIMDSIZE );
1483  const SIMDType x1( x.load(j ) );
1484  const SIMDType x2( x.load(j1) );
1485  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1486  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1487  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1488  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1489  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1490  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1491  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1492  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1493  }
1494 
1495  for( ; j<jpos; j+=SIMDSIZE ) {
1496  const SIMDType x1( x.load(j) );
1497  y[i ] += sum( A.load(i ,j) * x1 );
1498  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1499  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1500  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1501  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
1502  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
1503  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
1504  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
1505  }
1506 
1507  for( ; remainder && j<jend; ++j ) {
1508  y[i ] += A(i ,j) * x[j];
1509  y[i+1UL] += A(i+1UL,j) * x[j];
1510  y[i+2UL] += A(i+2UL,j) * x[j];
1511  y[i+3UL] += A(i+3UL,j) * x[j];
1512  y[i+4UL] += A(i+4UL,j) * x[j];
1513  y[i+5UL] += A(i+5UL,j) * x[j];
1514  y[i+6UL] += A(i+6UL,j) * x[j];
1515  y[i+7UL] += A(i+7UL,j) * x[j];
1516  }
1517  }
1518 
1519  for( ; (i+4UL) <= M; i+=4UL )
1520  {
1521  const size_t jbegin( ( IsUpper<MT1>::value )
1522  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1523  :( 0UL ) );
1524  const size_t jend( ( IsLower<MT1>::value )
1525  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1526  :( N ) );
1527  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1528 
1529  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1530  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1531 
1532  size_t j( jbegin );
1533 
1534  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1535  const size_t j1( j+SIMDSIZE );
1536  const size_t j2( j+SIMDSIZE*2UL );
1537  const size_t j3( j+SIMDSIZE*3UL );
1538  const SIMDType x1( x.load(j ) );
1539  const SIMDType x2( x.load(j1) );
1540  const SIMDType x3( x.load(j2) );
1541  const SIMDType x4( x.load(j3) );
1542  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1543  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1544  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1545  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1546  }
1547 
1548  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1549  const size_t j1( j+SIMDSIZE );
1550  const SIMDType x1( x.load(j ) );
1551  const SIMDType x2( x.load(j1) );
1552  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1553  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1554  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1555  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1556  }
1557 
1558  for( ; j<jpos; j+=SIMDSIZE ) {
1559  const SIMDType x1( x.load(j) );
1560  y[i ] += sum( A.load(i ,j) * x1 );
1561  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1562  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1563  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1564  }
1565 
1566  for( ; remainder && j<jend; ++j ) {
1567  y[i ] += A(i ,j) * x[j];
1568  y[i+1UL] += A(i+1UL,j) * x[j];
1569  y[i+2UL] += A(i+2UL,j) * x[j];
1570  y[i+3UL] += A(i+3UL,j) * x[j];
1571  }
1572  }
1573 
1574  for( ; (i+2UL) <= M; i+=2UL )
1575  {
1576  const size_t jbegin( ( IsUpper<MT1>::value )
1577  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1578  :( 0UL ) );
1579  const size_t jend( ( IsLower<MT1>::value )
1580  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1581  :( N ) );
1582  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1583 
1584  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1585  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1586 
1587  size_t j( jbegin );
1588 
1589  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1590  const size_t j1( j+SIMDSIZE );
1591  const size_t j2( j+SIMDSIZE*2UL );
1592  const size_t j3( j+SIMDSIZE*3UL );
1593  const SIMDType x1( x.load(j ) );
1594  const SIMDType x2( x.load(j1) );
1595  const SIMDType x3( x.load(j2) );
1596  const SIMDType x4( x.load(j3) );
1597  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1598  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1599  }
1600 
1601  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1602  const size_t j1( j+SIMDSIZE );
1603  const SIMDType x1( x.load(j ) );
1604  const SIMDType x2( x.load(j1) );
1605  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1606  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1607  }
1608 
1609  for( ; j<jpos; j+=SIMDSIZE ) {
1610  const SIMDType x1( x.load(j) );
1611  y[i ] += sum( A.load(i ,j) * x1 );
1612  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1613  }
1614 
1615  for( ; remainder && j<jend; ++j ) {
1616  y[i ] += A(i ,j) * x[j];
1617  y[i+1UL] += A(i+1UL,j) * x[j];
1618  }
1619  }
1620 
1621  if( i < M )
1622  {
1623  const size_t jbegin( ( IsUpper<MT1>::value )
1624  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1625  :( 0UL ) );
1626  const size_t jend( ( IsLower<MT1>::value )
1627  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1628  :( N ) );
1629  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1630 
1631  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1632  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1633 
1634  size_t j( jbegin );
1635 
1636  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1637  const size_t j1( j+SIMDSIZE );
1638  const size_t j2( j+SIMDSIZE*2UL );
1639  const size_t j3( j+SIMDSIZE*3UL );
1640  const SIMDType x1( x.load(j ) );
1641  const SIMDType x2( x.load(j1) );
1642  const SIMDType x3( x.load(j2) );
1643  const SIMDType x4( x.load(j3) );
1644  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1645  }
1646 
1647  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1648  const size_t j1( j+SIMDSIZE );
1649  const SIMDType x1( x.load(j ) );
1650  const SIMDType x2( x.load(j1) );
1651  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1652  }
1653 
1654  for( ; j<jpos; j+=SIMDSIZE ) {
1655  const SIMDType x1( x.load(j) );
1656  y[i] += sum( A.load(i,j) * x1 );
1657  }
1658 
1659  for( ; remainder && j<jend; ++j ) {
1660  y[i] += A(i,j) * x[j];
1661  }
1662  }
1663  }
1665  //**********************************************************************************************
1666 
1667  //**BLAS-based addition assignment to dense vectors (default)***********************************
1681  template< typename VT1 // Type of the left-hand side target vector
1682  , typename MT1 // Type of the left-hand side matrix operand
1683  , typename VT2 > // Type of the right-hand side vector operand
1685  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1686  {
1687  selectLargeAddAssignKernel( y, A, x );
1688  }
1690  //**********************************************************************************************
1691 
1692  //**BLAS-based addition assignment to dense vectors*********************************************
1693 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1694 
1707  template< typename VT1 // Type of the left-hand side target vector
1708  , typename MT1 // Type of the left-hand side matrix operand
1709  , typename VT2 > // Type of the right-hand side vector operand
1711  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1712  {
1713  using ET = ElementType_<VT1>;
1714 
1715  if( IsTriangular<MT1>::value ) {
1716  ResultType_<VT1> tmp( serial( x ) );
1717  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1718  addAssign( y, tmp );
1719  }
1720  else {
1721  gemv( y, A, x, ET(1), ET(1) );
1722  }
1723  }
1725 #endif
1726  //**********************************************************************************************
1727 
1728  //**Addition assignment to sparse vectors*******************************************************
1729  // No special implementation for the addition assignment to sparse vectors.
1730  //**********************************************************************************************
1731 
1732  //**Subtraction assignment to dense vectors*****************************************************
1745  template< typename VT1 > // Type of the target dense vector
1746  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1747  {
1749 
1750  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1751 
1752  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1753  return;
1754  }
1755 
1756  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1757  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1758 
1759  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1760  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1761  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1762  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1763 
1764  DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1765  }
1767  //**********************************************************************************************
1768 
1769  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1780  template< typename VT1 // Type of the left-hand side target vector
1781  , typename MT1 // Type of the left-hand side matrix operand
1782  , typename VT2 > // Type of the right-hand side vector operand
1783  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1784  {
1785  if( ( IsDiagonal<MT1>::value ) ||
1786  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1787  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1788  selectSmallSubAssignKernel( y, A, x );
1789  else
1790  selectBlasSubAssignKernel( y, A, x );
1791  }
1793  //**********************************************************************************************
1794 
1795  //**Default subtraction assignment to dense vectors*********************************************
1809  template< typename VT1 // Type of the left-hand side target vector
1810  , typename MT1 // Type of the left-hand side matrix operand
1811  , typename VT2 > // Type of the right-hand side vector operand
1812  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1813  {
1814  y.subAssign( A * x );
1815  }
1817  //**********************************************************************************************
1818 
1819  //**Default subtraction assignment to dense vectors (small matrices)****************************
1833  template< typename VT1 // Type of the left-hand side target vector
1834  , typename MT1 // Type of the left-hand side matrix operand
1835  , typename VT2 > // Type of the right-hand side vector operand
1837  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1838  {
1839  selectDefaultSubAssignKernel( y, A, x );
1840  }
1842  //**********************************************************************************************
1843 
1844  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1858  template< typename VT1 // Type of the left-hand side target vector
1859  , typename MT1 // Type of the left-hand side matrix operand
1860  , typename VT2 > // Type of the right-hand side vector operand
1862  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1863  {
1864  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1865 
1866  const size_t M( A.rows() );
1867  const size_t N( A.columns() );
1868 
1869  size_t i( 0UL );
1870 
1871  for( ; (i+8UL) <= M; i+=8UL )
1872  {
1873  const size_t jbegin( ( IsUpper<MT1>::value )
1874  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1875  :( 0UL ) );
1876  const size_t jend( ( IsLower<MT1>::value )
1877  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1878  :( N ) );
1879  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1880 
1881  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1882  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1883 
1884  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1885  size_t j( jbegin );
1886 
1887  for( ; j<jpos; j+=SIMDSIZE ) {
1888  const SIMDType x1( x.load(j) );
1889  xmm1 += A.load(i ,j) * x1;
1890  xmm2 += A.load(i+1UL,j) * x1;
1891  xmm3 += A.load(i+2UL,j) * x1;
1892  xmm4 += A.load(i+3UL,j) * x1;
1893  xmm5 += A.load(i+4UL,j) * x1;
1894  xmm6 += A.load(i+5UL,j) * x1;
1895  xmm7 += A.load(i+6UL,j) * x1;
1896  xmm8 += A.load(i+7UL,j) * x1;
1897  }
1898 
1899  y[i ] -= sum( xmm1 );
1900  y[i+1UL] -= sum( xmm2 );
1901  y[i+2UL] -= sum( xmm3 );
1902  y[i+3UL] -= sum( xmm4 );
1903  y[i+4UL] -= sum( xmm5 );
1904  y[i+5UL] -= sum( xmm6 );
1905  y[i+6UL] -= sum( xmm7 );
1906  y[i+7UL] -= sum( xmm8 );
1907 
1908  for( ; remainder && j<jend; ++j ) {
1909  y[i ] -= A(i ,j) * x[j];
1910  y[i+1UL] -= A(i+1UL,j) * x[j];
1911  y[i+2UL] -= A(i+2UL,j) * x[j];
1912  y[i+3UL] -= A(i+3UL,j) * x[j];
1913  y[i+4UL] -= A(i+4UL,j) * x[j];
1914  y[i+5UL] -= A(i+5UL,j) * x[j];
1915  y[i+6UL] -= A(i+6UL,j) * x[j];
1916  y[i+7UL] -= A(i+7UL,j) * x[j];
1917  }
1918  }
1919 
1920  for( ; (i+4UL) <= M; i+=4UL )
1921  {
1922  const size_t jbegin( ( IsUpper<MT1>::value )
1923  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1924  :( 0UL ) );
1925  const size_t jend( ( IsLower<MT1>::value )
1926  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1927  :( N ) );
1928  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1929 
1930  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1931  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1932 
1933  SIMDType xmm1, xmm2, xmm3, xmm4;
1934  size_t j( jbegin );
1935 
1936  for( ; j<jpos; j+=SIMDSIZE ) {
1937  const SIMDType x1( x.load(j) );
1938  xmm1 += A.load(i ,j) * x1;
1939  xmm2 += A.load(i+1UL,j) * x1;
1940  xmm3 += A.load(i+2UL,j) * x1;
1941  xmm4 += A.load(i+3UL,j) * x1;
1942  }
1943 
1944  y[i ] -= sum( xmm1 );
1945  y[i+1UL] -= sum( xmm2 );
1946  y[i+2UL] -= sum( xmm3 );
1947  y[i+3UL] -= sum( xmm4 );
1948 
1949  for( ; remainder && j<jend; ++j ) {
1950  y[i ] -= A(i ,j) * x[j];
1951  y[i+1UL] -= A(i+1UL,j) * x[j];
1952  y[i+2UL] -= A(i+2UL,j) * x[j];
1953  y[i+3UL] -= A(i+3UL,j) * x[j];
1954  }
1955  }
1956 
1957  for( ; (i+3UL) <= M; i+=3UL )
1958  {
1959  const size_t jbegin( ( IsUpper<MT1>::value )
1960  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1961  :( 0UL ) );
1962  const size_t jend( ( IsLower<MT1>::value )
1963  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1964  :( N ) );
1965  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1966 
1967  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1968  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1969 
1970  SIMDType xmm1, xmm2, xmm3;
1971  size_t j( jbegin );
1972 
1973  for( ; j<jpos; j+=SIMDSIZE ) {
1974  const SIMDType x1( x.load(j) );
1975  xmm1 += A.load(i ,j) * x1;
1976  xmm2 += A.load(i+1UL,j) * x1;
1977  xmm3 += A.load(i+2UL,j) * x1;
1978  }
1979 
1980  y[i ] -= sum( xmm1 );
1981  y[i+1UL] -= sum( xmm2 );
1982  y[i+2UL] -= sum( xmm3 );
1983 
1984  for( ; remainder && j<jend; ++j ) {
1985  y[i ] -= A(i ,j) * x[j];
1986  y[i+1UL] -= A(i+1UL,j) * x[j];
1987  y[i+2UL] -= A(i+2UL,j) * x[j];
1988  }
1989  }
1990 
1991  for( ; (i+2UL) <= M; i+=2UL )
1992  {
1993  const size_t jbegin( ( IsUpper<MT1>::value )
1994  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1995  :( 0UL ) );
1996  const size_t jend( ( IsLower<MT1>::value )
1997  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1998  :( N ) );
1999  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2000 
2001  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2002  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2003 
2004  SIMDType xmm1, xmm2;
2005  size_t j( jbegin );
2006 
2007  for( ; j<jpos; j+=SIMDSIZE ) {
2008  const SIMDType x1( x.load(j) );
2009  xmm1 += A.load(i ,j) * x1;
2010  xmm2 += A.load(i+1UL,j) * x1;
2011  }
2012 
2013  y[i ] -= sum( xmm1 );
2014  y[i+1UL] -= sum( xmm2 );
2015 
2016  for( ; remainder && j<jend; ++j ) {
2017  y[i ] -= A(i ,j) * x[j];
2018  y[i+1UL] -= A(i+1UL,j) * x[j];
2019  }
2020  }
2021 
2022  if( i < M )
2023  {
2024  const size_t jbegin( ( IsUpper<MT1>::value )
2025  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2026  :( 0UL ) );
2027  const size_t jend( ( IsLower<MT1>::value )
2028  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2029  :( N ) );
2030  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2031 
2032  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2033  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2034 
2035  SIMDType xmm1;
2036  size_t j( jbegin );
2037 
2038  for( ; j<jpos; j+=SIMDSIZE ) {
2039  xmm1 += A.load(i,j) * x.load(j);
2040  }
2041 
2042  y[i] -= sum( xmm1 );
2043 
2044  for( ; remainder && j<jend; ++j ) {
2045  y[i] -= A(i,j) * x[j];
2046  }
2047  }
2048  }
2050  //**********************************************************************************************
2051 
2052  //**Default subtraction assignment to dense vectors (large matrices)****************************
2066  template< typename VT1 // Type of the left-hand side target vector
2067  , typename MT1 // Type of the left-hand side matrix operand
2068  , typename VT2 > // Type of the right-hand side vector operand
2070  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2071  {
2072  selectDefaultSubAssignKernel( y, A, x );
2073  }
2075  //**********************************************************************************************
2076 
2077  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2091  template< typename VT1 // Type of the left-hand side target vector
2092  , typename MT1 // Type of the left-hand side matrix operand
2093  , typename VT2 > // Type of the right-hand side vector operand
2095  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2096  {
2097  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
2098 
2099  const size_t M( A.rows() );
2100  const size_t N( A.columns() );
2101 
2102  size_t i( 0UL );
2103 
2104  for( ; (i+8UL) <= M; i+=8UL )
2105  {
2106  const size_t jbegin( ( IsUpper<MT1>::value )
2107  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2108  :( 0UL ) );
2109  const size_t jend( ( IsLower<MT1>::value )
2110  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
2111  :( N ) );
2112  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2113 
2114  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2115  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2116 
2117  size_t j( jbegin );
2118 
2119  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2120  const size_t j1( j+SIMDSIZE );
2121  const size_t j2( j+SIMDSIZE*2UL );
2122  const size_t j3( j+SIMDSIZE*3UL );
2123  const SIMDType x1( x.load(j ) );
2124  const SIMDType x2( x.load(j1) );
2125  const SIMDType x3( x.load(j2) );
2126  const SIMDType x4( x.load(j3) );
2127  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2128  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2129  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2130  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2131  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2132  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2133  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2134  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2135  }
2136 
2137  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2138  const size_t j1( j+SIMDSIZE );
2139  const SIMDType x1( x.load(j ) );
2140  const SIMDType x2( x.load(j1) );
2141  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2142  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2143  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2144  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2145  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2146  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2147  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2148  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2149  }
2150 
2151  for( ; j<jpos; j+=SIMDSIZE ) {
2152  const SIMDType x1( x.load(j) );
2153  y[i ] -= sum( A.load(i ,j) * x1 );
2154  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2155  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2156  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2157  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 );
2158  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 );
2159  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 );
2160  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 );
2161  }
2162 
2163  for( ; remainder && j<jend; ++j ) {
2164  y[i ] -= A(i ,j) * x[j];
2165  y[i+1UL] -= A(i+1UL,j) * x[j];
2166  y[i+2UL] -= A(i+2UL,j) * x[j];
2167  y[i+3UL] -= A(i+3UL,j) * x[j];
2168  y[i+4UL] -= A(i+4UL,j) * x[j];
2169  y[i+5UL] -= A(i+5UL,j) * x[j];
2170  y[i+6UL] -= A(i+6UL,j) * x[j];
2171  y[i+7UL] -= A(i+7UL,j) * x[j];
2172  }
2173  }
2174 
2175  for( ; (i+4UL) <= M; i+=4UL )
2176  {
2177  const size_t jbegin( ( IsUpper<MT1>::value )
2178  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2179  :( 0UL ) );
2180  const size_t jend( ( IsLower<MT1>::value )
2181  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
2182  :( N ) );
2183  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2184 
2185  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2186  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2187 
2188  size_t j( jbegin );
2189 
2190  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2191  const size_t j1( j+SIMDSIZE );
2192  const size_t j2( j+SIMDSIZE*2UL );
2193  const size_t j3( j+SIMDSIZE*3UL );
2194  const SIMDType x1( x.load(j ) );
2195  const SIMDType x2( x.load(j1) );
2196  const SIMDType x3( x.load(j2) );
2197  const SIMDType x4( x.load(j3) );
2198  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2199  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2200  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2201  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2202  }
2203 
2204  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2205  const size_t j1( j+SIMDSIZE );
2206  const SIMDType x1( x.load(j ) );
2207  const SIMDType x2( x.load(j1) );
2208  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2209  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2210  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2211  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2212  }
2213 
2214  for( ; j<jpos; j+=SIMDSIZE ) {
2215  const SIMDType x1( x.load(j) );
2216  y[i ] -= sum( A.load(i ,j) * x1 );
2217  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2218  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2219  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2220  }
2221 
2222  for( ; remainder && j<jend; ++j ) {
2223  y[i ] -= A(i ,j) * x[j];
2224  y[i+1UL] -= A(i+1UL,j) * x[j];
2225  y[i+2UL] -= A(i+2UL,j) * x[j];
2226  y[i+3UL] -= A(i+3UL,j) * x[j];
2227  }
2228  }
2229 
2230  for( ; (i+2UL) <= M; i+=2UL )
2231  {
2232  const size_t jbegin( ( IsUpper<MT1>::value )
2233  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2234  :( 0UL ) );
2235  const size_t jend( ( IsLower<MT1>::value )
2236  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2237  :( N ) );
2238  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2239 
2240  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2241  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2242 
2243  size_t j( jbegin );
2244 
2245  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2246  const size_t j1( j+SIMDSIZE );
2247  const size_t j2( j+SIMDSIZE*2UL );
2248  const size_t j3( j+SIMDSIZE*3UL );
2249  const SIMDType x1( x.load(j ) );
2250  const SIMDType x2( x.load(j1) );
2251  const SIMDType x3( x.load(j2) );
2252  const SIMDType x4( x.load(j3) );
2253  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2254  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2255  }
2256 
2257  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2258  const size_t j1( j+SIMDSIZE );
2259  const SIMDType x1( x.load(j ) );
2260  const SIMDType x2( x.load(j1) );
2261  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2262  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2263  }
2264 
2265  for( ; j<jpos; j+=SIMDSIZE ) {
2266  const SIMDType x1( x.load(j) );
2267  y[i ] -= sum( A.load(i ,j) * x1 );
2268  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2269  }
2270 
2271  for( ; remainder && j<jend; ++j ) {
2272  y[i ] -= A(i ,j) * x[j];
2273  y[i+1UL] -= A(i+1UL,j) * x[j];
2274  }
2275  }
2276 
2277  if( i < M )
2278  {
2279  const size_t jbegin( ( IsUpper<MT1>::value )
2280  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2281  :( 0UL ) );
2282  const size_t jend( ( IsLower<MT1>::value )
2283  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2284  :( N ) );
2285  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2286 
2287  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2288  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2289 
2290  size_t j( jbegin );
2291 
2292  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2293  const size_t j1( j+SIMDSIZE );
2294  const size_t j2( j+SIMDSIZE*2UL );
2295  const size_t j3( j+SIMDSIZE*3UL );
2296  const SIMDType x1( x.load(j ) );
2297  const SIMDType x2( x.load(j1) );
2298  const SIMDType x3( x.load(j2) );
2299  const SIMDType x4( x.load(j3) );
2300  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2301  }
2302 
2303  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2304  const size_t j1( j+SIMDSIZE );
2305  const SIMDType x1( x.load(j ) );
2306  const SIMDType x2( x.load(j1) );
2307  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2308  }
2309 
2310  for( ; j<jpos; j+=SIMDSIZE ) {
2311  const SIMDType x1( x.load(j) );
2312  y[i] -= sum( A.load(i,j) * x1 );
2313  }
2314 
2315  for( ; remainder && j<jend; ++j ) {
2316  y[i] -= A(i,j) * x[j];
2317  }
2318  }
2319  }
2321  //**********************************************************************************************
2322 
2323  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2337  template< typename VT1 // Type of the left-hand side target vector
2338  , typename MT1 // Type of the left-hand side matrix operand
2339  , typename VT2 > // Type of the right-hand side vector operand
2341  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2342  {
2343  selectLargeSubAssignKernel( y, A, x );
2344  }
2346  //**********************************************************************************************
2347 
2348  //**BLAS-based subtraction assignment to dense vectors******************************************
2349 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2350 
2363  template< typename VT1 // Type of the left-hand side target vector
2364  , typename MT1 // Type of the left-hand side matrix operand
2365  , typename VT2 > // Type of the right-hand side vector operand
2367  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2368  {
2369  using ET = ElementType_<VT1>;
2370 
2371  if( IsTriangular<MT1>::value ) {
2372  ResultType_<VT1> tmp( serial( x ) );
2373  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2374  subAssign( y, tmp );
2375  }
2376  else {
2377  gemv( y, A, x, ET(-1), ET(1) );
2378  }
2379  }
2381 #endif
2382  //**********************************************************************************************
2383 
2384  //**Subtraction assignment to sparse vectors****************************************************
2385  // No special implementation for the subtraction assignment to sparse vectors.
2386  //**********************************************************************************************
2387 
2388  //**Multiplication assignment to dense vectors**************************************************
2401  template< typename VT1 > // Type of the target dense vector
2402  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2403  {
2405 
2409 
2410  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2411 
2412  const ResultType tmp( serial( rhs ) );
2413  multAssign( ~lhs, tmp );
2414  }
2416  //**********************************************************************************************
2417 
2418  //**Multiplication assignment to sparse vectors*************************************************
2419  // No special implementation for the multiplication assignment to sparse vectors.
2420  //**********************************************************************************************
2421 
2422  //**Division assignment to dense vectors********************************************************
2435  template< typename VT1 > // Type of the target dense vector
2436  friend inline void divAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2437  {
2439 
2443 
2444  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2445 
2446  const ResultType tmp( serial( rhs ) );
2447  divAssign( ~lhs, tmp );
2448  }
2450  //**********************************************************************************************
2451 
2452  //**Division assignment to sparse vectors*******************************************************
2453  // No special implementation for the division assignment to sparse vectors.
2454  //**********************************************************************************************
2455 
2456  //**SMP assignment to dense vectors*************************************************************
2471  template< typename VT1 > // Type of the target dense vector
2472  friend inline EnableIf_< UseSMPAssign<VT1> >
2474  {
2476 
2477  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2478 
2479  if( rhs.mat_.rows() == 0UL ) {
2480  return;
2481  }
2482  else if( rhs.mat_.columns() == 0UL ) {
2483  reset( ~lhs );
2484  return;
2485  }
2486 
2487  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2488  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2489 
2490  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2491  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2492  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2493  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2494 
2495  smpAssign( ~lhs, A * x );
2496  }
2498  //**********************************************************************************************
2499 
2500  //**SMP assignment to sparse vectors************************************************************
2515  template< typename VT1 > // Type of the target sparse vector
2516  friend inline EnableIf_< UseSMPAssign<VT1> >
2518  {
2520 
2524 
2525  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2526 
2527  const ResultType tmp( rhs );
2528  smpAssign( ~lhs, tmp );
2529  }
2531  //**********************************************************************************************
2532 
2533  //**SMP addition assignment to dense vectors****************************************************
2548  template< typename VT1 > // Type of the target dense vector
2549  friend inline EnableIf_< UseSMPAssign<VT1> >
2551  {
2553 
2554  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2555 
2556  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2557  return;
2558  }
2559 
2560  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2561  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2562 
2563  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2564  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2565  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2566  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2567 
2568  smpAddAssign( ~lhs, A * x );
2569  }
2571  //**********************************************************************************************
2572 
2573  //**SMP addition assignment to sparse vectors***************************************************
2574  // No special implementation for the SMP addition assignment to sparse vectors.
2575  //**********************************************************************************************
2576 
2577  //**SMP subtraction assignment to dense vectors*************************************************
2592  template< typename VT1 > // Type of the target dense vector
2593  friend inline EnableIf_< UseSMPAssign<VT1> >
2595  {
2597 
2598  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2599 
2600  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2601  return;
2602  }
2603 
2604  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2605  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2606 
2607  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2608  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2609  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2610  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2611 
2612  smpSubAssign( ~lhs, A * x );
2613  }
2615  //**********************************************************************************************
2616 
2617  //**SMP subtraction assignment to sparse vectors************************************************
2618  // No special implementation for the SMP subtraction assignment to sparse vectors.
2619  //**********************************************************************************************
2620 
2621  //**SMP multiplication assignment to dense vectors**********************************************
2636  template< typename VT1 > // Type of the target dense vector
2637  friend inline EnableIf_< UseSMPAssign<VT1> >
2639  {
2641 
2645 
2646  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2647 
2648  const ResultType tmp( rhs );
2649  smpMultAssign( ~lhs, tmp );
2650  }
2652  //**********************************************************************************************
2653 
2654  //**SMP multiplication assignment to sparse vectors*********************************************
2655  // No special implementation for the SMP multiplication assignment to sparse vectors.
2656  //**********************************************************************************************
2657 
2658  //**SMP division assignment to dense vectors****************************************************
2673  template< typename VT1 > // Type of the target dense vector
2674  friend inline EnableIf_< UseSMPAssign<VT1> >
2676  {
2678 
2682 
2683  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2684 
2685  const ResultType tmp( rhs );
2686  smpDivAssign( ~lhs, tmp );
2687  }
2689  //**********************************************************************************************
2690 
2691  //**SMP division assignment to sparse vectors***************************************************
2692  // No special implementation for the SMP division assignment to sparse vectors.
2693  //**********************************************************************************************
2694 
2695  //**Compile time checks*************************************************************************
2703  //**********************************************************************************************
2704 };
2705 //*************************************************************************************************
2706 
2707 
2708 
2709 
2710 //=================================================================================================
2711 //
2712 // DVECSCALARMULTEXPR SPECIALIZATION
2713 //
2714 //=================================================================================================
2715 
2716 //*************************************************************************************************
2724 template< typename MT // Type of the left-hand side dense matrix
2725  , typename VT // Type of the right-hand side dense vector
2726  , typename ST > // Type of the scalar value
2727 class DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >
2728  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false > >
2729  , private Computation
2730 {
2731  private:
2732  //**Type definitions****************************************************************************
2733  using MVM = DMatDVecMultExpr<MT,VT>;
2734  using RES = ResultType_<MVM>;
2735  using MRT = ResultType_<MT>;
2736  using VRT = ResultType_<VT>;
2737  using MET = ElementType_<MRT>;
2738  using VET = ElementType_<VRT>;
2739  using MCT = CompositeType_<MT>;
2740  using VCT = CompositeType_<VT>;
2741  //**********************************************************************************************
2742 
2743  //**********************************************************************************************
2745  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2747  //**********************************************************************************************
2748 
2749  //**********************************************************************************************
2751  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<MT>::value };
2752  //**********************************************************************************************
2753 
2754  //**********************************************************************************************
2756 
2759  template< typename T1 >
2760  struct UseSMPAssign {
2761  enum : bool { value = ( evaluateMatrix || evaluateVector ) };
2762  };
2763  //**********************************************************************************************
2764 
2765  //**********************************************************************************************
2767 
2769  template< typename T1, typename T2, typename T3, typename T4 >
2770  struct UseBlasKernel {
2776  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2781  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2783  };
2784  //**********************************************************************************************
2785 
2786  //**********************************************************************************************
2788 
2791  template< typename T1, typename T2, typename T3, typename T4 >
2792  struct UseVectorizedDefaultKernel {
2793  enum : bool { value = useOptimizedKernels &&
2795  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2799  , T4 >::value &&
2800  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2801  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2802  };
2803  //**********************************************************************************************
2804 
2805  public:
2806  //**Type definitions****************************************************************************
2808  using ResultType = MultTrait_<RES,ST>;
2812  using ReturnType = const ElementType;
2813  using CompositeType = const ResultType;
2814 
2816  using LeftOperand = const DMatDVecMultExpr<MT,VT>;
2817 
2819  using RightOperand = ST;
2820 
2823 
2826  //**********************************************************************************************
2827 
2828  //**Compilation flags***************************************************************************
2830  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2831  MT::simdEnabled && VT::simdEnabled &&
2835 
2837  enum : bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2838  !evaluateVector && VT::smpAssignable };
2839  //**********************************************************************************************
2840 
2841  //**SIMD properties*****************************************************************************
2843  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2844  //**********************************************************************************************
2845 
2846  //**Constructor*********************************************************************************
2852  explicit inline DVecScalarMultExpr( const MVM& vector, ST scalar )
2853  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2854  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2855  {}
2856  //**********************************************************************************************
2857 
2858  //**Subscript operator**************************************************************************
2864  inline ReturnType operator[]( size_t index ) const {
2865  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2866  return vector_[index] * scalar_;
2867  }
2868  //**********************************************************************************************
2869 
2870  //**At function*********************************************************************************
2877  inline ReturnType at( size_t index ) const {
2878  if( index >= vector_.size() ) {
2879  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2880  }
2881  return (*this)[index];
2882  }
2883  //**********************************************************************************************
2884 
2885  //**Size function*******************************************************************************
2890  inline size_t size() const {
2891  return vector_.size();
2892  }
2893  //**********************************************************************************************
2894 
2895  //**Left operand access*************************************************************************
2900  inline LeftOperand leftOperand() const {
2901  return vector_;
2902  }
2903  //**********************************************************************************************
2904 
2905  //**Right operand access************************************************************************
2910  inline RightOperand rightOperand() const {
2911  return scalar_;
2912  }
2913  //**********************************************************************************************
2914 
2915  //**********************************************************************************************
2921  template< typename T >
2922  inline bool canAlias( const T* alias ) const {
2923  return vector_.canAlias( alias );
2924  }
2925  //**********************************************************************************************
2926 
2927  //**********************************************************************************************
2933  template< typename T >
2934  inline bool isAliased( const T* alias ) const {
2935  return vector_.isAliased( alias );
2936  }
2937  //**********************************************************************************************
2938 
2939  //**********************************************************************************************
2944  inline bool isAligned() const {
2945  return vector_.isAligned();
2946  }
2947  //**********************************************************************************************
2948 
2949  //**********************************************************************************************
2954  inline bool canSMPAssign() const noexcept {
2955  LeftOperand_<MVM> A( vector_.leftOperand() );
2956  return ( !BLAZE_BLAS_MODE ||
2959  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2960  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2961  ( size() > SMP_DMATDVECMULT_THRESHOLD );
2962  }
2963  //**********************************************************************************************
2964 
2965  private:
2966  //**Member variables****************************************************************************
2967  LeftOperand vector_;
2968  RightOperand scalar_;
2969  //**********************************************************************************************
2970 
2971  //**Assignment to dense vectors*****************************************************************
2983  template< typename VT1 > // Type of the target dense vector
2984  friend inline void assign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
2985  {
2987 
2988  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2989 
2990  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
2991  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
2992 
2993  if( left.rows() == 0UL ) {
2994  return;
2995  }
2996  else if( left.columns() == 0UL ) {
2997  reset( ~lhs );
2998  return;
2999  }
3000 
3001  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3002  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3003 
3004  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3005  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3006  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3007  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3008 
3009  DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
3010  }
3011  //**********************************************************************************************
3012 
3013  //**Assignment to dense vectors (kernel selection)**********************************************
3024  template< typename VT1 // Type of the left-hand side target vector
3025  , typename MT1 // Type of the left-hand side matrix operand
3026  , typename VT2 // Type of the right-hand side vector operand
3027  , typename ST2 > // Type of the scalar value
3028  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3029  {
3030  if( ( IsDiagonal<MT1>::value ) ||
3031  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3032  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3033  selectSmallAssignKernel( y, A, x, scalar );
3034  else
3035  selectBlasAssignKernel( y, A, x, scalar );
3036  }
3037  //**********************************************************************************************
3038 
3039  //**Default assignment to dense vectors*********************************************************
3053  template< typename VT1 // Type of the left-hand side target vector
3054  , typename MT1 // Type of the left-hand side matrix operand
3055  , typename VT2 // Type of the right-hand side vector operand
3056  , typename ST2 > // Type of the scalar value
3058  selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3059  {
3060  y.assign( A * x * scalar );
3061  }
3062  //**********************************************************************************************
3063 
3064  //**Default assignment to dense vectors (small matrices)****************************************
3078  template< typename VT1 // Type of the left-hand side target vector
3079  , typename MT1 // Type of the left-hand side matrix operand
3080  , typename VT2 // Type of the right-hand side vector operand
3081  , typename ST2 > // Type of the scalar value
3083  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3084  {
3085  selectDefaultAssignKernel( y, A, x, scalar );
3086  }
3087  //**********************************************************************************************
3088 
3089  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3103  template< typename VT1 // Type of the left-hand side target vector
3104  , typename MT1 // Type of the left-hand side matrix operand
3105  , typename VT2 // Type of the right-hand side vector operand
3106  , typename ST2 > // Type of the scalar value
3108  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3109  {
3110  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3111 
3112  const size_t M( A.rows() );
3113  const size_t N( A.columns() );
3114 
3115  size_t i( 0UL );
3116 
3117  for( ; (i+8UL) <= M; i+=8UL )
3118  {
3119  const size_t jbegin( ( IsUpper<MT1>::value )
3120  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3121  :( 0UL ) );
3122  const size_t jend( ( IsLower<MT1>::value )
3123  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3124  :( N ) );
3125  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3126 
3127  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3128  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3129 
3130  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3131  size_t j( jbegin );
3132 
3133  for( ; j<jpos; j+=SIMDSIZE ) {
3134  const SIMDType x1( x.load(j) );
3135  xmm1 += A.load(i ,j) * x1;
3136  xmm2 += A.load(i+1UL,j) * x1;
3137  xmm3 += A.load(i+2UL,j) * x1;
3138  xmm4 += A.load(i+3UL,j) * x1;
3139  xmm5 += A.load(i+4UL,j) * x1;
3140  xmm6 += A.load(i+5UL,j) * x1;
3141  xmm7 += A.load(i+6UL,j) * x1;
3142  xmm8 += A.load(i+7UL,j) * x1;
3143  }
3144 
3145  y[i ] = sum( xmm1 ) * scalar;
3146  y[i+1UL] = sum( xmm2 ) * scalar;
3147  y[i+2UL] = sum( xmm3 ) * scalar;
3148  y[i+3UL] = sum( xmm4 ) * scalar;
3149  y[i+4UL] = sum( xmm5 ) * scalar;
3150  y[i+5UL] = sum( xmm6 ) * scalar;
3151  y[i+6UL] = sum( xmm7 ) * scalar;
3152  y[i+7UL] = sum( xmm8 ) * scalar;
3153 
3154  for( ; remainder && j<jend; ++j ) {
3155  y[i ] += A(i ,j) * x[j] * scalar;
3156  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3157  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3158  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3159  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3160  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3161  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3162  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3163  }
3164  }
3165 
3166  for( ; (i+4UL) <= M; i+=4UL )
3167  {
3168  const size_t jbegin( ( IsUpper<MT1>::value )
3169  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3170  :( 0UL ) );
3171  const size_t jend( ( IsLower<MT1>::value )
3172  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3173  :( N ) );
3174  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3175 
3176  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3177  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3178 
3179  SIMDType xmm1, xmm2, xmm3, xmm4;
3180  size_t j( jbegin );
3181 
3182  for( ; j<jpos; j+=SIMDSIZE ) {
3183  const SIMDType x1( x.load(j) );
3184  xmm1 += A.load(i ,j) * x1;
3185  xmm2 += A.load(i+1UL,j) * x1;
3186  xmm3 += A.load(i+2UL,j) * x1;
3187  xmm4 += A.load(i+3UL,j) * x1;
3188  }
3189 
3190  y[i ] = sum( xmm1 ) * scalar;
3191  y[i+1UL] = sum( xmm2 ) * scalar;
3192  y[i+2UL] = sum( xmm3 ) * scalar;
3193  y[i+3UL] = sum( xmm4 ) * scalar;
3194 
3195  for( ; remainder && j<jend; ++j ) {
3196  y[i ] += A(i ,j) * x[j] * scalar;
3197  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3198  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3199  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3200  }
3201  }
3202 
3203  for( ; (i+3UL) <= M; i+=3UL )
3204  {
3205  const size_t jbegin( ( IsUpper<MT1>::value )
3206  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3207  :( 0UL ) );
3208  const size_t jend( ( IsLower<MT1>::value )
3209  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3210  :( N ) );
3211  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3212 
3213  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3214  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3215 
3216  SIMDType xmm1, xmm2, xmm3;
3217  size_t j( jbegin );
3218 
3219  for( ; j<jpos; j+=SIMDSIZE ) {
3220  const SIMDType x1( x.load(j) );
3221  xmm1 += A.load(i ,j) * x1;
3222  xmm2 += A.load(i+1UL,j) * x1;
3223  xmm3 += A.load(i+2UL,j) * x1;
3224  }
3225 
3226  y[i ] = sum( xmm1 ) * scalar;
3227  y[i+1UL] = sum( xmm2 ) * scalar;
3228  y[i+2UL] = sum( xmm3 ) * scalar;
3229 
3230  for( ; remainder && j<jend; ++j ) {
3231  y[i ] += A(i ,j) * x[j] * scalar;
3232  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3233  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3234  }
3235  }
3236 
3237  for( ; (i+2UL) <= M; i+=2UL )
3238  {
3239  const size_t jbegin( ( IsUpper<MT1>::value )
3240  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3241  :( 0UL ) );
3242  const size_t jend( ( IsLower<MT1>::value )
3243  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3244  :( N ) );
3245  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3246 
3247  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3248  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3249 
3250  SIMDType xmm1, xmm2;
3251  size_t j( jbegin );
3252 
3253  for( ; j<jpos; j+=SIMDSIZE ) {
3254  const SIMDType x1( x.load(j) );
3255  xmm1 += A.load(i ,j) * x1;
3256  xmm2 += A.load(i+1UL,j) * x1;
3257  }
3258 
3259  y[i ] = sum( xmm1 ) * scalar;
3260  y[i+1UL] = sum( xmm2 ) * scalar;
3261 
3262  for( ; remainder && j<jend; ++j ) {
3263  y[i ] += A(i ,j) * x[j] * scalar;
3264  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3265  }
3266  }
3267 
3268  if( i < M )
3269  {
3270  const size_t jbegin( ( IsUpper<MT1>::value )
3271  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3272  :( 0UL ) );
3273  const size_t jend( ( IsLower<MT1>::value )
3274  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3275  :( N ) );
3276  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3277 
3278  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3279  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3280 
3281  SIMDType xmm1;
3282  size_t j( jbegin );
3283 
3284  for( ; j<jpos; j+=SIMDSIZE ) {
3285  xmm1 += A.load(i,j) * x.load(j);
3286  }
3287 
3288  y[i] = sum( xmm1 ) * scalar;
3289 
3290  for( ; remainder && j<jend; ++j ) {
3291  y[i] += A(i,j) * x[j] * scalar;
3292  }
3293  }
3294  }
3295  //**********************************************************************************************
3296 
3297  //**Default assignment to dense vectors (large matrices)****************************************
3311  template< typename VT1 // Type of the left-hand side target vector
3312  , typename MT1 // Type of the left-hand side matrix operand
3313  , typename VT2 // Type of the right-hand side vector operand
3314  , typename ST2 > // Type of the scalar value
3316  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3317  {
3318  selectDefaultAssignKernel( y, A, x, scalar );
3319  }
3320  //**********************************************************************************************
3321 
3322  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3336  template< typename VT1 // Type of the left-hand side target vector
3337  , typename MT1 // Type of the left-hand side matrix operand
3338  , typename VT2 // Type of the right-hand side vector operand
3339  , typename ST2 > // Type of the scalar value
3341  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3342  {
3343  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3344 
3345  const size_t M( A.rows() );
3346  const size_t N( A.columns() );
3347 
3348  reset( y );
3349 
3350  size_t i( 0UL );
3351 
3352  for( ; (i+8UL) <= M; i+=8UL )
3353  {
3354  const size_t jbegin( ( IsUpper<MT1>::value )
3355  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3356  :( 0UL ) );
3357  const size_t jend( ( IsLower<MT1>::value )
3358  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3359  :( N ) );
3360  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3361 
3362  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3363  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3364 
3365  size_t j( jbegin );
3366 
3367  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3368  const size_t j1( j+SIMDSIZE );
3369  const size_t j2( j+SIMDSIZE*2UL );
3370  const size_t j3( j+SIMDSIZE*3UL );
3371  const SIMDType x1( x.load(j ) );
3372  const SIMDType x2( x.load(j1) );
3373  const SIMDType x3( x.load(j2) );
3374  const SIMDType x4( x.load(j3) );
3375  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3376  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3377  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3378  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3379  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3380  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3381  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3382  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3383  }
3384 
3385  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3386  const size_t j1( j+SIMDSIZE );
3387  const SIMDType x1( x.load(j ) );
3388  const SIMDType x2( x.load(j1) );
3389  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3390  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3391  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3392  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3393  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3394  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3395  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3396  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3397  }
3398 
3399  for( ; j<jpos; j+=SIMDSIZE ) {
3400  const SIMDType x1( x.load(j) );
3401  y[i ] += sum( A.load(i ,j) * x1 );
3402  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3403  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3404  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3405  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
3406  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
3407  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
3408  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
3409  }
3410 
3411  for( ; remainder && j<jend; ++j ) {
3412  y[i ] += A(i ,j) * x[j];
3413  y[i+1UL] += A(i+1UL,j) * x[j];
3414  y[i+2UL] += A(i+2UL,j) * x[j];
3415  y[i+3UL] += A(i+3UL,j) * x[j];
3416  y[i+4UL] += A(i+4UL,j) * x[j];
3417  y[i+5UL] += A(i+5UL,j) * x[j];
3418  y[i+6UL] += A(i+6UL,j) * x[j];
3419  y[i+7UL] += A(i+7UL,j) * x[j];
3420  }
3421 
3422  y[i ] *= scalar;
3423  y[i+1UL] *= scalar;
3424  y[i+2UL] *= scalar;
3425  y[i+3UL] *= scalar;
3426  y[i+4UL] *= scalar;
3427  y[i+5UL] *= scalar;
3428  y[i+6UL] *= scalar;
3429  y[i+7UL] *= scalar;
3430  }
3431 
3432  for( ; (i+4UL) <= M; i+=4UL )
3433  {
3434  const size_t jbegin( ( IsUpper<MT1>::value )
3435  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3436  :( 0UL ) );
3437  const size_t jend( ( IsLower<MT1>::value )
3438  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3439  :( N ) );
3440  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3441 
3442  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3443  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3444 
3445  size_t j( jbegin );
3446 
3447  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3448  const size_t j1( j+SIMDSIZE );
3449  const size_t j2( j+SIMDSIZE*2UL );
3450  const size_t j3( j+SIMDSIZE*3UL );
3451  const SIMDType x1( x.load(j ) );
3452  const SIMDType x2( x.load(j1) );
3453  const SIMDType x3( x.load(j2) );
3454  const SIMDType x4( x.load(j3) );
3455  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3456  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3457  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3458  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3459  }
3460 
3461  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3462  const size_t j1( j+SIMDSIZE );
3463  const SIMDType x1( x.load(j ) );
3464  const SIMDType x2( x.load(j1) );
3465  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3466  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3467  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3468  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3469  }
3470 
3471  for( ; j<jpos; j+=SIMDSIZE ) {
3472  const SIMDType x1( x.load(j) );
3473  y[i ] += sum( A.load(i ,j) * x1 );
3474  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3475  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3476  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3477  }
3478 
3479  for( ; remainder && j<jend; ++j ) {
3480  y[i ] += A(i ,j) * x[j];
3481  y[i+1UL] += A(i+1UL,j) * x[j];
3482  y[i+2UL] += A(i+2UL,j) * x[j];
3483  y[i+3UL] += A(i+3UL,j) * x[j];
3484  }
3485 
3486  y[i ] *= scalar;
3487  y[i+1UL] *= scalar;
3488  y[i+2UL] *= scalar;
3489  y[i+3UL] *= scalar;
3490  }
3491 
3492  for( ; (i+2UL) <= M; i+=2UL )
3493  {
3494  const size_t jbegin( ( IsUpper<MT1>::value )
3495  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3496  :( 0UL ) );
3497  const size_t jend( ( IsLower<MT1>::value )
3498  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3499  :( N ) );
3500  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3501 
3502  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3503  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3504 
3505  size_t j( jbegin );
3506 
3507  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3508  const size_t j1( j+SIMDSIZE );
3509  const size_t j2( j+SIMDSIZE*2UL );
3510  const size_t j3( j+SIMDSIZE*3UL );
3511  const SIMDType x1( x.load(j ) );
3512  const SIMDType x2( x.load(j1) );
3513  const SIMDType x3( x.load(j2) );
3514  const SIMDType x4( x.load(j3) );
3515  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3516  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3517  }
3518 
3519  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3520  const size_t j1( j+SIMDSIZE );
3521  const SIMDType x1( x.load(j ) );
3522  const SIMDType x2( x.load(j1) );
3523  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3524  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3525  }
3526 
3527  for( ; j<jpos; j+=SIMDSIZE ) {
3528  const SIMDType x1( x.load(j) );
3529  y[i ] += sum( A.load(i ,j) * x1 );
3530  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3531  }
3532 
3533  for( ; remainder && j<jend; ++j ) {
3534  y[i ] += A(i ,j) * x[j];
3535  y[i+1UL] += A(i+1UL,j) * x[j];
3536  }
3537 
3538  y[i ] *= scalar;
3539  y[i+1UL] *= scalar;
3540  }
3541 
3542  if( i < M )
3543  {
3544  const size_t jbegin( ( IsUpper<MT1>::value )
3545  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3546  :( 0UL ) );
3547  const size_t jend( ( IsLower<MT1>::value )
3548  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3549  :( N ) );
3550  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3551 
3552  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3553  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3554 
3555  size_t j( jbegin );
3556 
3557  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3558  const size_t j1( j+SIMDSIZE );
3559  const size_t j2( j+SIMDSIZE*2UL );
3560  const size_t j3( j+SIMDSIZE*3UL );
3561  const SIMDType x1( x.load(j ) );
3562  const SIMDType x2( x.load(j1) );
3563  const SIMDType x3( x.load(j2) );
3564  const SIMDType x4( x.load(j3) );
3565  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3566  }
3567 
3568  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3569  const size_t j1( j+SIMDSIZE );
3570  const SIMDType x1( x.load(j ) );
3571  const SIMDType x2( x.load(j1) );
3572  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3573  }
3574 
3575  for( ; j<jpos; j+=SIMDSIZE ) {
3576  const SIMDType x1( x.load(j) );
3577  y[i] += sum( A.load(i,j) * x1 );
3578  }
3579 
3580  for( ; remainder && j<jend; ++j ) {
3581  y[i] += A(i,j) * x[j];
3582  }
3583 
3584  y[i] *= scalar;
3585  }
3586  }
3587  //**********************************************************************************************
3588 
3589  //**BLAS-based assignment to dense vectors (default)********************************************
3603  template< typename VT1 // Type of the left-hand side target vector
3604  , typename MT1 // Type of the left-hand side matrix operand
3605  , typename VT2 // Type of the right-hand side vector operand
3606  , typename ST2 > // Type of the scalar value
3608  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3609  {
3610  selectLargeAssignKernel( y, A, x, scalar );
3611  }
3612  //**********************************************************************************************
3613 
3614  //**BLAS-based assignment to dense vectors******************************************************
3615 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3616 
3629  template< typename VT1 // Type of the left-hand side target vector
3630  , typename MT1 // Type of the left-hand side matrix operand
3631  , typename VT2 // Type of the right-hand side vector operand
3632  , typename ST2 > // Type of the scalar value
3634  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3635  {
3636  using ET = ElementType_<VT1>;
3637 
3638  if( IsTriangular<MT1>::value ) {
3639  assign( y, scalar * x );
3640  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3641  }
3642  else {
3643  gemv( y, A, x, ET(scalar), ET(0) );
3644  }
3645  }
3646 #endif
3647  //**********************************************************************************************
3648 
3649  //**Assignment to sparse vectors****************************************************************
3661  template< typename VT1 > // Type of the target sparse vector
3662  friend inline void assign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3663  {
3665 
3669 
3670  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3671 
3672  const ResultType tmp( serial( rhs ) );
3673  assign( ~lhs, tmp );
3674  }
3675  //**********************************************************************************************
3676 
3677  //**Addition assignment to dense vectors********************************************************
3689  template< typename VT1 > // Type of the target dense vector
3690  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3691  {
3693 
3694  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3695 
3696  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
3697  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
3698 
3699  if( left.rows() == 0UL || left.columns() == 0UL ) {
3700  return;
3701  }
3702 
3703  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3704  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3705 
3706  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3707  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3708  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3709  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3710 
3711  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3712  }
3713  //**********************************************************************************************
3714 
3715  //**Addition assignment to dense vectors (kernel selection)*************************************
3726  template< typename VT1 // Type of the left-hand side target vector
3727  , typename MT1 // Type of the left-hand side matrix operand
3728  , typename VT2 // Type of the right-hand side vector operand
3729  , typename ST2 > // Type of the scalar value
3730  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3731  {
3732  if( ( IsDiagonal<MT1>::value ) ||
3733  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3734  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3735  selectSmallAddAssignKernel( y, A, x, scalar );
3736  else
3737  selectBlasAddAssignKernel( y, A, x, scalar );
3738  }
3739  //**********************************************************************************************
3740 
3741  //**Default addition assignment to dense vectors************************************************
3755  template< typename VT1 // Type of the left-hand side target vector
3756  , typename MT1 // Type of the left-hand side matrix operand
3757  , typename VT2 // Type of the right-hand side vector operand
3758  , typename ST2 > // Type of the scalar value
3759  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3760  {
3761  y.addAssign( A * x * scalar );
3762  }
3763  //**********************************************************************************************
3764 
3765  //**Default addition assignment to dense vectors (small matrices)*******************************
3779  template< typename VT1 // Type of the left-hand side target vector
3780  , typename MT1 // Type of the left-hand side matrix operand
3781  , typename VT2 // Type of the right-hand side vector operand
3782  , typename ST2 > // Type of the scalar value
3784  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3785  {
3786  selectDefaultAddAssignKernel( y, A, x, scalar );
3787  }
3788  //**********************************************************************************************
3789 
3790  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3804  template< typename VT1 // Type of the left-hand side target vector
3805  , typename MT1 // Type of the left-hand side matrix operand
3806  , typename VT2 // Type of the right-hand side vector operand
3807  , typename ST2 > // Type of the scalar value
3809  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3810  {
3811  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3812 
3813  const size_t M( A.rows() );
3814  const size_t N( A.columns() );
3815 
3816  size_t i( 0UL );
3817 
3818  for( ; (i+8UL) <= M; i+=8UL )
3819  {
3820  const size_t jbegin( ( IsUpper<MT1>::value )
3821  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3822  :( 0UL ) );
3823  const size_t jend( ( IsLower<MT1>::value )
3824  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3825  :( N ) );
3826  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3827 
3828  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3829  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3830 
3831  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3832  size_t j( jbegin );
3833 
3834  for( ; j<jpos; j+=SIMDSIZE ) {
3835  const SIMDType x1( x.load(j) );
3836  xmm1 += A.load(i ,j) * x1;
3837  xmm2 += A.load(i+1UL,j) * x1;
3838  xmm3 += A.load(i+2UL,j) * x1;
3839  xmm4 += A.load(i+3UL,j) * x1;
3840  xmm5 += A.load(i+4UL,j) * x1;
3841  xmm6 += A.load(i+5UL,j) * x1;
3842  xmm7 += A.load(i+6UL,j) * x1;
3843  xmm8 += A.load(i+7UL,j) * x1;
3844  }
3845 
3846  y[i ] += sum( xmm1 ) * scalar;
3847  y[i+1UL] += sum( xmm2 ) * scalar;
3848  y[i+2UL] += sum( xmm3 ) * scalar;
3849  y[i+3UL] += sum( xmm4 ) * scalar;
3850  y[i+4UL] += sum( xmm5 ) * scalar;
3851  y[i+5UL] += sum( xmm6 ) * scalar;
3852  y[i+6UL] += sum( xmm7 ) * scalar;
3853  y[i+7UL] += sum( xmm8 ) * scalar;
3854 
3855  for( ; remainder && j<jend; ++j ) {
3856  y[i ] += A(i ,j) * x[j] * scalar;
3857  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3858  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3859  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3860  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3861  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3862  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3863  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3864  }
3865  }
3866 
3867  for( ; (i+4UL) <= M; i+=4UL )
3868  {
3869  const size_t jbegin( ( IsUpper<MT1>::value )
3870  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3871  :( 0UL ) );
3872  const size_t jend( ( IsLower<MT1>::value )
3873  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3874  :( N ) );
3875  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3876 
3877  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3878  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3879 
3880  SIMDType xmm1, xmm2, xmm3, xmm4;
3881  size_t j( jbegin );
3882 
3883  for( ; j<jpos; j+=SIMDSIZE ) {
3884  const SIMDType x1( x.load(j) );
3885  xmm1 += A.load(i ,j) * x1;
3886  xmm2 += A.load(i+1UL,j) * x1;
3887  xmm3 += A.load(i+2UL,j) * x1;
3888  xmm4 += A.load(i+3UL,j) * x1;
3889  }
3890 
3891  y[i ] += sum( xmm1 ) * scalar;
3892  y[i+1UL] += sum( xmm2 ) * scalar;
3893  y[i+2UL] += sum( xmm3 ) * scalar;
3894  y[i+3UL] += sum( xmm4 ) * scalar;
3895 
3896  for( ; remainder && j<jend; ++j ) {
3897  y[i ] += A(i ,j) * x[j] * scalar;
3898  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3899  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3900  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3901  }
3902  }
3903 
3904  for( ; (i+3UL) <= M; i+=3UL )
3905  {
3906  const size_t jbegin( ( IsUpper<MT1>::value )
3907  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3908  :( 0UL ) );
3909  const size_t jend( ( IsLower<MT1>::value )
3910  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3911  :( N ) );
3912  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3913 
3914  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3915  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3916 
3917  SIMDType xmm1, xmm2, xmm3;
3918  size_t j( jbegin );
3919 
3920  for( ; j<jpos; j+=SIMDSIZE ) {
3921  const SIMDType x1( x.load(j) );
3922  xmm1 += A.load(i ,j) * x1;
3923  xmm2 += A.load(i+1UL,j) * x1;
3924  xmm3 += A.load(i+2UL,j) * x1;
3925  }
3926 
3927  y[i ] += sum( xmm1 ) * scalar;
3928  y[i+1UL] += sum( xmm2 ) * scalar;
3929  y[i+2UL] += sum( xmm3 ) * scalar;
3930 
3931  for( ; remainder && j<jend; ++j ) {
3932  y[i ] += A(i ,j) * x[j] * scalar;
3933  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3934  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3935  }
3936  }
3937 
3938  for( ; (i+2UL) <= M; i+=2UL )
3939  {
3940  const size_t jbegin( ( IsUpper<MT1>::value )
3941  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3942  :( 0UL ) );
3943  const size_t jend( ( IsLower<MT1>::value )
3944  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3945  :( N ) );
3946  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3947 
3948  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3949  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3950 
3951  SIMDType xmm1, xmm2;
3952  size_t j( jbegin );
3953 
3954  for( ; j<jpos; j+=SIMDSIZE ) {
3955  const SIMDType x1( x.load(j) );
3956  xmm1 += A.load(i ,j) * x1;
3957  xmm2 += A.load(i+1UL,j) * x1;
3958  }
3959 
3960  y[i ] += sum( xmm1 ) * scalar;
3961  y[i+1UL] += sum( xmm2 ) * scalar;
3962 
3963  for( ; remainder && j<jend; ++j ) {
3964  y[i ] += A(i ,j) * x[j] * scalar;
3965  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3966  }
3967  }
3968 
3969  if( i < M )
3970  {
3971  const size_t jbegin( ( IsUpper<MT1>::value )
3972  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3973  :( 0UL ) );
3974  const size_t jend( ( IsLower<MT1>::value )
3975  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3976  :( N ) );
3977  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3978 
3979  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3980  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3981 
3982  SIMDType xmm1;
3983  size_t j( jbegin );
3984 
3985  for( ; j<jpos; j+=SIMDSIZE ) {
3986  xmm1 += A.load(i,j) * x.load(j);
3987  }
3988 
3989  y[i] += sum( xmm1 ) * scalar;
3990 
3991  for( ; remainder && j<jend; ++j ) {
3992  y[i] += A(i,j) * x[j] * scalar;
3993  }
3994  }
3995  }
3996  //**********************************************************************************************
3997 
3998  //**Default addition assignment to dense vectors (large matrices)*******************************
4012  template< typename VT1 // Type of the left-hand side target vector
4013  , typename MT1 // Type of the left-hand side matrix operand
4014  , typename VT2 // Type of the right-hand side vector operand
4015  , typename ST2 > // Type of the scalar value
4017  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4018  {
4019  selectDefaultAddAssignKernel( y, A, x, scalar );
4020  }
4021  //**********************************************************************************************
4022 
4023  //**Vectorized default addition assignment to dense vectors (large matrices)********************
4037  template< typename VT1 // Type of the left-hand side target vector
4038  , typename MT1 // Type of the left-hand side matrix operand
4039  , typename VT2 // Type of the right-hand side vector operand
4040  , typename ST2 > // Type of the scalar value
4042  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4043  {
4044  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4045 
4046  const size_t M( A.rows() );
4047  const size_t N( A.columns() );
4048 
4049  size_t i( 0UL );
4050 
4051  for( ; (i+8UL) <= M; i+=8UL )
4052  {
4053  const size_t jbegin( ( IsUpper<MT1>::value )
4054  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4055  :( 0UL ) );
4056  const size_t jend( ( IsLower<MT1>::value )
4057  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4058  :( N ) );
4059  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4060 
4061  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4062  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4063 
4064  size_t j( jbegin );
4065 
4066  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4067  const size_t j1( j+SIMDSIZE );
4068  const size_t j2( j+SIMDSIZE*2UL );
4069  const size_t j3( j+SIMDSIZE*3UL );
4070  const SIMDType x1( x.load(j ) );
4071  const SIMDType x2( x.load(j1) );
4072  const SIMDType x3( x.load(j2) );
4073  const SIMDType x4( x.load(j3) );
4074  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4075  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4076  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4077  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4078  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4079  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4080  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4081  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4082  }
4083 
4084  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4085  const size_t j1( j+SIMDSIZE );
4086  const SIMDType x1( x.load(j ) );
4087  const SIMDType x2( x.load(j1) );
4088  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4089  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4090  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4091  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4092  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4093  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4094  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4095  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4096  }
4097 
4098  for( ; j<jpos; j+=SIMDSIZE ) {
4099  const SIMDType x1( x.load(j) );
4100  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4101  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4102  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4103  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4104  y[i+4UL] += sum( A.load(i+4UL,j) * x1 ) * scalar;
4105  y[i+5UL] += sum( A.load(i+5UL,j) * x1 ) * scalar;
4106  y[i+6UL] += sum( A.load(i+6UL,j) * x1 ) * scalar;
4107  y[i+7UL] += sum( A.load(i+7UL,j) * x1 ) * scalar;
4108  }
4109 
4110  for( ; remainder && j<jend; ++j ) {
4111  y[i ] += A(i ,j) * x[j] * scalar;
4112  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4113  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4114  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4115  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4116  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4117  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4118  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4119  }
4120  }
4121 
4122  for( ; (i+4UL) <= M; i+=4UL )
4123  {
4124  const size_t jbegin( ( IsUpper<MT1>::value )
4125  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4126  :( 0UL ) );
4127  const size_t jend( ( IsLower<MT1>::value )
4128  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4129  :( N ) );
4130  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4131 
4132  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4133  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4134 
4135  size_t j( jbegin );
4136 
4137  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4138  const size_t j1( j+SIMDSIZE );
4139  const size_t j2( j+SIMDSIZE*2UL );
4140  const size_t j3( j+SIMDSIZE*3UL );
4141  const SIMDType x1( x.load(j ) );
4142  const SIMDType x2( x.load(j1) );
4143  const SIMDType x3( x.load(j2) );
4144  const SIMDType x4( x.load(j3) );
4145  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4146  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4147  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4148  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4149  }
4150 
4151  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4152  const size_t j1( j+SIMDSIZE );
4153  const SIMDType x1( x.load(j ) );
4154  const SIMDType x2( x.load(j1) );
4155  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4156  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4157  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4158  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4159  }
4160 
4161  for( ; j<jpos; j+=SIMDSIZE ) {
4162  const SIMDType x1( x.load(j) );
4163  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4164  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4165  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4166  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4167  }
4168 
4169  for( ; remainder && j<jend; ++j ) {
4170  y[i ] += A(i ,j) * x[j] * scalar;
4171  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4172  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4173  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4174  }
4175  }
4176 
4177  for( ; (i+2UL) <= M; i+=2UL )
4178  {
4179  const size_t jbegin( ( IsUpper<MT1>::value )
4180  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4181  :( 0UL ) );
4182  const size_t jend( ( IsLower<MT1>::value )
4183  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4184  :( N ) );
4185  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4186 
4187  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4188  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4189 
4190  size_t j( jbegin );
4191 
4192  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4193  const size_t j1( j+SIMDSIZE );
4194  const size_t j2( j+SIMDSIZE*2UL );
4195  const size_t j3( j+SIMDSIZE*3UL );
4196  const SIMDType x1( x.load(j ) );
4197  const SIMDType x2( x.load(j1) );
4198  const SIMDType x3( x.load(j2) );
4199  const SIMDType x4( x.load(j3) );
4200  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4201  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4202  }
4203 
4204  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4205  const size_t j1( j+SIMDSIZE );
4206  const SIMDType x1( x.load(j ) );
4207  const SIMDType x2( x.load(j1) );
4208  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4209  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4210  }
4211 
4212  for( ; j<jpos; j+=SIMDSIZE ) {
4213  const SIMDType x1( x.load(j) );
4214  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4215  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4216  }
4217 
4218  for( ; remainder && j<jend; ++j ) {
4219  y[i ] += A(i ,j) * x[j] * scalar;
4220  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4221  }
4222  }
4223 
4224  if( i < M )
4225  {
4226  const size_t jbegin( ( IsUpper<MT1>::value )
4227  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4228  :( 0UL ) );
4229  const size_t jend( ( IsLower<MT1>::value )
4230  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4231  :( N ) );
4232  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4233 
4234  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4235  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4236 
4237  size_t j( jbegin );
4238 
4239  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4240  const size_t j1( j+SIMDSIZE );
4241  const size_t j2( j+SIMDSIZE*2UL );
4242  const size_t j3( j+SIMDSIZE*3UL );
4243  const SIMDType x1( x.load(j ) );
4244  const SIMDType x2( x.load(j1) );
4245  const SIMDType x3( x.load(j2) );
4246  const SIMDType x4( x.load(j3) );
4247  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4248  }
4249 
4250  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4251  const size_t j1( j+SIMDSIZE );
4252  const SIMDType x1( x.load(j ) );
4253  const SIMDType x2( x.load(j1) );
4254  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4255  }
4256 
4257  for( ; j<jpos; j+=SIMDSIZE ) {
4258  const SIMDType x1( x.load(j) );
4259  y[i] += sum( A.load(i,j) * x1 ) * scalar;
4260  }
4261 
4262  for( ; remainder && j<jend; ++j ) {
4263  y[i] += A(i,j) * x[j] * scalar;
4264  }
4265  }
4266  }
4267  //**********************************************************************************************
4268 
4269  //**BLAS-based addition assignment to dense vectors (default)***********************************
4283  template< typename VT1 // Type of the left-hand side target vector
4284  , typename MT1 // Type of the left-hand side matrix operand
4285  , typename VT2 // Type of the right-hand side vector operand
4286  , typename ST2 > // Type of the scalar value
4288  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4289  {
4290  selectLargeAddAssignKernel( y, A, x, scalar );
4291  }
4292  //**********************************************************************************************
4293 
4294  //**BLAS-based addition assignment to dense vectors*********************************************
4295 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4296 
4309  template< typename VT1 // Type of the left-hand side target vector
4310  , typename MT1 // Type of the left-hand side matrix operand
4311  , typename VT2 // Type of the right-hand side vector operand
4312  , typename ST2 > // Type of the scalar value
4314  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4315  {
4316  using ET = ElementType_<VT1>;
4317 
4318  if( IsTriangular<MT1>::value ) {
4319  ResultType_<VT1> tmp( serial( scalar * x ) );
4320  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4321  addAssign( y, tmp );
4322  }
4323  else {
4324  gemv( y, A, x, ET(scalar), ET(1) );
4325  }
4326  }
4327 #endif
4328  //**********************************************************************************************
4329 
4330  //**Addition assignment to sparse vectors*******************************************************
4331  // No special implementation for the addition assignment to sparse vectors.
4332  //**********************************************************************************************
4333 
4334  //**Subtraction assignment to dense vectors*****************************************************
4346  template< typename VT1 > // Type of the target dense vector
4347  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4348  {
4350 
4351  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4352 
4353  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4354  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4355 
4356  if( left.rows() == 0UL || left.columns() == 0UL ) {
4357  return;
4358  }
4359 
4360  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4361  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
4362 
4363  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4364  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4365  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4366  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
4367 
4368  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
4369  }
4370  //**********************************************************************************************
4371 
4372  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4383  template< typename VT1 // Type of the left-hand side target vector
4384  , typename MT1 // Type of the left-hand side matrix operand
4385  , typename VT2 // Type of the right-hand side vector operand
4386  , typename ST2 > // Type of the scalar value
4387  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4388  {
4389  if( ( IsDiagonal<MT1>::value ) ||
4390  ( IsComputation<MT>::value && !evaluateMatrix ) ||
4391  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4392  selectSmallSubAssignKernel( y, A, x, scalar );
4393  else
4394  selectBlasSubAssignKernel( y, A, x, scalar );
4395  }
4396  //**********************************************************************************************
4397 
4398  //**Default subtraction assignment to dense vectors*********************************************
4412  template< typename VT1 // Type of the left-hand side target vector
4413  , typename MT1 // Type of the left-hand side matrix operand
4414  , typename VT2 // Type of the right-hand side vector operand
4415  , typename ST2 > // Type of the scalar value
4416  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4417  {
4418  y.subAssign( A * x * scalar );
4419  }
4420  //**********************************************************************************************
4421 
4422  //**Default subtraction assignment to dense vectors (small matrices)****************************
4436  template< typename VT1 // Type of the left-hand side target vector
4437  , typename MT1 // Type of the left-hand side matrix operand
4438  , typename VT2 // Type of the right-hand side vector operand
4439  , typename ST2 > // Type of the scalar value
4441  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4442  {
4443  selectDefaultSubAssignKernel( y, A, x, scalar );
4444  }
4445  //**********************************************************************************************
4446 
4447  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4461  template< typename VT1 // Type of the left-hand side target vector
4462  , typename MT1 // Type of the left-hand side matrix operand
4463  , typename VT2 // Type of the right-hand side vector operand
4464  , typename ST2 > // Type of the scalar value
4466  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4467  {
4468  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4469 
4470  const size_t M( A.rows() );
4471  const size_t N( A.columns() );
4472 
4473  size_t i( 0UL );
4474 
4475  for( ; (i+8UL) <= M; i+=8UL )
4476  {
4477  const size_t jbegin( ( IsUpper<MT1>::value )
4478  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4479  :( 0UL ) );
4480  const size_t jend( ( IsLower<MT1>::value )
4481  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4482  :( N ) );
4483  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4484 
4485  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4486  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4487 
4488  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4489  size_t j( jbegin );
4490 
4491  for( ; j<jpos; j+=SIMDSIZE ) {
4492  const SIMDType x1( x.load(j) );
4493  xmm1 += A.load(i ,j) * x1;
4494  xmm2 += A.load(i+1UL,j) * x1;
4495  xmm3 += A.load(i+2UL,j) * x1;
4496  xmm4 += A.load(i+3UL,j) * x1;
4497  xmm5 += A.load(i+4UL,j) * x1;
4498  xmm6 += A.load(i+5UL,j) * x1;
4499  xmm7 += A.load(i+6UL,j) * x1;
4500  xmm8 += A.load(i+7UL,j) * x1;
4501  }
4502 
4503  y[i ] -= sum( xmm1 ) * scalar;
4504  y[i+1UL] -= sum( xmm2 ) * scalar;
4505  y[i+2UL] -= sum( xmm3 ) * scalar;
4506  y[i+3UL] -= sum( xmm4 ) * scalar;
4507  y[i+4UL] -= sum( xmm5 ) * scalar;
4508  y[i+5UL] -= sum( xmm6 ) * scalar;
4509  y[i+6UL] -= sum( xmm7 ) * scalar;
4510  y[i+7UL] -= sum( xmm8 ) * scalar;
4511 
4512  for( ; remainder && j<jend; ++j ) {
4513  y[i ] -= A(i ,j) * x[j] * scalar;
4514  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4515  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4516  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4517  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4518  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4519  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4520  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4521  }
4522  }
4523 
4524  for( ; (i+4UL) <= M; i+=4UL )
4525  {
4526  const size_t jbegin( ( IsUpper<MT1>::value )
4527  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4528  :( 0UL ) );
4529  const size_t jend( ( IsLower<MT1>::value )
4530  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4531  :( N ) );
4532  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4533 
4534  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4535  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4536 
4537  SIMDType xmm1, xmm2, xmm3, xmm4;
4538  size_t j( jbegin );
4539 
4540  for( ; j<jpos; j+=SIMDSIZE ) {
4541  const SIMDType x1( x.load(j) );
4542  xmm1 += A.load(i ,j) * x1;
4543  xmm2 += A.load(i+1UL,j) * x1;
4544  xmm3 += A.load(i+2UL,j) * x1;
4545  xmm4 += A.load(i+3UL,j) * x1;
4546  }
4547 
4548  y[i ] -= sum( xmm1 ) * scalar;
4549  y[i+1UL] -= sum( xmm2 ) * scalar;
4550  y[i+2UL] -= sum( xmm3 ) * scalar;
4551  y[i+3UL] -= sum( xmm4 ) * scalar;
4552 
4553  for( ; remainder && j<jend; ++j ) {
4554  y[i ] -= A(i ,j) * x[j] * scalar;
4555  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4556  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4557  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4558  }
4559  }
4560 
4561  for( ; (i+3UL) <= M; i+=3UL )
4562  {
4563  const size_t jbegin( ( IsUpper<MT1>::value )
4564  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4565  :( 0UL ) );
4566  const size_t jend( ( IsLower<MT1>::value )
4567  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
4568  :( N ) );
4569  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4570 
4571  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4572  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4573 
4574  SIMDType xmm1, xmm2, xmm3;
4575  size_t j( jbegin );
4576 
4577  for( ; j<jpos; j+=SIMDSIZE ) {
4578  const SIMDType x1( x.load(j) );
4579  xmm1 += A.load(i ,j) * x1;
4580  xmm2 += A.load(i+1UL,j) * x1;
4581  xmm3 += A.load(i+2UL,j) * x1;
4582  }
4583 
4584  y[i ] -= sum( xmm1 ) * scalar;
4585  y[i+1UL] -= sum( xmm2 ) * scalar;
4586  y[i+2UL] -= sum( xmm3 ) * scalar;
4587 
4588  for( ; remainder && j<jend; ++j ) {
4589  y[i ] -= A(i ,j) * x[j] * scalar;
4590  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4591  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4592  }
4593  }
4594 
4595  for( ; (i+2UL) <= M; i+=2UL )
4596  {
4597  const size_t jbegin( ( IsUpper<MT1>::value )
4598  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4599  :( 0UL ) );
4600  const size_t jend( ( IsLower<MT1>::value )
4601  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4602  :( N ) );
4603  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4604 
4605  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4606  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4607 
4608  SIMDType xmm1, xmm2;
4609  size_t j( jbegin );
4610 
4611  for( ; j<jpos; j+=SIMDSIZE ) {
4612  const SIMDType x1( x.load(j) );
4613  xmm1 += A.load(i ,j) * x1;
4614  xmm2 += A.load(i+1UL,j) * x1;
4615  }
4616 
4617  y[i ] -= sum( xmm1 ) * scalar;
4618  y[i+1UL] -= sum( xmm2 ) * scalar;
4619 
4620  for( ; remainder && j<jend; ++j ) {
4621  y[i ] -= A(i ,j) * x[j] * scalar;
4622  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4623  }
4624  }
4625 
4626  if( i < M )
4627  {
4628  const size_t jbegin( ( IsUpper<MT1>::value )
4629  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4630  :( 0UL ) );
4631  const size_t jend( ( IsLower<MT1>::value )
4632  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4633  :( N ) );
4634  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4635 
4636  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4637  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4638 
4639  SIMDType xmm1;
4640  size_t j( jbegin );
4641 
4642  for( ; j<jpos; j+=SIMDSIZE ) {
4643  xmm1 += A.load(i,j) * x.load(j);
4644  }
4645 
4646  y[i] -= sum( xmm1 ) * scalar;
4647 
4648  for( ; remainder && j<jend; ++j ) {
4649  y[i] -= A(i,j) * x[j] * scalar;
4650  }
4651  }
4652  }
4653  //**********************************************************************************************
4654 
4655  //**Default subtraction assignment to dense vectors (large matrices)****************************
4669  template< typename VT1 // Type of the left-hand side target vector
4670  , typename MT1 // Type of the left-hand side matrix operand
4671  , typename VT2 // Type of the right-hand side vector operand
4672  , typename ST2 > // Type of the scalar value
4674  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4675  {
4676  selectDefaultSubAssignKernel( y, A, x, scalar );
4677  }
4678  //**********************************************************************************************
4679 
4680  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4694  template< typename VT1 // Type of the left-hand side target vector
4695  , typename MT1 // Type of the left-hand side matrix operand
4696  , typename VT2 // Type of the right-hand side vector operand
4697  , typename ST2 > // Type of the scalar value
4699  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4700  {
4701  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4702 
4703  const size_t M( A.rows() );
4704  const size_t N( A.columns() );
4705 
4706  size_t i( 0UL );
4707 
4708  for( ; (i+8UL) <= M; i+=8UL )
4709  {
4710  const size_t jbegin( ( IsUpper<MT1>::value )
4711  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4712  :( 0UL ) );
4713  const size_t jend( ( IsLower<MT1>::value )
4714  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4715  :( N ) );
4716  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4717 
4718  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4719  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4720 
4721  size_t j( jbegin );
4722 
4723  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4724  const size_t j1( j+SIMDSIZE );
4725  const size_t j2( j+SIMDSIZE*2UL );
4726  const size_t j3( j+SIMDSIZE*3UL );
4727  const SIMDType x1( x.load(j ) );
4728  const SIMDType x2( x.load(j1) );
4729  const SIMDType x3( x.load(j2) );
4730  const SIMDType x4( x.load(j3) );
4731  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4732  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4733  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4734  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4735  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4736  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4737  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4738  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4739  }
4740 
4741  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4742  const size_t j1( j+SIMDSIZE );
4743  const SIMDType x1( x.load(j ) );
4744  const SIMDType x2( x.load(j1) );
4745  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4746  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4747  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4748  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4749  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4750  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4751  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4752  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4753  }
4754 
4755  for( ; j<jpos; j+=SIMDSIZE ) {
4756  const SIMDType x1( x.load(j) );
4757  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4758  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4759  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4760  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4761  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 ) * scalar;
4762  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 ) * scalar;
4763  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 ) * scalar;
4764  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 ) * scalar;
4765  }
4766 
4767  for( ; remainder && j<jend; ++j ) {
4768  y[i ] -= A(i ,j) * x[j] * scalar;
4769  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4770  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4771  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4772  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4773  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4774  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4775  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4776  }
4777  }
4778 
4779  for( ; (i+4UL) <= M; i+=4UL )
4780  {
4781  const size_t jbegin( ( IsUpper<MT1>::value )
4782  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4783  :( 0UL ) );
4784  const size_t jend( ( IsLower<MT1>::value )
4785  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4786  :( N ) );
4787  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4788 
4789  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4790  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4791 
4792  size_t j( jbegin );
4793 
4794  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4795  const size_t j1( j+SIMDSIZE );
4796  const size_t j2( j+SIMDSIZE*2UL );
4797  const size_t j3( j+SIMDSIZE*3UL );
4798  const SIMDType x1( x.load(j ) );
4799  const SIMDType x2( x.load(j1) );
4800  const SIMDType x3( x.load(j2) );
4801  const SIMDType x4( x.load(j3) );
4802  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4803  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4804  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4805  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4806  }
4807 
4808  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4809  const size_t j1( j+SIMDSIZE );
4810  const SIMDType x1( x.load(j ) );
4811  const SIMDType x2( x.load(j1) );
4812  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4813  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4814  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4815  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4816  }
4817 
4818  for( ; j<jpos; j+=SIMDSIZE ) {
4819  const SIMDType x1( x.load(j) );
4820  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4821  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4822  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4823  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4824  }
4825 
4826  for( ; remainder && j<jend; ++j ) {
4827  y[i ] -= A(i ,j) * x[j] * scalar;
4828  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4829  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4830  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4831  }
4832  }
4833 
4834  for( ; (i+2UL) <= M; i+=2UL )
4835  {
4836  const size_t jbegin( ( IsUpper<MT1>::value )
4837  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4838  :( 0UL ) );
4839  const size_t jend( ( IsLower<MT1>::value )
4840  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4841  :( N ) );
4842  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4843 
4844  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4845  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4846 
4847  size_t j( jbegin );
4848 
4849  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4850  const size_t j1( j+SIMDSIZE );
4851  const size_t j2( j+SIMDSIZE*2UL );
4852  const size_t j3( j+SIMDSIZE*3UL );
4853  const SIMDType x1( x.load(j ) );
4854  const SIMDType x2( x.load(j1) );
4855  const SIMDType x3( x.load(j2) );
4856  const SIMDType x4( x.load(j3) );
4857  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4858  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4859  }
4860 
4861  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4862  const size_t j1( j+SIMDSIZE );
4863  const SIMDType x1( x.load(j ) );
4864  const SIMDType x2( x.load(j1) );
4865  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4866  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4867  }
4868 
4869  for( ; j<jpos; j+=SIMDSIZE ) {
4870  const SIMDType x1( x.load(j) );
4871  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4872  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4873  }
4874 
4875  for( ; remainder && j<jend; ++j ) {
4876  y[i ] -= A(i ,j) * x[j] * scalar;
4877  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4878  }
4879  }
4880 
4881  if( i < M )
4882  {
4883  const size_t jbegin( ( IsUpper<MT1>::value )
4884  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4885  :( 0UL ) );
4886  const size_t jend( ( IsLower<MT1>::value )
4887  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4888  :( N ) );
4889  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4890 
4891  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4892  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4893 
4894  size_t j( jbegin );
4895 
4896  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4897  const size_t j1( j+SIMDSIZE );
4898  const size_t j2( j+SIMDSIZE*2UL );
4899  const size_t j3( j+SIMDSIZE*3UL );
4900  const SIMDType x1( x.load(j ) );
4901  const SIMDType x2( x.load(j1) );
4902  const SIMDType x3( x.load(j2) );
4903  const SIMDType x4( x.load(j3) );
4904  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4905  }
4906 
4907  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4908  const size_t j1( j+SIMDSIZE );
4909  const SIMDType x1( x.load(j ) );
4910  const SIMDType x2( x.load(j1) );
4911  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4912  }
4913 
4914  for( ; j<jpos; j+=SIMDSIZE ) {
4915  const SIMDType x1( x.load(j) );
4916  y[i] -= sum( A.load(i,j) * x1 ) * scalar;
4917  }
4918 
4919  for( ; remainder && j<jend; ++j ) {
4920  y[i] -= A(i,j) * x[j] * scalar;
4921  }
4922  }
4923  }
4924  //**********************************************************************************************
4925 
4926  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4940  template< typename VT1 // Type of the left-hand side target vector
4941  , typename MT1 // Type of the left-hand side matrix operand
4942  , typename VT2 // Type of the right-hand side vector operand
4943  , typename ST2 > // Type of the scalar value
4945  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4946  {
4947  selectLargeSubAssignKernel( y, A, x, scalar );
4948  }
4949  //**********************************************************************************************
4950 
4951  //**BLAS-based subtraction assignment to dense vectors******************************************
4952 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4953 
4966  template< typename VT1 // Type of the left-hand side target vector
4967  , typename MT1 // Type of the left-hand side matrix operand
4968  , typename VT2 // Type of the right-hand side vector operand
4969  , typename ST2 > // Type of the scalar value
4971  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4972  {
4973  using ET = ElementType_<VT1>;
4974 
4975  if( IsTriangular<MT1>::value ) {
4976  ResultType_<VT1> tmp( serial( scalar * x ) );
4977  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4978  subAssign( y, tmp );
4979  }
4980  else {
4981  gemv( y, A, x, ET(-scalar), ET(1) );
4982  }
4983  }
4984 #endif
4985  //**********************************************************************************************
4986 
4987  //**Subtraction assignment to sparse vectors****************************************************
4988  // No special implementation for the subtraction assignment to sparse vectors.
4989  //**********************************************************************************************
4990 
4991  //**Multiplication assignment to dense vectors**************************************************
5003  template< typename VT1 > // Type of the target dense vector
5004  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5005  {
5007 
5011 
5012  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5013 
5014  const ResultType tmp( serial( rhs ) );
5015  multAssign( ~lhs, tmp );
5016  }
5017  //**********************************************************************************************
5018 
5019  //**Multiplication assignment to sparse vectors*************************************************
5020  // No special implementation for the multiplication assignment to sparse vectors.
5021  //**********************************************************************************************
5022 
5023  //**Division assignment to dense vectors********************************************************
5035  template< typename VT1 > // Type of the target dense vector
5036  friend inline void divAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5037  {
5039 
5043 
5044  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5045 
5046  const ResultType tmp( serial( rhs ) );
5047  divAssign( ~lhs, tmp );
5048  }
5049  //**********************************************************************************************
5050 
5051  //**Division assignment to sparse vectors*******************************************************
5052  // No special implementation for the division assignment to sparse vectors.
5053  //**********************************************************************************************
5054 
5055  //**SMP assignment to dense vectors*************************************************************
5069  template< typename VT1 > // Type of the target dense vector
5070  friend inline EnableIf_< UseSMPAssign<VT1> >
5072  {
5074 
5075  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5076 
5077  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5078  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5079 
5080  if( left.rows() == 0UL ) {
5081  return;
5082  }
5083  else if( left.columns() == 0UL ) {
5084  reset( ~lhs );
5085  return;
5086  }
5087 
5088  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5089  RT x( right ); // Evaluation of the right-hand side dense vector operand
5090 
5091  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5092  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5093  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5094  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5095 
5096  smpAssign( ~lhs, A * x * rhs.scalar_ );
5097  }
5098  //**********************************************************************************************
5099 
5100  //**SMP assignment to sparse vectors************************************************************
5114  template< typename VT1 > // Type of the target sparse vector
5115  friend inline EnableIf_< UseSMPAssign<VT1> >
5117  {
5119 
5123 
5124  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5125 
5126  const ResultType tmp( rhs );
5127  smpAssign( ~lhs, tmp );
5128  }
5129  //**********************************************************************************************
5130 
5131  //**SMP addition assignment to dense vectors****************************************************
5145  template< typename VT1 > // Type of the target dense vector
5146  friend inline EnableIf_< UseSMPAssign<VT1> >
5148  {
5150 
5151  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5152 
5153  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5154  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5155 
5156  if( left.rows() == 0UL || left.columns() == 0UL ) {
5157  return;
5158  }
5159 
5160  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5161  RT x( right ); // Evaluation of the right-hand side dense vector operand
5162 
5163  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5164  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5165  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5166  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5167 
5168  smpAddAssign( ~lhs, A * x * rhs.scalar_ );
5169  }
5170  //**********************************************************************************************
5171 
5172  //**SMP addition assignment to sparse vectors***************************************************
5173  // No special implementation for the SMP addition assignment to sparse vectors.
5174  //**********************************************************************************************
5175 
5176  //**SMP subtraction assignment to dense vectors*************************************************
5190  template< typename VT1 > // Type of the target dense vector
5191  friend inline EnableIf_< UseSMPAssign<VT1> >
5193  {
5195 
5196  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5197 
5198  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5199  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5200 
5201  if( left.rows() == 0UL || left.columns() == 0UL ) {
5202  return;
5203  }
5204 
5205  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5206  RT x( right ); // Evaluation of the right-hand side dense vector operand
5207 
5208  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5209  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5210  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5211  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5212 
5213  smpSubAssign( ~lhs, A * x * rhs.scalar_ );
5214  }
5215  //**********************************************************************************************
5216 
5217  //**SMP subtraction assignment to sparse vectors************************************************
5218  // No special implementation for the SMP subtraction assignment to sparse vectors.
5219  //**********************************************************************************************
5220 
5221  //**SMP multiplication assignment to dense vectors**********************************************
5235  template< typename VT1 > // Type of the target dense vector
5236  friend inline EnableIf_< UseSMPAssign<VT1> >
5238  {
5240 
5244 
5245  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5246 
5247  const ResultType tmp( rhs );
5248  smpMultAssign( ~lhs, tmp );
5249  }
5250  //**********************************************************************************************
5251 
5252  //**SMP multiplication assignment to sparse vectors*********************************************
5253  // No special implementation for the SMP multiplication assignment to sparse vectors.
5254  //**********************************************************************************************
5255 
5256  //**SMP division assignment to dense vectors****************************************************
5270  template< typename VT1 > // Type of the target dense vector
5271  friend inline EnableIf_< UseSMPAssign<VT1> >
5273  {
5275 
5279 
5280  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5281 
5282  const ResultType tmp( rhs );
5283  smpDivAssign( ~lhs, tmp );
5284  }
5285  //**********************************************************************************************
5286 
5287  //**SMP division assignment to sparse vectors***************************************************
5288  // No special implementation for the SMP division assignment to sparse vectors.
5289  //**********************************************************************************************
5290 
5291  //**Compile time checks*************************************************************************
5300  //**********************************************************************************************
5301 };
5303 //*************************************************************************************************
5304 
5305 
5306 
5307 
5308 //=================================================================================================
5309 //
5310 // GLOBAL BINARY ARITHMETIC OPERATORS
5311 //
5312 //=================================================================================================
5313 
5314 //*************************************************************************************************
5344 template< typename MT // Type of the left-hand side dense matrix
5345  , typename VT > // Type of the right-hand side dense vector
5346 inline decltype(auto)
5347  operator*( const DenseMatrix<MT,false>& mat, const DenseVector<VT,false>& vec )
5348 {
5350 
5352 
5353  if( (~mat).columns() != (~vec).size() ) {
5354  BLAZE_THROW_INVALID_ARGUMENT( "Matrix and vector sizes do not match" );
5355  }
5356 
5357  using ReturnType = const DMatDVecMultExpr<MT,VT>;
5358  return ReturnType( ~mat, ~vec );
5359 }
5360 //*************************************************************************************************
5361 
5362 
5363 
5364 
5365 //=================================================================================================
5366 //
5367 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
5368 //
5369 //=================================================================================================
5370 
5371 //*************************************************************************************************
5385 template< typename MT // Matrix base type of the left-hand side expression
5386  , typename VT > // Type of the right-hand side dense vector
5387 inline decltype(auto)
5388  operator*( const MatMatMultExpr<MT>& mat, const DenseVector<VT,false>& vec )
5389 {
5391 
5392  return (~mat).leftOperand() * ( (~mat).rightOperand() * vec );
5393 }
5395 //*************************************************************************************************
5396 
5397 
5398 
5399 
5400 //=================================================================================================
5401 //
5402 // SIZE SPECIALIZATIONS
5403 //
5404 //=================================================================================================
5405 
5406 //*************************************************************************************************
5408 template< typename MT, typename VT >
5409 struct Size< DMatDVecMultExpr<MT,VT>, 0UL >
5410  : public Size<MT,0UL>
5411 {};
5413 //*************************************************************************************************
5414 
5415 
5416 
5417 
5418 //=================================================================================================
5419 //
5420 // ISALIGNED SPECIALIZATIONS
5421 //
5422 //=================================================================================================
5423 
5424 //*************************************************************************************************
5426 template< typename MT, typename VT >
5427 struct IsAligned< DMatDVecMultExpr<MT,VT> >
5428  : public And< IsAligned<MT>, IsAligned<VT> >
5429 {};
5431 //*************************************************************************************************
5432 
5433 } // namespace blaze
5434 
5435 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Data type constraint.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
If_< IsExpression< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:213
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:222
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:262
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:219
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:130
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:122
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
If_< IsExpression< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:216
Header file for the IsSame and IsStrictlySame type traits.
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:128
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:208
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:133
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:519
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:341
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:353
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:205
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:373
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:207
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:131
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:309
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:506
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:67
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:108
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:329
decltype(auto) operator*(const DenseMatrix< MT1, false > &lhs, const DenseMatrix< MT2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8893
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:590
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:385
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:210
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:363
Header file for run time assertion macros.
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:132
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:386
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:248
Header file for the IsContiguous type trait.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:131
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:129
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:206
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:104
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Compile time logical &#39;and&#39; evaluation.The And alias declaration performs at compile time a logical &#39;a...
Definition: And.h:76
Constraint on the data type.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:319
Header file for the MatVecMultExpr base class.
Constraint on the data type.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:296
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:209
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the function trace functionality.