DMatDVecMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/Exception.h>
60 #include <blaze/math/shims/Reset.h>
62 #include <blaze/math/SIMD.h>
82 #include <blaze/system/BLAS.h>
85 #include <blaze/util/Assert.h>
86 #include <blaze/util/Complex.h>
88 #include <blaze/util/DisableIf.h>
89 #include <blaze/util/EnableIf.h>
92 #include <blaze/util/mpl/And.h>
93 #include <blaze/util/mpl/If.h>
94 #include <blaze/util/Types.h>
102 
103 
104 namespace blaze {
105 
106 //=================================================================================================
107 //
108 // CLASS DMATDVECMULTEXPR
109 //
110 //=================================================================================================
111 
112 //*************************************************************************************************
119 template< typename MT // Type of the left-hand side dense matrix
120  , typename VT > // Type of the right-hand side dense vector
122  : public MatVecMultExpr< DenseVector< DMatDVecMultExpr<MT,VT>, false > >
123  , private Computation
124 {
125  private:
126  //**Type definitions****************************************************************************
133  //**********************************************************************************************
134 
135  //**********************************************************************************************
137  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
139  //**********************************************************************************************
140 
141  //**********************************************************************************************
143  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
144  //**********************************************************************************************
145 
146  //**********************************************************************************************
148 
152  template< typename T1 >
153  struct UseSMPAssign {
154  enum : bool { value = ( evaluateMatrix || evaluateVector ) };
155  };
157  //**********************************************************************************************
158 
159  //**********************************************************************************************
161 
164  template< typename T1, typename T2, typename T3 >
165  struct UseBlasKernel {
171  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
176  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
177  };
179  //**********************************************************************************************
180 
181  //**********************************************************************************************
183 
187  template< typename T1, typename T2, typename T3 >
188  struct UseVectorizedDefaultKernel {
189  enum : bool { value = useOptimizedKernels &&
191  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
194  , ElementType_<T3> >::value &&
197  };
199  //**********************************************************************************************
200 
201  public:
202  //**Type definitions****************************************************************************
208  using ReturnType = const ElementType;
209  using CompositeType = const ResultType;
210 
212  using LeftOperand = If_< IsExpression<MT>, const MT, const MT& >;
213 
215  using RightOperand = If_< IsExpression<VT>, const VT, const VT& >;
216 
219 
222  //**********************************************************************************************
223 
224  //**Compilation flags***************************************************************************
226  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
227  MT::simdEnabled && VT::simdEnabled &&
230 
232  enum : bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
233  !evaluateVector && VT::smpAssignable };
234  //**********************************************************************************************
235 
236  //**SIMD properties*****************************************************************************
238  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
239  //**********************************************************************************************
240 
241  //**Constructor*********************************************************************************
247  explicit inline DMatDVecMultExpr( const MT& mat, const VT& vec ) noexcept
248  : mat_( mat ) // Left-hand side dense matrix of the multiplication expression
249  , vec_( vec ) // Right-hand side dense vector of the multiplication expression
250  {
251  BLAZE_INTERNAL_ASSERT( mat_.columns() == vec_.size(), "Invalid matrix and vector sizes" );
252  }
253  //**********************************************************************************************
254 
255  //**Subscript operator**************************************************************************
261  inline ReturnType operator[]( size_t index ) const {
262  BLAZE_INTERNAL_ASSERT( index < mat_.rows(), "Invalid vector access index" );
263 
265  {
266  return mat_(index,index) * vec_[index];
267  }
268  else if( IsLower<MT>::value && ( index + 8UL < mat_.rows() ) )
269  {
270  const size_t n( IsStrictlyLower<MT>::value ? index : index+1UL );
271  return subvector( row( mat_, index ), 0UL, n ) * subvector( vec_, 0UL, n );
272  }
273  else if( IsUpper<MT>::value && ( index > 8UL ) )
274  {
275  const size_t begin( IsStrictlyUpper<MT>::value ? index+1UL : index );
276  const size_t n ( mat_.columns() - begin );
277  return subvector( row( mat_, index ), begin, n ) * subvector( vec_, begin, n );
278  }
279  else
280  {
281  return row( mat_, index ) * vec_;
282  }
283  }
284  //**********************************************************************************************
285 
286  //**At function*********************************************************************************
293  inline ReturnType at( size_t index ) const {
294  if( index >= mat_.rows() ) {
295  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
296  }
297  return (*this)[index];
298  }
299  //**********************************************************************************************
300 
301  //**Size function*******************************************************************************
306  inline size_t size() const noexcept {
307  return mat_.rows();
308  }
309  //**********************************************************************************************
310 
311  //**Left operand access*************************************************************************
316  inline LeftOperand leftOperand() const noexcept{
317  return mat_;
318  }
319  //**********************************************************************************************
320 
321  //**Right operand access************************************************************************
326  inline RightOperand rightOperand() const noexcept {
327  return vec_;
328  }
329  //**********************************************************************************************
330 
331  //**********************************************************************************************
337  template< typename T >
338  inline bool canAlias( const T* alias ) const noexcept {
339  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
340  }
341  //**********************************************************************************************
342 
343  //**********************************************************************************************
349  template< typename T >
350  inline bool isAliased( const T* alias ) const noexcept {
351  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
352  }
353  //**********************************************************************************************
354 
355  //**********************************************************************************************
360  inline bool isAligned() const noexcept {
361  return mat_.isAligned() && vec_.isAligned();
362  }
363  //**********************************************************************************************
364 
365  //**********************************************************************************************
370  inline bool canSMPAssign() const noexcept {
371  return ( !BLAZE_BLAS_MODE ||
374  ( IsComputation<MT>::value && !evaluateMatrix ) ||
375  ( mat_.rows() * mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
376  ( size() > SMP_DMATDVECMULT_THRESHOLD );
377  }
378  //**********************************************************************************************
379 
380  private:
381  //**Member variables****************************************************************************
384  //**********************************************************************************************
385 
386  //**Assignment to dense vectors*****************************************************************
399  template< typename VT1 > // Type of the target dense vector
400  friend inline void assign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
401  {
403 
404  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
405 
406  if( rhs.mat_.rows() == 0UL ) {
407  return;
408  }
409  else if( rhs.mat_.columns() == 0UL ) {
410  reset( ~lhs );
411  return;
412  }
413 
414  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
415  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
416 
417  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
418  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
419  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
420  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
421 
422  DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
423  }
425  //**********************************************************************************************
426 
427  //**Assignment to dense vectors (kernel selection)**********************************************
438  template< typename VT1 // Type of the left-hand side target vector
439  , typename MT1 // Type of the left-hand side matrix operand
440  , typename VT2 > // Type of the right-hand side vector operand
441  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x )
442  {
443  if( ( IsDiagonal<MT1>::value ) ||
444  ( IsComputation<MT>::value && !evaluateMatrix ) ||
445  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
446  selectSmallAssignKernel( y, A, x );
447  else
448  selectBlasAssignKernel( y, A, x );
449  }
451  //**********************************************************************************************
452 
453  //**Default assignment to dense vectors*********************************************************
467  template< typename VT1 // Type of the left-hand side target vector
468  , typename MT1 // Type of the left-hand side matrix operand
469  , typename VT2 > // Type of the right-hand side vector operand
470  static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x )
471  {
472  y.assign( A * x );
473  }
475  //**********************************************************************************************
476 
477  //**Default assignment to dense vectors (small matrices)****************************************
491  template< typename VT1 // Type of the left-hand side target vector
492  , typename MT1 // Type of the left-hand side matrix operand
493  , typename VT2 > // Type of the right-hand side vector operand
495  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
496  {
497  selectDefaultAssignKernel( y, A, x );
498  }
500  //**********************************************************************************************
501 
502  //**Vectorized default assignment to dense vectors (small matrices)*****************************
516  template< typename VT1 // Type of the left-hand side target vector
517  , typename MT1 // Type of the left-hand side matrix operand
518  , typename VT2 > // Type of the right-hand side vector operand
520  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
521  {
522  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
523 
524  const size_t M( A.rows() );
525  const size_t N( A.columns() );
526 
527  size_t i( 0UL );
528 
529  for( ; (i+8UL) <= M; i+=8UL )
530  {
531  const size_t jbegin( ( IsUpper<MT1>::value )
532  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
533  :( 0UL ) );
534  const size_t jend( ( IsLower<MT1>::value )
535  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
536  :( N ) );
537  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
538 
539  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
540  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
541 
542  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
543  size_t j( jbegin );
544 
545  for( ; j<jpos; j+=SIMDSIZE ) {
546  const SIMDType x1( x.load(j) );
547  xmm1 += A.load(i ,j) * x1;
548  xmm2 += A.load(i+1UL,j) * x1;
549  xmm3 += A.load(i+2UL,j) * x1;
550  xmm4 += A.load(i+3UL,j) * x1;
551  xmm5 += A.load(i+4UL,j) * x1;
552  xmm6 += A.load(i+5UL,j) * x1;
553  xmm7 += A.load(i+6UL,j) * x1;
554  xmm8 += A.load(i+7UL,j) * x1;
555  }
556 
557  y[i ] = sum( xmm1 );
558  y[i+1UL] = sum( xmm2 );
559  y[i+2UL] = sum( xmm3 );
560  y[i+3UL] = sum( xmm4 );
561  y[i+4UL] = sum( xmm5 );
562  y[i+5UL] = sum( xmm6 );
563  y[i+6UL] = sum( xmm7 );
564  y[i+7UL] = sum( xmm8 );
565 
566  for( ; remainder && j<jend; ++j ) {
567  y[i ] += A(i ,j) * x[j];
568  y[i+1UL] += A(i+1UL,j) * x[j];
569  y[i+2UL] += A(i+2UL,j) * x[j];
570  y[i+3UL] += A(i+3UL,j) * x[j];
571  y[i+4UL] += A(i+4UL,j) * x[j];
572  y[i+5UL] += A(i+5UL,j) * x[j];
573  y[i+6UL] += A(i+6UL,j) * x[j];
574  y[i+7UL] += A(i+7UL,j) * x[j];
575  }
576  }
577 
578  for( ; (i+4UL) <= M; i+=4UL )
579  {
580  const size_t jbegin( ( IsUpper<MT1>::value )
581  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
582  :( 0UL ) );
583  const size_t jend( ( IsLower<MT1>::value )
584  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
585  :( N ) );
586  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
587 
588  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
589  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
590 
591  SIMDType xmm1, xmm2, xmm3, xmm4;
592  size_t j( jbegin );
593 
594  for( ; j<jpos; j+=SIMDSIZE ) {
595  const SIMDType x1( x.load(j) );
596  xmm1 += A.load(i ,j) * x1;
597  xmm2 += A.load(i+1UL,j) * x1;
598  xmm3 += A.load(i+2UL,j) * x1;
599  xmm4 += A.load(i+3UL,j) * x1;
600  }
601 
602  y[i ] = sum( xmm1 );
603  y[i+1UL] = sum( xmm2 );
604  y[i+2UL] = sum( xmm3 );
605  y[i+3UL] = sum( xmm4 );
606 
607  for( ; remainder && j<jend; ++j ) {
608  y[i ] += A(i ,j) * x[j];
609  y[i+1UL] += A(i+1UL,j) * x[j];
610  y[i+2UL] += A(i+2UL,j) * x[j];
611  y[i+3UL] += A(i+3UL,j) * x[j];
612  }
613  }
614 
615  for( ; (i+3UL) <= M; i+=3UL )
616  {
617  const size_t jbegin( ( IsUpper<MT1>::value )
618  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
619  :( 0UL ) );
620  const size_t jend( ( IsLower<MT1>::value )
621  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
622  :( N ) );
623  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
624 
625  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
626  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
627 
628  SIMDType xmm1, xmm2, xmm3;
629  size_t j( jbegin );
630 
631  for( ; j<jpos; j+=SIMDSIZE ) {
632  const SIMDType x1( x.load(j) );
633  xmm1 += A.load(i ,j) * x1;
634  xmm2 += A.load(i+1UL,j) * x1;
635  xmm3 += A.load(i+2UL,j) * x1;
636  }
637 
638  y[i ] = sum( xmm1 );
639  y[i+1UL] = sum( xmm2 );
640  y[i+2UL] = sum( xmm3 );
641 
642  for( ; remainder && j<jend; ++j ) {
643  y[i ] += A(i ,j) * x[j];
644  y[i+1UL] += A(i+1UL,j) * x[j];
645  y[i+2UL] += A(i+2UL,j) * x[j];
646  }
647  }
648 
649  for( ; (i+2UL) <= M; i+=2UL )
650  {
651  const size_t jbegin( ( IsUpper<MT1>::value )
652  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
653  :( 0UL ) );
654  const size_t jend( ( IsLower<MT1>::value )
655  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
656  :( N ) );
657  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
658 
659  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
660  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
661 
662  SIMDType xmm1, xmm2;
663  size_t j( jbegin );
664 
665  for( ; j<jpos; j+=SIMDSIZE ) {
666  const SIMDType x1( x.load(j) );
667  xmm1 += A.load(i ,j) * x1;
668  xmm2 += A.load(i+1UL,j) * x1;
669  }
670 
671  y[i ] = sum( xmm1 );
672  y[i+1UL] = sum( xmm2 );
673 
674  for( ; remainder && j<jend; ++j ) {
675  y[i ] += A(i ,j) * x[j];
676  y[i+1UL] += A(i+1UL,j) * x[j];
677  }
678  }
679 
680  if( i < M )
681  {
682  const size_t jbegin( ( IsUpper<MT1>::value )
683  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
684  :( 0UL ) );
685  const size_t jend( ( IsLower<MT1>::value )
686  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
687  :( N ) );
688  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
689 
690  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
691  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
692 
693  SIMDType xmm1;
694  size_t j( jbegin );
695 
696  for( ; j<jpos; j+=SIMDSIZE ) {
697  xmm1 += A.load(i,j) * x.load(j);
698  }
699 
700  y[i] = sum( xmm1 );
701 
702  for( ; remainder && j<jend; ++j ) {
703  y[i] += A(i,j) * x[j];
704  }
705  }
706  }
708  //**********************************************************************************************
709 
710  //**Default assignment to dense vectors (large matrices)****************************************
724  template< typename VT1 // Type of the left-hand side target vector
725  , typename MT1 // Type of the left-hand side matrix operand
726  , typename VT2 > // Type of the right-hand side vector operand
728  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
729  {
730  selectDefaultAssignKernel( y, A, x );
731  }
733  //**********************************************************************************************
734 
735  //**Vectorized default assignment to dense vectors (large matrices)*****************************
749  template< typename VT1 // Type of the left-hand side target vector
750  , typename MT1 // Type of the left-hand side matrix operand
751  , typename VT2 > // Type of the right-hand side vector operand
753  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
754  {
755  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
756 
757  const size_t M( A.rows() );
758  const size_t N( A.columns() );
759 
760  reset( y );
761 
762  size_t i( 0UL );
763 
764  for( ; (i+8UL) <= M; i+=8UL )
765  {
766  const size_t jbegin( ( IsUpper<MT1>::value )
767  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
768  :( 0UL ) );
769  const size_t jend( ( IsLower<MT1>::value )
770  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
771  :( N ) );
772  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
773 
774  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
775  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
776 
777  size_t j( jbegin );
778 
779  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
780  const size_t j1( j+SIMDSIZE );
781  const size_t j2( j+SIMDSIZE*2UL );
782  const size_t j3( j+SIMDSIZE*3UL );
783  const SIMDType x1( x.load(j ) );
784  const SIMDType x2( x.load(j1) );
785  const SIMDType x3( x.load(j2) );
786  const SIMDType x4( x.load(j3) );
787  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
788  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
789  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
790  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
791  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
792  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
793  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
794  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
795  }
796 
797  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
798  const size_t j1( j+SIMDSIZE );
799  const SIMDType x1( x.load(j ) );
800  const SIMDType x2( x.load(j1) );
801  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
802  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
803  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
804  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
805  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
806  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
807  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
808  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
809  }
810 
811  for( ; j<jpos; j+=SIMDSIZE ) {
812  const SIMDType x1( x.load(j) );
813  y[i ] += sum( A.load(i ,j) * x1 );
814  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
815  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
816  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
817  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
818  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
819  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
820  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
821  }
822 
823  for( ; remainder && j<jend; ++j ) {
824  y[i ] += A(i ,j) * x[j];
825  y[i+1UL] += A(i+1UL,j) * x[j];
826  y[i+2UL] += A(i+2UL,j) * x[j];
827  y[i+3UL] += A(i+3UL,j) * x[j];
828  y[i+4UL] += A(i+4UL,j) * x[j];
829  y[i+5UL] += A(i+5UL,j) * x[j];
830  y[i+6UL] += A(i+6UL,j) * x[j];
831  y[i+7UL] += A(i+7UL,j) * x[j];
832  }
833  }
834 
835  for( ; (i+4UL) <= M; i+=4UL )
836  {
837  const size_t jbegin( ( IsUpper<MT1>::value )
838  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
839  :( 0UL ) );
840  const size_t jend( ( IsLower<MT1>::value )
841  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
842  :( N ) );
843  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
844 
845  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
846  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
847 
848  size_t j( jbegin );
849 
850  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
851  const size_t j1( j+SIMDSIZE );
852  const size_t j2( j+SIMDSIZE*2UL );
853  const size_t j3( j+SIMDSIZE*3UL );
854  const SIMDType x1( x.load(j ) );
855  const SIMDType x2( x.load(j1) );
856  const SIMDType x3( x.load(j2) );
857  const SIMDType x4( x.load(j3) );
858  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
859  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
860  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
861  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
862  }
863 
864  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
865  const size_t j1( j+SIMDSIZE );
866  const SIMDType x1( x.load(j ) );
867  const SIMDType x2( x.load(j1) );
868  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
869  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
870  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
871  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
872  }
873 
874  for( ; j<jpos; j+=SIMDSIZE ) {
875  const SIMDType x1( x.load(j) );
876  y[i ] += sum( A.load(i ,j) * x1 );
877  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
878  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
879  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
880  }
881 
882  for( ; remainder && j<jend; ++j ) {
883  y[i ] += A(i ,j) * x[j];
884  y[i+1UL] += A(i+1UL,j) * x[j];
885  y[i+2UL] += A(i+2UL,j) * x[j];
886  y[i+3UL] += A(i+3UL,j) * x[j];
887  }
888  }
889 
890  for( ; (i+2UL) <= M; i+=2UL )
891  {
892  const size_t jbegin( ( IsUpper<MT1>::value )
893  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
894  :( 0UL ) );
895  const size_t jend( ( IsLower<MT1>::value )
896  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
897  :( N ) );
898  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
899 
900  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
901  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
902 
903  size_t j( jbegin );
904 
905  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
906  const size_t j1( j+SIMDSIZE );
907  const size_t j2( j+SIMDSIZE*2UL );
908  const size_t j3( j+SIMDSIZE*3UL );
909  const SIMDType x1( x.load(j ) );
910  const SIMDType x2( x.load(j1) );
911  const SIMDType x3( x.load(j2) );
912  const SIMDType x4( x.load(j3) );
913  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
914  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
915  }
916 
917  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
918  const size_t j1( j+SIMDSIZE );
919  const SIMDType x1( x.load(j ) );
920  const SIMDType x2( x.load(j1) );
921  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
922  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
923  }
924 
925  for( ; j<jpos; j+=SIMDSIZE ) {
926  const SIMDType x1( x.load(j) );
927  y[i ] += sum( A.load(i ,j) * x1 );
928  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
929  }
930 
931  for( ; remainder && j<jend; ++j ) {
932  y[i ] += A(i ,j) * x[j];
933  y[i+1UL] += A(i+1UL,j) * x[j];
934  }
935  }
936 
937  if( i < M )
938  {
939  const size_t jbegin( ( IsUpper<MT1>::value )
940  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
941  :( 0UL ) );
942  const size_t jend( ( IsLower<MT1>::value )
943  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
944  :( N ) );
945  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
946 
947  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
948  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
949 
950  size_t j( jbegin );
951 
952  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
953  const size_t j1( j+SIMDSIZE );
954  const size_t j2( j+SIMDSIZE*2UL );
955  const size_t j3( j+SIMDSIZE*3UL );
956  const SIMDType x1( x.load(j ) );
957  const SIMDType x2( x.load(j1) );
958  const SIMDType x3( x.load(j2) );
959  const SIMDType x4( x.load(j3) );
960  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
961  }
962 
963  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
964  const size_t j1( j+SIMDSIZE );
965  const SIMDType x1( x.load(j ) );
966  const SIMDType x2( x.load(j1) );
967  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
968  }
969 
970  for( ; j<jpos; j+=SIMDSIZE ) {
971  const SIMDType x1( x.load(j) );
972  y[i] += sum( A.load(i,j) * x1 );
973  }
974 
975  for( ; remainder && j<jend; ++j ) {
976  y[i] += A(i,j) * x[j];
977  }
978  }
979  }
981  //**********************************************************************************************
982 
983  //**BLAS-based assignment to dense vectors (default)********************************************
997  template< typename VT1 // Type of the left-hand side target vector
998  , typename MT1 // Type of the left-hand side matrix operand
999  , typename VT2 > // Type of the right-hand side vector operand
1001  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1002  {
1003  selectLargeAssignKernel( y, A, x );
1004  }
1006  //**********************************************************************************************
1007 
1008  //**BLAS-based assignment to dense vectors******************************************************
1009 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1010 
1023  template< typename VT1 // Type of the left-hand side target vector
1024  , typename MT1 // Type of the left-hand side matrix operand
1025  , typename VT2 > // Type of the right-hand side vector operand
1027  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1028  {
1029  using ET = ElementType_<VT1>;
1030 
1031  if( IsTriangular<MT1>::value ) {
1032  assign( y, x );
1033  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1034  }
1035  else {
1036  gemv( y, A, x, ET(1), ET(0) );
1037  }
1038  }
1040 #endif
1041  //**********************************************************************************************
1042 
1043  //**Assignment to sparse vectors****************************************************************
1056  template< typename VT1 > // Type of the target sparse vector
1057  friend inline void assign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1058  {
1060 
1064 
1065  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1066 
1067  const ResultType tmp( serial( rhs ) );
1068  assign( ~lhs, tmp );
1069  }
1071  //**********************************************************************************************
1072 
1073  //**Addition assignment to dense vectors********************************************************
1086  template< typename VT1 > // Type of the target dense vector
1087  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1088  {
1090 
1091  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1092 
1093  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1094  return;
1095  }
1096 
1097  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1098  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1099 
1100  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1101  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1102  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1103  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1104 
1105  DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1106  }
1108  //**********************************************************************************************
1109 
1110  //**Addition assignment to dense vectors (kernel selection)*************************************
1121  template< typename VT1 // Type of the left-hand side target vector
1122  , typename MT1 // Type of the left-hand side matrix operand
1123  , typename VT2 > // Type of the right-hand side vector operand
1124  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1125  {
1126  if( ( IsDiagonal<MT1>::value ) ||
1127  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1128  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1129  selectSmallAddAssignKernel( y, A, x );
1130  else
1131  selectBlasAddAssignKernel( y, A, x );
1132  }
1134  //**********************************************************************************************
1135 
1136  //**Default addition assignment to dense vectors************************************************
1150  template< typename VT1 // Type of the left-hand side target vector
1151  , typename MT1 // Type of the left-hand side matrix operand
1152  , typename VT2 > // Type of the right-hand side vector operand
1153  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1154  {
1155  y.addAssign( A * x );
1156  }
1158  //**********************************************************************************************
1159 
1160  //**Default addition assignment to dense vectors (small matrices)*******************************
1174  template< typename VT1 // Type of the left-hand side target vector
1175  , typename MT1 // Type of the left-hand side matrix operand
1176  , typename VT2 > // Type of the right-hand side vector operand
1178  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1179  {
1180  selectDefaultAddAssignKernel( y, A, x );
1181  }
1183  //**********************************************************************************************
1184 
1185  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1199  template< typename VT1 // Type of the left-hand side target vector
1200  , typename MT1 // Type of the left-hand side matrix operand
1201  , typename VT2 > // Type of the right-hand side vector operand
1203  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1204  {
1205  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1206 
1207  const size_t M( A.rows() );
1208  const size_t N( A.columns() );
1209 
1210  size_t i( 0UL );
1211 
1212  for( ; (i+8UL) <= M; i+=8UL )
1213  {
1214  const size_t jbegin( ( IsUpper<MT1>::value )
1215  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1216  :( 0UL ) );
1217  const size_t jend( ( IsLower<MT1>::value )
1218  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1219  :( N ) );
1220  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1221 
1222  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1223  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1224 
1225  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1226  size_t j( jbegin );
1227 
1228  for( ; j<jpos; j+=SIMDSIZE ) {
1229  const SIMDType x1( x.load(j) );
1230  xmm1 += A.load(i ,j) * x1;
1231  xmm2 += A.load(i+1UL,j) * x1;
1232  xmm3 += A.load(i+2UL,j) * x1;
1233  xmm4 += A.load(i+3UL,j) * x1;
1234  xmm5 += A.load(i+4UL,j) * x1;
1235  xmm6 += A.load(i+5UL,j) * x1;
1236  xmm7 += A.load(i+6UL,j) * x1;
1237  xmm8 += A.load(i+7UL,j) * x1;
1238  }
1239 
1240  y[i ] += sum( xmm1 );
1241  y[i+1UL] += sum( xmm2 );
1242  y[i+2UL] += sum( xmm3 );
1243  y[i+3UL] += sum( xmm4 );
1244  y[i+4UL] += sum( xmm5 );
1245  y[i+5UL] += sum( xmm6 );
1246  y[i+6UL] += sum( xmm7 );
1247  y[i+7UL] += sum( xmm8 );
1248 
1249  for( ; remainder && j<jend; ++j ) {
1250  y[i ] += A(i ,j) * x[j];
1251  y[i+1UL] += A(i+1UL,j) * x[j];
1252  y[i+2UL] += A(i+2UL,j) * x[j];
1253  y[i+3UL] += A(i+3UL,j) * x[j];
1254  y[i+4UL] += A(i+4UL,j) * x[j];
1255  y[i+5UL] += A(i+5UL,j) * x[j];
1256  y[i+6UL] += A(i+6UL,j) * x[j];
1257  y[i+7UL] += A(i+7UL,j) * x[j];
1258  }
1259  }
1260 
1261  for( ; (i+4UL) <= M; i+=4UL )
1262  {
1263  const size_t jbegin( ( IsUpper<MT1>::value )
1264  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1265  :( 0UL ) );
1266  const size_t jend( ( IsLower<MT1>::value )
1267  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1268  :( N ) );
1269  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1270 
1271  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1272  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1273 
1274  SIMDType xmm1, xmm2, xmm3, xmm4;
1275  size_t j( jbegin );
1276 
1277  for( ; j<jpos; j+=SIMDSIZE ) {
1278  const SIMDType x1( x.load(j) );
1279  xmm1 += A.load(i ,j) * x1;
1280  xmm2 += A.load(i+1UL,j) * x1;
1281  xmm3 += A.load(i+2UL,j) * x1;
1282  xmm4 += A.load(i+3UL,j) * x1;
1283  }
1284 
1285  y[i ] += sum( xmm1 );
1286  y[i+1UL] += sum( xmm2 );
1287  y[i+2UL] += sum( xmm3 );
1288  y[i+3UL] += sum( xmm4 );
1289 
1290  for( ; remainder && j<jend; ++j ) {
1291  y[i ] += A(i ,j) * x[j];
1292  y[i+1UL] += A(i+1UL,j) * x[j];
1293  y[i+2UL] += A(i+2UL,j) * x[j];
1294  y[i+3UL] += A(i+3UL,j) * x[j];
1295  }
1296  }
1297 
1298  for( ; (i+3UL) <= M; i+=3UL )
1299  {
1300  const size_t jbegin( ( IsUpper<MT1>::value )
1301  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1302  :( 0UL ) );
1303  const size_t jend( ( IsLower<MT1>::value )
1304  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1305  :( N ) );
1306  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1307 
1308  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1309  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1310 
1311  SIMDType xmm1, xmm2, xmm3;
1312  size_t j( jbegin );
1313 
1314  for( ; j<jpos; j+=SIMDSIZE ) {
1315  const SIMDType x1( x.load(j) );
1316  xmm1 += A.load(i ,j) * x1;
1317  xmm2 += A.load(i+1UL,j) * x1;
1318  xmm3 += A.load(i+2UL,j) * x1;
1319  }
1320 
1321  y[i ] += sum( xmm1 );
1322  y[i+1UL] += sum( xmm2 );
1323  y[i+2UL] += sum( xmm3 );
1324 
1325  for( ; remainder && j<jend; ++j ) {
1326  y[i ] += A(i ,j) * x[j];
1327  y[i+1UL] += A(i+1UL,j) * x[j];
1328  y[i+2UL] += A(i+2UL,j) * x[j];
1329  }
1330  }
1331 
1332  for( ; (i+2UL) <= M; i+=2UL )
1333  {
1334  const size_t jbegin( ( IsUpper<MT1>::value )
1335  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1336  :( 0UL ) );
1337  const size_t jend( ( IsLower<MT1>::value )
1338  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1339  :( N ) );
1340  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1341 
1342  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1343  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1344 
1345  SIMDType xmm1, xmm2;
1346  size_t j( jbegin );
1347 
1348  for( ; j<jpos; j+=SIMDSIZE ) {
1349  const SIMDType x1( x.load(j) );
1350  xmm1 += A.load(i ,j) * x1;
1351  xmm2 += A.load(i+1UL,j) * x1;
1352  }
1353 
1354  y[i ] += sum( xmm1 );
1355  y[i+1UL] += sum( xmm2 );
1356 
1357  for( ; remainder && j<jend; ++j ) {
1358  y[i ] += A(i ,j) * x[j];
1359  y[i+1UL] += A(i+1UL,j) * x[j];
1360  }
1361  }
1362 
1363  if( i < M )
1364  {
1365  const size_t jbegin( ( IsUpper<MT1>::value )
1366  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1367  :( 0UL ) );
1368  const size_t jend( ( IsLower<MT1>::value )
1369  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1370  :( N ) );
1371  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1372 
1373  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1374  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1375 
1376  SIMDType xmm1;
1377  size_t j( jbegin );
1378 
1379  for( ; j<jpos; j+=SIMDSIZE ) {
1380  xmm1 += A.load(i,j) * x.load(j);
1381  }
1382 
1383  y[i] += sum( xmm1 );
1384 
1385  for( ; remainder && j<jend; ++j ) {
1386  y[i] += A(i,j) * x[j];
1387  }
1388  }
1389  }
1391  //**********************************************************************************************
1392 
1393  //**Default addition assignment to dense vectors (large matrices)*******************************
1407  template< typename VT1 // Type of the left-hand side target vector
1408  , typename MT1 // Type of the left-hand side matrix operand
1409  , typename VT2 > // Type of the right-hand side vector operand
1411  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1412  {
1413  selectDefaultAddAssignKernel( y, A, x );
1414  }
1416  //**********************************************************************************************
1417 
1418  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1432  template< typename VT1 // Type of the left-hand side target vector
1433  , typename MT1 // Type of the left-hand side matrix operand
1434  , typename VT2 > // Type of the right-hand side vector operand
1436  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1437  {
1438  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1439 
1440  const size_t M( A.rows() );
1441  const size_t N( A.columns() );
1442 
1443  size_t i( 0UL );
1444 
1445  for( ; (i+8UL) <= M; i+=8UL )
1446  {
1447  const size_t jbegin( ( IsUpper<MT1>::value )
1448  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1449  :( 0UL ) );
1450  const size_t jend( ( IsLower<MT1>::value )
1451  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1452  :( N ) );
1453  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1454 
1455  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1456  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1457 
1458  size_t j( jbegin );
1459 
1460  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1461  const size_t j1( j+SIMDSIZE );
1462  const size_t j2( j+SIMDSIZE*2UL );
1463  const size_t j3( j+SIMDSIZE*3UL );
1464  const SIMDType x1( x.load(j ) );
1465  const SIMDType x2( x.load(j1) );
1466  const SIMDType x3( x.load(j2) );
1467  const SIMDType x4( x.load(j3) );
1468  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1469  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1470  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1471  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1472  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1473  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1474  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1475  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1476  }
1477 
1478  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1479  const size_t j1( j+SIMDSIZE );
1480  const SIMDType x1( x.load(j ) );
1481  const SIMDType x2( x.load(j1) );
1482  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1483  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1484  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1485  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1486  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1487  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1488  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1489  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1490  }
1491 
1492  for( ; j<jpos; j+=SIMDSIZE ) {
1493  const SIMDType x1( x.load(j) );
1494  y[i ] += sum( A.load(i ,j) * x1 );
1495  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1496  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1497  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1498  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
1499  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
1500  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
1501  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
1502  }
1503 
1504  for( ; remainder && j<jend; ++j ) {
1505  y[i ] += A(i ,j) * x[j];
1506  y[i+1UL] += A(i+1UL,j) * x[j];
1507  y[i+2UL] += A(i+2UL,j) * x[j];
1508  y[i+3UL] += A(i+3UL,j) * x[j];
1509  y[i+4UL] += A(i+4UL,j) * x[j];
1510  y[i+5UL] += A(i+5UL,j) * x[j];
1511  y[i+6UL] += A(i+6UL,j) * x[j];
1512  y[i+7UL] += A(i+7UL,j) * x[j];
1513  }
1514  }
1515 
1516  for( ; (i+4UL) <= M; i+=4UL )
1517  {
1518  const size_t jbegin( ( IsUpper<MT1>::value )
1519  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1520  :( 0UL ) );
1521  const size_t jend( ( IsLower<MT1>::value )
1522  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1523  :( N ) );
1524  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1525 
1526  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1527  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1528 
1529  size_t j( jbegin );
1530 
1531  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1532  const size_t j1( j+SIMDSIZE );
1533  const size_t j2( j+SIMDSIZE*2UL );
1534  const size_t j3( j+SIMDSIZE*3UL );
1535  const SIMDType x1( x.load(j ) );
1536  const SIMDType x2( x.load(j1) );
1537  const SIMDType x3( x.load(j2) );
1538  const SIMDType x4( x.load(j3) );
1539  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1540  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1541  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1542  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1543  }
1544 
1545  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1546  const size_t j1( j+SIMDSIZE );
1547  const SIMDType x1( x.load(j ) );
1548  const SIMDType x2( x.load(j1) );
1549  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1550  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1551  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1552  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1553  }
1554 
1555  for( ; j<jpos; j+=SIMDSIZE ) {
1556  const SIMDType x1( x.load(j) );
1557  y[i ] += sum( A.load(i ,j) * x1 );
1558  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1559  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1560  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1561  }
1562 
1563  for( ; remainder && j<jend; ++j ) {
1564  y[i ] += A(i ,j) * x[j];
1565  y[i+1UL] += A(i+1UL,j) * x[j];
1566  y[i+2UL] += A(i+2UL,j) * x[j];
1567  y[i+3UL] += A(i+3UL,j) * x[j];
1568  }
1569  }
1570 
1571  for( ; (i+2UL) <= M; i+=2UL )
1572  {
1573  const size_t jbegin( ( IsUpper<MT1>::value )
1574  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1575  :( 0UL ) );
1576  const size_t jend( ( IsLower<MT1>::value )
1577  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1578  :( N ) );
1579  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1580 
1581  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1582  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1583 
1584  size_t j( jbegin );
1585 
1586  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1587  const size_t j1( j+SIMDSIZE );
1588  const size_t j2( j+SIMDSIZE*2UL );
1589  const size_t j3( j+SIMDSIZE*3UL );
1590  const SIMDType x1( x.load(j ) );
1591  const SIMDType x2( x.load(j1) );
1592  const SIMDType x3( x.load(j2) );
1593  const SIMDType x4( x.load(j3) );
1594  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1595  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1596  }
1597 
1598  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1599  const size_t j1( j+SIMDSIZE );
1600  const SIMDType x1( x.load(j ) );
1601  const SIMDType x2( x.load(j1) );
1602  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1603  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1604  }
1605 
1606  for( ; j<jpos; j+=SIMDSIZE ) {
1607  const SIMDType x1( x.load(j) );
1608  y[i ] += sum( A.load(i ,j) * x1 );
1609  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1610  }
1611 
1612  for( ; remainder && j<jend; ++j ) {
1613  y[i ] += A(i ,j) * x[j];
1614  y[i+1UL] += A(i+1UL,j) * x[j];
1615  }
1616  }
1617 
1618  if( i < M )
1619  {
1620  const size_t jbegin( ( IsUpper<MT1>::value )
1621  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1622  :( 0UL ) );
1623  const size_t jend( ( IsLower<MT1>::value )
1624  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1625  :( N ) );
1626  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1627 
1628  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1629  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1630 
1631  size_t j( jbegin );
1632 
1633  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1634  const size_t j1( j+SIMDSIZE );
1635  const size_t j2( j+SIMDSIZE*2UL );
1636  const size_t j3( j+SIMDSIZE*3UL );
1637  const SIMDType x1( x.load(j ) );
1638  const SIMDType x2( x.load(j1) );
1639  const SIMDType x3( x.load(j2) );
1640  const SIMDType x4( x.load(j3) );
1641  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1642  }
1643 
1644  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1645  const size_t j1( j+SIMDSIZE );
1646  const SIMDType x1( x.load(j ) );
1647  const SIMDType x2( x.load(j1) );
1648  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1649  }
1650 
1651  for( ; j<jpos; j+=SIMDSIZE ) {
1652  const SIMDType x1( x.load(j) );
1653  y[i] += sum( A.load(i,j) * x1 );
1654  }
1655 
1656  for( ; remainder && j<jend; ++j ) {
1657  y[i] += A(i,j) * x[j];
1658  }
1659  }
1660  }
1662  //**********************************************************************************************
1663 
1664  //**BLAS-based addition assignment to dense vectors (default)***********************************
1678  template< typename VT1 // Type of the left-hand side target vector
1679  , typename MT1 // Type of the left-hand side matrix operand
1680  , typename VT2 > // Type of the right-hand side vector operand
1682  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1683  {
1684  selectLargeAddAssignKernel( y, A, x );
1685  }
1687  //**********************************************************************************************
1688 
1689  //**BLAS-based addition assignment to dense vectors*********************************************
1690 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1691 
1704  template< typename VT1 // Type of the left-hand side target vector
1705  , typename MT1 // Type of the left-hand side matrix operand
1706  , typename VT2 > // Type of the right-hand side vector operand
1708  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1709  {
1710  using ET = ElementType_<VT1>;
1711 
1712  if( IsTriangular<MT1>::value ) {
1713  ResultType_<VT1> tmp( serial( x ) );
1714  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1715  addAssign( y, tmp );
1716  }
1717  else {
1718  gemv( y, A, x, ET(1), ET(1) );
1719  }
1720  }
1722 #endif
1723  //**********************************************************************************************
1724 
1725  //**Addition assignment to sparse vectors*******************************************************
1726  // No special implementation for the addition assignment to sparse vectors.
1727  //**********************************************************************************************
1728 
1729  //**Subtraction assignment to dense vectors*****************************************************
1742  template< typename VT1 > // Type of the target dense vector
1743  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1744  {
1746 
1747  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1748 
1749  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1750  return;
1751  }
1752 
1753  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1754  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1755 
1756  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1757  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1758  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1759  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1760 
1761  DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1762  }
1764  //**********************************************************************************************
1765 
1766  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1777  template< typename VT1 // Type of the left-hand side target vector
1778  , typename MT1 // Type of the left-hand side matrix operand
1779  , typename VT2 > // Type of the right-hand side vector operand
1780  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1781  {
1782  if( ( IsDiagonal<MT1>::value ) ||
1783  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1784  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1785  selectSmallSubAssignKernel( y, A, x );
1786  else
1787  selectBlasSubAssignKernel( y, A, x );
1788  }
1790  //**********************************************************************************************
1791 
1792  //**Default subtraction assignment to dense vectors*********************************************
1806  template< typename VT1 // Type of the left-hand side target vector
1807  , typename MT1 // Type of the left-hand side matrix operand
1808  , typename VT2 > // Type of the right-hand side vector operand
1809  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1810  {
1811  y.subAssign( A * x );
1812  }
1814  //**********************************************************************************************
1815 
1816  //**Default subtraction assignment to dense vectors (small matrices)****************************
1830  template< typename VT1 // Type of the left-hand side target vector
1831  , typename MT1 // Type of the left-hand side matrix operand
1832  , typename VT2 > // Type of the right-hand side vector operand
1834  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1835  {
1836  selectDefaultSubAssignKernel( y, A, x );
1837  }
1839  //**********************************************************************************************
1840 
1841  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1855  template< typename VT1 // Type of the left-hand side target vector
1856  , typename MT1 // Type of the left-hand side matrix operand
1857  , typename VT2 > // Type of the right-hand side vector operand
1859  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1860  {
1861  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1862 
1863  const size_t M( A.rows() );
1864  const size_t N( A.columns() );
1865 
1866  size_t i( 0UL );
1867 
1868  for( ; (i+8UL) <= M; i+=8UL )
1869  {
1870  const size_t jbegin( ( IsUpper<MT1>::value )
1871  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1872  :( 0UL ) );
1873  const size_t jend( ( IsLower<MT1>::value )
1874  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1875  :( N ) );
1876  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1877 
1878  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1879  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1880 
1881  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1882  size_t j( jbegin );
1883 
1884  for( ; j<jpos; j+=SIMDSIZE ) {
1885  const SIMDType x1( x.load(j) );
1886  xmm1 += A.load(i ,j) * x1;
1887  xmm2 += A.load(i+1UL,j) * x1;
1888  xmm3 += A.load(i+2UL,j) * x1;
1889  xmm4 += A.load(i+3UL,j) * x1;
1890  xmm5 += A.load(i+4UL,j) * x1;
1891  xmm6 += A.load(i+5UL,j) * x1;
1892  xmm7 += A.load(i+6UL,j) * x1;
1893  xmm8 += A.load(i+7UL,j) * x1;
1894  }
1895 
1896  y[i ] -= sum( xmm1 );
1897  y[i+1UL] -= sum( xmm2 );
1898  y[i+2UL] -= sum( xmm3 );
1899  y[i+3UL] -= sum( xmm4 );
1900  y[i+4UL] -= sum( xmm5 );
1901  y[i+5UL] -= sum( xmm6 );
1902  y[i+6UL] -= sum( xmm7 );
1903  y[i+7UL] -= sum( xmm8 );
1904 
1905  for( ; remainder && j<jend; ++j ) {
1906  y[i ] -= A(i ,j) * x[j];
1907  y[i+1UL] -= A(i+1UL,j) * x[j];
1908  y[i+2UL] -= A(i+2UL,j) * x[j];
1909  y[i+3UL] -= A(i+3UL,j) * x[j];
1910  y[i+4UL] -= A(i+4UL,j) * x[j];
1911  y[i+5UL] -= A(i+5UL,j) * x[j];
1912  y[i+6UL] -= A(i+6UL,j) * x[j];
1913  y[i+7UL] -= A(i+7UL,j) * x[j];
1914  }
1915  }
1916 
1917  for( ; (i+4UL) <= M; i+=4UL )
1918  {
1919  const size_t jbegin( ( IsUpper<MT1>::value )
1920  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1921  :( 0UL ) );
1922  const size_t jend( ( IsLower<MT1>::value )
1923  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1924  :( N ) );
1925  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1926 
1927  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1928  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1929 
1930  SIMDType xmm1, xmm2, xmm3, xmm4;
1931  size_t j( jbegin );
1932 
1933  for( ; j<jpos; j+=SIMDSIZE ) {
1934  const SIMDType x1( x.load(j) );
1935  xmm1 += A.load(i ,j) * x1;
1936  xmm2 += A.load(i+1UL,j) * x1;
1937  xmm3 += A.load(i+2UL,j) * x1;
1938  xmm4 += A.load(i+3UL,j) * x1;
1939  }
1940 
1941  y[i ] -= sum( xmm1 );
1942  y[i+1UL] -= sum( xmm2 );
1943  y[i+2UL] -= sum( xmm3 );
1944  y[i+3UL] -= sum( xmm4 );
1945 
1946  for( ; remainder && j<jend; ++j ) {
1947  y[i ] -= A(i ,j) * x[j];
1948  y[i+1UL] -= A(i+1UL,j) * x[j];
1949  y[i+2UL] -= A(i+2UL,j) * x[j];
1950  y[i+3UL] -= A(i+3UL,j) * x[j];
1951  }
1952  }
1953 
1954  for( ; (i+3UL) <= M; i+=3UL )
1955  {
1956  const size_t jbegin( ( IsUpper<MT1>::value )
1957  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1958  :( 0UL ) );
1959  const size_t jend( ( IsLower<MT1>::value )
1960  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1961  :( N ) );
1962  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1963 
1964  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1965  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1966 
1967  SIMDType xmm1, xmm2, xmm3;
1968  size_t j( jbegin );
1969 
1970  for( ; j<jpos; j+=SIMDSIZE ) {
1971  const SIMDType x1( x.load(j) );
1972  xmm1 += A.load(i ,j) * x1;
1973  xmm2 += A.load(i+1UL,j) * x1;
1974  xmm3 += A.load(i+2UL,j) * x1;
1975  }
1976 
1977  y[i ] -= sum( xmm1 );
1978  y[i+1UL] -= sum( xmm2 );
1979  y[i+2UL] -= sum( xmm3 );
1980 
1981  for( ; remainder && j<jend; ++j ) {
1982  y[i ] -= A(i ,j) * x[j];
1983  y[i+1UL] -= A(i+1UL,j) * x[j];
1984  y[i+2UL] -= A(i+2UL,j) * x[j];
1985  }
1986  }
1987 
1988  for( ; (i+2UL) <= M; i+=2UL )
1989  {
1990  const size_t jbegin( ( IsUpper<MT1>::value )
1991  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1992  :( 0UL ) );
1993  const size_t jend( ( IsLower<MT1>::value )
1994  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1995  :( N ) );
1996  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1997 
1998  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1999  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2000 
2001  SIMDType xmm1, xmm2;
2002  size_t j( jbegin );
2003 
2004  for( ; j<jpos; j+=SIMDSIZE ) {
2005  const SIMDType x1( x.load(j) );
2006  xmm1 += A.load(i ,j) * x1;
2007  xmm2 += A.load(i+1UL,j) * x1;
2008  }
2009 
2010  y[i ] -= sum( xmm1 );
2011  y[i+1UL] -= sum( xmm2 );
2012 
2013  for( ; remainder && j<jend; ++j ) {
2014  y[i ] -= A(i ,j) * x[j];
2015  y[i+1UL] -= A(i+1UL,j) * x[j];
2016  }
2017  }
2018 
2019  if( i < M )
2020  {
2021  const size_t jbegin( ( IsUpper<MT1>::value )
2022  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2023  :( 0UL ) );
2024  const size_t jend( ( IsLower<MT1>::value )
2025  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2026  :( N ) );
2027  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2028 
2029  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2030  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2031 
2032  SIMDType xmm1;
2033  size_t j( jbegin );
2034 
2035  for( ; j<jpos; j+=SIMDSIZE ) {
2036  xmm1 += A.load(i,j) * x.load(j);
2037  }
2038 
2039  y[i] -= sum( xmm1 );
2040 
2041  for( ; remainder && j<jend; ++j ) {
2042  y[i] -= A(i,j) * x[j];
2043  }
2044  }
2045  }
2047  //**********************************************************************************************
2048 
2049  //**Default subtraction assignment to dense vectors (large matrices)****************************
2063  template< typename VT1 // Type of the left-hand side target vector
2064  , typename MT1 // Type of the left-hand side matrix operand
2065  , typename VT2 > // Type of the right-hand side vector operand
2067  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2068  {
2069  selectDefaultSubAssignKernel( y, A, x );
2070  }
2072  //**********************************************************************************************
2073 
2074  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2088  template< typename VT1 // Type of the left-hand side target vector
2089  , typename MT1 // Type of the left-hand side matrix operand
2090  , typename VT2 > // Type of the right-hand side vector operand
2092  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2093  {
2094  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
2095 
2096  const size_t M( A.rows() );
2097  const size_t N( A.columns() );
2098 
2099  size_t i( 0UL );
2100 
2101  for( ; (i+8UL) <= M; i+=8UL )
2102  {
2103  const size_t jbegin( ( IsUpper<MT1>::value )
2104  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2105  :( 0UL ) );
2106  const size_t jend( ( IsLower<MT1>::value )
2107  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
2108  :( N ) );
2109  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2110 
2111  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2112  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2113 
2114  size_t j( jbegin );
2115 
2116  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2117  const size_t j1( j+SIMDSIZE );
2118  const size_t j2( j+SIMDSIZE*2UL );
2119  const size_t j3( j+SIMDSIZE*3UL );
2120  const SIMDType x1( x.load(j ) );
2121  const SIMDType x2( x.load(j1) );
2122  const SIMDType x3( x.load(j2) );
2123  const SIMDType x4( x.load(j3) );
2124  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2125  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2126  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2127  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2128  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2129  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2130  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2131  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2132  }
2133 
2134  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2135  const size_t j1( j+SIMDSIZE );
2136  const SIMDType x1( x.load(j ) );
2137  const SIMDType x2( x.load(j1) );
2138  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2139  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2140  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2141  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2142  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2143  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2144  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2145  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2146  }
2147 
2148  for( ; j<jpos; j+=SIMDSIZE ) {
2149  const SIMDType x1( x.load(j) );
2150  y[i ] -= sum( A.load(i ,j) * x1 );
2151  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2152  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2153  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2154  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 );
2155  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 );
2156  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 );
2157  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 );
2158  }
2159 
2160  for( ; remainder && j<jend; ++j ) {
2161  y[i ] -= A(i ,j) * x[j];
2162  y[i+1UL] -= A(i+1UL,j) * x[j];
2163  y[i+2UL] -= A(i+2UL,j) * x[j];
2164  y[i+3UL] -= A(i+3UL,j) * x[j];
2165  y[i+4UL] -= A(i+4UL,j) * x[j];
2166  y[i+5UL] -= A(i+5UL,j) * x[j];
2167  y[i+6UL] -= A(i+6UL,j) * x[j];
2168  y[i+7UL] -= A(i+7UL,j) * x[j];
2169  }
2170  }
2171 
2172  for( ; (i+4UL) <= M; i+=4UL )
2173  {
2174  const size_t jbegin( ( IsUpper<MT1>::value )
2175  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2176  :( 0UL ) );
2177  const size_t jend( ( IsLower<MT1>::value )
2178  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
2179  :( N ) );
2180  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2181 
2182  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2183  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2184 
2185  size_t j( jbegin );
2186 
2187  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2188  const size_t j1( j+SIMDSIZE );
2189  const size_t j2( j+SIMDSIZE*2UL );
2190  const size_t j3( j+SIMDSIZE*3UL );
2191  const SIMDType x1( x.load(j ) );
2192  const SIMDType x2( x.load(j1) );
2193  const SIMDType x3( x.load(j2) );
2194  const SIMDType x4( x.load(j3) );
2195  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2196  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2197  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2198  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2199  }
2200 
2201  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2202  const size_t j1( j+SIMDSIZE );
2203  const SIMDType x1( x.load(j ) );
2204  const SIMDType x2( x.load(j1) );
2205  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2206  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2207  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2208  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2209  }
2210 
2211  for( ; j<jpos; j+=SIMDSIZE ) {
2212  const SIMDType x1( x.load(j) );
2213  y[i ] -= sum( A.load(i ,j) * x1 );
2214  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2215  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2216  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2217  }
2218 
2219  for( ; remainder && j<jend; ++j ) {
2220  y[i ] -= A(i ,j) * x[j];
2221  y[i+1UL] -= A(i+1UL,j) * x[j];
2222  y[i+2UL] -= A(i+2UL,j) * x[j];
2223  y[i+3UL] -= A(i+3UL,j) * x[j];
2224  }
2225  }
2226 
2227  for( ; (i+2UL) <= M; i+=2UL )
2228  {
2229  const size_t jbegin( ( IsUpper<MT1>::value )
2230  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2231  :( 0UL ) );
2232  const size_t jend( ( IsLower<MT1>::value )
2233  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2234  :( N ) );
2235  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2236 
2237  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2238  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2239 
2240  size_t j( jbegin );
2241 
2242  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2243  const size_t j1( j+SIMDSIZE );
2244  const size_t j2( j+SIMDSIZE*2UL );
2245  const size_t j3( j+SIMDSIZE*3UL );
2246  const SIMDType x1( x.load(j ) );
2247  const SIMDType x2( x.load(j1) );
2248  const SIMDType x3( x.load(j2) );
2249  const SIMDType x4( x.load(j3) );
2250  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2251  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2252  }
2253 
2254  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2255  const size_t j1( j+SIMDSIZE );
2256  const SIMDType x1( x.load(j ) );
2257  const SIMDType x2( x.load(j1) );
2258  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2259  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2260  }
2261 
2262  for( ; j<jpos; j+=SIMDSIZE ) {
2263  const SIMDType x1( x.load(j) );
2264  y[i ] -= sum( A.load(i ,j) * x1 );
2265  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2266  }
2267 
2268  for( ; remainder && j<jend; ++j ) {
2269  y[i ] -= A(i ,j) * x[j];
2270  y[i+1UL] -= A(i+1UL,j) * x[j];
2271  }
2272  }
2273 
2274  if( i < M )
2275  {
2276  const size_t jbegin( ( IsUpper<MT1>::value )
2277  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2278  :( 0UL ) );
2279  const size_t jend( ( IsLower<MT1>::value )
2280  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2281  :( N ) );
2282  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2283 
2284  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2285  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2286 
2287  size_t j( jbegin );
2288 
2289  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2290  const size_t j1( j+SIMDSIZE );
2291  const size_t j2( j+SIMDSIZE*2UL );
2292  const size_t j3( j+SIMDSIZE*3UL );
2293  const SIMDType x1( x.load(j ) );
2294  const SIMDType x2( x.load(j1) );
2295  const SIMDType x3( x.load(j2) );
2296  const SIMDType x4( x.load(j3) );
2297  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2298  }
2299 
2300  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2301  const size_t j1( j+SIMDSIZE );
2302  const SIMDType x1( x.load(j ) );
2303  const SIMDType x2( x.load(j1) );
2304  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2305  }
2306 
2307  for( ; j<jpos; j+=SIMDSIZE ) {
2308  const SIMDType x1( x.load(j) );
2309  y[i] -= sum( A.load(i,j) * x1 );
2310  }
2311 
2312  for( ; remainder && j<jend; ++j ) {
2313  y[i] -= A(i,j) * x[j];
2314  }
2315  }
2316  }
2318  //**********************************************************************************************
2319 
2320  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2334  template< typename VT1 // Type of the left-hand side target vector
2335  , typename MT1 // Type of the left-hand side matrix operand
2336  , typename VT2 > // Type of the right-hand side vector operand
2338  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2339  {
2340  selectLargeSubAssignKernel( y, A, x );
2341  }
2343  //**********************************************************************************************
2344 
2345  //**BLAS-based subtraction assignment to dense vectors******************************************
2346 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2347 
2360  template< typename VT1 // Type of the left-hand side target vector
2361  , typename MT1 // Type of the left-hand side matrix operand
2362  , typename VT2 > // Type of the right-hand side vector operand
2364  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2365  {
2366  using ET = ElementType_<VT1>;
2367 
2368  if( IsTriangular<MT1>::value ) {
2369  ResultType_<VT1> tmp( serial( x ) );
2370  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2371  subAssign( y, tmp );
2372  }
2373  else {
2374  gemv( y, A, x, ET(-1), ET(1) );
2375  }
2376  }
2378 #endif
2379  //**********************************************************************************************
2380 
2381  //**Subtraction assignment to sparse vectors****************************************************
2382  // No special implementation for the subtraction assignment to sparse vectors.
2383  //**********************************************************************************************
2384 
2385  //**Multiplication assignment to dense vectors**************************************************
2398  template< typename VT1 > // Type of the target dense vector
2399  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2400  {
2402 
2406 
2407  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2408 
2409  const ResultType tmp( serial( rhs ) );
2410  multAssign( ~lhs, tmp );
2411  }
2413  //**********************************************************************************************
2414 
2415  //**Multiplication assignment to sparse vectors*************************************************
2416  // No special implementation for the multiplication assignment to sparse vectors.
2417  //**********************************************************************************************
2418 
2419  //**Division assignment to dense vectors********************************************************
2432  template< typename VT1 > // Type of the target dense vector
2433  friend inline void divAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2434  {
2436 
2440 
2441  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2442 
2443  const ResultType tmp( serial( rhs ) );
2444  divAssign( ~lhs, tmp );
2445  }
2447  //**********************************************************************************************
2448 
2449  //**Division assignment to sparse vectors*******************************************************
2450  // No special implementation for the division assignment to sparse vectors.
2451  //**********************************************************************************************
2452 
2453  //**SMP assignment to dense vectors*************************************************************
2468  template< typename VT1 > // Type of the target dense vector
2469  friend inline EnableIf_< UseSMPAssign<VT1> >
2471  {
2473 
2474  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2475 
2476  if( rhs.mat_.rows() == 0UL ) {
2477  return;
2478  }
2479  else if( rhs.mat_.columns() == 0UL ) {
2480  reset( ~lhs );
2481  return;
2482  }
2483 
2484  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2485  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2486 
2487  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2488  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2489  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2490  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2491 
2492  smpAssign( ~lhs, A * x );
2493  }
2495  //**********************************************************************************************
2496 
2497  //**SMP assignment to sparse vectors************************************************************
2512  template< typename VT1 > // Type of the target sparse vector
2513  friend inline EnableIf_< UseSMPAssign<VT1> >
2515  {
2517 
2521 
2522  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2523 
2524  const ResultType tmp( rhs );
2525  smpAssign( ~lhs, tmp );
2526  }
2528  //**********************************************************************************************
2529 
2530  //**SMP addition assignment to dense vectors****************************************************
2545  template< typename VT1 > // Type of the target dense vector
2546  friend inline EnableIf_< UseSMPAssign<VT1> >
2548  {
2550 
2551  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2552 
2553  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2554  return;
2555  }
2556 
2557  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2558  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2559 
2560  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2561  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2562  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2563  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2564 
2565  smpAddAssign( ~lhs, A * x );
2566  }
2568  //**********************************************************************************************
2569 
2570  //**SMP addition assignment to sparse vectors***************************************************
2571  // No special implementation for the SMP addition assignment to sparse vectors.
2572  //**********************************************************************************************
2573 
2574  //**SMP subtraction assignment to dense vectors*************************************************
2589  template< typename VT1 > // Type of the target dense vector
2590  friend inline EnableIf_< UseSMPAssign<VT1> >
2592  {
2594 
2595  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2596 
2597  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2598  return;
2599  }
2600 
2601  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2602  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2603 
2604  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2605  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2606  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2607  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2608 
2609  smpSubAssign( ~lhs, A * x );
2610  }
2612  //**********************************************************************************************
2613 
2614  //**SMP subtraction assignment to sparse vectors************************************************
2615  // No special implementation for the SMP subtraction assignment to sparse vectors.
2616  //**********************************************************************************************
2617 
2618  //**SMP multiplication assignment to dense vectors**********************************************
2633  template< typename VT1 > // Type of the target dense vector
2634  friend inline EnableIf_< UseSMPAssign<VT1> >
2636  {
2638 
2642 
2643  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2644 
2645  const ResultType tmp( rhs );
2646  smpMultAssign( ~lhs, tmp );
2647  }
2649  //**********************************************************************************************
2650 
2651  //**SMP multiplication assignment to sparse vectors*********************************************
2652  // No special implementation for the SMP multiplication assignment to sparse vectors.
2653  //**********************************************************************************************
2654 
2655  //**SMP division assignment to dense vectors****************************************************
2670  template< typename VT1 > // Type of the target dense vector
2671  friend inline EnableIf_< UseSMPAssign<VT1> >
2673  {
2675 
2679 
2680  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2681 
2682  const ResultType tmp( rhs );
2683  smpDivAssign( ~lhs, tmp );
2684  }
2686  //**********************************************************************************************
2687 
2688  //**SMP division assignment to sparse vectors***************************************************
2689  // No special implementation for the SMP division assignment to sparse vectors.
2690  //**********************************************************************************************
2691 
2692  //**Compile time checks*************************************************************************
2700  //**********************************************************************************************
2701 };
2702 //*************************************************************************************************
2703 
2704 
2705 
2706 
2707 //=================================================================================================
2708 //
2709 // DVECSCALARMULTEXPR SPECIALIZATION
2710 //
2711 //=================================================================================================
2712 
2713 //*************************************************************************************************
2721 template< typename MT // Type of the left-hand side dense matrix
2722  , typename VT // Type of the right-hand side dense vector
2723  , typename ST > // Type of the scalar value
2724 class DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >
2725  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false > >
2726  , private Computation
2727 {
2728  private:
2729  //**Type definitions****************************************************************************
2730  using MVM = DMatDVecMultExpr<MT,VT>;
2731  using RES = ResultType_<MVM>;
2732  using MRT = ResultType_<MT>;
2733  using VRT = ResultType_<VT>;
2734  using MET = ElementType_<MRT>;
2735  using VET = ElementType_<VRT>;
2736  using MCT = CompositeType_<MT>;
2737  using VCT = CompositeType_<VT>;
2738  //**********************************************************************************************
2739 
2740  //**********************************************************************************************
2742  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2744  //**********************************************************************************************
2745 
2746  //**********************************************************************************************
2748  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<MT>::value };
2749  //**********************************************************************************************
2750 
2751  //**********************************************************************************************
2753 
2756  template< typename T1 >
2757  struct UseSMPAssign {
2758  enum : bool { value = ( evaluateMatrix || evaluateVector ) };
2759  };
2760  //**********************************************************************************************
2761 
2762  //**********************************************************************************************
2764 
2766  template< typename T1, typename T2, typename T3, typename T4 >
2767  struct UseBlasKernel {
2773  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2778  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2780  };
2781  //**********************************************************************************************
2782 
2783  //**********************************************************************************************
2785 
2788  template< typename T1, typename T2, typename T3, typename T4 >
2789  struct UseVectorizedDefaultKernel {
2790  enum : bool { value = useOptimizedKernels &&
2792  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2796  , T4 >::value &&
2797  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2798  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2799  };
2800  //**********************************************************************************************
2801 
2802  public:
2803  //**Type definitions****************************************************************************
2805  using ResultType = MultTrait_<RES,ST>;
2809  using ReturnType = const ElementType;
2810  using CompositeType = const ResultType;
2811 
2813  using LeftOperand = const DMatDVecMultExpr<MT,VT>;
2814 
2816  using RightOperand = ST;
2817 
2820 
2823  //**********************************************************************************************
2824 
2825  //**Compilation flags***************************************************************************
2827  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2828  MT::simdEnabled && VT::simdEnabled &&
2832 
2834  enum : bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2835  !evaluateVector && VT::smpAssignable };
2836  //**********************************************************************************************
2837 
2838  //**SIMD properties*****************************************************************************
2840  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2841  //**********************************************************************************************
2842 
2843  //**Constructor*********************************************************************************
2849  explicit inline DVecScalarMultExpr( const MVM& vector, ST scalar )
2850  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2851  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2852  {}
2853  //**********************************************************************************************
2854 
2855  //**Subscript operator**************************************************************************
2861  inline ReturnType operator[]( size_t index ) const {
2862  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2863  return vector_[index] * scalar_;
2864  }
2865  //**********************************************************************************************
2866 
2867  //**At function*********************************************************************************
2874  inline ReturnType at( size_t index ) const {
2875  if( index >= vector_.size() ) {
2876  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2877  }
2878  return (*this)[index];
2879  }
2880  //**********************************************************************************************
2881 
2882  //**Size function*******************************************************************************
2887  inline size_t size() const {
2888  return vector_.size();
2889  }
2890  //**********************************************************************************************
2891 
2892  //**Left operand access*************************************************************************
2897  inline LeftOperand leftOperand() const {
2898  return vector_;
2899  }
2900  //**********************************************************************************************
2901 
2902  //**Right operand access************************************************************************
2907  inline RightOperand rightOperand() const {
2908  return scalar_;
2909  }
2910  //**********************************************************************************************
2911 
2912  //**********************************************************************************************
2918  template< typename T >
2919  inline bool canAlias( const T* alias ) const {
2920  return vector_.canAlias( alias );
2921  }
2922  //**********************************************************************************************
2923 
2924  //**********************************************************************************************
2930  template< typename T >
2931  inline bool isAliased( const T* alias ) const {
2932  return vector_.isAliased( alias );
2933  }
2934  //**********************************************************************************************
2935 
2936  //**********************************************************************************************
2941  inline bool isAligned() const {
2942  return vector_.isAligned();
2943  }
2944  //**********************************************************************************************
2945 
2946  //**********************************************************************************************
2951  inline bool canSMPAssign() const noexcept {
2952  LeftOperand_<MVM> A( vector_.leftOperand() );
2953  return ( !BLAZE_BLAS_MODE ||
2956  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2957  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2958  ( size() > SMP_DMATDVECMULT_THRESHOLD );
2959  }
2960  //**********************************************************************************************
2961 
2962  private:
2963  //**Member variables****************************************************************************
2964  LeftOperand vector_;
2965  RightOperand scalar_;
2966  //**********************************************************************************************
2967 
2968  //**Assignment to dense vectors*****************************************************************
2980  template< typename VT1 > // Type of the target dense vector
2981  friend inline void assign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
2982  {
2984 
2985  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2986 
2987  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
2988  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
2989 
2990  if( left.rows() == 0UL ) {
2991  return;
2992  }
2993  else if( left.columns() == 0UL ) {
2994  reset( ~lhs );
2995  return;
2996  }
2997 
2998  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
2999  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3000 
3001  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3002  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3003  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3004  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3005 
3006  DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
3007  }
3008  //**********************************************************************************************
3009 
3010  //**Assignment to dense vectors (kernel selection)**********************************************
3021  template< typename VT1 // Type of the left-hand side target vector
3022  , typename MT1 // Type of the left-hand side matrix operand
3023  , typename VT2 // Type of the right-hand side vector operand
3024  , typename ST2 > // Type of the scalar value
3025  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3026  {
3027  if( ( IsDiagonal<MT1>::value ) ||
3028  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3029  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3030  selectSmallAssignKernel( y, A, x, scalar );
3031  else
3032  selectBlasAssignKernel( y, A, x, scalar );
3033  }
3034  //**********************************************************************************************
3035 
3036  //**Default assignment to dense vectors*********************************************************
3050  template< typename VT1 // Type of the left-hand side target vector
3051  , typename MT1 // Type of the left-hand side matrix operand
3052  , typename VT2 // Type of the right-hand side vector operand
3053  , typename ST2 > // Type of the scalar value
3055  selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3056  {
3057  y.assign( A * x * scalar );
3058  }
3059  //**********************************************************************************************
3060 
3061  //**Default assignment to dense vectors (small matrices)****************************************
3075  template< typename VT1 // Type of the left-hand side target vector
3076  , typename MT1 // Type of the left-hand side matrix operand
3077  , typename VT2 // Type of the right-hand side vector operand
3078  , typename ST2 > // Type of the scalar value
3080  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3081  {
3082  selectDefaultAssignKernel( y, A, x, scalar );
3083  }
3084  //**********************************************************************************************
3085 
3086  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3100  template< typename VT1 // Type of the left-hand side target vector
3101  , typename MT1 // Type of the left-hand side matrix operand
3102  , typename VT2 // Type of the right-hand side vector operand
3103  , typename ST2 > // Type of the scalar value
3105  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3106  {
3107  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3108 
3109  const size_t M( A.rows() );
3110  const size_t N( A.columns() );
3111 
3112  size_t i( 0UL );
3113 
3114  for( ; (i+8UL) <= M; i+=8UL )
3115  {
3116  const size_t jbegin( ( IsUpper<MT1>::value )
3117  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3118  :( 0UL ) );
3119  const size_t jend( ( IsLower<MT1>::value )
3120  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3121  :( N ) );
3122  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3123 
3124  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3125  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3126 
3127  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3128  size_t j( jbegin );
3129 
3130  for( ; j<jpos; j+=SIMDSIZE ) {
3131  const SIMDType x1( x.load(j) );
3132  xmm1 += A.load(i ,j) * x1;
3133  xmm2 += A.load(i+1UL,j) * x1;
3134  xmm3 += A.load(i+2UL,j) * x1;
3135  xmm4 += A.load(i+3UL,j) * x1;
3136  xmm5 += A.load(i+4UL,j) * x1;
3137  xmm6 += A.load(i+5UL,j) * x1;
3138  xmm7 += A.load(i+6UL,j) * x1;
3139  xmm8 += A.load(i+7UL,j) * x1;
3140  }
3141 
3142  y[i ] = sum( xmm1 ) * scalar;
3143  y[i+1UL] = sum( xmm2 ) * scalar;
3144  y[i+2UL] = sum( xmm3 ) * scalar;
3145  y[i+3UL] = sum( xmm4 ) * scalar;
3146  y[i+4UL] = sum( xmm5 ) * scalar;
3147  y[i+5UL] = sum( xmm6 ) * scalar;
3148  y[i+6UL] = sum( xmm7 ) * scalar;
3149  y[i+7UL] = sum( xmm8 ) * scalar;
3150 
3151  for( ; remainder && j<jend; ++j ) {
3152  y[i ] += A(i ,j) * x[j] * scalar;
3153  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3154  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3155  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3156  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3157  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3158  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3159  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3160  }
3161  }
3162 
3163  for( ; (i+4UL) <= M; i+=4UL )
3164  {
3165  const size_t jbegin( ( IsUpper<MT1>::value )
3166  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3167  :( 0UL ) );
3168  const size_t jend( ( IsLower<MT1>::value )
3169  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3170  :( N ) );
3171  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3172 
3173  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3174  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3175 
3176  SIMDType xmm1, xmm2, xmm3, xmm4;
3177  size_t j( jbegin );
3178 
3179  for( ; j<jpos; j+=SIMDSIZE ) {
3180  const SIMDType x1( x.load(j) );
3181  xmm1 += A.load(i ,j) * x1;
3182  xmm2 += A.load(i+1UL,j) * x1;
3183  xmm3 += A.load(i+2UL,j) * x1;
3184  xmm4 += A.load(i+3UL,j) * x1;
3185  }
3186 
3187  y[i ] = sum( xmm1 ) * scalar;
3188  y[i+1UL] = sum( xmm2 ) * scalar;
3189  y[i+2UL] = sum( xmm3 ) * scalar;
3190  y[i+3UL] = sum( xmm4 ) * scalar;
3191 
3192  for( ; remainder && j<jend; ++j ) {
3193  y[i ] += A(i ,j) * x[j] * scalar;
3194  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3195  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3196  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3197  }
3198  }
3199 
3200  for( ; (i+3UL) <= M; i+=3UL )
3201  {
3202  const size_t jbegin( ( IsUpper<MT1>::value )
3203  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3204  :( 0UL ) );
3205  const size_t jend( ( IsLower<MT1>::value )
3206  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3207  :( N ) );
3208  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3209 
3210  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3211  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3212 
3213  SIMDType xmm1, xmm2, xmm3;
3214  size_t j( jbegin );
3215 
3216  for( ; j<jpos; j+=SIMDSIZE ) {
3217  const SIMDType x1( x.load(j) );
3218  xmm1 += A.load(i ,j) * x1;
3219  xmm2 += A.load(i+1UL,j) * x1;
3220  xmm3 += A.load(i+2UL,j) * x1;
3221  }
3222 
3223  y[i ] = sum( xmm1 ) * scalar;
3224  y[i+1UL] = sum( xmm2 ) * scalar;
3225  y[i+2UL] = sum( xmm3 ) * scalar;
3226 
3227  for( ; remainder && j<jend; ++j ) {
3228  y[i ] += A(i ,j) * x[j] * scalar;
3229  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3230  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3231  }
3232  }
3233 
3234  for( ; (i+2UL) <= M; i+=2UL )
3235  {
3236  const size_t jbegin( ( IsUpper<MT1>::value )
3237  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3238  :( 0UL ) );
3239  const size_t jend( ( IsLower<MT1>::value )
3240  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3241  :( N ) );
3242  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3243 
3244  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3245  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3246 
3247  SIMDType xmm1, xmm2;
3248  size_t j( jbegin );
3249 
3250  for( ; j<jpos; j+=SIMDSIZE ) {
3251  const SIMDType x1( x.load(j) );
3252  xmm1 += A.load(i ,j) * x1;
3253  xmm2 += A.load(i+1UL,j) * x1;
3254  }
3255 
3256  y[i ] = sum( xmm1 ) * scalar;
3257  y[i+1UL] = sum( xmm2 ) * scalar;
3258 
3259  for( ; remainder && j<jend; ++j ) {
3260  y[i ] += A(i ,j) * x[j] * scalar;
3261  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3262  }
3263  }
3264 
3265  if( i < M )
3266  {
3267  const size_t jbegin( ( IsUpper<MT1>::value )
3268  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3269  :( 0UL ) );
3270  const size_t jend( ( IsLower<MT1>::value )
3271  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3272  :( N ) );
3273  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3274 
3275  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3276  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3277 
3278  SIMDType xmm1;
3279  size_t j( jbegin );
3280 
3281  for( ; j<jpos; j+=SIMDSIZE ) {
3282  xmm1 += A.load(i,j) * x.load(j);
3283  }
3284 
3285  y[i] = sum( xmm1 ) * scalar;
3286 
3287  for( ; remainder && j<jend; ++j ) {
3288  y[i] += A(i,j) * x[j] * scalar;
3289  }
3290  }
3291  }
3292  //**********************************************************************************************
3293 
3294  //**Default assignment to dense vectors (large matrices)****************************************
3308  template< typename VT1 // Type of the left-hand side target vector
3309  , typename MT1 // Type of the left-hand side matrix operand
3310  , typename VT2 // Type of the right-hand side vector operand
3311  , typename ST2 > // Type of the scalar value
3313  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3314  {
3315  selectDefaultAssignKernel( y, A, x, scalar );
3316  }
3317  //**********************************************************************************************
3318 
3319  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3333  template< typename VT1 // Type of the left-hand side target vector
3334  , typename MT1 // Type of the left-hand side matrix operand
3335  , typename VT2 // Type of the right-hand side vector operand
3336  , typename ST2 > // Type of the scalar value
3338  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3339  {
3340  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3341 
3342  const size_t M( A.rows() );
3343  const size_t N( A.columns() );
3344 
3345  reset( y );
3346 
3347  size_t i( 0UL );
3348 
3349  for( ; (i+8UL) <= M; i+=8UL )
3350  {
3351  const size_t jbegin( ( IsUpper<MT1>::value )
3352  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3353  :( 0UL ) );
3354  const size_t jend( ( IsLower<MT1>::value )
3355  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3356  :( N ) );
3357  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3358 
3359  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3360  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3361 
3362  size_t j( jbegin );
3363 
3364  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3365  const size_t j1( j+SIMDSIZE );
3366  const size_t j2( j+SIMDSIZE*2UL );
3367  const size_t j3( j+SIMDSIZE*3UL );
3368  const SIMDType x1( x.load(j ) );
3369  const SIMDType x2( x.load(j1) );
3370  const SIMDType x3( x.load(j2) );
3371  const SIMDType x4( x.load(j3) );
3372  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3373  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3374  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3375  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3376  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3377  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3378  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3379  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3380  }
3381 
3382  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3383  const size_t j1( j+SIMDSIZE );
3384  const SIMDType x1( x.load(j ) );
3385  const SIMDType x2( x.load(j1) );
3386  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3387  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3388  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3389  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3390  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3391  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3392  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3393  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3394  }
3395 
3396  for( ; j<jpos; j+=SIMDSIZE ) {
3397  const SIMDType x1( x.load(j) );
3398  y[i ] += sum( A.load(i ,j) * x1 );
3399  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3400  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3401  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3402  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
3403  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
3404  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
3405  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
3406  }
3407 
3408  for( ; remainder && j<jend; ++j ) {
3409  y[i ] += A(i ,j) * x[j];
3410  y[i+1UL] += A(i+1UL,j) * x[j];
3411  y[i+2UL] += A(i+2UL,j) * x[j];
3412  y[i+3UL] += A(i+3UL,j) * x[j];
3413  y[i+4UL] += A(i+4UL,j) * x[j];
3414  y[i+5UL] += A(i+5UL,j) * x[j];
3415  y[i+6UL] += A(i+6UL,j) * x[j];
3416  y[i+7UL] += A(i+7UL,j) * x[j];
3417  }
3418 
3419  y[i ] *= scalar;
3420  y[i+1UL] *= scalar;
3421  y[i+2UL] *= scalar;
3422  y[i+3UL] *= scalar;
3423  y[i+4UL] *= scalar;
3424  y[i+5UL] *= scalar;
3425  y[i+6UL] *= scalar;
3426  y[i+7UL] *= scalar;
3427  }
3428 
3429  for( ; (i+4UL) <= M; i+=4UL )
3430  {
3431  const size_t jbegin( ( IsUpper<MT1>::value )
3432  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3433  :( 0UL ) );
3434  const size_t jend( ( IsLower<MT1>::value )
3435  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3436  :( N ) );
3437  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3438 
3439  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3440  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3441 
3442  size_t j( jbegin );
3443 
3444  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3445  const size_t j1( j+SIMDSIZE );
3446  const size_t j2( j+SIMDSIZE*2UL );
3447  const size_t j3( j+SIMDSIZE*3UL );
3448  const SIMDType x1( x.load(j ) );
3449  const SIMDType x2( x.load(j1) );
3450  const SIMDType x3( x.load(j2) );
3451  const SIMDType x4( x.load(j3) );
3452  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3453  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3454  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3455  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3456  }
3457 
3458  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3459  const size_t j1( j+SIMDSIZE );
3460  const SIMDType x1( x.load(j ) );
3461  const SIMDType x2( x.load(j1) );
3462  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3463  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3464  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3465  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3466  }
3467 
3468  for( ; j<jpos; j+=SIMDSIZE ) {
3469  const SIMDType x1( x.load(j) );
3470  y[i ] += sum( A.load(i ,j) * x1 );
3471  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3472  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3473  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3474  }
3475 
3476  for( ; remainder && j<jend; ++j ) {
3477  y[i ] += A(i ,j) * x[j];
3478  y[i+1UL] += A(i+1UL,j) * x[j];
3479  y[i+2UL] += A(i+2UL,j) * x[j];
3480  y[i+3UL] += A(i+3UL,j) * x[j];
3481  }
3482 
3483  y[i ] *= scalar;
3484  y[i+1UL] *= scalar;
3485  y[i+2UL] *= scalar;
3486  y[i+3UL] *= scalar;
3487  }
3488 
3489  for( ; (i+2UL) <= M; i+=2UL )
3490  {
3491  const size_t jbegin( ( IsUpper<MT1>::value )
3492  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3493  :( 0UL ) );
3494  const size_t jend( ( IsLower<MT1>::value )
3495  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3496  :( N ) );
3497  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3498 
3499  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3500  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3501 
3502  size_t j( jbegin );
3503 
3504  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3505  const size_t j1( j+SIMDSIZE );
3506  const size_t j2( j+SIMDSIZE*2UL );
3507  const size_t j3( j+SIMDSIZE*3UL );
3508  const SIMDType x1( x.load(j ) );
3509  const SIMDType x2( x.load(j1) );
3510  const SIMDType x3( x.load(j2) );
3511  const SIMDType x4( x.load(j3) );
3512  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3513  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3514  }
3515 
3516  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3517  const size_t j1( j+SIMDSIZE );
3518  const SIMDType x1( x.load(j ) );
3519  const SIMDType x2( x.load(j1) );
3520  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3521  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3522  }
3523 
3524  for( ; j<jpos; j+=SIMDSIZE ) {
3525  const SIMDType x1( x.load(j) );
3526  y[i ] += sum( A.load(i ,j) * x1 );
3527  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3528  }
3529 
3530  for( ; remainder && j<jend; ++j ) {
3531  y[i ] += A(i ,j) * x[j];
3532  y[i+1UL] += A(i+1UL,j) * x[j];
3533  }
3534 
3535  y[i ] *= scalar;
3536  y[i+1UL] *= scalar;
3537  }
3538 
3539  if( i < M )
3540  {
3541  const size_t jbegin( ( IsUpper<MT1>::value )
3542  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3543  :( 0UL ) );
3544  const size_t jend( ( IsLower<MT1>::value )
3545  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3546  :( N ) );
3547  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3548 
3549  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3550  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3551 
3552  size_t j( jbegin );
3553 
3554  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3555  const size_t j1( j+SIMDSIZE );
3556  const size_t j2( j+SIMDSIZE*2UL );
3557  const size_t j3( j+SIMDSIZE*3UL );
3558  const SIMDType x1( x.load(j ) );
3559  const SIMDType x2( x.load(j1) );
3560  const SIMDType x3( x.load(j2) );
3561  const SIMDType x4( x.load(j3) );
3562  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3563  }
3564 
3565  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3566  const size_t j1( j+SIMDSIZE );
3567  const SIMDType x1( x.load(j ) );
3568  const SIMDType x2( x.load(j1) );
3569  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3570  }
3571 
3572  for( ; j<jpos; j+=SIMDSIZE ) {
3573  const SIMDType x1( x.load(j) );
3574  y[i] += sum( A.load(i,j) * x1 );
3575  }
3576 
3577  for( ; remainder && j<jend; ++j ) {
3578  y[i] += A(i,j) * x[j];
3579  }
3580 
3581  y[i] *= scalar;
3582  }
3583  }
3584  //**********************************************************************************************
3585 
3586  //**BLAS-based assignment to dense vectors (default)********************************************
3600  template< typename VT1 // Type of the left-hand side target vector
3601  , typename MT1 // Type of the left-hand side matrix operand
3602  , typename VT2 // Type of the right-hand side vector operand
3603  , typename ST2 > // Type of the scalar value
3605  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3606  {
3607  selectLargeAssignKernel( y, A, x, scalar );
3608  }
3609  //**********************************************************************************************
3610 
3611  //**BLAS-based assignment to dense vectors******************************************************
3612 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3613 
3626  template< typename VT1 // Type of the left-hand side target vector
3627  , typename MT1 // Type of the left-hand side matrix operand
3628  , typename VT2 // Type of the right-hand side vector operand
3629  , typename ST2 > // Type of the scalar value
3631  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3632  {
3633  using ET = ElementType_<VT1>;
3634 
3635  if( IsTriangular<MT1>::value ) {
3636  assign( y, scalar * x );
3637  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3638  }
3639  else {
3640  gemv( y, A, x, ET(scalar), ET(0) );
3641  }
3642  }
3643 #endif
3644  //**********************************************************************************************
3645 
3646  //**Assignment to sparse vectors****************************************************************
3658  template< typename VT1 > // Type of the target sparse vector
3659  friend inline void assign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3660  {
3662 
3666 
3667  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3668 
3669  const ResultType tmp( serial( rhs ) );
3670  assign( ~lhs, tmp );
3671  }
3672  //**********************************************************************************************
3673 
3674  //**Addition assignment to dense vectors********************************************************
3686  template< typename VT1 > // Type of the target dense vector
3687  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3688  {
3690 
3691  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3692 
3693  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
3694  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
3695 
3696  if( left.rows() == 0UL || left.columns() == 0UL ) {
3697  return;
3698  }
3699 
3700  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3701  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3702 
3703  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3704  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3705  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3706  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3707 
3708  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3709  }
3710  //**********************************************************************************************
3711 
3712  //**Addition assignment to dense vectors (kernel selection)*************************************
3723  template< typename VT1 // Type of the left-hand side target vector
3724  , typename MT1 // Type of the left-hand side matrix operand
3725  , typename VT2 // Type of the right-hand side vector operand
3726  , typename ST2 > // Type of the scalar value
3727  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3728  {
3729  if( ( IsDiagonal<MT1>::value ) ||
3730  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3731  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3732  selectSmallAddAssignKernel( y, A, x, scalar );
3733  else
3734  selectBlasAddAssignKernel( y, A, x, scalar );
3735  }
3736  //**********************************************************************************************
3737 
3738  //**Default addition assignment to dense vectors************************************************
3752  template< typename VT1 // Type of the left-hand side target vector
3753  , typename MT1 // Type of the left-hand side matrix operand
3754  , typename VT2 // Type of the right-hand side vector operand
3755  , typename ST2 > // Type of the scalar value
3756  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3757  {
3758  y.addAssign( A * x * scalar );
3759  }
3760  //**********************************************************************************************
3761 
3762  //**Default addition assignment to dense vectors (small matrices)*******************************
3776  template< typename VT1 // Type of the left-hand side target vector
3777  , typename MT1 // Type of the left-hand side matrix operand
3778  , typename VT2 // Type of the right-hand side vector operand
3779  , typename ST2 > // Type of the scalar value
3781  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3782  {
3783  selectDefaultAddAssignKernel( y, A, x, scalar );
3784  }
3785  //**********************************************************************************************
3786 
3787  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3801  template< typename VT1 // Type of the left-hand side target vector
3802  , typename MT1 // Type of the left-hand side matrix operand
3803  , typename VT2 // Type of the right-hand side vector operand
3804  , typename ST2 > // Type of the scalar value
3806  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3807  {
3808  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3809 
3810  const size_t M( A.rows() );
3811  const size_t N( A.columns() );
3812 
3813  size_t i( 0UL );
3814 
3815  for( ; (i+8UL) <= M; i+=8UL )
3816  {
3817  const size_t jbegin( ( IsUpper<MT1>::value )
3818  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3819  :( 0UL ) );
3820  const size_t jend( ( IsLower<MT1>::value )
3821  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3822  :( N ) );
3823  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3824 
3825  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3826  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3827 
3828  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3829  size_t j( jbegin );
3830 
3831  for( ; j<jpos; j+=SIMDSIZE ) {
3832  const SIMDType x1( x.load(j) );
3833  xmm1 += A.load(i ,j) * x1;
3834  xmm2 += A.load(i+1UL,j) * x1;
3835  xmm3 += A.load(i+2UL,j) * x1;
3836  xmm4 += A.load(i+3UL,j) * x1;
3837  xmm5 += A.load(i+4UL,j) * x1;
3838  xmm6 += A.load(i+5UL,j) * x1;
3839  xmm7 += A.load(i+6UL,j) * x1;
3840  xmm8 += A.load(i+7UL,j) * x1;
3841  }
3842 
3843  y[i ] += sum( xmm1 ) * scalar;
3844  y[i+1UL] += sum( xmm2 ) * scalar;
3845  y[i+2UL] += sum( xmm3 ) * scalar;
3846  y[i+3UL] += sum( xmm4 ) * scalar;
3847  y[i+4UL] += sum( xmm5 ) * scalar;
3848  y[i+5UL] += sum( xmm6 ) * scalar;
3849  y[i+6UL] += sum( xmm7 ) * scalar;
3850  y[i+7UL] += sum( xmm8 ) * scalar;
3851 
3852  for( ; remainder && j<jend; ++j ) {
3853  y[i ] += A(i ,j) * x[j] * scalar;
3854  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3855  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3856  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3857  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3858  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3859  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3860  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3861  }
3862  }
3863 
3864  for( ; (i+4UL) <= M; i+=4UL )
3865  {
3866  const size_t jbegin( ( IsUpper<MT1>::value )
3867  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3868  :( 0UL ) );
3869  const size_t jend( ( IsLower<MT1>::value )
3870  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3871  :( N ) );
3872  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3873 
3874  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3875  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3876 
3877  SIMDType xmm1, xmm2, xmm3, xmm4;
3878  size_t j( jbegin );
3879 
3880  for( ; j<jpos; j+=SIMDSIZE ) {
3881  const SIMDType x1( x.load(j) );
3882  xmm1 += A.load(i ,j) * x1;
3883  xmm2 += A.load(i+1UL,j) * x1;
3884  xmm3 += A.load(i+2UL,j) * x1;
3885  xmm4 += A.load(i+3UL,j) * x1;
3886  }
3887 
3888  y[i ] += sum( xmm1 ) * scalar;
3889  y[i+1UL] += sum( xmm2 ) * scalar;
3890  y[i+2UL] += sum( xmm3 ) * scalar;
3891  y[i+3UL] += sum( xmm4 ) * scalar;
3892 
3893  for( ; remainder && j<jend; ++j ) {
3894  y[i ] += A(i ,j) * x[j] * scalar;
3895  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3896  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3897  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3898  }
3899  }
3900 
3901  for( ; (i+3UL) <= M; i+=3UL )
3902  {
3903  const size_t jbegin( ( IsUpper<MT1>::value )
3904  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3905  :( 0UL ) );
3906  const size_t jend( ( IsLower<MT1>::value )
3907  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3908  :( N ) );
3909  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3910 
3911  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3912  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3913 
3914  SIMDType xmm1, xmm2, xmm3;
3915  size_t j( jbegin );
3916 
3917  for( ; j<jpos; j+=SIMDSIZE ) {
3918  const SIMDType x1( x.load(j) );
3919  xmm1 += A.load(i ,j) * x1;
3920  xmm2 += A.load(i+1UL,j) * x1;
3921  xmm3 += A.load(i+2UL,j) * x1;
3922  }
3923 
3924  y[i ] += sum( xmm1 ) * scalar;
3925  y[i+1UL] += sum( xmm2 ) * scalar;
3926  y[i+2UL] += sum( xmm3 ) * scalar;
3927 
3928  for( ; remainder && j<jend; ++j ) {
3929  y[i ] += A(i ,j) * x[j] * scalar;
3930  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3931  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3932  }
3933  }
3934 
3935  for( ; (i+2UL) <= M; i+=2UL )
3936  {
3937  const size_t jbegin( ( IsUpper<MT1>::value )
3938  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3939  :( 0UL ) );
3940  const size_t jend( ( IsLower<MT1>::value )
3941  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3942  :( N ) );
3943  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3944 
3945  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3946  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3947 
3948  SIMDType xmm1, xmm2;
3949  size_t j( jbegin );
3950 
3951  for( ; j<jpos; j+=SIMDSIZE ) {
3952  const SIMDType x1( x.load(j) );
3953  xmm1 += A.load(i ,j) * x1;
3954  xmm2 += A.load(i+1UL,j) * x1;
3955  }
3956 
3957  y[i ] += sum( xmm1 ) * scalar;
3958  y[i+1UL] += sum( xmm2 ) * scalar;
3959 
3960  for( ; remainder && j<jend; ++j ) {
3961  y[i ] += A(i ,j) * x[j] * scalar;
3962  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3963  }
3964  }
3965 
3966  if( i < M )
3967  {
3968  const size_t jbegin( ( IsUpper<MT1>::value )
3969  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3970  :( 0UL ) );
3971  const size_t jend( ( IsLower<MT1>::value )
3972  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3973  :( N ) );
3974  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3975 
3976  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3977  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3978 
3979  SIMDType xmm1;
3980  size_t j( jbegin );
3981 
3982  for( ; j<jpos; j+=SIMDSIZE ) {
3983  xmm1 += A.load(i,j) * x.load(j);
3984  }
3985 
3986  y[i] += sum( xmm1 ) * scalar;
3987 
3988  for( ; remainder && j<jend; ++j ) {
3989  y[i] += A(i,j) * x[j] * scalar;
3990  }
3991  }
3992  }
3993  //**********************************************************************************************
3994 
3995  //**Default addition assignment to dense vectors (large matrices)*******************************
4009  template< typename VT1 // Type of the left-hand side target vector
4010  , typename MT1 // Type of the left-hand side matrix operand
4011  , typename VT2 // Type of the right-hand side vector operand
4012  , typename ST2 > // Type of the scalar value
4014  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4015  {
4016  selectDefaultAddAssignKernel( y, A, x, scalar );
4017  }
4018  //**********************************************************************************************
4019 
4020  //**Vectorized default addition assignment to dense vectors (large matrices)********************
4034  template< typename VT1 // Type of the left-hand side target vector
4035  , typename MT1 // Type of the left-hand side matrix operand
4036  , typename VT2 // Type of the right-hand side vector operand
4037  , typename ST2 > // Type of the scalar value
4039  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4040  {
4041  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4042 
4043  const size_t M( A.rows() );
4044  const size_t N( A.columns() );
4045 
4046  size_t i( 0UL );
4047 
4048  for( ; (i+8UL) <= M; i+=8UL )
4049  {
4050  const size_t jbegin( ( IsUpper<MT1>::value )
4051  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4052  :( 0UL ) );
4053  const size_t jend( ( IsLower<MT1>::value )
4054  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4055  :( N ) );
4056  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4057 
4058  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4059  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4060 
4061  size_t j( jbegin );
4062 
4063  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4064  const size_t j1( j+SIMDSIZE );
4065  const size_t j2( j+SIMDSIZE*2UL );
4066  const size_t j3( j+SIMDSIZE*3UL );
4067  const SIMDType x1( x.load(j ) );
4068  const SIMDType x2( x.load(j1) );
4069  const SIMDType x3( x.load(j2) );
4070  const SIMDType x4( x.load(j3) );
4071  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4072  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4073  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4074  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4075  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4076  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4077  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4078  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4079  }
4080 
4081  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4082  const size_t j1( j+SIMDSIZE );
4083  const SIMDType x1( x.load(j ) );
4084  const SIMDType x2( x.load(j1) );
4085  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4086  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4087  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4088  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4089  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4090  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4091  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4092  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4093  }
4094 
4095  for( ; j<jpos; j+=SIMDSIZE ) {
4096  const SIMDType x1( x.load(j) );
4097  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4098  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4099  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4100  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4101  y[i+4UL] += sum( A.load(i+4UL,j) * x1 ) * scalar;
4102  y[i+5UL] += sum( A.load(i+5UL,j) * x1 ) * scalar;
4103  y[i+6UL] += sum( A.load(i+6UL,j) * x1 ) * scalar;
4104  y[i+7UL] += sum( A.load(i+7UL,j) * x1 ) * scalar;
4105  }
4106 
4107  for( ; remainder && j<jend; ++j ) {
4108  y[i ] += A(i ,j) * x[j] * scalar;
4109  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4110  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4111  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4112  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4113  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4114  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4115  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4116  }
4117  }
4118 
4119  for( ; (i+4UL) <= M; i+=4UL )
4120  {
4121  const size_t jbegin( ( IsUpper<MT1>::value )
4122  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4123  :( 0UL ) );
4124  const size_t jend( ( IsLower<MT1>::value )
4125  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4126  :( N ) );
4127  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4128 
4129  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4130  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4131 
4132  size_t j( jbegin );
4133 
4134  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4135  const size_t j1( j+SIMDSIZE );
4136  const size_t j2( j+SIMDSIZE*2UL );
4137  const size_t j3( j+SIMDSIZE*3UL );
4138  const SIMDType x1( x.load(j ) );
4139  const SIMDType x2( x.load(j1) );
4140  const SIMDType x3( x.load(j2) );
4141  const SIMDType x4( x.load(j3) );
4142  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4143  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4144  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4145  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4146  }
4147 
4148  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4149  const size_t j1( j+SIMDSIZE );
4150  const SIMDType x1( x.load(j ) );
4151  const SIMDType x2( x.load(j1) );
4152  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4153  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4154  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4155  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4156  }
4157 
4158  for( ; j<jpos; j+=SIMDSIZE ) {
4159  const SIMDType x1( x.load(j) );
4160  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4161  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4162  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4163  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4164  }
4165 
4166  for( ; remainder && j<jend; ++j ) {
4167  y[i ] += A(i ,j) * x[j] * scalar;
4168  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4169  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4170  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4171  }
4172  }
4173 
4174  for( ; (i+2UL) <= M; i+=2UL )
4175  {
4176  const size_t jbegin( ( IsUpper<MT1>::value )
4177  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4178  :( 0UL ) );
4179  const size_t jend( ( IsLower<MT1>::value )
4180  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4181  :( N ) );
4182  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4183 
4184  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4185  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4186 
4187  size_t j( jbegin );
4188 
4189  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4190  const size_t j1( j+SIMDSIZE );
4191  const size_t j2( j+SIMDSIZE*2UL );
4192  const size_t j3( j+SIMDSIZE*3UL );
4193  const SIMDType x1( x.load(j ) );
4194  const SIMDType x2( x.load(j1) );
4195  const SIMDType x3( x.load(j2) );
4196  const SIMDType x4( x.load(j3) );
4197  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4198  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4199  }
4200 
4201  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4202  const size_t j1( j+SIMDSIZE );
4203  const SIMDType x1( x.load(j ) );
4204  const SIMDType x2( x.load(j1) );
4205  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4206  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4207  }
4208 
4209  for( ; j<jpos; j+=SIMDSIZE ) {
4210  const SIMDType x1( x.load(j) );
4211  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4212  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4213  }
4214 
4215  for( ; remainder && j<jend; ++j ) {
4216  y[i ] += A(i ,j) * x[j] * scalar;
4217  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4218  }
4219  }
4220 
4221  if( i < M )
4222  {
4223  const size_t jbegin( ( IsUpper<MT1>::value )
4224  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4225  :( 0UL ) );
4226  const size_t jend( ( IsLower<MT1>::value )
4227  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4228  :( N ) );
4229  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4230 
4231  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4232  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4233 
4234  size_t j( jbegin );
4235 
4236  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4237  const size_t j1( j+SIMDSIZE );
4238  const size_t j2( j+SIMDSIZE*2UL );
4239  const size_t j3( j+SIMDSIZE*3UL );
4240  const SIMDType x1( x.load(j ) );
4241  const SIMDType x2( x.load(j1) );
4242  const SIMDType x3( x.load(j2) );
4243  const SIMDType x4( x.load(j3) );
4244  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4245  }
4246 
4247  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4248  const size_t j1( j+SIMDSIZE );
4249  const SIMDType x1( x.load(j ) );
4250  const SIMDType x2( x.load(j1) );
4251  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4252  }
4253 
4254  for( ; j<jpos; j+=SIMDSIZE ) {
4255  const SIMDType x1( x.load(j) );
4256  y[i] += sum( A.load(i,j) * x1 ) * scalar;
4257  }
4258 
4259  for( ; remainder && j<jend; ++j ) {
4260  y[i] += A(i,j) * x[j] * scalar;
4261  }
4262  }
4263  }
4264  //**********************************************************************************************
4265 
4266  //**BLAS-based addition assignment to dense vectors (default)***********************************
4280  template< typename VT1 // Type of the left-hand side target vector
4281  , typename MT1 // Type of the left-hand side matrix operand
4282  , typename VT2 // Type of the right-hand side vector operand
4283  , typename ST2 > // Type of the scalar value
4285  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4286  {
4287  selectLargeAddAssignKernel( y, A, x, scalar );
4288  }
4289  //**********************************************************************************************
4290 
4291  //**BLAS-based addition assignment to dense vectors*********************************************
4292 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4293 
4306  template< typename VT1 // Type of the left-hand side target vector
4307  , typename MT1 // Type of the left-hand side matrix operand
4308  , typename VT2 // Type of the right-hand side vector operand
4309  , typename ST2 > // Type of the scalar value
4311  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4312  {
4313  using ET = ElementType_<VT1>;
4314 
4315  if( IsTriangular<MT1>::value ) {
4316  ResultType_<VT1> tmp( serial( scalar * x ) );
4317  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4318  addAssign( y, tmp );
4319  }
4320  else {
4321  gemv( y, A, x, ET(scalar), ET(1) );
4322  }
4323  }
4324 #endif
4325  //**********************************************************************************************
4326 
4327  //**Addition assignment to sparse vectors*******************************************************
4328  // No special implementation for the addition assignment to sparse vectors.
4329  //**********************************************************************************************
4330 
4331  //**Subtraction assignment to dense vectors*****************************************************
4343  template< typename VT1 > // Type of the target dense vector
4344  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4345  {
4347 
4348  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4349 
4350  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4351  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4352 
4353  if( left.rows() == 0UL || left.columns() == 0UL ) {
4354  return;
4355  }
4356 
4357  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4358  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
4359 
4360  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4361  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4362  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4363  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
4364 
4365  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
4366  }
4367  //**********************************************************************************************
4368 
4369  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4380  template< typename VT1 // Type of the left-hand side target vector
4381  , typename MT1 // Type of the left-hand side matrix operand
4382  , typename VT2 // Type of the right-hand side vector operand
4383  , typename ST2 > // Type of the scalar value
4384  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4385  {
4386  if( ( IsDiagonal<MT1>::value ) ||
4387  ( IsComputation<MT>::value && !evaluateMatrix ) ||
4388  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4389  selectSmallSubAssignKernel( y, A, x, scalar );
4390  else
4391  selectBlasSubAssignKernel( y, A, x, scalar );
4392  }
4393  //**********************************************************************************************
4394 
4395  //**Default subtraction assignment to dense vectors*********************************************
4409  template< typename VT1 // Type of the left-hand side target vector
4410  , typename MT1 // Type of the left-hand side matrix operand
4411  , typename VT2 // Type of the right-hand side vector operand
4412  , typename ST2 > // Type of the scalar value
4413  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4414  {
4415  y.subAssign( A * x * scalar );
4416  }
4417  //**********************************************************************************************
4418 
4419  //**Default subtraction assignment to dense vectors (small matrices)****************************
4433  template< typename VT1 // Type of the left-hand side target vector
4434  , typename MT1 // Type of the left-hand side matrix operand
4435  , typename VT2 // Type of the right-hand side vector operand
4436  , typename ST2 > // Type of the scalar value
4438  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4439  {
4440  selectDefaultSubAssignKernel( y, A, x, scalar );
4441  }
4442  //**********************************************************************************************
4443 
4444  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4458  template< typename VT1 // Type of the left-hand side target vector
4459  , typename MT1 // Type of the left-hand side matrix operand
4460  , typename VT2 // Type of the right-hand side vector operand
4461  , typename ST2 > // Type of the scalar value
4463  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4464  {
4465  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4466 
4467  const size_t M( A.rows() );
4468  const size_t N( A.columns() );
4469 
4470  size_t i( 0UL );
4471 
4472  for( ; (i+8UL) <= M; i+=8UL )
4473  {
4474  const size_t jbegin( ( IsUpper<MT1>::value )
4475  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4476  :( 0UL ) );
4477  const size_t jend( ( IsLower<MT1>::value )
4478  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4479  :( N ) );
4480  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4481 
4482  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4483  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4484 
4485  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4486  size_t j( jbegin );
4487 
4488  for( ; j<jpos; j+=SIMDSIZE ) {
4489  const SIMDType x1( x.load(j) );
4490  xmm1 += A.load(i ,j) * x1;
4491  xmm2 += A.load(i+1UL,j) * x1;
4492  xmm3 += A.load(i+2UL,j) * x1;
4493  xmm4 += A.load(i+3UL,j) * x1;
4494  xmm5 += A.load(i+4UL,j) * x1;
4495  xmm6 += A.load(i+5UL,j) * x1;
4496  xmm7 += A.load(i+6UL,j) * x1;
4497  xmm8 += A.load(i+7UL,j) * x1;
4498  }
4499 
4500  y[i ] -= sum( xmm1 ) * scalar;
4501  y[i+1UL] -= sum( xmm2 ) * scalar;
4502  y[i+2UL] -= sum( xmm3 ) * scalar;
4503  y[i+3UL] -= sum( xmm4 ) * scalar;
4504  y[i+4UL] -= sum( xmm5 ) * scalar;
4505  y[i+5UL] -= sum( xmm6 ) * scalar;
4506  y[i+6UL] -= sum( xmm7 ) * scalar;
4507  y[i+7UL] -= sum( xmm8 ) * scalar;
4508 
4509  for( ; remainder && j<jend; ++j ) {
4510  y[i ] -= A(i ,j) * x[j] * scalar;
4511  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4512  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4513  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4514  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4515  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4516  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4517  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4518  }
4519  }
4520 
4521  for( ; (i+4UL) <= M; i+=4UL )
4522  {
4523  const size_t jbegin( ( IsUpper<MT1>::value )
4524  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4525  :( 0UL ) );
4526  const size_t jend( ( IsLower<MT1>::value )
4527  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4528  :( N ) );
4529  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4530 
4531  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4532  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4533 
4534  SIMDType xmm1, xmm2, xmm3, xmm4;
4535  size_t j( jbegin );
4536 
4537  for( ; j<jpos; j+=SIMDSIZE ) {
4538  const SIMDType x1( x.load(j) );
4539  xmm1 += A.load(i ,j) * x1;
4540  xmm2 += A.load(i+1UL,j) * x1;
4541  xmm3 += A.load(i+2UL,j) * x1;
4542  xmm4 += A.load(i+3UL,j) * x1;
4543  }
4544 
4545  y[i ] -= sum( xmm1 ) * scalar;
4546  y[i+1UL] -= sum( xmm2 ) * scalar;
4547  y[i+2UL] -= sum( xmm3 ) * scalar;
4548  y[i+3UL] -= sum( xmm4 ) * scalar;
4549 
4550  for( ; remainder && j<jend; ++j ) {
4551  y[i ] -= A(i ,j) * x[j] * scalar;
4552  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4553  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4554  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4555  }
4556  }
4557 
4558  for( ; (i+3UL) <= M; i+=3UL )
4559  {
4560  const size_t jbegin( ( IsUpper<MT1>::value )
4561  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4562  :( 0UL ) );
4563  const size_t jend( ( IsLower<MT1>::value )
4564  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
4565  :( N ) );
4566  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4567 
4568  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4569  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4570 
4571  SIMDType xmm1, xmm2, xmm3;
4572  size_t j( jbegin );
4573 
4574  for( ; j<jpos; j+=SIMDSIZE ) {
4575  const SIMDType x1( x.load(j) );
4576  xmm1 += A.load(i ,j) * x1;
4577  xmm2 += A.load(i+1UL,j) * x1;
4578  xmm3 += A.load(i+2UL,j) * x1;
4579  }
4580 
4581  y[i ] -= sum( xmm1 ) * scalar;
4582  y[i+1UL] -= sum( xmm2 ) * scalar;
4583  y[i+2UL] -= sum( xmm3 ) * scalar;
4584 
4585  for( ; remainder && j<jend; ++j ) {
4586  y[i ] -= A(i ,j) * x[j] * scalar;
4587  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4588  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4589  }
4590  }
4591 
4592  for( ; (i+2UL) <= M; i+=2UL )
4593  {
4594  const size_t jbegin( ( IsUpper<MT1>::value )
4595  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4596  :( 0UL ) );
4597  const size_t jend( ( IsLower<MT1>::value )
4598  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4599  :( N ) );
4600  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4601 
4602  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4603  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4604 
4605  SIMDType xmm1, xmm2;
4606  size_t j( jbegin );
4607 
4608  for( ; j<jpos; j+=SIMDSIZE ) {
4609  const SIMDType x1( x.load(j) );
4610  xmm1 += A.load(i ,j) * x1;
4611  xmm2 += A.load(i+1UL,j) * x1;
4612  }
4613 
4614  y[i ] -= sum( xmm1 ) * scalar;
4615  y[i+1UL] -= sum( xmm2 ) * scalar;
4616 
4617  for( ; remainder && j<jend; ++j ) {
4618  y[i ] -= A(i ,j) * x[j] * scalar;
4619  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4620  }
4621  }
4622 
4623  if( i < M )
4624  {
4625  const size_t jbegin( ( IsUpper<MT1>::value )
4626  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4627  :( 0UL ) );
4628  const size_t jend( ( IsLower<MT1>::value )
4629  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4630  :( N ) );
4631  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4632 
4633  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4634  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4635 
4636  SIMDType xmm1;
4637  size_t j( jbegin );
4638 
4639  for( ; j<jpos; j+=SIMDSIZE ) {
4640  xmm1 += A.load(i,j) * x.load(j);
4641  }
4642 
4643  y[i] -= sum( xmm1 ) * scalar;
4644 
4645  for( ; remainder && j<jend; ++j ) {
4646  y[i] -= A(i,j) * x[j] * scalar;
4647  }
4648  }
4649  }
4650  //**********************************************************************************************
4651 
4652  //**Default subtraction assignment to dense vectors (large matrices)****************************
4666  template< typename VT1 // Type of the left-hand side target vector
4667  , typename MT1 // Type of the left-hand side matrix operand
4668  , typename VT2 // Type of the right-hand side vector operand
4669  , typename ST2 > // Type of the scalar value
4671  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4672  {
4673  selectDefaultSubAssignKernel( y, A, x, scalar );
4674  }
4675  //**********************************************************************************************
4676 
4677  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4691  template< typename VT1 // Type of the left-hand side target vector
4692  , typename MT1 // Type of the left-hand side matrix operand
4693  , typename VT2 // Type of the right-hand side vector operand
4694  , typename ST2 > // Type of the scalar value
4696  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4697  {
4698  constexpr bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4699 
4700  const size_t M( A.rows() );
4701  const size_t N( A.columns() );
4702 
4703  size_t i( 0UL );
4704 
4705  for( ; (i+8UL) <= M; i+=8UL )
4706  {
4707  const size_t jbegin( ( IsUpper<MT1>::value )
4708  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4709  :( 0UL ) );
4710  const size_t jend( ( IsLower<MT1>::value )
4711  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4712  :( N ) );
4713  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4714 
4715  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4716  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4717 
4718  size_t j( jbegin );
4719 
4720  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4721  const size_t j1( j+SIMDSIZE );
4722  const size_t j2( j+SIMDSIZE*2UL );
4723  const size_t j3( j+SIMDSIZE*3UL );
4724  const SIMDType x1( x.load(j ) );
4725  const SIMDType x2( x.load(j1) );
4726  const SIMDType x3( x.load(j2) );
4727  const SIMDType x4( x.load(j3) );
4728  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4729  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4730  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4731  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4732  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4733  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4734  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4735  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4736  }
4737 
4738  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4739  const size_t j1( j+SIMDSIZE );
4740  const SIMDType x1( x.load(j ) );
4741  const SIMDType x2( x.load(j1) );
4742  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4743  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4744  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4745  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4746  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4747  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4748  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4749  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4750  }
4751 
4752  for( ; j<jpos; j+=SIMDSIZE ) {
4753  const SIMDType x1( x.load(j) );
4754  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4755  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4756  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4757  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4758  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 ) * scalar;
4759  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 ) * scalar;
4760  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 ) * scalar;
4761  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 ) * scalar;
4762  }
4763 
4764  for( ; remainder && j<jend; ++j ) {
4765  y[i ] -= A(i ,j) * x[j] * scalar;
4766  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4767  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4768  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4769  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4770  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4771  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4772  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4773  }
4774  }
4775 
4776  for( ; (i+4UL) <= M; i+=4UL )
4777  {
4778  const size_t jbegin( ( IsUpper<MT1>::value )
4779  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4780  :( 0UL ) );
4781  const size_t jend( ( IsLower<MT1>::value )
4782  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4783  :( N ) );
4784  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4785 
4786  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4787  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4788 
4789  size_t j( jbegin );
4790 
4791  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4792  const size_t j1( j+SIMDSIZE );
4793  const size_t j2( j+SIMDSIZE*2UL );
4794  const size_t j3( j+SIMDSIZE*3UL );
4795  const SIMDType x1( x.load(j ) );
4796  const SIMDType x2( x.load(j1) );
4797  const SIMDType x3( x.load(j2) );
4798  const SIMDType x4( x.load(j3) );
4799  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4800  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4801  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4802  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4803  }
4804 
4805  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4806  const size_t j1( j+SIMDSIZE );
4807  const SIMDType x1( x.load(j ) );
4808  const SIMDType x2( x.load(j1) );
4809  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4810  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4811  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4812  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4813  }
4814 
4815  for( ; j<jpos; j+=SIMDSIZE ) {
4816  const SIMDType x1( x.load(j) );
4817  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4818  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4819  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4820  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4821  }
4822 
4823  for( ; remainder && j<jend; ++j ) {
4824  y[i ] -= A(i ,j) * x[j] * scalar;
4825  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4826  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4827  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4828  }
4829  }
4830 
4831  for( ; (i+2UL) <= M; i+=2UL )
4832  {
4833  const size_t jbegin( ( IsUpper<MT1>::value )
4834  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4835  :( 0UL ) );
4836  const size_t jend( ( IsLower<MT1>::value )
4837  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4838  :( N ) );
4839  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4840 
4841  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4842  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4843 
4844  size_t j( jbegin );
4845 
4846  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4847  const size_t j1( j+SIMDSIZE );
4848  const size_t j2( j+SIMDSIZE*2UL );
4849  const size_t j3( j+SIMDSIZE*3UL );
4850  const SIMDType x1( x.load(j ) );
4851  const SIMDType x2( x.load(j1) );
4852  const SIMDType x3( x.load(j2) );
4853  const SIMDType x4( x.load(j3) );
4854  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4855  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4856  }
4857 
4858  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4859  const size_t j1( j+SIMDSIZE );
4860  const SIMDType x1( x.load(j ) );
4861  const SIMDType x2( x.load(j1) );
4862  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4863  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4864  }
4865 
4866  for( ; j<jpos; j+=SIMDSIZE ) {
4867  const SIMDType x1( x.load(j) );
4868  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4869  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4870  }
4871 
4872  for( ; remainder && j<jend; ++j ) {
4873  y[i ] -= A(i ,j) * x[j] * scalar;
4874  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4875  }
4876  }
4877 
4878  if( i < M )
4879  {
4880  const size_t jbegin( ( IsUpper<MT1>::value )
4881  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4882  :( 0UL ) );
4883  const size_t jend( ( IsLower<MT1>::value )
4884  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4885  :( N ) );
4886  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4887 
4888  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4889  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4890 
4891  size_t j( jbegin );
4892 
4893  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4894  const size_t j1( j+SIMDSIZE );
4895  const size_t j2( j+SIMDSIZE*2UL );
4896  const size_t j3( j+SIMDSIZE*3UL );
4897  const SIMDType x1( x.load(j ) );
4898  const SIMDType x2( x.load(j1) );
4899  const SIMDType x3( x.load(j2) );
4900  const SIMDType x4( x.load(j3) );
4901  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4902  }
4903 
4904  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4905  const size_t j1( j+SIMDSIZE );
4906  const SIMDType x1( x.load(j ) );
4907  const SIMDType x2( x.load(j1) );
4908  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4909  }
4910 
4911  for( ; j<jpos; j+=SIMDSIZE ) {
4912  const SIMDType x1( x.load(j) );
4913  y[i] -= sum( A.load(i,j) * x1 ) * scalar;
4914  }
4915 
4916  for( ; remainder && j<jend; ++j ) {
4917  y[i] -= A(i,j) * x[j] * scalar;
4918  }
4919  }
4920  }
4921  //**********************************************************************************************
4922 
4923  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4937  template< typename VT1 // Type of the left-hand side target vector
4938  , typename MT1 // Type of the left-hand side matrix operand
4939  , typename VT2 // Type of the right-hand side vector operand
4940  , typename ST2 > // Type of the scalar value
4942  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4943  {
4944  selectLargeSubAssignKernel( y, A, x, scalar );
4945  }
4946  //**********************************************************************************************
4947 
4948  //**BLAS-based subtraction assignment to dense vectors******************************************
4949 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4950 
4963  template< typename VT1 // Type of the left-hand side target vector
4964  , typename MT1 // Type of the left-hand side matrix operand
4965  , typename VT2 // Type of the right-hand side vector operand
4966  , typename ST2 > // Type of the scalar value
4968  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4969  {
4970  using ET = ElementType_<VT1>;
4971 
4972  if( IsTriangular<MT1>::value ) {
4973  ResultType_<VT1> tmp( serial( scalar * x ) );
4974  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4975  subAssign( y, tmp );
4976  }
4977  else {
4978  gemv( y, A, x, ET(-scalar), ET(1) );
4979  }
4980  }
4981 #endif
4982  //**********************************************************************************************
4983 
4984  //**Subtraction assignment to sparse vectors****************************************************
4985  // No special implementation for the subtraction assignment to sparse vectors.
4986  //**********************************************************************************************
4987 
4988  //**Multiplication assignment to dense vectors**************************************************
5000  template< typename VT1 > // Type of the target dense vector
5001  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5002  {
5004 
5008 
5009  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5010 
5011  const ResultType tmp( serial( rhs ) );
5012  multAssign( ~lhs, tmp );
5013  }
5014  //**********************************************************************************************
5015 
5016  //**Multiplication assignment to sparse vectors*************************************************
5017  // No special implementation for the multiplication assignment to sparse vectors.
5018  //**********************************************************************************************
5019 
5020  //**Division assignment to dense vectors********************************************************
5032  template< typename VT1 > // Type of the target dense vector
5033  friend inline void divAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5034  {
5036 
5040 
5041  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5042 
5043  const ResultType tmp( serial( rhs ) );
5044  divAssign( ~lhs, tmp );
5045  }
5046  //**********************************************************************************************
5047 
5048  //**Division assignment to sparse vectors*******************************************************
5049  // No special implementation for the division assignment to sparse vectors.
5050  //**********************************************************************************************
5051 
5052  //**SMP assignment to dense vectors*************************************************************
5066  template< typename VT1 > // Type of the target dense vector
5067  friend inline EnableIf_< UseSMPAssign<VT1> >
5069  {
5071 
5072  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5073 
5074  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5075  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5076 
5077  if( left.rows() == 0UL ) {
5078  return;
5079  }
5080  else if( left.columns() == 0UL ) {
5081  reset( ~lhs );
5082  return;
5083  }
5084 
5085  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5086  RT x( right ); // Evaluation of the right-hand side dense vector operand
5087 
5088  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5089  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5090  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5091  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5092 
5093  smpAssign( ~lhs, A * x * rhs.scalar_ );
5094  }
5095  //**********************************************************************************************
5096 
5097  //**SMP assignment to sparse vectors************************************************************
5111  template< typename VT1 > // Type of the target sparse vector
5112  friend inline EnableIf_< UseSMPAssign<VT1> >
5114  {
5116 
5120 
5121  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5122 
5123  const ResultType tmp( rhs );
5124  smpAssign( ~lhs, tmp );
5125  }
5126  //**********************************************************************************************
5127 
5128  //**SMP addition assignment to dense vectors****************************************************
5142  template< typename VT1 > // Type of the target dense vector
5143  friend inline EnableIf_< UseSMPAssign<VT1> >
5145  {
5147 
5148  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5149 
5150  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5151  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5152 
5153  if( left.rows() == 0UL || left.columns() == 0UL ) {
5154  return;
5155  }
5156 
5157  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5158  RT x( right ); // Evaluation of the right-hand side dense vector operand
5159 
5160  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5161  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5162  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5163  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5164 
5165  smpAddAssign( ~lhs, A * x * rhs.scalar_ );
5166  }
5167  //**********************************************************************************************
5168 
5169  //**SMP addition assignment to sparse vectors***************************************************
5170  // No special implementation for the SMP addition assignment to sparse vectors.
5171  //**********************************************************************************************
5172 
5173  //**SMP subtraction assignment to dense vectors*************************************************
5187  template< typename VT1 > // Type of the target dense vector
5188  friend inline EnableIf_< UseSMPAssign<VT1> >
5190  {
5192 
5193  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5194 
5195  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5196  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5197 
5198  if( left.rows() == 0UL || left.columns() == 0UL ) {
5199  return;
5200  }
5201 
5202  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5203  RT x( right ); // Evaluation of the right-hand side dense vector operand
5204 
5205  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5206  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5207  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5208  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5209 
5210  smpSubAssign( ~lhs, A * x * rhs.scalar_ );
5211  }
5212  //**********************************************************************************************
5213 
5214  //**SMP subtraction assignment to sparse vectors************************************************
5215  // No special implementation for the SMP subtraction assignment to sparse vectors.
5216  //**********************************************************************************************
5217 
5218  //**SMP multiplication assignment to dense vectors**********************************************
5232  template< typename VT1 > // Type of the target dense vector
5233  friend inline EnableIf_< UseSMPAssign<VT1> >
5235  {
5237 
5241 
5242  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5243 
5244  const ResultType tmp( rhs );
5245  smpMultAssign( ~lhs, tmp );
5246  }
5247  //**********************************************************************************************
5248 
5249  //**SMP multiplication assignment to sparse vectors*********************************************
5250  // No special implementation for the SMP multiplication assignment to sparse vectors.
5251  //**********************************************************************************************
5252 
5253  //**SMP division assignment to dense vectors****************************************************
5267  template< typename VT1 > // Type of the target dense vector
5268  friend inline EnableIf_< UseSMPAssign<VT1> >
5270  {
5272 
5276 
5277  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5278 
5279  const ResultType tmp( rhs );
5280  smpDivAssign( ~lhs, tmp );
5281  }
5282  //**********************************************************************************************
5283 
5284  //**SMP division assignment to sparse vectors***************************************************
5285  // No special implementation for the SMP division assignment to sparse vectors.
5286  //**********************************************************************************************
5287 
5288  //**Compile time checks*************************************************************************
5297  //**********************************************************************************************
5298 };
5300 //*************************************************************************************************
5301 
5302 
5303 
5304 
5305 //=================================================================================================
5306 //
5307 // GLOBAL BINARY ARITHMETIC OPERATORS
5308 //
5309 //=================================================================================================
5310 
5311 //*************************************************************************************************
5341 template< typename MT // Type of the left-hand side dense matrix
5342  , typename VT > // Type of the right-hand side dense vector
5343 inline decltype(auto)
5344  operator*( const DenseMatrix<MT,false>& mat, const DenseVector<VT,false>& vec )
5345 {
5347 
5349 
5350  if( (~mat).columns() != (~vec).size() ) {
5351  BLAZE_THROW_INVALID_ARGUMENT( "Matrix and vector sizes do not match" );
5352  }
5353 
5354  using ReturnType = const DMatDVecMultExpr<MT,VT>;
5355  return ReturnType( ~mat, ~vec );
5356 }
5357 //*************************************************************************************************
5358 
5359 
5360 
5361 
5362 //=================================================================================================
5363 //
5364 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
5365 //
5366 //=================================================================================================
5367 
5368 //*************************************************************************************************
5382 template< typename MT // Matrix base type of the left-hand side expression
5383  , typename VT > // Type of the right-hand side dense vector
5384 inline decltype(auto)
5385  operator*( const MatMatMultExpr<MT>& mat, const DenseVector<VT,false>& vec )
5386 {
5388 
5389  return (~mat).leftOperand() * ( (~mat).rightOperand() * vec );
5390 }
5392 //*************************************************************************************************
5393 
5394 
5395 
5396 
5397 //=================================================================================================
5398 //
5399 // SIZE SPECIALIZATIONS
5400 //
5401 //=================================================================================================
5402 
5403 //*************************************************************************************************
5405 template< typename MT, typename VT >
5406 struct Size< DMatDVecMultExpr<MT,VT> >
5407  : public Rows<MT>
5408 {};
5410 //*************************************************************************************************
5411 
5412 
5413 
5414 
5415 //=================================================================================================
5416 //
5417 // ISALIGNED SPECIALIZATIONS
5418 //
5419 //=================================================================================================
5420 
5421 //*************************************************************************************************
5423 template< typename MT, typename VT >
5424 struct IsAligned< DMatDVecMultExpr<MT,VT> >
5425  : public BoolConstant< And< IsAligned<MT>, IsAligned<VT> >::value >
5426 {};
5428 //*************************************************************************************************
5429 
5430 } // namespace blaze
5431 
5432 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
If_< IsExpression< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:212
Header file for the Rows type trait.
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:221
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:261
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:218
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:129
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:121
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
If_< IsExpression< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:215
Header file for the IsSame and IsStrictlySame type traits.
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:127
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:207
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:132
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:521
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:338
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:350
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:204
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:370
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Row< MT > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:124
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:206
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:130
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:306
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:340
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:67
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:88
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:109
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:326
decltype(auto) operator*(const DenseMatrix< MT1, false > &lhs, const DenseMatrix< MT2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8893
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:382
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:209
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:593
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:360
Header file for run time assertion macros.
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:131
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:383
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:247
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:128
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:205
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:106
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:74
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:75
Header file for the IsComplex type trait.
Constraint on the data type.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:316
Header file for the MatVecMultExpr base class.
Constraint on the data type.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:293
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:208
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the function trace functionality.