DMatDVecMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/Exception.h>
58 #include <blaze/math/shims/Reset.h>
60 #include <blaze/math/SIMD.h>
84 #include <blaze/system/BLAS.h>
87 #include <blaze/util/Assert.h>
88 #include <blaze/util/Complex.h>
91 #include <blaze/util/DisableIf.h>
92 #include <blaze/util/EnableIf.h>
95 #include <blaze/util/mpl/And.h>
96 #include <blaze/util/mpl/If.h>
97 #include <blaze/util/Types.h>
106 
107 
108 namespace blaze {
109 
110 //=================================================================================================
111 //
112 // CLASS DMATDVECMULTEXPR
113 //
114 //=================================================================================================
115 
116 //*************************************************************************************************
123 template< typename MT // Type of the left-hand side dense matrix
124  , typename VT > // Type of the right-hand side dense vector
125 class DMatDVecMultExpr : public DenseVector< DMatDVecMultExpr<MT,VT>, false >
126  , private MatVecMultExpr
127  , private Computation
128 {
129  private:
130  //**Type definitions****************************************************************************
137  //**********************************************************************************************
138 
139  //**********************************************************************************************
141  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
143  //**********************************************************************************************
144 
145  //**********************************************************************************************
147  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
148  //**********************************************************************************************
149 
150  //**********************************************************************************************
152 
156  template< typename T1 >
157  struct UseSMPAssign {
158  enum : bool { value = ( evaluateMatrix || evaluateVector ) };
159  };
161  //**********************************************************************************************
162 
163  //**********************************************************************************************
165 
168  template< typename T1, typename T2, typename T3 >
169  struct UseBlasKernel {
175  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
180  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
181  };
183  //**********************************************************************************************
184 
185  //**********************************************************************************************
187 
191  template< typename T1, typename T2, typename T3 >
192  struct UseVectorizedDefaultKernel {
193  enum : bool { value = useOptimizedKernels &&
195  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
198  , ElementType_<T3> >::value &&
201  };
203  //**********************************************************************************************
204 
205  public:
206  //**Type definitions****************************************************************************
212  typedef const ElementType ReturnType;
213  typedef const ResultType CompositeType;
214 
216  typedef If_< IsExpression<MT>, const MT, const MT& > LeftOperand;
217 
219  typedef If_< IsExpression<VT>, const VT, const VT& > RightOperand;
220 
223 
226  //**********************************************************************************************
227 
228  //**Compilation flags***************************************************************************
230  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
231  MT::simdEnabled && VT::simdEnabled &&
234 
236  enum : bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
237  !evaluateVector && VT::smpAssignable };
238  //**********************************************************************************************
239 
240  //**SIMD properties*****************************************************************************
242  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
243  //**********************************************************************************************
244 
245  //**Constructor*********************************************************************************
251  explicit inline DMatDVecMultExpr( const MT& mat, const VT& vec ) noexcept
252  : mat_( mat ) // Left-hand side dense matrix of the multiplication expression
253  , vec_( vec ) // Right-hand side dense vector of the multiplication expression
254  {
255  BLAZE_INTERNAL_ASSERT( mat_.columns() == vec_.size(), "Invalid matrix and vector sizes" );
256  }
257  //**********************************************************************************************
258 
259  //**Subscript operator**************************************************************************
265  inline ReturnType operator[]( size_t index ) const {
266  BLAZE_INTERNAL_ASSERT( index < mat_.rows(), "Invalid vector access index" );
267 
269  {
270  return mat_(index,index) * vec_[index];
271  }
272  else if( IsLower<MT>::value && ( index + 8UL < mat_.rows() ) )
273  {
274  const size_t n( IsStrictlyLower<MT>::value ? index : index+1UL );
275  return subvector( row( mat_, index ), 0UL, n ) * subvector( vec_, 0UL, n );
276  }
277  else if( IsUpper<MT>::value && ( index > 8UL ) )
278  {
279  const size_t begin( IsStrictlyUpper<MT>::value ? index+1UL : index );
280  const size_t n ( mat_.columns() - begin );
281  return subvector( row( mat_, index ), begin, n ) * subvector( vec_, begin, n );
282  }
283  else
284  {
285  return row( mat_, index ) * vec_;
286  }
287  }
288  //**********************************************************************************************
289 
290  //**At function*********************************************************************************
297  inline ReturnType at( size_t index ) const {
298  if( index >= mat_.rows() ) {
299  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
300  }
301  return (*this)[index];
302  }
303  //**********************************************************************************************
304 
305  //**Size function*******************************************************************************
310  inline size_t size() const noexcept {
311  return mat_.rows();
312  }
313  //**********************************************************************************************
314 
315  //**Left operand access*************************************************************************
320  inline LeftOperand leftOperand() const noexcept{
321  return mat_;
322  }
323  //**********************************************************************************************
324 
325  //**Right operand access************************************************************************
330  inline RightOperand rightOperand() const noexcept {
331  return vec_;
332  }
333  //**********************************************************************************************
334 
335  //**********************************************************************************************
341  template< typename T >
342  inline bool canAlias( const T* alias ) const noexcept {
343  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
344  }
345  //**********************************************************************************************
346 
347  //**********************************************************************************************
353  template< typename T >
354  inline bool isAliased( const T* alias ) const noexcept {
355  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
356  }
357  //**********************************************************************************************
358 
359  //**********************************************************************************************
364  inline bool isAligned() const noexcept {
365  return mat_.isAligned() && vec_.isAligned();
366  }
367  //**********************************************************************************************
368 
369  //**********************************************************************************************
374  inline bool canSMPAssign() const noexcept {
375  return ( !BLAZE_BLAS_IS_PARALLEL ||
376  ( IsComputation<MT>::value && !evaluateMatrix ) ||
377  ( mat_.rows() * mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
378  ( size() > SMP_DMATDVECMULT_THRESHOLD );
379  }
380  //**********************************************************************************************
381 
382  private:
383  //**Member variables****************************************************************************
384  LeftOperand mat_;
385  RightOperand vec_;
386  //**********************************************************************************************
387 
388  //**Assignment to dense vectors*****************************************************************
401  template< typename VT1 > // Type of the target dense vector
402  friend inline void assign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
403  {
405 
406  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
407 
408  if( rhs.mat_.rows() == 0UL ) {
409  return;
410  }
411  else if( rhs.mat_.columns() == 0UL ) {
412  reset( ~lhs );
413  return;
414  }
415 
416  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
417  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
418 
419  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
420  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
421  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
422  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
423 
424  DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
425  }
427  //**********************************************************************************************
428 
429  //**Assignment to dense vectors (kernel selection)**********************************************
440  template< typename VT1 // Type of the left-hand side target vector
441  , typename MT1 // Type of the left-hand side matrix operand
442  , typename VT2 > // Type of the right-hand side vector operand
443  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x )
444  {
445  if( ( IsDiagonal<MT1>::value ) ||
446  ( IsComputation<MT>::value && !evaluateMatrix ) ||
447  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
448  selectSmallAssignKernel( y, A, x );
449  else
450  selectBlasAssignKernel( y, A, x );
451  }
453  //**********************************************************************************************
454 
455  //**Default assignment to dense vectors*********************************************************
469  template< typename VT1 // Type of the left-hand side target vector
470  , typename MT1 // Type of the left-hand side matrix operand
471  , typename VT2 > // Type of the right-hand side vector operand
472  static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x )
473  {
474  y.assign( A * x );
475  }
477  //**********************************************************************************************
478 
479  //**Default assignment to dense vectors (small matrices)****************************************
493  template< typename VT1 // Type of the left-hand side target vector
494  , typename MT1 // Type of the left-hand side matrix operand
495  , typename VT2 > // Type of the right-hand side vector operand
496  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
497  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
498  {
499  selectDefaultAssignKernel( y, A, x );
500  }
502  //**********************************************************************************************
503 
504  //**Vectorized default assignment to dense vectors (small matrices)*****************************
518  template< typename VT1 // Type of the left-hand side target vector
519  , typename MT1 // Type of the left-hand side matrix operand
520  , typename VT2 > // Type of the right-hand side vector operand
521  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
522  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
523  {
524  const size_t M( A.rows() );
525  const size_t N( A.columns() );
526 
527  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
528 
529  size_t i( 0UL );
530 
531  for( ; (i+8UL) <= M; i+=8UL )
532  {
533  const size_t jbegin( ( IsUpper<MT1>::value )
534  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
535  :( 0UL ) );
536  const size_t jend( ( IsLower<MT1>::value )
537  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
538  :( N ) );
539  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
540 
541  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
542  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
543 
544  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
545  size_t j( jbegin );
546 
547  for( ; j<jpos; j+=SIMDSIZE ) {
548  const SIMDType x1( x.load(j) );
549  xmm1 = xmm1 + A.load(i ,j) * x1;
550  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
551  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
552  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
553  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
554  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
555  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
556  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
557  }
558 
559  y[i ] = sum( xmm1 );
560  y[i+1UL] = sum( xmm2 );
561  y[i+2UL] = sum( xmm3 );
562  y[i+3UL] = sum( xmm4 );
563  y[i+4UL] = sum( xmm5 );
564  y[i+5UL] = sum( xmm6 );
565  y[i+6UL] = sum( xmm7 );
566  y[i+7UL] = sum( xmm8 );
567 
568  for( ; remainder && j<jend; ++j ) {
569  y[i ] += A(i ,j) * x[j];
570  y[i+1UL] += A(i+1UL,j) * x[j];
571  y[i+2UL] += A(i+2UL,j) * x[j];
572  y[i+3UL] += A(i+3UL,j) * x[j];
573  y[i+4UL] += A(i+4UL,j) * x[j];
574  y[i+5UL] += A(i+5UL,j) * x[j];
575  y[i+6UL] += A(i+6UL,j) * x[j];
576  y[i+7UL] += A(i+7UL,j) * x[j];
577  }
578  }
579 
580  for( ; (i+4UL) <= M; i+=4UL )
581  {
582  const size_t jbegin( ( IsUpper<MT1>::value )
583  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
584  :( 0UL ) );
585  const size_t jend( ( IsLower<MT1>::value )
586  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
587  :( N ) );
588  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
589 
590  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
591  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
592 
593  SIMDType xmm1, xmm2, xmm3, xmm4;
594  size_t j( jbegin );
595 
596  for( ; j<jpos; j+=SIMDSIZE ) {
597  const SIMDType x1( x.load(j) );
598  xmm1 = xmm1 + A.load(i ,j) * x1;
599  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
600  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
601  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
602  }
603 
604  y[i ] = sum( xmm1 );
605  y[i+1UL] = sum( xmm2 );
606  y[i+2UL] = sum( xmm3 );
607  y[i+3UL] = sum( xmm4 );
608 
609  for( ; remainder && j<jend; ++j ) {
610  y[i ] += A(i ,j) * x[j];
611  y[i+1UL] += A(i+1UL,j) * x[j];
612  y[i+2UL] += A(i+2UL,j) * x[j];
613  y[i+3UL] += A(i+3UL,j) * x[j];
614  }
615  }
616 
617  for( ; (i+3UL) <= M; i+=3UL )
618  {
619  const size_t jbegin( ( IsUpper<MT1>::value )
620  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
621  :( 0UL ) );
622  const size_t jend( ( IsLower<MT1>::value )
623  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
624  :( N ) );
625  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
626 
627  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
628  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
629 
630  SIMDType xmm1, xmm2, xmm3;
631  size_t j( jbegin );
632 
633  for( ; j<jpos; j+=SIMDSIZE ) {
634  const SIMDType x1( x.load(j) );
635  xmm1 = xmm1 + A.load(i ,j) * x1;
636  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
637  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
638  }
639 
640  y[i ] = sum( xmm1 );
641  y[i+1UL] = sum( xmm2 );
642  y[i+2UL] = sum( xmm3 );
643 
644  for( ; remainder && j<jend; ++j ) {
645  y[i ] += A(i ,j) * x[j];
646  y[i+1UL] += A(i+1UL,j) * x[j];
647  y[i+2UL] += A(i+2UL,j) * x[j];
648  }
649  }
650 
651  for( ; (i+2UL) <= M; i+=2UL )
652  {
653  const size_t jbegin( ( IsUpper<MT1>::value )
654  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
655  :( 0UL ) );
656  const size_t jend( ( IsLower<MT1>::value )
657  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
658  :( N ) );
659  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
660 
661  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
662  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
663 
664  SIMDType xmm1, xmm2;
665  size_t j( jbegin );
666 
667  for( ; j<jpos; j+=SIMDSIZE ) {
668  const SIMDType x1( x.load(j) );
669  xmm1 = xmm1 + A.load(i ,j) * x1;
670  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
671  }
672 
673  y[i ] = sum( xmm1 );
674  y[i+1UL] = sum( xmm2 );
675 
676  for( ; remainder && j<jend; ++j ) {
677  y[i ] += A(i ,j) * x[j];
678  y[i+1UL] += A(i+1UL,j) * x[j];
679  }
680  }
681 
682  if( i < M )
683  {
684  const size_t jbegin( ( IsUpper<MT1>::value )
685  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
686  :( 0UL ) );
687  const size_t jend( ( IsLower<MT1>::value )
688  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
689  :( N ) );
690  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
691 
692  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
693  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
694 
695  SIMDType xmm1;
696  size_t j( jbegin );
697 
698  for( ; j<jpos; j+=SIMDSIZE ) {
699  xmm1 = xmm1 + A.load(i,j) * x.load(j);
700  }
701 
702  y[i] = sum( xmm1 );
703 
704  for( ; remainder && j<jend; ++j ) {
705  y[i] += A(i,j) * x[j];
706  }
707  }
708  }
710  //**********************************************************************************************
711 
712  //**Default assignment to dense vectors (large matrices)****************************************
726  template< typename VT1 // Type of the left-hand side target vector
727  , typename MT1 // Type of the left-hand side matrix operand
728  , typename VT2 > // Type of the right-hand side vector operand
729  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
730  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
731  {
732  selectDefaultAssignKernel( y, A, x );
733  }
735  //**********************************************************************************************
736 
737  //**Vectorized default assignment to dense vectors (large matrices)*****************************
751  template< typename VT1 // Type of the left-hand side target vector
752  , typename MT1 // Type of the left-hand side matrix operand
753  , typename VT2 > // Type of the right-hand side vector operand
754  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
755  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
756  {
757  const size_t M( A.rows() );
758  const size_t N( A.columns() );
759 
760  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
761 
762  reset( y );
763 
764  size_t i( 0UL );
765 
766  for( ; (i+8UL) <= M; i+=8UL )
767  {
768  const size_t jbegin( ( IsUpper<MT1>::value )
769  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
770  :( 0UL ) );
771  const size_t jend( ( IsLower<MT1>::value )
772  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
773  :( N ) );
774  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
775 
776  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
777  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
778 
779  size_t j( jbegin );
780 
781  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
782  const size_t j1( j+SIMDSIZE );
783  const size_t j2( j+SIMDSIZE*2UL );
784  const size_t j3( j+SIMDSIZE*3UL );
785  const SIMDType x1( x.load(j ) );
786  const SIMDType x2( x.load(j1) );
787  const SIMDType x3( x.load(j2) );
788  const SIMDType x4( x.load(j3) );
789  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
790  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
791  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
792  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
793  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
794  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
795  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
796  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
797  }
798 
799  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
800  const size_t j1( j+SIMDSIZE );
801  const SIMDType x1( x.load(j ) );
802  const SIMDType x2( x.load(j1) );
803  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
804  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
805  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
806  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
807  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
808  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
809  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
810  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
811  }
812 
813  for( ; j<jpos; j+=SIMDSIZE ) {
814  const SIMDType x1( x.load(j) );
815  y[i ] += sum( A.load(i ,j) * x1 );
816  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
817  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
818  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
819  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
820  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
821  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
822  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
823  }
824 
825  for( ; remainder && j<jend; ++j ) {
826  y[i ] += A(i ,j) * x[j];
827  y[i+1UL] += A(i+1UL,j) * x[j];
828  y[i+2UL] += A(i+2UL,j) * x[j];
829  y[i+3UL] += A(i+3UL,j) * x[j];
830  y[i+4UL] += A(i+4UL,j) * x[j];
831  y[i+5UL] += A(i+5UL,j) * x[j];
832  y[i+6UL] += A(i+6UL,j) * x[j];
833  y[i+7UL] += A(i+7UL,j) * x[j];
834  }
835  }
836 
837  for( ; (i+4UL) <= M; i+=4UL )
838  {
839  const size_t jbegin( ( IsUpper<MT1>::value )
840  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
841  :( 0UL ) );
842  const size_t jend( ( IsLower<MT1>::value )
843  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
844  :( N ) );
845  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
846 
847  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
848  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
849 
850  size_t j( jbegin );
851 
852  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
853  const size_t j1( j+SIMDSIZE );
854  const size_t j2( j+SIMDSIZE*2UL );
855  const size_t j3( j+SIMDSIZE*3UL );
856  const SIMDType x1( x.load(j ) );
857  const SIMDType x2( x.load(j1) );
858  const SIMDType x3( x.load(j2) );
859  const SIMDType x4( x.load(j3) );
860  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
861  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
862  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
863  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
864  }
865 
866  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
867  const size_t j1( j+SIMDSIZE );
868  const SIMDType x1( x.load(j ) );
869  const SIMDType x2( x.load(j1) );
870  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
871  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
872  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
873  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
874  }
875 
876  for( ; j<jpos; j+=SIMDSIZE ) {
877  const SIMDType x1( x.load(j) );
878  y[i ] += sum( A.load(i ,j) * x1 );
879  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
880  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
881  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
882  }
883 
884  for( ; remainder && j<jend; ++j ) {
885  y[i ] += A(i ,j) * x[j];
886  y[i+1UL] += A(i+1UL,j) * x[j];
887  y[i+2UL] += A(i+2UL,j) * x[j];
888  y[i+3UL] += A(i+3UL,j) * x[j];
889  }
890  }
891 
892  for( ; (i+2UL) <= M; i+=2UL )
893  {
894  const size_t jbegin( ( IsUpper<MT1>::value )
895  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
896  :( 0UL ) );
897  const size_t jend( ( IsLower<MT1>::value )
898  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
899  :( N ) );
900  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
901 
902  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
903  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
904 
905  size_t j( jbegin );
906 
907  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
908  const size_t j1( j+SIMDSIZE );
909  const size_t j2( j+SIMDSIZE*2UL );
910  const size_t j3( j+SIMDSIZE*3UL );
911  const SIMDType x1( x.load(j ) );
912  const SIMDType x2( x.load(j1) );
913  const SIMDType x3( x.load(j2) );
914  const SIMDType x4( x.load(j3) );
915  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
916  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
917  }
918 
919  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
920  const size_t j1( j+SIMDSIZE );
921  const SIMDType x1( x.load(j ) );
922  const SIMDType x2( x.load(j1) );
923  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
924  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
925  }
926 
927  for( ; j<jpos; j+=SIMDSIZE ) {
928  const SIMDType x1( x.load(j) );
929  y[i ] += sum( A.load(i ,j) * x1 );
930  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
931  }
932 
933  for( ; remainder && j<jend; ++j ) {
934  y[i ] += A(i ,j) * x[j];
935  y[i+1UL] += A(i+1UL,j) * x[j];
936  }
937  }
938 
939  if( i < M )
940  {
941  const size_t jbegin( ( IsUpper<MT1>::value )
942  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
943  :( 0UL ) );
944  const size_t jend( ( IsLower<MT1>::value )
945  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
946  :( N ) );
947  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
948 
949  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
950  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
951 
952  size_t j( jbegin );
953 
954  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
955  const size_t j1( j+SIMDSIZE );
956  const size_t j2( j+SIMDSIZE*2UL );
957  const size_t j3( j+SIMDSIZE*3UL );
958  const SIMDType x1( x.load(j ) );
959  const SIMDType x2( x.load(j1) );
960  const SIMDType x3( x.load(j2) );
961  const SIMDType x4( x.load(j3) );
962  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
963  }
964 
965  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
966  const size_t j1( j+SIMDSIZE );
967  const SIMDType x1( x.load(j ) );
968  const SIMDType x2( x.load(j1) );
969  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
970  }
971 
972  for( ; j<jpos; j+=SIMDSIZE ) {
973  const SIMDType x1( x.load(j) );
974  y[i] += sum( A.load(i,j) * x1 );
975  }
976 
977  for( ; remainder && j<jend; ++j ) {
978  y[i] += A(i,j) * x[j];
979  }
980  }
981  }
983  //**********************************************************************************************
984 
985  //**BLAS-based assignment to dense vectors (default)********************************************
999  template< typename VT1 // Type of the left-hand side target vector
1000  , typename MT1 // Type of the left-hand side matrix operand
1001  , typename VT2 > // Type of the right-hand side vector operand
1002  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
1003  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1004  {
1005  selectLargeAssignKernel( y, A, x );
1006  }
1008  //**********************************************************************************************
1009 
1010  //**BLAS-based assignment to dense vectors******************************************************
1011 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1012 
1025  template< typename VT1 // Type of the left-hand side target vector
1026  , typename MT1 // Type of the left-hand side matrix operand
1027  , typename VT2 > // Type of the right-hand side vector operand
1028  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
1029  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1030  {
1031  typedef ElementType_<VT1> ET;
1032 
1033  if( IsTriangular<MT1>::value ) {
1034  assign( y, x );
1035  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1036  }
1037  else {
1038  gemv( y, A, x, ET(1), ET(0) );
1039  }
1040  }
1042 #endif
1043  //**********************************************************************************************
1044 
1045  //**Assignment to sparse vectors****************************************************************
1058  template< typename VT1 > // Type of the target sparse vector
1059  friend inline void assign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1060  {
1062 
1065  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
1066 
1067  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1068 
1069  const ResultType tmp( serial( rhs ) );
1070  assign( ~lhs, tmp );
1071  }
1073  //**********************************************************************************************
1074 
1075  //**Addition assignment to dense vectors********************************************************
1088  template< typename VT1 > // Type of the target dense vector
1089  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1090  {
1092 
1093  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1094 
1095  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1096  return;
1097  }
1098 
1099  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1100  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1101 
1102  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1103  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1104  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1105  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1106 
1107  DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1108  }
1110  //**********************************************************************************************
1111 
1112  //**Addition assignment to dense vectors (kernel selection)*************************************
1123  template< typename VT1 // Type of the left-hand side target vector
1124  , typename MT1 // Type of the left-hand side matrix operand
1125  , typename VT2 > // Type of the right-hand side vector operand
1126  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1127  {
1128  if( ( IsDiagonal<MT1>::value ) ||
1129  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1130  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1131  selectSmallAddAssignKernel( y, A, x );
1132  else
1133  selectBlasAddAssignKernel( y, A, x );
1134  }
1136  //**********************************************************************************************
1137 
1138  //**Default addition assignment to dense vectors************************************************
1152  template< typename VT1 // Type of the left-hand side target vector
1153  , typename MT1 // Type of the left-hand side matrix operand
1154  , typename VT2 > // Type of the right-hand side vector operand
1155  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1156  {
1157  y.addAssign( A * x );
1158  }
1160  //**********************************************************************************************
1161 
1162  //**Default addition assignment to dense vectors (small matrices)*******************************
1176  template< typename VT1 // Type of the left-hand side target vector
1177  , typename MT1 // Type of the left-hand side matrix operand
1178  , typename VT2 > // Type of the right-hand side vector operand
1179  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1180  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1181  {
1182  selectDefaultAddAssignKernel( y, A, x );
1183  }
1185  //**********************************************************************************************
1186 
1187  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1201  template< typename VT1 // Type of the left-hand side target vector
1202  , typename MT1 // Type of the left-hand side matrix operand
1203  , typename VT2 > // Type of the right-hand side vector operand
1204  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1205  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1206  {
1207  const size_t M( A.rows() );
1208  const size_t N( A.columns() );
1209 
1210  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1211 
1212  size_t i( 0UL );
1213 
1214  for( ; (i+8UL) <= M; i+=8UL )
1215  {
1216  const size_t jbegin( ( IsUpper<MT1>::value )
1217  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1218  :( 0UL ) );
1219  const size_t jend( ( IsLower<MT1>::value )
1220  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1221  :( N ) );
1222  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1223 
1224  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1225  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1226 
1227  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1228  size_t j( jbegin );
1229 
1230  for( ; j<jpos; j+=SIMDSIZE ) {
1231  const SIMDType x1( x.load(j) );
1232  xmm1 = xmm1 + A.load(i ,j) * x1;
1233  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1234  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1235  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1236  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1237  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1238  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1239  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1240  }
1241 
1242  y[i ] += sum( xmm1 );
1243  y[i+1UL] += sum( xmm2 );
1244  y[i+2UL] += sum( xmm3 );
1245  y[i+3UL] += sum( xmm4 );
1246  y[i+4UL] += sum( xmm5 );
1247  y[i+5UL] += sum( xmm6 );
1248  y[i+6UL] += sum( xmm7 );
1249  y[i+7UL] += sum( xmm8 );
1250 
1251  for( ; remainder && j<jend; ++j ) {
1252  y[i ] += A(i ,j) * x[j];
1253  y[i+1UL] += A(i+1UL,j) * x[j];
1254  y[i+2UL] += A(i+2UL,j) * x[j];
1255  y[i+3UL] += A(i+3UL,j) * x[j];
1256  y[i+4UL] += A(i+4UL,j) * x[j];
1257  y[i+5UL] += A(i+5UL,j) * x[j];
1258  y[i+6UL] += A(i+6UL,j) * x[j];
1259  y[i+7UL] += A(i+7UL,j) * x[j];
1260  }
1261  }
1262 
1263  for( ; (i+4UL) <= M; i+=4UL )
1264  {
1265  const size_t jbegin( ( IsUpper<MT1>::value )
1266  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1267  :( 0UL ) );
1268  const size_t jend( ( IsLower<MT1>::value )
1269  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1270  :( N ) );
1271  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1272 
1273  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1274  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1275 
1276  SIMDType xmm1, xmm2, xmm3, xmm4;
1277  size_t j( jbegin );
1278 
1279  for( ; j<jpos; j+=SIMDSIZE ) {
1280  const SIMDType x1( x.load(j) );
1281  xmm1 = xmm1 + A.load(i ,j) * x1;
1282  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1283  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1284  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1285  }
1286 
1287  y[i ] += sum( xmm1 );
1288  y[i+1UL] += sum( xmm2 );
1289  y[i+2UL] += sum( xmm3 );
1290  y[i+3UL] += sum( xmm4 );
1291 
1292  for( ; remainder && j<jend; ++j ) {
1293  y[i ] += A(i ,j) * x[j];
1294  y[i+1UL] += A(i+1UL,j) * x[j];
1295  y[i+2UL] += A(i+2UL,j) * x[j];
1296  y[i+3UL] += A(i+3UL,j) * x[j];
1297  }
1298  }
1299 
1300  for( ; (i+3UL) <= M; i+=3UL )
1301  {
1302  const size_t jbegin( ( IsUpper<MT1>::value )
1303  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1304  :( 0UL ) );
1305  const size_t jend( ( IsLower<MT1>::value )
1306  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1307  :( N ) );
1308  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1309 
1310  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1311  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1312 
1313  SIMDType xmm1, xmm2, xmm3;
1314  size_t j( jbegin );
1315 
1316  for( ; j<jpos; j+=SIMDSIZE ) {
1317  const SIMDType x1( x.load(j) );
1318  xmm1 = xmm1 + A.load(i ,j) * x1;
1319  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1320  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1321  }
1322 
1323  y[i ] += sum( xmm1 );
1324  y[i+1UL] += sum( xmm2 );
1325  y[i+2UL] += sum( xmm3 );
1326 
1327  for( ; remainder && j<jend; ++j ) {
1328  y[i ] += A(i ,j) * x[j];
1329  y[i+1UL] += A(i+1UL,j) * x[j];
1330  y[i+2UL] += A(i+2UL,j) * x[j];
1331  }
1332  }
1333 
1334  for( ; (i+2UL) <= M; i+=2UL )
1335  {
1336  const size_t jbegin( ( IsUpper<MT1>::value )
1337  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1338  :( 0UL ) );
1339  const size_t jend( ( IsLower<MT1>::value )
1340  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1341  :( N ) );
1342  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1343 
1344  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1345  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1346 
1347  SIMDType xmm1, xmm2;
1348  size_t j( jbegin );
1349 
1350  for( ; j<jpos; j+=SIMDSIZE ) {
1351  const SIMDType x1( x.load(j) );
1352  xmm1 = xmm1 + A.load(i ,j) * x1;
1353  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1354  }
1355 
1356  y[i ] += sum( xmm1 );
1357  y[i+1UL] += sum( xmm2 );
1358 
1359  for( ; remainder && j<jend; ++j ) {
1360  y[i ] += A(i ,j) * x[j];
1361  y[i+1UL] += A(i+1UL,j) * x[j];
1362  }
1363  }
1364 
1365  if( i < M )
1366  {
1367  const size_t jbegin( ( IsUpper<MT1>::value )
1368  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1369  :( 0UL ) );
1370  const size_t jend( ( IsLower<MT1>::value )
1371  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1372  :( N ) );
1373  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1374 
1375  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1376  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1377 
1378  SIMDType xmm1;
1379  size_t j( jbegin );
1380 
1381  for( ; j<jpos; j+=SIMDSIZE ) {
1382  xmm1 = xmm1 + A.load(i,j) * x.load(j);
1383  }
1384 
1385  y[i] += sum( xmm1 );
1386 
1387  for( ; remainder && j<jend; ++j ) {
1388  y[i] += A(i,j) * x[j];
1389  }
1390  }
1391  }
1393  //**********************************************************************************************
1394 
1395  //**Default addition assignment to dense vectors (large matrices)*******************************
1409  template< typename VT1 // Type of the left-hand side target vector
1410  , typename MT1 // Type of the left-hand side matrix operand
1411  , typename VT2 > // Type of the right-hand side vector operand
1412  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1413  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1414  {
1415  selectDefaultAddAssignKernel( y, A, x );
1416  }
1418  //**********************************************************************************************
1419 
1420  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1434  template< typename VT1 // Type of the left-hand side target vector
1435  , typename MT1 // Type of the left-hand side matrix operand
1436  , typename VT2 > // Type of the right-hand side vector operand
1437  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1438  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1439  {
1440  const size_t M( A.rows() );
1441  const size_t N( A.columns() );
1442 
1443  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1444 
1445  size_t i( 0UL );
1446 
1447  for( ; (i+8UL) <= M; i+=8UL )
1448  {
1449  const size_t jbegin( ( IsUpper<MT1>::value )
1450  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1451  :( 0UL ) );
1452  const size_t jend( ( IsLower<MT1>::value )
1453  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1454  :( N ) );
1455  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1456 
1457  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1458  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1459 
1460  size_t j( jbegin );
1461 
1462  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1463  const size_t j1( j+SIMDSIZE );
1464  const size_t j2( j+SIMDSIZE*2UL );
1465  const size_t j3( j+SIMDSIZE*3UL );
1466  const SIMDType x1( x.load(j ) );
1467  const SIMDType x2( x.load(j1) );
1468  const SIMDType x3( x.load(j2) );
1469  const SIMDType x4( x.load(j3) );
1470  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1471  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1472  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1473  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1474  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1475  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1476  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1477  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1478  }
1479 
1480  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1481  const size_t j1( j+SIMDSIZE );
1482  const SIMDType x1( x.load(j ) );
1483  const SIMDType x2( x.load(j1) );
1484  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1485  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1486  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1487  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1488  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1489  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1490  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1491  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1492  }
1493 
1494  for( ; j<jpos; j+=SIMDSIZE ) {
1495  const SIMDType x1( x.load(j) );
1496  y[i ] += sum( A.load(i ,j) * x1 );
1497  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1498  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1499  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1500  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
1501  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
1502  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
1503  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
1504  }
1505 
1506  for( ; remainder && j<jend; ++j ) {
1507  y[i ] += A(i ,j) * x[j];
1508  y[i+1UL] += A(i+1UL,j) * x[j];
1509  y[i+2UL] += A(i+2UL,j) * x[j];
1510  y[i+3UL] += A(i+3UL,j) * x[j];
1511  y[i+4UL] += A(i+4UL,j) * x[j];
1512  y[i+5UL] += A(i+5UL,j) * x[j];
1513  y[i+6UL] += A(i+6UL,j) * x[j];
1514  y[i+7UL] += A(i+7UL,j) * x[j];
1515  }
1516  }
1517 
1518  for( ; (i+4UL) <= M; i+=4UL )
1519  {
1520  const size_t jbegin( ( IsUpper<MT1>::value )
1521  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1522  :( 0UL ) );
1523  const size_t jend( ( IsLower<MT1>::value )
1524  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1525  :( N ) );
1526  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1527 
1528  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1529  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1530 
1531  size_t j( jbegin );
1532 
1533  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1534  const size_t j1( j+SIMDSIZE );
1535  const size_t j2( j+SIMDSIZE*2UL );
1536  const size_t j3( j+SIMDSIZE*3UL );
1537  const SIMDType x1( x.load(j ) );
1538  const SIMDType x2( x.load(j1) );
1539  const SIMDType x3( x.load(j2) );
1540  const SIMDType x4( x.load(j3) );
1541  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1542  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1543  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1544  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1545  }
1546 
1547  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1548  const size_t j1( j+SIMDSIZE );
1549  const SIMDType x1( x.load(j ) );
1550  const SIMDType x2( x.load(j1) );
1551  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1552  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1553  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1554  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1555  }
1556 
1557  for( ; j<jpos; j+=SIMDSIZE ) {
1558  const SIMDType x1( x.load(j) );
1559  y[i ] += sum( A.load(i ,j) * x1 );
1560  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1561  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1562  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1563  }
1564 
1565  for( ; remainder && j<jend; ++j ) {
1566  y[i ] += A(i ,j) * x[j];
1567  y[i+1UL] += A(i+1UL,j) * x[j];
1568  y[i+2UL] += A(i+2UL,j) * x[j];
1569  y[i+3UL] += A(i+3UL,j) * x[j];
1570  }
1571  }
1572 
1573  for( ; (i+2UL) <= M; i+=2UL )
1574  {
1575  const size_t jbegin( ( IsUpper<MT1>::value )
1576  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1577  :( 0UL ) );
1578  const size_t jend( ( IsLower<MT1>::value )
1579  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1580  :( N ) );
1581  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1582 
1583  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1584  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1585 
1586  size_t j( jbegin );
1587 
1588  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1589  const size_t j1( j+SIMDSIZE );
1590  const size_t j2( j+SIMDSIZE*2UL );
1591  const size_t j3( j+SIMDSIZE*3UL );
1592  const SIMDType x1( x.load(j ) );
1593  const SIMDType x2( x.load(j1) );
1594  const SIMDType x3( x.load(j2) );
1595  const SIMDType x4( x.load(j3) );
1596  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1597  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1598  }
1599 
1600  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1601  const size_t j1( j+SIMDSIZE );
1602  const SIMDType x1( x.load(j ) );
1603  const SIMDType x2( x.load(j1) );
1604  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1605  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1606  }
1607 
1608  for( ; j<jpos; j+=SIMDSIZE ) {
1609  const SIMDType x1( x.load(j) );
1610  y[i ] += sum( A.load(i ,j) * x1 );
1611  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1612  }
1613 
1614  for( ; remainder && j<jend; ++j ) {
1615  y[i ] += A(i ,j) * x[j];
1616  y[i+1UL] += A(i+1UL,j) * x[j];
1617  }
1618  }
1619 
1620  if( i < M )
1621  {
1622  const size_t jbegin( ( IsUpper<MT1>::value )
1623  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1624  :( 0UL ) );
1625  const size_t jend( ( IsLower<MT1>::value )
1626  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1627  :( N ) );
1628  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1629 
1630  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1631  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1632 
1633  size_t j( jbegin );
1634 
1635  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
1636  const size_t j1( j+SIMDSIZE );
1637  const size_t j2( j+SIMDSIZE*2UL );
1638  const size_t j3( j+SIMDSIZE*3UL );
1639  const SIMDType x1( x.load(j ) );
1640  const SIMDType x2( x.load(j1) );
1641  const SIMDType x3( x.load(j2) );
1642  const SIMDType x4( x.load(j3) );
1643  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1644  }
1645 
1646  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
1647  const size_t j1( j+SIMDSIZE );
1648  const SIMDType x1( x.load(j ) );
1649  const SIMDType x2( x.load(j1) );
1650  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1651  }
1652 
1653  for( ; j<jpos; j+=SIMDSIZE ) {
1654  const SIMDType x1( x.load(j) );
1655  y[i] += sum( A.load(i,j) * x1 );
1656  }
1657 
1658  for( ; remainder && j<jend; ++j ) {
1659  y[i] += A(i,j) * x[j];
1660  }
1661  }
1662  }
1664  //**********************************************************************************************
1665 
1666  //**BLAS-based addition assignment to dense vectors (default)***********************************
1680  template< typename VT1 // Type of the left-hand side target vector
1681  , typename MT1 // Type of the left-hand side matrix operand
1682  , typename VT2 > // Type of the right-hand side vector operand
1683  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
1684  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1685  {
1686  selectLargeAddAssignKernel( y, A, x );
1687  }
1689  //**********************************************************************************************
1690 
1691  //**BLAS-based addition assignment to dense vectors*********************************************
1692 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1693 
1706  template< typename VT1 // Type of the left-hand side target vector
1707  , typename MT1 // Type of the left-hand side matrix operand
1708  , typename VT2 > // Type of the right-hand side vector operand
1709  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
1710  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1711  {
1712  typedef ElementType_<VT1> ET;
1713 
1714  if( IsTriangular<MT1>::value ) {
1715  ResultType_<VT1> tmp( serial( x ) );
1716  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1717  addAssign( y, tmp );
1718  }
1719  else {
1720  gemv( y, A, x, ET(1), ET(1) );
1721  }
1722  }
1724 #endif
1725  //**********************************************************************************************
1726 
1727  //**Addition assignment to sparse vectors*******************************************************
1728  // No special implementation for the addition assignment to sparse vectors.
1729  //**********************************************************************************************
1730 
1731  //**Subtraction assignment to dense vectors*****************************************************
1744  template< typename VT1 > // Type of the target dense vector
1745  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1746  {
1748 
1749  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1750 
1751  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1752  return;
1753  }
1754 
1755  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1756  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1757 
1758  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1759  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1760  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1761  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1762 
1763  DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1764  }
1766  //**********************************************************************************************
1767 
1768  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1779  template< typename VT1 // Type of the left-hand side target vector
1780  , typename MT1 // Type of the left-hand side matrix operand
1781  , typename VT2 > // Type of the right-hand side vector operand
1782  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1783  {
1784  if( ( IsDiagonal<MT1>::value ) ||
1785  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1786  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1787  selectSmallSubAssignKernel( y, A, x );
1788  else
1789  selectBlasSubAssignKernel( y, A, x );
1790  }
1792  //**********************************************************************************************
1793 
1794  //**Default subtraction assignment to dense vectors*********************************************
1808  template< typename VT1 // Type of the left-hand side target vector
1809  , typename MT1 // Type of the left-hand side matrix operand
1810  , typename VT2 > // Type of the right-hand side vector operand
1811  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1812  {
1813  y.subAssign( A * x );
1814  }
1816  //**********************************************************************************************
1817 
1818  //**Default subtraction assignment to dense vectors (small matrices)****************************
1832  template< typename VT1 // Type of the left-hand side target vector
1833  , typename MT1 // Type of the left-hand side matrix operand
1834  , typename VT2 > // Type of the right-hand side vector operand
1835  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1836  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1837  {
1838  selectDefaultSubAssignKernel( y, A, x );
1839  }
1841  //**********************************************************************************************
1842 
1843  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1857  template< typename VT1 // Type of the left-hand side target vector
1858  , typename MT1 // Type of the left-hand side matrix operand
1859  , typename VT2 > // Type of the right-hand side vector operand
1860  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1861  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1862  {
1863  const size_t M( A.rows() );
1864  const size_t N( A.columns() );
1865 
1866  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1867 
1868  size_t i( 0UL );
1869 
1870  for( ; (i+8UL) <= M; i+=8UL )
1871  {
1872  const size_t jbegin( ( IsUpper<MT1>::value )
1873  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1874  :( 0UL ) );
1875  const size_t jend( ( IsLower<MT1>::value )
1876  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1877  :( N ) );
1878  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1879 
1880  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1881  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1882 
1883  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1884  size_t j( jbegin );
1885 
1886  for( ; j<jpos; j+=SIMDSIZE ) {
1887  const SIMDType x1( x.load(j) );
1888  xmm1 = xmm1 + A.load(i ,j) * x1;
1889  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1890  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1891  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1892  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1893  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1894  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1895  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1896  }
1897 
1898  y[i ] -= sum( xmm1 );
1899  y[i+1UL] -= sum( xmm2 );
1900  y[i+2UL] -= sum( xmm3 );
1901  y[i+3UL] -= sum( xmm4 );
1902  y[i+4UL] -= sum( xmm5 );
1903  y[i+5UL] -= sum( xmm6 );
1904  y[i+6UL] -= sum( xmm7 );
1905  y[i+7UL] -= sum( xmm8 );
1906 
1907  for( ; remainder && j<jend; ++j ) {
1908  y[i ] -= A(i ,j) * x[j];
1909  y[i+1UL] -= A(i+1UL,j) * x[j];
1910  y[i+2UL] -= A(i+2UL,j) * x[j];
1911  y[i+3UL] -= A(i+3UL,j) * x[j];
1912  y[i+4UL] -= A(i+4UL,j) * x[j];
1913  y[i+5UL] -= A(i+5UL,j) * x[j];
1914  y[i+6UL] -= A(i+6UL,j) * x[j];
1915  y[i+7UL] -= A(i+7UL,j) * x[j];
1916  }
1917  }
1918 
1919  for( ; (i+4UL) <= M; i+=4UL )
1920  {
1921  const size_t jbegin( ( IsUpper<MT1>::value )
1922  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1923  :( 0UL ) );
1924  const size_t jend( ( IsLower<MT1>::value )
1925  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1926  :( N ) );
1927  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1928 
1929  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1930  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1931 
1932  SIMDType xmm1, xmm2, xmm3, xmm4;
1933  size_t j( jbegin );
1934 
1935  for( ; j<jpos; j+=SIMDSIZE ) {
1936  const SIMDType x1( x.load(j) );
1937  xmm1 = xmm1 + A.load(i ,j) * x1;
1938  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1939  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1940  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1941  }
1942 
1943  y[i ] -= sum( xmm1 );
1944  y[i+1UL] -= sum( xmm2 );
1945  y[i+2UL] -= sum( xmm3 );
1946  y[i+3UL] -= sum( xmm4 );
1947 
1948  for( ; remainder && j<jend; ++j ) {
1949  y[i ] -= A(i ,j) * x[j];
1950  y[i+1UL] -= A(i+1UL,j) * x[j];
1951  y[i+2UL] -= A(i+2UL,j) * x[j];
1952  y[i+3UL] -= A(i+3UL,j) * x[j];
1953  }
1954  }
1955 
1956  for( ; (i+3UL) <= M; i+=3UL )
1957  {
1958  const size_t jbegin( ( IsUpper<MT1>::value )
1959  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1960  :( 0UL ) );
1961  const size_t jend( ( IsLower<MT1>::value )
1962  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1963  :( N ) );
1964  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1965 
1966  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1967  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
1968 
1969  SIMDType xmm1, xmm2, xmm3;
1970  size_t j( jbegin );
1971 
1972  for( ; j<jpos; j+=SIMDSIZE ) {
1973  const SIMDType x1( x.load(j) );
1974  xmm1 = xmm1 + A.load(i ,j) * x1;
1975  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1976  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1977  }
1978 
1979  y[i ] -= sum( xmm1 );
1980  y[i+1UL] -= sum( xmm2 );
1981  y[i+2UL] -= sum( xmm3 );
1982 
1983  for( ; remainder && j<jend; ++j ) {
1984  y[i ] -= A(i ,j) * x[j];
1985  y[i+1UL] -= A(i+1UL,j) * x[j];
1986  y[i+2UL] -= A(i+2UL,j) * x[j];
1987  }
1988  }
1989 
1990  for( ; (i+2UL) <= M; i+=2UL )
1991  {
1992  const size_t jbegin( ( IsUpper<MT1>::value )
1993  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
1994  :( 0UL ) );
1995  const size_t jend( ( IsLower<MT1>::value )
1996  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1997  :( N ) );
1998  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1999 
2000  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2001  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2002 
2003  SIMDType xmm1, xmm2;
2004  size_t j( jbegin );
2005 
2006  for( ; j<jpos; j+=SIMDSIZE ) {
2007  const SIMDType x1( x.load(j) );
2008  xmm1 = xmm1 + A.load(i ,j) * x1;
2009  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2010  }
2011 
2012  y[i ] -= sum( xmm1 );
2013  y[i+1UL] -= sum( xmm2 );
2014 
2015  for( ; remainder && j<jend; ++j ) {
2016  y[i ] -= A(i ,j) * x[j];
2017  y[i+1UL] -= A(i+1UL,j) * x[j];
2018  }
2019  }
2020 
2021  if( i < M )
2022  {
2023  const size_t jbegin( ( IsUpper<MT1>::value )
2024  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2025  :( 0UL ) );
2026  const size_t jend( ( IsLower<MT1>::value )
2027  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2028  :( N ) );
2029  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2030 
2031  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2032  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2033 
2034  SIMDType xmm1;
2035  size_t j( jbegin );
2036 
2037  for( ; j<jpos; j+=SIMDSIZE ) {
2038  xmm1 = xmm1 + A.load(i,j) * x.load(j);
2039  }
2040 
2041  y[i] -= sum( xmm1 );
2042 
2043  for( ; remainder && j<jend; ++j ) {
2044  y[i] -= A(i,j) * x[j];
2045  }
2046  }
2047  }
2049  //**********************************************************************************************
2050 
2051  //**Default subtraction assignment to dense vectors (large matrices)****************************
2065  template< typename VT1 // Type of the left-hand side target vector
2066  , typename MT1 // Type of the left-hand side matrix operand
2067  , typename VT2 > // Type of the right-hand side vector operand
2068  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
2069  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2070  {
2071  selectDefaultSubAssignKernel( y, A, x );
2072  }
2074  //**********************************************************************************************
2075 
2076  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2090  template< typename VT1 // Type of the left-hand side target vector
2091  , typename MT1 // Type of the left-hand side matrix operand
2092  , typename VT2 > // Type of the right-hand side vector operand
2093  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
2094  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2095  {
2096  const size_t M( A.rows() );
2097  const size_t N( A.columns() );
2098 
2099  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
2100 
2101  size_t i( 0UL );
2102 
2103  for( ; (i+8UL) <= M; i+=8UL )
2104  {
2105  const size_t jbegin( ( IsUpper<MT1>::value )
2106  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2107  :( 0UL ) );
2108  const size_t jend( ( IsLower<MT1>::value )
2109  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
2110  :( N ) );
2111  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2112 
2113  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2114  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2115 
2116  size_t j( jbegin );
2117 
2118  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2119  const size_t j1( j+SIMDSIZE );
2120  const size_t j2( j+SIMDSIZE*2UL );
2121  const size_t j3( j+SIMDSIZE*3UL );
2122  const SIMDType x1( x.load(j ) );
2123  const SIMDType x2( x.load(j1) );
2124  const SIMDType x3( x.load(j2) );
2125  const SIMDType x4( x.load(j3) );
2126  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2127  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2128  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2129  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2130  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2131  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2132  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2133  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2134  }
2135 
2136  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2137  const size_t j1( j+SIMDSIZE );
2138  const SIMDType x1( x.load(j ) );
2139  const SIMDType x2( x.load(j1) );
2140  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2141  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2142  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2143  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2144  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2145  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2146  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2147  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2148  }
2149 
2150  for( ; j<jpos; j+=SIMDSIZE ) {
2151  const SIMDType x1( x.load(j) );
2152  y[i ] -= sum( A.load(i ,j) * x1 );
2153  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2154  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2155  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2156  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 );
2157  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 );
2158  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 );
2159  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 );
2160  }
2161 
2162  for( ; remainder && j<jend; ++j ) {
2163  y[i ] -= A(i ,j) * x[j];
2164  y[i+1UL] -= A(i+1UL,j) * x[j];
2165  y[i+2UL] -= A(i+2UL,j) * x[j];
2166  y[i+3UL] -= A(i+3UL,j) * x[j];
2167  y[i+4UL] -= A(i+4UL,j) * x[j];
2168  y[i+5UL] -= A(i+5UL,j) * x[j];
2169  y[i+6UL] -= A(i+6UL,j) * x[j];
2170  y[i+7UL] -= A(i+7UL,j) * x[j];
2171  }
2172  }
2173 
2174  for( ; (i+4UL) <= M; i+=4UL )
2175  {
2176  const size_t jbegin( ( IsUpper<MT1>::value )
2177  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2178  :( 0UL ) );
2179  const size_t jend( ( IsLower<MT1>::value )
2180  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
2181  :( N ) );
2182  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2183 
2184  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2185  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2186 
2187  size_t j( jbegin );
2188 
2189  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2190  const size_t j1( j+SIMDSIZE );
2191  const size_t j2( j+SIMDSIZE*2UL );
2192  const size_t j3( j+SIMDSIZE*3UL );
2193  const SIMDType x1( x.load(j ) );
2194  const SIMDType x2( x.load(j1) );
2195  const SIMDType x3( x.load(j2) );
2196  const SIMDType x4( x.load(j3) );
2197  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2198  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2199  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2200  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2201  }
2202 
2203  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2204  const size_t j1( j+SIMDSIZE );
2205  const SIMDType x1( x.load(j ) );
2206  const SIMDType x2( x.load(j1) );
2207  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2208  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2209  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2210  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2211  }
2212 
2213  for( ; j<jpos; j+=SIMDSIZE ) {
2214  const SIMDType x1( x.load(j) );
2215  y[i ] -= sum( A.load(i ,j) * x1 );
2216  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2217  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2218  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2219  }
2220 
2221  for( ; remainder && j<jend; ++j ) {
2222  y[i ] -= A(i ,j) * x[j];
2223  y[i+1UL] -= A(i+1UL,j) * x[j];
2224  y[i+2UL] -= A(i+2UL,j) * x[j];
2225  y[i+3UL] -= A(i+3UL,j) * x[j];
2226  }
2227  }
2228 
2229  for( ; (i+2UL) <= M; i+=2UL )
2230  {
2231  const size_t jbegin( ( IsUpper<MT1>::value )
2232  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2233  :( 0UL ) );
2234  const size_t jend( ( IsLower<MT1>::value )
2235  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2236  :( N ) );
2237  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2238 
2239  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2240  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2241 
2242  size_t j( jbegin );
2243 
2244  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2245  const size_t j1( j+SIMDSIZE );
2246  const size_t j2( j+SIMDSIZE*2UL );
2247  const size_t j3( j+SIMDSIZE*3UL );
2248  const SIMDType x1( x.load(j ) );
2249  const SIMDType x2( x.load(j1) );
2250  const SIMDType x3( x.load(j2) );
2251  const SIMDType x4( x.load(j3) );
2252  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2253  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2254  }
2255 
2256  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2257  const size_t j1( j+SIMDSIZE );
2258  const SIMDType x1( x.load(j ) );
2259  const SIMDType x2( x.load(j1) );
2260  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2261  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2262  }
2263 
2264  for( ; j<jpos; j+=SIMDSIZE ) {
2265  const SIMDType x1( x.load(j) );
2266  y[i ] -= sum( A.load(i ,j) * x1 );
2267  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2268  }
2269 
2270  for( ; remainder && j<jend; ++j ) {
2271  y[i ] -= A(i ,j) * x[j];
2272  y[i+1UL] -= A(i+1UL,j) * x[j];
2273  }
2274  }
2275 
2276  if( i < M )
2277  {
2278  const size_t jbegin( ( IsUpper<MT1>::value )
2279  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
2280  :( 0UL ) );
2281  const size_t jend( ( IsLower<MT1>::value )
2282  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2283  :( N ) );
2284  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2285 
2286  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
2287  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
2288 
2289  size_t j( jbegin );
2290 
2291  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
2292  const size_t j1( j+SIMDSIZE );
2293  const size_t j2( j+SIMDSIZE*2UL );
2294  const size_t j3( j+SIMDSIZE*3UL );
2295  const SIMDType x1( x.load(j ) );
2296  const SIMDType x2( x.load(j1) );
2297  const SIMDType x3( x.load(j2) );
2298  const SIMDType x4( x.load(j3) );
2299  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2300  }
2301 
2302  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
2303  const size_t j1( j+SIMDSIZE );
2304  const SIMDType x1( x.load(j ) );
2305  const SIMDType x2( x.load(j1) );
2306  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2307  }
2308 
2309  for( ; j<jpos; j+=SIMDSIZE ) {
2310  const SIMDType x1( x.load(j) );
2311  y[i] -= sum( A.load(i,j) * x1 );
2312  }
2313 
2314  for( ; remainder && j<jend; ++j ) {
2315  y[i] -= A(i,j) * x[j];
2316  }
2317  }
2318  }
2320  //**********************************************************************************************
2321 
2322  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2336  template< typename VT1 // Type of the left-hand side target vector
2337  , typename MT1 // Type of the left-hand side matrix operand
2338  , typename VT2 > // Type of the right-hand side vector operand
2339  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
2340  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2341  {
2342  selectLargeSubAssignKernel( y, A, x );
2343  }
2345  //**********************************************************************************************
2346 
2347  //**BLAS-based subtraction assignment to dense vectors******************************************
2348 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2349 
2362  template< typename VT1 // Type of the left-hand side target vector
2363  , typename MT1 // Type of the left-hand side matrix operand
2364  , typename VT2 > // Type of the right-hand side vector operand
2365  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
2366  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2367  {
2368  typedef ElementType_<VT1> ET;
2369 
2370  if( IsTriangular<MT1>::value ) {
2371  ResultType_<VT1> tmp( serial( x ) );
2372  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2373  subAssign( y, tmp );
2374  }
2375  else {
2376  gemv( y, A, x, ET(-1), ET(1) );
2377  }
2378  }
2380 #endif
2381  //**********************************************************************************************
2382 
2383  //**Subtraction assignment to sparse vectors****************************************************
2384  // No special implementation for the subtraction assignment to sparse vectors.
2385  //**********************************************************************************************
2386 
2387  //**Multiplication assignment to dense vectors**************************************************
2400  template< typename VT1 > // Type of the target dense vector
2401  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2402  {
2404 
2407  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2408 
2409  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2410 
2411  const ResultType tmp( serial( rhs ) );
2412  multAssign( ~lhs, tmp );
2413  }
2415  //**********************************************************************************************
2416 
2417  //**Multiplication assignment to sparse vectors*************************************************
2418  // No special implementation for the multiplication assignment to sparse vectors.
2419  //**********************************************************************************************
2420 
2421  //**Division assignment to dense vectors********************************************************
2434  template< typename VT1 > // Type of the target dense vector
2435  friend inline void divAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2436  {
2438 
2441  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2442 
2443  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2444 
2445  const ResultType tmp( serial( rhs ) );
2446  divAssign( ~lhs, tmp );
2447  }
2449  //**********************************************************************************************
2450 
2451  //**Division assignment to sparse vectors*******************************************************
2452  // No special implementation for the division assignment to sparse vectors.
2453  //**********************************************************************************************
2454 
2455  //**SMP assignment to dense vectors*************************************************************
2470  template< typename VT1 > // Type of the target dense vector
2471  friend inline EnableIf_< UseSMPAssign<VT1> >
2472  smpAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2473  {
2475 
2476  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2477 
2478  if( rhs.mat_.rows() == 0UL ) {
2479  return;
2480  }
2481  else if( rhs.mat_.columns() == 0UL ) {
2482  reset( ~lhs );
2483  return;
2484  }
2485 
2486  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2487  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2488 
2489  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2490  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2491  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2492  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2493 
2494  smpAssign( ~lhs, A * x );
2495  }
2497  //**********************************************************************************************
2498 
2499  //**SMP assignment to sparse vectors************************************************************
2514  template< typename VT1 > // Type of the target sparse vector
2515  friend inline EnableIf_< UseSMPAssign<VT1> >
2516  smpAssign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2517  {
2519 
2522  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2523 
2524  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2525 
2526  const ResultType tmp( rhs );
2527  smpAssign( ~lhs, tmp );
2528  }
2530  //**********************************************************************************************
2531 
2532  //**SMP addition assignment to dense vectors****************************************************
2547  template< typename VT1 > // Type of the target dense vector
2548  friend inline EnableIf_< UseSMPAssign<VT1> >
2549  smpAddAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2550  {
2552 
2553  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2554 
2555  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2556  return;
2557  }
2558 
2559  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2560  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2561 
2562  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2563  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2564  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2565  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2566 
2567  smpAddAssign( ~lhs, A * x );
2568  }
2570  //**********************************************************************************************
2571 
2572  //**SMP addition assignment to sparse vectors***************************************************
2573  // No special implementation for the SMP addition assignment to sparse vectors.
2574  //**********************************************************************************************
2575 
2576  //**SMP subtraction assignment to dense vectors*************************************************
2591  template< typename VT1 > // Type of the target dense vector
2592  friend inline EnableIf_< UseSMPAssign<VT1> >
2593  smpSubAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2594  {
2596 
2597  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2598 
2599  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2600  return;
2601  }
2602 
2603  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2604  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2605 
2606  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2607  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2608  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2609  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2610 
2611  smpSubAssign( ~lhs, A * x );
2612  }
2614  //**********************************************************************************************
2615 
2616  //**SMP subtraction assignment to sparse vectors************************************************
2617  // No special implementation for the SMP subtraction assignment to sparse vectors.
2618  //**********************************************************************************************
2619 
2620  //**SMP multiplication assignment to dense vectors**********************************************
2635  template< typename VT1 > // Type of the target dense vector
2636  friend inline EnableIf_< UseSMPAssign<VT1> >
2637  smpMultAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2638  {
2640 
2643  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2644 
2645  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2646 
2647  const ResultType tmp( rhs );
2648  smpMultAssign( ~lhs, tmp );
2649  }
2651  //**********************************************************************************************
2652 
2653  //**SMP multiplication assignment to sparse vectors*********************************************
2654  // No special implementation for the SMP multiplication assignment to sparse vectors.
2655  //**********************************************************************************************
2656 
2657  //**SMP division assignment to dense vectors****************************************************
2672  template< typename VT1 > // Type of the target dense vector
2673  friend inline EnableIf_< UseSMPAssign<VT1> >
2674  smpDivAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2675  {
2677 
2680  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2681 
2682  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2683 
2684  const ResultType tmp( rhs );
2685  smpDivAssign( ~lhs, tmp );
2686  }
2688  //**********************************************************************************************
2689 
2690  //**SMP division assignment to sparse vectors***************************************************
2691  // No special implementation for the SMP division assignment to sparse vectors.
2692  //**********************************************************************************************
2693 
2694  //**Compile time checks*************************************************************************
2702  //**********************************************************************************************
2703 };
2704 //*************************************************************************************************
2705 
2706 
2707 
2708 
2709 //=================================================================================================
2710 //
2711 // DVECSCALARMULTEXPR SPECIALIZATION
2712 //
2713 //=================================================================================================
2714 
2715 //*************************************************************************************************
2723 template< typename MT // Type of the left-hand side dense matrix
2724  , typename VT // Type of the right-hand side dense vector
2725  , typename ST > // Type of the scalar value
2726 class DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >
2727  : public DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false >
2728  , private VecScalarMultExpr
2729  , private Computation
2730 {
2731  private:
2732  //**Type definitions****************************************************************************
2733  typedef DMatDVecMultExpr<MT,VT> MVM;
2734  typedef ResultType_<MVM> RES;
2735  typedef ResultType_<MT> MRT;
2736  typedef ResultType_<VT> VRT;
2737  typedef ElementType_<MRT> MET;
2738  typedef ElementType_<VRT> VET;
2739  typedef CompositeType_<MT> MCT;
2740  typedef CompositeType_<VT> VCT;
2741  //**********************************************************************************************
2742 
2743  //**********************************************************************************************
2745  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2746  IsBLASCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2747  //**********************************************************************************************
2748 
2749  //**********************************************************************************************
2751  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<MT>::value };
2752  //**********************************************************************************************
2753 
2754  //**********************************************************************************************
2756 
2759  template< typename T1 >
2760  struct UseSMPAssign {
2761  enum : bool { value = ( evaluateMatrix || evaluateVector ) };
2762  };
2763  //**********************************************************************************************
2764 
2765  //**********************************************************************************************
2767 
2769  template< typename T1, typename T2, typename T3, typename T4 >
2770  struct UseBlasKernel {
2772  HasMutableDataAccess<T1>::value &&
2773  HasConstDataAccess<T2>::value &&
2774  HasConstDataAccess<T3>::value &&
2775  !IsDiagonal<T2>::value &&
2776  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2777  IsBLASCompatible< ElementType_<T1> >::value &&
2778  IsBLASCompatible< ElementType_<T2> >::value &&
2779  IsBLASCompatible< ElementType_<T3> >::value &&
2780  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
2781  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2782  !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
2783  };
2784  //**********************************************************************************************
2785 
2786  //**********************************************************************************************
2788 
2791  template< typename T1, typename T2, typename T3, typename T4 >
2792  struct UseVectorizedDefaultKernel {
2793  enum : bool { value = useOptimizedKernels &&
2794  !IsDiagonal<T2>::value &&
2795  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2796  AreSIMDCombinable< ElementType_<T1>
2797  , ElementType_<T2>
2798  , ElementType_<T3>
2799  , T4 >::value &&
2800  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2801  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2802  };
2803  //**********************************************************************************************
2804 
2805  public:
2806  //**Type definitions****************************************************************************
2807  typedef DVecScalarMultExpr<MVM,ST,false> This;
2808  typedef MultTrait_<RES,ST> ResultType;
2809  typedef TransposeType_<ResultType> TransposeType;
2810  typedef ElementType_<ResultType> ElementType;
2811  typedef SIMDTrait_<ElementType> SIMDType;
2812  typedef const ElementType ReturnType;
2813  typedef const ResultType CompositeType;
2814 
2816  typedef const DMatDVecMultExpr<MT,VT> LeftOperand;
2817 
2819  typedef ST RightOperand;
2820 
2822  typedef IfTrue_< evaluateMatrix, const MRT, MCT > LT;
2823 
2825  typedef IfTrue_< evaluateVector, const VRT, VCT > RT;
2826  //**********************************************************************************************
2827 
2828  //**Compilation flags***************************************************************************
2830  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2831  MT::simdEnabled && VT::simdEnabled &&
2832  AreSIMDCombinable<MET,VET,ST>::value &&
2833  HasSIMDAdd<MET,VET>::value &&
2834  HasSIMDMult<MET,VET>::value };
2835 
2837  enum : bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2838  !evaluateVector && VT::smpAssignable };
2839  //**********************************************************************************************
2840 
2841  //**SIMD properties*****************************************************************************
2843  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2844  //**********************************************************************************************
2845 
2846  //**Constructor*********************************************************************************
2852  explicit inline DVecScalarMultExpr( const MVM& vector, ST scalar )
2853  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2854  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2855  {}
2856  //**********************************************************************************************
2857 
2858  //**Subscript operator**************************************************************************
2864  inline ReturnType operator[]( size_t index ) const {
2865  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2866  return vector_[index] * scalar_;
2867  }
2868  //**********************************************************************************************
2869 
2870  //**At function*********************************************************************************
2877  inline ReturnType at( size_t index ) const {
2878  if( index >= vector_.size() ) {
2879  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2880  }
2881  return (*this)[index];
2882  }
2883  //**********************************************************************************************
2884 
2885  //**Size function*******************************************************************************
2890  inline size_t size() const {
2891  return vector_.size();
2892  }
2893  //**********************************************************************************************
2894 
2895  //**Left operand access*************************************************************************
2900  inline LeftOperand leftOperand() const {
2901  return vector_;
2902  }
2903  //**********************************************************************************************
2904 
2905  //**Right operand access************************************************************************
2910  inline RightOperand rightOperand() const {
2911  return scalar_;
2912  }
2913  //**********************************************************************************************
2914 
2915  //**********************************************************************************************
2921  template< typename T >
2922  inline bool canAlias( const T* alias ) const {
2923  return vector_.canAlias( alias );
2924  }
2925  //**********************************************************************************************
2926 
2927  //**********************************************************************************************
2933  template< typename T >
2934  inline bool isAliased( const T* alias ) const {
2935  return vector_.isAliased( alias );
2936  }
2937  //**********************************************************************************************
2938 
2939  //**********************************************************************************************
2944  inline bool isAligned() const {
2945  return vector_.isAligned();
2946  }
2947  //**********************************************************************************************
2948 
2949  //**********************************************************************************************
2954  inline bool canSMPAssign() const noexcept {
2955  LeftOperand_<MVM> A( vector_.leftOperand() );
2956  return ( !BLAZE_BLAS_IS_PARALLEL ||
2957  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2958  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2959  ( size() > SMP_DMATDVECMULT_THRESHOLD );
2960  }
2961  //**********************************************************************************************
2962 
2963  private:
2964  //**Member variables****************************************************************************
2965  LeftOperand vector_;
2966  RightOperand scalar_;
2967  //**********************************************************************************************
2968 
2969  //**Assignment to dense vectors*****************************************************************
2981  template< typename VT1 > // Type of the target dense vector
2982  friend inline void assign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
2983  {
2985 
2986  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2987 
2988  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
2989  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
2990 
2991  if( left.rows() == 0UL ) {
2992  return;
2993  }
2994  else if( left.columns() == 0UL ) {
2995  reset( ~lhs );
2996  return;
2997  }
2998 
2999  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3000  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3001 
3002  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3003  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3004  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3005  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3006 
3007  DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
3008  }
3009  //**********************************************************************************************
3010 
3011  //**Assignment to dense vectors (kernel selection)**********************************************
3022  template< typename VT1 // Type of the left-hand side target vector
3023  , typename MT1 // Type of the left-hand side matrix operand
3024  , typename VT2 // Type of the right-hand side vector operand
3025  , typename ST2 > // Type of the scalar value
3026  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3027  {
3028  if( ( IsDiagonal<MT1>::value ) ||
3029  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3030  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3031  selectSmallAssignKernel( y, A, x, scalar );
3032  else
3033  selectBlasAssignKernel( y, A, x, scalar );
3034  }
3035  //**********************************************************************************************
3036 
3037  //**Default assignment to dense vectors*********************************************************
3051  template< typename VT1 // Type of the left-hand side target vector
3052  , typename MT1 // Type of the left-hand side matrix operand
3053  , typename VT2 // Type of the right-hand side vector operand
3054  , typename ST2 > // Type of the scalar value
3055  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3056  selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3057  {
3058  y.assign( A * x * scalar );
3059  }
3060  //**********************************************************************************************
3061 
3062  //**Default assignment to dense vectors (small matrices)****************************************
3076  template< typename VT1 // Type of the left-hand side target vector
3077  , typename MT1 // Type of the left-hand side matrix operand
3078  , typename VT2 // Type of the right-hand side vector operand
3079  , typename ST2 > // Type of the scalar value
3080  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3081  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3082  {
3083  selectDefaultAssignKernel( y, A, x, scalar );
3084  }
3085  //**********************************************************************************************
3086 
3087  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3101  template< typename VT1 // Type of the left-hand side target vector
3102  , typename MT1 // Type of the left-hand side matrix operand
3103  , typename VT2 // Type of the right-hand side vector operand
3104  , typename ST2 > // Type of the scalar value
3105  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3106  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3107  {
3108  const size_t M( A.rows() );
3109  const size_t N( A.columns() );
3110 
3111  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3112 
3113  size_t i( 0UL );
3114 
3115  for( ; (i+8UL) <= M; i+=8UL )
3116  {
3117  const size_t jbegin( ( IsUpper<MT1>::value )
3118  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3119  :( 0UL ) );
3120  const size_t jend( ( IsLower<MT1>::value )
3121  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3122  :( N ) );
3123  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3124 
3125  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3126  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3127 
3128  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3129  size_t j( jbegin );
3130 
3131  for( ; j<jpos; j+=SIMDSIZE ) {
3132  const SIMDType x1( x.load(j) );
3133  xmm1 = xmm1 + A.load(i ,j) * x1;
3134  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3135  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3136  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3137  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3138  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3139  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3140  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3141  }
3142 
3143  y[i ] = sum( xmm1 ) * scalar;
3144  y[i+1UL] = sum( xmm2 ) * scalar;
3145  y[i+2UL] = sum( xmm3 ) * scalar;
3146  y[i+3UL] = sum( xmm4 ) * scalar;
3147  y[i+4UL] = sum( xmm5 ) * scalar;
3148  y[i+5UL] = sum( xmm6 ) * scalar;
3149  y[i+6UL] = sum( xmm7 ) * scalar;
3150  y[i+7UL] = sum( xmm8 ) * scalar;
3151 
3152  for( ; remainder && j<jend; ++j ) {
3153  y[i ] += A(i ,j) * x[j] * scalar;
3154  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3155  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3156  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3157  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3158  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3159  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3160  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3161  }
3162  }
3163 
3164  for( ; (i+4UL) <= M; i+=4UL )
3165  {
3166  const size_t jbegin( ( IsUpper<MT1>::value )
3167  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3168  :( 0UL ) );
3169  const size_t jend( ( IsLower<MT1>::value )
3170  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3171  :( N ) );
3172  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3173 
3174  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3175  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3176 
3177  SIMDType xmm1, xmm2, xmm3, xmm4;
3178  size_t j( jbegin );
3179 
3180  for( ; j<jpos; j+=SIMDSIZE ) {
3181  const SIMDType x1( x.load(j) );
3182  xmm1 = xmm1 + A.load(i ,j) * x1;
3183  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3184  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3185  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3186  }
3187 
3188  y[i ] = sum( xmm1 ) * scalar;
3189  y[i+1UL] = sum( xmm2 ) * scalar;
3190  y[i+2UL] = sum( xmm3 ) * scalar;
3191  y[i+3UL] = sum( xmm4 ) * scalar;
3192 
3193  for( ; remainder && j<jend; ++j ) {
3194  y[i ] += A(i ,j) * x[j] * scalar;
3195  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3196  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3197  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3198  }
3199  }
3200 
3201  for( ; (i+3UL) <= M; i+=3UL )
3202  {
3203  const size_t jbegin( ( IsUpper<MT1>::value )
3204  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3205  :( 0UL ) );
3206  const size_t jend( ( IsLower<MT1>::value )
3207  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3208  :( N ) );
3209  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3210 
3211  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3212  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3213 
3214  SIMDType xmm1, xmm2, xmm3;
3215  size_t j( jbegin );
3216 
3217  for( ; j<jpos; j+=SIMDSIZE ) {
3218  const SIMDType x1( x.load(j) );
3219  xmm1 = xmm1 + A.load(i ,j) * x1;
3220  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3221  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3222  }
3223 
3224  y[i ] = sum( xmm1 ) * scalar;
3225  y[i+1UL] = sum( xmm2 ) * scalar;
3226  y[i+2UL] = sum( xmm3 ) * scalar;
3227 
3228  for( ; remainder && j<jend; ++j ) {
3229  y[i ] += A(i ,j) * x[j] * scalar;
3230  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3231  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3232  }
3233  }
3234 
3235  for( ; (i+2UL) <= M; i+=2UL )
3236  {
3237  const size_t jbegin( ( IsUpper<MT1>::value )
3238  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3239  :( 0UL ) );
3240  const size_t jend( ( IsLower<MT1>::value )
3241  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3242  :( N ) );
3243  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3244 
3245  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3246  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3247 
3248  SIMDType xmm1, xmm2;
3249  size_t j( jbegin );
3250 
3251  for( ; j<jpos; j+=SIMDSIZE ) {
3252  const SIMDType x1( x.load(j) );
3253  xmm1 = xmm1 + A.load(i ,j) * x1;
3254  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3255  }
3256 
3257  y[i ] = sum( xmm1 ) * scalar;
3258  y[i+1UL] = sum( xmm2 ) * scalar;
3259 
3260  for( ; remainder && j<jend; ++j ) {
3261  y[i ] += A(i ,j) * x[j] * scalar;
3262  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3263  }
3264  }
3265 
3266  if( i < M )
3267  {
3268  const size_t jbegin( ( IsUpper<MT1>::value )
3269  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3270  :( 0UL ) );
3271  const size_t jend( ( IsLower<MT1>::value )
3272  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3273  :( N ) );
3274  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3275 
3276  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3277  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3278 
3279  SIMDType xmm1;
3280  size_t j( jbegin );
3281 
3282  for( ; j<jpos; j+=SIMDSIZE ) {
3283  xmm1 = xmm1 + A.load(i,j) * x.load(j);
3284  }
3285 
3286  y[i] = sum( xmm1 ) * scalar;
3287 
3288  for( ; remainder && j<jend; ++j ) {
3289  y[i] += A(i,j) * x[j] * scalar;
3290  }
3291  }
3292  }
3293  //**********************************************************************************************
3294 
3295  //**Default assignment to dense vectors (large matrices)****************************************
3309  template< typename VT1 // Type of the left-hand side target vector
3310  , typename MT1 // Type of the left-hand side matrix operand
3311  , typename VT2 // Type of the right-hand side vector operand
3312  , typename ST2 > // Type of the scalar value
3313  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3314  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3315  {
3316  selectDefaultAssignKernel( y, A, x, scalar );
3317  }
3318  //**********************************************************************************************
3319 
3320  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3334  template< typename VT1 // Type of the left-hand side target vector
3335  , typename MT1 // Type of the left-hand side matrix operand
3336  , typename VT2 // Type of the right-hand side vector operand
3337  , typename ST2 > // Type of the scalar value
3338  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3339  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3340  {
3341  const size_t M( A.rows() );
3342  const size_t N( A.columns() );
3343 
3344  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3345 
3346  reset( y );
3347 
3348  size_t i( 0UL );
3349 
3350  for( ; (i+8UL) <= M; i+=8UL )
3351  {
3352  const size_t jbegin( ( IsUpper<MT1>::value )
3353  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3354  :( 0UL ) );
3355  const size_t jend( ( IsLower<MT1>::value )
3356  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3357  :( N ) );
3358  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3359 
3360  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3361  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3362 
3363  size_t j( jbegin );
3364 
3365  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3366  const size_t j1( j+SIMDSIZE );
3367  const size_t j2( j+SIMDSIZE*2UL );
3368  const size_t j3( j+SIMDSIZE*3UL );
3369  const SIMDType x1( x.load(j ) );
3370  const SIMDType x2( x.load(j1) );
3371  const SIMDType x3( x.load(j2) );
3372  const SIMDType x4( x.load(j3) );
3373  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3374  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3375  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3376  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3377  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3378  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3379  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3380  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3381  }
3382 
3383  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3384  const size_t j1( j+SIMDSIZE );
3385  const SIMDType x1( x.load(j ) );
3386  const SIMDType x2( x.load(j1) );
3387  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3388  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3389  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3390  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3391  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3392  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3393  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3394  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3395  }
3396 
3397  for( ; j<jpos; j+=SIMDSIZE ) {
3398  const SIMDType x1( x.load(j) );
3399  y[i ] += sum( A.load(i ,j) * x1 );
3400  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3401  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3402  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3403  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
3404  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
3405  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
3406  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
3407  }
3408 
3409  for( ; remainder && j<jend; ++j ) {
3410  y[i ] += A(i ,j) * x[j];
3411  y[i+1UL] += A(i+1UL,j) * x[j];
3412  y[i+2UL] += A(i+2UL,j) * x[j];
3413  y[i+3UL] += A(i+3UL,j) * x[j];
3414  y[i+4UL] += A(i+4UL,j) * x[j];
3415  y[i+5UL] += A(i+5UL,j) * x[j];
3416  y[i+6UL] += A(i+6UL,j) * x[j];
3417  y[i+7UL] += A(i+7UL,j) * x[j];
3418  }
3419 
3420  y[i ] *= scalar;
3421  y[i+1UL] *= scalar;
3422  y[i+2UL] *= scalar;
3423  y[i+3UL] *= scalar;
3424  y[i+4UL] *= scalar;
3425  y[i+5UL] *= scalar;
3426  y[i+6UL] *= scalar;
3427  y[i+7UL] *= scalar;
3428  }
3429 
3430  for( ; (i+4UL) <= M; i+=4UL )
3431  {
3432  const size_t jbegin( ( IsUpper<MT1>::value )
3433  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3434  :( 0UL ) );
3435  const size_t jend( ( IsLower<MT1>::value )
3436  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3437  :( N ) );
3438  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3439 
3440  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3441  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3442 
3443  size_t j( jbegin );
3444 
3445  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3446  const size_t j1( j+SIMDSIZE );
3447  const size_t j2( j+SIMDSIZE*2UL );
3448  const size_t j3( j+SIMDSIZE*3UL );
3449  const SIMDType x1( x.load(j ) );
3450  const SIMDType x2( x.load(j1) );
3451  const SIMDType x3( x.load(j2) );
3452  const SIMDType x4( x.load(j3) );
3453  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3454  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3455  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3456  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3457  }
3458 
3459  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3460  const size_t j1( j+SIMDSIZE );
3461  const SIMDType x1( x.load(j ) );
3462  const SIMDType x2( x.load(j1) );
3463  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3464  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3465  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3466  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3467  }
3468 
3469  for( ; j<jpos; j+=SIMDSIZE ) {
3470  const SIMDType x1( x.load(j) );
3471  y[i ] += sum( A.load(i ,j) * x1 );
3472  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3473  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3474  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3475  }
3476 
3477  for( ; remainder && j<jend; ++j ) {
3478  y[i ] += A(i ,j) * x[j];
3479  y[i+1UL] += A(i+1UL,j) * x[j];
3480  y[i+2UL] += A(i+2UL,j) * x[j];
3481  y[i+3UL] += A(i+3UL,j) * x[j];
3482  }
3483 
3484  y[i ] *= scalar;
3485  y[i+1UL] *= scalar;
3486  y[i+2UL] *= scalar;
3487  y[i+3UL] *= scalar;
3488  }
3489 
3490  for( ; (i+2UL) <= M; i+=2UL )
3491  {
3492  const size_t jbegin( ( IsUpper<MT1>::value )
3493  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3494  :( 0UL ) );
3495  const size_t jend( ( IsLower<MT1>::value )
3496  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3497  :( N ) );
3498  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3499 
3500  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3501  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3502 
3503  size_t j( jbegin );
3504 
3505  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3506  const size_t j1( j+SIMDSIZE );
3507  const size_t j2( j+SIMDSIZE*2UL );
3508  const size_t j3( j+SIMDSIZE*3UL );
3509  const SIMDType x1( x.load(j ) );
3510  const SIMDType x2( x.load(j1) );
3511  const SIMDType x3( x.load(j2) );
3512  const SIMDType x4( x.load(j3) );
3513  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3514  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3515  }
3516 
3517  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3518  const size_t j1( j+SIMDSIZE );
3519  const SIMDType x1( x.load(j ) );
3520  const SIMDType x2( x.load(j1) );
3521  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3522  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3523  }
3524 
3525  for( ; j<jpos; j+=SIMDSIZE ) {
3526  const SIMDType x1( x.load(j) );
3527  y[i ] += sum( A.load(i ,j) * x1 );
3528  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3529  }
3530 
3531  for( ; remainder && j<jend; ++j ) {
3532  y[i ] += A(i ,j) * x[j];
3533  y[i+1UL] += A(i+1UL,j) * x[j];
3534  }
3535 
3536  y[i ] *= scalar;
3537  y[i+1UL] *= scalar;
3538  }
3539 
3540  if( i < M )
3541  {
3542  const size_t jbegin( ( IsUpper<MT1>::value )
3543  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3544  :( 0UL ) );
3545  const size_t jend( ( IsLower<MT1>::value )
3546  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3547  :( N ) );
3548  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3549 
3550  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3551  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3552 
3553  size_t j( jbegin );
3554 
3555  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
3556  const size_t j1( j+SIMDSIZE );
3557  const size_t j2( j+SIMDSIZE*2UL );
3558  const size_t j3( j+SIMDSIZE*3UL );
3559  const SIMDType x1( x.load(j ) );
3560  const SIMDType x2( x.load(j1) );
3561  const SIMDType x3( x.load(j2) );
3562  const SIMDType x4( x.load(j3) );
3563  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3564  }
3565 
3566  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
3567  const size_t j1( j+SIMDSIZE );
3568  const SIMDType x1( x.load(j ) );
3569  const SIMDType x2( x.load(j1) );
3570  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3571  }
3572 
3573  for( ; j<jpos; j+=SIMDSIZE ) {
3574  const SIMDType x1( x.load(j) );
3575  y[i] += sum( A.load(i,j) * x1 );
3576  }
3577 
3578  for( ; remainder && j<jend; ++j ) {
3579  y[i] += A(i,j) * x[j];
3580  }
3581 
3582  y[i] *= scalar;
3583  }
3584  }
3585  //**********************************************************************************************
3586 
3587  //**BLAS-based assignment to dense vectors (default)********************************************
3601  template< typename VT1 // Type of the left-hand side target vector
3602  , typename MT1 // Type of the left-hand side matrix operand
3603  , typename VT2 // Type of the right-hand side vector operand
3604  , typename ST2 > // Type of the scalar value
3605  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3606  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3607  {
3608  selectLargeAssignKernel( y, A, x, scalar );
3609  }
3610  //**********************************************************************************************
3611 
3612  //**BLAS-based assignment to dense vectors******************************************************
3613 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3614 
3627  template< typename VT1 // Type of the left-hand side target vector
3628  , typename MT1 // Type of the left-hand side matrix operand
3629  , typename VT2 // Type of the right-hand side vector operand
3630  , typename ST2 > // Type of the scalar value
3631  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3632  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3633  {
3634  typedef ElementType_<VT1> ET;
3635 
3636  if( IsTriangular<MT1>::value ) {
3637  assign( y, scalar * x );
3638  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3639  }
3640  else {
3641  gemv( y, A, x, ET(scalar), ET(0) );
3642  }
3643  }
3644 #endif
3645  //**********************************************************************************************
3646 
3647  //**Assignment to sparse vectors****************************************************************
3659  template< typename VT1 > // Type of the target sparse vector
3660  friend inline void assign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3661  {
3663 
3666  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
3667 
3668  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3669 
3670  const ResultType tmp( serial( rhs ) );
3671  assign( ~lhs, tmp );
3672  }
3673  //**********************************************************************************************
3674 
3675  //**Addition assignment to dense vectors********************************************************
3687  template< typename VT1 > // Type of the target dense vector
3688  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3689  {
3691 
3692  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3693 
3694  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
3695  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
3696 
3697  if( left.rows() == 0UL || left.columns() == 0UL ) {
3698  return;
3699  }
3700 
3701  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3702  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3703 
3704  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3705  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3706  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3707  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3708 
3709  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3710  }
3711  //**********************************************************************************************
3712 
3713  //**Addition assignment to dense vectors (kernel selection)*************************************
3724  template< typename VT1 // Type of the left-hand side target vector
3725  , typename MT1 // Type of the left-hand side matrix operand
3726  , typename VT2 // Type of the right-hand side vector operand
3727  , typename ST2 > // Type of the scalar value
3728  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3729  {
3730  if( ( IsDiagonal<MT1>::value ) ||
3731  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3732  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3733  selectSmallAddAssignKernel( y, A, x, scalar );
3734  else
3735  selectBlasAddAssignKernel( y, A, x, scalar );
3736  }
3737  //**********************************************************************************************
3738 
3739  //**Default addition assignment to dense vectors************************************************
3753  template< typename VT1 // Type of the left-hand side target vector
3754  , typename MT1 // Type of the left-hand side matrix operand
3755  , typename VT2 // Type of the right-hand side vector operand
3756  , typename ST2 > // Type of the scalar value
3757  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3758  {
3759  y.addAssign( A * x * scalar );
3760  }
3761  //**********************************************************************************************
3762 
3763  //**Default addition assignment to dense vectors (small matrices)*******************************
3777  template< typename VT1 // Type of the left-hand side target vector
3778  , typename MT1 // Type of the left-hand side matrix operand
3779  , typename VT2 // Type of the right-hand side vector operand
3780  , typename ST2 > // Type of the scalar value
3781  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3782  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3783  {
3784  selectDefaultAddAssignKernel( y, A, x, scalar );
3785  }
3786  //**********************************************************************************************
3787 
3788  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3802  template< typename VT1 // Type of the left-hand side target vector
3803  , typename MT1 // Type of the left-hand side matrix operand
3804  , typename VT2 // Type of the right-hand side vector operand
3805  , typename ST2 > // Type of the scalar value
3806  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3807  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3808  {
3809  const size_t M( A.rows() );
3810  const size_t N( A.columns() );
3811 
3812  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3813 
3814  size_t i( 0UL );
3815 
3816  for( ; (i+8UL) <= M; i+=8UL )
3817  {
3818  const size_t jbegin( ( IsUpper<MT1>::value )
3819  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3820  :( 0UL ) );
3821  const size_t jend( ( IsLower<MT1>::value )
3822  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3823  :( N ) );
3824  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3825 
3826  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3827  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3828 
3829  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3830  size_t j( jbegin );
3831 
3832  for( ; j<jpos; j+=SIMDSIZE ) {
3833  const SIMDType x1( x.load(j) );
3834  xmm1 = xmm1 + A.load(i ,j) * x1;
3835  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3836  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3837  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3838  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3839  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3840  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3841  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3842  }
3843 
3844  y[i ] += sum( xmm1 ) * scalar;
3845  y[i+1UL] += sum( xmm2 ) * scalar;
3846  y[i+2UL] += sum( xmm3 ) * scalar;
3847  y[i+3UL] += sum( xmm4 ) * scalar;
3848  y[i+4UL] += sum( xmm5 ) * scalar;
3849  y[i+5UL] += sum( xmm6 ) * scalar;
3850  y[i+6UL] += sum( xmm7 ) * scalar;
3851  y[i+7UL] += sum( xmm8 ) * scalar;
3852 
3853  for( ; remainder && j<jend; ++j ) {
3854  y[i ] += A(i ,j) * x[j] * scalar;
3855  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3856  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3857  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3858  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3859  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3860  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3861  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3862  }
3863  }
3864 
3865  for( ; (i+4UL) <= M; i+=4UL )
3866  {
3867  const size_t jbegin( ( IsUpper<MT1>::value )
3868  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3869  :( 0UL ) );
3870  const size_t jend( ( IsLower<MT1>::value )
3871  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3872  :( N ) );
3873  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3874 
3875  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3876  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3877 
3878  SIMDType xmm1, xmm2, xmm3, xmm4;
3879  size_t j( jbegin );
3880 
3881  for( ; j<jpos; j+=SIMDSIZE ) {
3882  const SIMDType x1( x.load(j) );
3883  xmm1 = xmm1 + A.load(i ,j) * x1;
3884  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3885  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3886  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3887  }
3888 
3889  y[i ] += sum( xmm1 ) * scalar;
3890  y[i+1UL] += sum( xmm2 ) * scalar;
3891  y[i+2UL] += sum( xmm3 ) * scalar;
3892  y[i+3UL] += sum( xmm4 ) * scalar;
3893 
3894  for( ; remainder && j<jend; ++j ) {
3895  y[i ] += A(i ,j) * x[j] * scalar;
3896  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3897  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3898  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3899  }
3900  }
3901 
3902  for( ; (i+3UL) <= M; i+=3UL )
3903  {
3904  const size_t jbegin( ( IsUpper<MT1>::value )
3905  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3906  :( 0UL ) );
3907  const size_t jend( ( IsLower<MT1>::value )
3908  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3909  :( N ) );
3910  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3911 
3912  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3913  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3914 
3915  SIMDType xmm1, xmm2, xmm3;
3916  size_t j( jbegin );
3917 
3918  for( ; j<jpos; j+=SIMDSIZE ) {
3919  const SIMDType x1( x.load(j) );
3920  xmm1 = xmm1 + A.load(i ,j) * x1;
3921  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3922  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3923  }
3924 
3925  y[i ] += sum( xmm1 ) * scalar;
3926  y[i+1UL] += sum( xmm2 ) * scalar;
3927  y[i+2UL] += sum( xmm3 ) * scalar;
3928 
3929  for( ; remainder && j<jend; ++j ) {
3930  y[i ] += A(i ,j) * x[j] * scalar;
3931  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3932  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3933  }
3934  }
3935 
3936  for( ; (i+2UL) <= M; i+=2UL )
3937  {
3938  const size_t jbegin( ( IsUpper<MT1>::value )
3939  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3940  :( 0UL ) );
3941  const size_t jend( ( IsLower<MT1>::value )
3942  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3943  :( N ) );
3944  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3945 
3946  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3947  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3948 
3949  SIMDType xmm1, xmm2;
3950  size_t j( jbegin );
3951 
3952  for( ; j<jpos; j+=SIMDSIZE ) {
3953  const SIMDType x1( x.load(j) );
3954  xmm1 = xmm1 + A.load(i ,j) * x1;
3955  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3956  }
3957 
3958  y[i ] += sum( xmm1 ) * scalar;
3959  y[i+1UL] += sum( xmm2 ) * scalar;
3960 
3961  for( ; remainder && j<jend; ++j ) {
3962  y[i ] += A(i ,j) * x[j] * scalar;
3963  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3964  }
3965  }
3966 
3967  if( i < M )
3968  {
3969  const size_t jbegin( ( IsUpper<MT1>::value )
3970  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
3971  :( 0UL ) );
3972  const size_t jend( ( IsLower<MT1>::value )
3973  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3974  :( N ) );
3975  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3976 
3977  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3978  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
3979 
3980  SIMDType xmm1;
3981  size_t j( jbegin );
3982 
3983  for( ; j<jpos; j+=SIMDSIZE ) {
3984  xmm1 = xmm1 + A.load(i,j) * x.load(j);
3985  }
3986 
3987  y[i] += sum( xmm1 ) * scalar;
3988 
3989  for( ; remainder && j<jend; ++j ) {
3990  y[i] += A(i,j) * x[j] * scalar;
3991  }
3992  }
3993  }
3994  //**********************************************************************************************
3995 
3996  //**Default addition assignment to dense vectors (large matrices)*******************************
4010  template< typename VT1 // Type of the left-hand side target vector
4011  , typename MT1 // Type of the left-hand side matrix operand
4012  , typename VT2 // Type of the right-hand side vector operand
4013  , typename ST2 > // Type of the scalar value
4014  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4015  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4016  {
4017  selectDefaultAddAssignKernel( y, A, x, scalar );
4018  }
4019  //**********************************************************************************************
4020 
4021  //**Vectorized default addition assignment to dense vectors (large matrices)********************
4035  template< typename VT1 // Type of the left-hand side target vector
4036  , typename MT1 // Type of the left-hand side matrix operand
4037  , typename VT2 // Type of the right-hand side vector operand
4038  , typename ST2 > // Type of the scalar value
4039  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4040  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4041  {
4042  const size_t M( A.rows() );
4043  const size_t N( A.columns() );
4044 
4045  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4046 
4047  size_t i( 0UL );
4048 
4049  for( ; (i+8UL) <= M; i+=8UL )
4050  {
4051  const size_t jbegin( ( IsUpper<MT1>::value )
4052  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4053  :( 0UL ) );
4054  const size_t jend( ( IsLower<MT1>::value )
4055  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4056  :( N ) );
4057  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4058 
4059  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4060  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4061 
4062  size_t j( jbegin );
4063 
4064  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4065  const size_t j1( j+SIMDSIZE );
4066  const size_t j2( j+SIMDSIZE*2UL );
4067  const size_t j3( j+SIMDSIZE*3UL );
4068  const SIMDType x1( x.load(j ) );
4069  const SIMDType x2( x.load(j1) );
4070  const SIMDType x3( x.load(j2) );
4071  const SIMDType x4( x.load(j3) );
4072  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4073  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4074  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4075  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4076  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4077  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4078  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4079  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4080  }
4081 
4082  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4083  const size_t j1( j+SIMDSIZE );
4084  const SIMDType x1( x.load(j ) );
4085  const SIMDType x2( x.load(j1) );
4086  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4087  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4088  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4089  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4090  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4091  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4092  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4093  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4094  }
4095 
4096  for( ; j<jpos; j+=SIMDSIZE ) {
4097  const SIMDType x1( x.load(j) );
4098  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4099  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4100  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4101  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4102  y[i+4UL] += sum( A.load(i+4UL,j) * x1 ) * scalar;
4103  y[i+5UL] += sum( A.load(i+5UL,j) * x1 ) * scalar;
4104  y[i+6UL] += sum( A.load(i+6UL,j) * x1 ) * scalar;
4105  y[i+7UL] += sum( A.load(i+7UL,j) * x1 ) * scalar;
4106  }
4107 
4108  for( ; remainder && j<jend; ++j ) {
4109  y[i ] += A(i ,j) * x[j] * scalar;
4110  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4111  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4112  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4113  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4114  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4115  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4116  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4117  }
4118  }
4119 
4120  for( ; (i+4UL) <= M; i+=4UL )
4121  {
4122  const size_t jbegin( ( IsUpper<MT1>::value )
4123  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4124  :( 0UL ) );
4125  const size_t jend( ( IsLower<MT1>::value )
4126  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4127  :( N ) );
4128  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4129 
4130  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4131  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4132 
4133  size_t j( jbegin );
4134 
4135  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4136  const size_t j1( j+SIMDSIZE );
4137  const size_t j2( j+SIMDSIZE*2UL );
4138  const size_t j3( j+SIMDSIZE*3UL );
4139  const SIMDType x1( x.load(j ) );
4140  const SIMDType x2( x.load(j1) );
4141  const SIMDType x3( x.load(j2) );
4142  const SIMDType x4( x.load(j3) );
4143  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4144  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4145  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4146  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4147  }
4148 
4149  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4150  const size_t j1( j+SIMDSIZE );
4151  const SIMDType x1( x.load(j ) );
4152  const SIMDType x2( x.load(j1) );
4153  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4154  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4155  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4156  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4157  }
4158 
4159  for( ; j<jpos; j+=SIMDSIZE ) {
4160  const SIMDType x1( x.load(j) );
4161  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4162  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4163  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4164  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4165  }
4166 
4167  for( ; remainder && j<jend; ++j ) {
4168  y[i ] += A(i ,j) * x[j] * scalar;
4169  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4170  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4171  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4172  }
4173  }
4174 
4175  for( ; (i+2UL) <= M; i+=2UL )
4176  {
4177  const size_t jbegin( ( IsUpper<MT1>::value )
4178  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4179  :( 0UL ) );
4180  const size_t jend( ( IsLower<MT1>::value )
4181  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4182  :( N ) );
4183  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4184 
4185  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4186  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4187 
4188  size_t j( jbegin );
4189 
4190  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4191  const size_t j1( j+SIMDSIZE );
4192  const size_t j2( j+SIMDSIZE*2UL );
4193  const size_t j3( j+SIMDSIZE*3UL );
4194  const SIMDType x1( x.load(j ) );
4195  const SIMDType x2( x.load(j1) );
4196  const SIMDType x3( x.load(j2) );
4197  const SIMDType x4( x.load(j3) );
4198  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4199  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4200  }
4201 
4202  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4203  const size_t j1( j+SIMDSIZE );
4204  const SIMDType x1( x.load(j ) );
4205  const SIMDType x2( x.load(j1) );
4206  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4207  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4208  }
4209 
4210  for( ; j<jpos; j+=SIMDSIZE ) {
4211  const SIMDType x1( x.load(j) );
4212  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4213  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4214  }
4215 
4216  for( ; remainder && j<jend; ++j ) {
4217  y[i ] += A(i ,j) * x[j] * scalar;
4218  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4219  }
4220  }
4221 
4222  if( i < M )
4223  {
4224  const size_t jbegin( ( IsUpper<MT1>::value )
4225  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4226  :( 0UL ) );
4227  const size_t jend( ( IsLower<MT1>::value )
4228  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4229  :( N ) );
4230  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4231 
4232  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4233  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4234 
4235  size_t j( jbegin );
4236 
4237  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4238  const size_t j1( j+SIMDSIZE );
4239  const size_t j2( j+SIMDSIZE*2UL );
4240  const size_t j3( j+SIMDSIZE*3UL );
4241  const SIMDType x1( x.load(j ) );
4242  const SIMDType x2( x.load(j1) );
4243  const SIMDType x3( x.load(j2) );
4244  const SIMDType x4( x.load(j3) );
4245  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4246  }
4247 
4248  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4249  const size_t j1( j+SIMDSIZE );
4250  const SIMDType x1( x.load(j ) );
4251  const SIMDType x2( x.load(j1) );
4252  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4253  }
4254 
4255  for( ; j<jpos; j+=SIMDSIZE ) {
4256  const SIMDType x1( x.load(j) );
4257  y[i] += sum( A.load(i,j) * x1 ) * scalar;
4258  }
4259 
4260  for( ; remainder && j<jend; ++j ) {
4261  y[i] += A(i,j) * x[j] * scalar;
4262  }
4263  }
4264  }
4265  //**********************************************************************************************
4266 
4267  //**BLAS-based addition assignment to dense vectors (default)***********************************
4281  template< typename VT1 // Type of the left-hand side target vector
4282  , typename MT1 // Type of the left-hand side matrix operand
4283  , typename VT2 // Type of the right-hand side vector operand
4284  , typename ST2 > // Type of the scalar value
4285  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4286  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4287  {
4288  selectLargeAddAssignKernel( y, A, x, scalar );
4289  }
4290  //**********************************************************************************************
4291 
4292  //**BLAS-based addition assignment to dense vectors*********************************************
4293 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4294 
4307  template< typename VT1 // Type of the left-hand side target vector
4308  , typename MT1 // Type of the left-hand side matrix operand
4309  , typename VT2 // Type of the right-hand side vector operand
4310  , typename ST2 > // Type of the scalar value
4311  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4312  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4313  {
4314  typedef ElementType_<VT1> ET;
4315 
4316  if( IsTriangular<MT1>::value ) {
4317  ResultType_<VT1> tmp( serial( scalar * x ) );
4318  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4319  addAssign( y, tmp );
4320  }
4321  else {
4322  gemv( y, A, x, ET(scalar), ET(1) );
4323  }
4324  }
4325 #endif
4326  //**********************************************************************************************
4327 
4328  //**Addition assignment to sparse vectors*******************************************************
4329  // No special implementation for the addition assignment to sparse vectors.
4330  //**********************************************************************************************
4331 
4332  //**Subtraction assignment to dense vectors*****************************************************
4344  template< typename VT1 > // Type of the target dense vector
4345  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4346  {
4348 
4349  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4350 
4351  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4352  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4353 
4354  if( left.rows() == 0UL || left.columns() == 0UL ) {
4355  return;
4356  }
4357 
4358  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4359  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
4360 
4361  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4362  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4363  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4364  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
4365 
4366  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
4367  }
4368  //**********************************************************************************************
4369 
4370  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4381  template< typename VT1 // Type of the left-hand side target vector
4382  , typename MT1 // Type of the left-hand side matrix operand
4383  , typename VT2 // Type of the right-hand side vector operand
4384  , typename ST2 > // Type of the scalar value
4385  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4386  {
4387  if( ( IsDiagonal<MT1>::value ) ||
4388  ( IsComputation<MT>::value && !evaluateMatrix ) ||
4389  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4390  selectSmallSubAssignKernel( y, A, x, scalar );
4391  else
4392  selectBlasSubAssignKernel( y, A, x, scalar );
4393  }
4394  //**********************************************************************************************
4395 
4396  //**Default subtraction assignment to dense vectors*********************************************
4410  template< typename VT1 // Type of the left-hand side target vector
4411  , typename MT1 // Type of the left-hand side matrix operand
4412  , typename VT2 // Type of the right-hand side vector operand
4413  , typename ST2 > // Type of the scalar value
4414  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4415  {
4416  y.subAssign( A * x * scalar );
4417  }
4418  //**********************************************************************************************
4419 
4420  //**Default subtraction assignment to dense vectors (small matrices)****************************
4434  template< typename VT1 // Type of the left-hand side target vector
4435  , typename MT1 // Type of the left-hand side matrix operand
4436  , typename VT2 // Type of the right-hand side vector operand
4437  , typename ST2 > // Type of the scalar value
4438  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4439  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4440  {
4441  selectDefaultSubAssignKernel( y, A, x, scalar );
4442  }
4443  //**********************************************************************************************
4444 
4445  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4459  template< typename VT1 // Type of the left-hand side target vector
4460  , typename MT1 // Type of the left-hand side matrix operand
4461  , typename VT2 // Type of the right-hand side vector operand
4462  , typename ST2 > // Type of the scalar value
4463  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4464  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4465  {
4466  const size_t M( A.rows() );
4467  const size_t N( A.columns() );
4468 
4469  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4470 
4471  size_t i( 0UL );
4472 
4473  for( ; (i+8UL) <= M; i+=8UL )
4474  {
4475  const size_t jbegin( ( IsUpper<MT1>::value )
4476  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4477  :( 0UL ) );
4478  const size_t jend( ( IsLower<MT1>::value )
4479  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4480  :( N ) );
4481  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4482 
4483  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4484  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4485 
4486  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4487  size_t j( jbegin );
4488 
4489  for( ; j<jpos; j+=SIMDSIZE ) {
4490  const SIMDType x1( x.load(j) );
4491  xmm1 = xmm1 + A.load(i ,j) * x1;
4492  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4493  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4494  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4495  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
4496  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
4497  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
4498  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
4499  }
4500 
4501  y[i ] -= sum( xmm1 ) * scalar;
4502  y[i+1UL] -= sum( xmm2 ) * scalar;
4503  y[i+2UL] -= sum( xmm3 ) * scalar;
4504  y[i+3UL] -= sum( xmm4 ) * scalar;
4505  y[i+4UL] -= sum( xmm5 ) * scalar;
4506  y[i+5UL] -= sum( xmm6 ) * scalar;
4507  y[i+6UL] -= sum( xmm7 ) * scalar;
4508  y[i+7UL] -= sum( xmm8 ) * scalar;
4509 
4510  for( ; remainder && j<jend; ++j ) {
4511  y[i ] -= A(i ,j) * x[j] * scalar;
4512  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4513  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4514  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4515  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4516  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4517  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4518  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4519  }
4520  }
4521 
4522  for( ; (i+4UL) <= M; i+=4UL )
4523  {
4524  const size_t jbegin( ( IsUpper<MT1>::value )
4525  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4526  :( 0UL ) );
4527  const size_t jend( ( IsLower<MT1>::value )
4528  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4529  :( N ) );
4530  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4531 
4532  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4533  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4534 
4535  SIMDType xmm1, xmm2, xmm3, xmm4;
4536  size_t j( jbegin );
4537 
4538  for( ; j<jpos; j+=SIMDSIZE ) {
4539  const SIMDType x1( x.load(j) );
4540  xmm1 = xmm1 + A.load(i ,j) * x1;
4541  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4542  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4543  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4544  }
4545 
4546  y[i ] -= sum( xmm1 ) * scalar;
4547  y[i+1UL] -= sum( xmm2 ) * scalar;
4548  y[i+2UL] -= sum( xmm3 ) * scalar;
4549  y[i+3UL] -= sum( xmm4 ) * scalar;
4550 
4551  for( ; remainder && j<jend; ++j ) {
4552  y[i ] -= A(i ,j) * x[j] * scalar;
4553  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4554  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4555  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4556  }
4557  }
4558 
4559  for( ; (i+3UL) <= M; i+=3UL )
4560  {
4561  const size_t jbegin( ( IsUpper<MT1>::value )
4562  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4563  :( 0UL ) );
4564  const size_t jend( ( IsLower<MT1>::value )
4565  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
4566  :( N ) );
4567  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4568 
4569  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4570  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4571 
4572  SIMDType xmm1, xmm2, xmm3;
4573  size_t j( jbegin );
4574 
4575  for( ; j<jpos; j+=SIMDSIZE ) {
4576  const SIMDType x1( x.load(j) );
4577  xmm1 = xmm1 + A.load(i ,j) * x1;
4578  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4579  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4580  }
4581 
4582  y[i ] -= sum( xmm1 ) * scalar;
4583  y[i+1UL] -= sum( xmm2 ) * scalar;
4584  y[i+2UL] -= sum( xmm3 ) * scalar;
4585 
4586  for( ; remainder && j<jend; ++j ) {
4587  y[i ] -= A(i ,j) * x[j] * scalar;
4588  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4589  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4590  }
4591  }
4592 
4593  for( ; (i+2UL) <= M; i+=2UL )
4594  {
4595  const size_t jbegin( ( IsUpper<MT1>::value )
4596  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4597  :( 0UL ) );
4598  const size_t jend( ( IsLower<MT1>::value )
4599  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4600  :( N ) );
4601  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4602 
4603  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4604  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4605 
4606  SIMDType xmm1, xmm2;
4607  size_t j( jbegin );
4608 
4609  for( ; j<jpos; j+=SIMDSIZE ) {
4610  const SIMDType x1( x.load(j) );
4611  xmm1 = xmm1 + A.load(i ,j) * x1;
4612  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4613  }
4614 
4615  y[i ] -= sum( xmm1 ) * scalar;
4616  y[i+1UL] -= sum( xmm2 ) * scalar;
4617 
4618  for( ; remainder && j<jend; ++j ) {
4619  y[i ] -= A(i ,j) * x[j] * scalar;
4620  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4621  }
4622  }
4623 
4624  if( i < M )
4625  {
4626  const size_t jbegin( ( IsUpper<MT1>::value )
4627  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4628  :( 0UL ) );
4629  const size_t jend( ( IsLower<MT1>::value )
4630  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4631  :( N ) );
4632  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4633 
4634  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4635  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4636 
4637  SIMDType xmm1;
4638  size_t j( jbegin );
4639 
4640  for( ; j<jpos; j+=SIMDSIZE ) {
4641  xmm1 = xmm1 + A.load(i,j) * x.load(j);
4642  }
4643 
4644  y[i] -= sum( xmm1 ) * scalar;
4645 
4646  for( ; remainder && j<jend; ++j ) {
4647  y[i] -= A(i,j) * x[j] * scalar;
4648  }
4649  }
4650  }
4651  //**********************************************************************************************
4652 
4653  //**Default subtraction assignment to dense vectors (large matrices)****************************
4667  template< typename VT1 // Type of the left-hand side target vector
4668  , typename MT1 // Type of the left-hand side matrix operand
4669  , typename VT2 // Type of the right-hand side vector operand
4670  , typename ST2 > // Type of the scalar value
4671  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4672  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4673  {
4674  selectDefaultSubAssignKernel( y, A, x, scalar );
4675  }
4676  //**********************************************************************************************
4677 
4678  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4692  template< typename VT1 // Type of the left-hand side target vector
4693  , typename MT1 // Type of the left-hand side matrix operand
4694  , typename VT2 // Type of the right-hand side vector operand
4695  , typename ST2 > // Type of the scalar value
4696  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4697  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4698  {
4699  const size_t M( A.rows() );
4700  const size_t N( A.columns() );
4701 
4702  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4703 
4704  size_t i( 0UL );
4705 
4706  for( ; (i+8UL) <= M; i+=8UL )
4707  {
4708  const size_t jbegin( ( IsUpper<MT1>::value )
4709  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4710  :( 0UL ) );
4711  const size_t jend( ( IsLower<MT1>::value )
4712  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4713  :( N ) );
4714  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4715 
4716  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4717  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4718 
4719  size_t j( jbegin );
4720 
4721  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4722  const size_t j1( j+SIMDSIZE );
4723  const size_t j2( j+SIMDSIZE*2UL );
4724  const size_t j3( j+SIMDSIZE*3UL );
4725  const SIMDType x1( x.load(j ) );
4726  const SIMDType x2( x.load(j1) );
4727  const SIMDType x3( x.load(j2) );
4728  const SIMDType x4( x.load(j3) );
4729  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4730  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4731  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4732  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4733  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4734  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4735  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4736  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4737  }
4738 
4739  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4740  const size_t j1( j+SIMDSIZE );
4741  const SIMDType x1( x.load(j ) );
4742  const SIMDType x2( x.load(j1) );
4743  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4744  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4745  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4746  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4747  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4748  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4749  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4750  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4751  }
4752 
4753  for( ; j<jpos; j+=SIMDSIZE ) {
4754  const SIMDType x1( x.load(j) );
4755  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4756  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4757  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4758  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4759  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 ) * scalar;
4760  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 ) * scalar;
4761  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 ) * scalar;
4762  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 ) * scalar;
4763  }
4764 
4765  for( ; remainder && j<jend; ++j ) {
4766  y[i ] -= A(i ,j) * x[j] * scalar;
4767  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4768  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4769  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4770  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4771  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4772  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4773  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4774  }
4775  }
4776 
4777  for( ; (i+4UL) <= M; i+=4UL )
4778  {
4779  const size_t jbegin( ( IsUpper<MT1>::value )
4780  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4781  :( 0UL ) );
4782  const size_t jend( ( IsLower<MT1>::value )
4783  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4784  :( N ) );
4785  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4786 
4787  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4788  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4789 
4790  size_t j( jbegin );
4791 
4792  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4793  const size_t j1( j+SIMDSIZE );
4794  const size_t j2( j+SIMDSIZE*2UL );
4795  const size_t j3( j+SIMDSIZE*3UL );
4796  const SIMDType x1( x.load(j ) );
4797  const SIMDType x2( x.load(j1) );
4798  const SIMDType x3( x.load(j2) );
4799  const SIMDType x4( x.load(j3) );
4800  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4801  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4802  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4803  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4804  }
4805 
4806  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4807  const size_t j1( j+SIMDSIZE );
4808  const SIMDType x1( x.load(j ) );
4809  const SIMDType x2( x.load(j1) );
4810  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4811  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4812  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4813  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4814  }
4815 
4816  for( ; j<jpos; j+=SIMDSIZE ) {
4817  const SIMDType x1( x.load(j) );
4818  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4819  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4820  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4821  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4822  }
4823 
4824  for( ; remainder && j<jend; ++j ) {
4825  y[i ] -= A(i ,j) * x[j] * scalar;
4826  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4827  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4828  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4829  }
4830  }
4831 
4832  for( ; (i+2UL) <= M; i+=2UL )
4833  {
4834  const size_t jbegin( ( IsUpper<MT1>::value )
4835  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4836  :( 0UL ) );
4837  const size_t jend( ( IsLower<MT1>::value )
4838  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4839  :( N ) );
4840  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4841 
4842  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4843  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4844 
4845  size_t j( jbegin );
4846 
4847  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4848  const size_t j1( j+SIMDSIZE );
4849  const size_t j2( j+SIMDSIZE*2UL );
4850  const size_t j3( j+SIMDSIZE*3UL );
4851  const SIMDType x1( x.load(j ) );
4852  const SIMDType x2( x.load(j1) );
4853  const SIMDType x3( x.load(j2) );
4854  const SIMDType x4( x.load(j3) );
4855  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4856  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4857  }
4858 
4859  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4860  const size_t j1( j+SIMDSIZE );
4861  const SIMDType x1( x.load(j ) );
4862  const SIMDType x2( x.load(j1) );
4863  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4864  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4865  }
4866 
4867  for( ; j<jpos; j+=SIMDSIZE ) {
4868  const SIMDType x1( x.load(j) );
4869  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4870  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4871  }
4872 
4873  for( ; remainder && j<jend; ++j ) {
4874  y[i ] -= A(i ,j) * x[j] * scalar;
4875  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4876  }
4877  }
4878 
4879  if( i < M )
4880  {
4881  const size_t jbegin( ( IsUpper<MT1>::value )
4882  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-SIMDSIZE) )
4883  :( 0UL ) );
4884  const size_t jend( ( IsLower<MT1>::value )
4885  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4886  :( N ) );
4887  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4888 
4889  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4890  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (SIMDSIZE) ) ) == jpos, "Invalid end calculation" );
4891 
4892  size_t j( jbegin );
4893 
4894  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL ) {
4895  const size_t j1( j+SIMDSIZE );
4896  const size_t j2( j+SIMDSIZE*2UL );
4897  const size_t j3( j+SIMDSIZE*3UL );
4898  const SIMDType x1( x.load(j ) );
4899  const SIMDType x2( x.load(j1) );
4900  const SIMDType x3( x.load(j2) );
4901  const SIMDType x4( x.load(j3) );
4902  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4903  }
4904 
4905  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL ) {
4906  const size_t j1( j+SIMDSIZE );
4907  const SIMDType x1( x.load(j ) );
4908  const SIMDType x2( x.load(j1) );
4909  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4910  }
4911 
4912  for( ; j<jpos; j+=SIMDSIZE ) {
4913  const SIMDType x1( x.load(j) );
4914  y[i] -= sum( A.load(i,j) * x1 ) * scalar;
4915  }
4916 
4917  for( ; remainder && j<jend; ++j ) {
4918  y[i] -= A(i,j) * x[j] * scalar;
4919  }
4920  }
4921  }
4922  //**********************************************************************************************
4923 
4924  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4938  template< typename VT1 // Type of the left-hand side target vector
4939  , typename MT1 // Type of the left-hand side matrix operand
4940  , typename VT2 // Type of the right-hand side vector operand
4941  , typename ST2 > // Type of the scalar value
4942  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4943  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4944  {
4945  selectLargeSubAssignKernel( y, A, x, scalar );
4946  }
4947  //**********************************************************************************************
4948 
4949  //**BLAS-based subtraction assignment to dense vectors******************************************
4950 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4951 
4964  template< typename VT1 // Type of the left-hand side target vector
4965  , typename MT1 // Type of the left-hand side matrix operand
4966  , typename VT2 // Type of the right-hand side vector operand
4967  , typename ST2 > // Type of the scalar value
4968  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4969  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4970  {
4971  typedef ElementType_<VT1> ET;
4972 
4973  if( IsTriangular<MT1>::value ) {
4974  ResultType_<VT1> tmp( serial( scalar * x ) );
4975  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4976  subAssign( y, tmp );
4977  }
4978  else {
4979  gemv( y, A, x, ET(-scalar), ET(1) );
4980  }
4981  }
4982 #endif
4983  //**********************************************************************************************
4984 
4985  //**Subtraction assignment to sparse vectors****************************************************
4986  // No special implementation for the subtraction assignment to sparse vectors.
4987  //**********************************************************************************************
4988 
4989  //**Multiplication assignment to dense vectors**************************************************
5001  template< typename VT1 > // Type of the target dense vector
5002  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5003  {
5005 
5008  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5009 
5010  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5011 
5012  const ResultType tmp( serial( rhs ) );
5013  multAssign( ~lhs, tmp );
5014  }
5015  //**********************************************************************************************
5016 
5017  //**Multiplication assignment to sparse vectors*************************************************
5018  // No special implementation for the multiplication assignment to sparse vectors.
5019  //**********************************************************************************************
5020 
5021  //**Division assignment to dense vectors********************************************************
5033  template< typename VT1 > // Type of the target dense vector
5034  friend inline void divAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5035  {
5037 
5040  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5041 
5042  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5043 
5044  const ResultType tmp( serial( rhs ) );
5045  divAssign( ~lhs, tmp );
5046  }
5047  //**********************************************************************************************
5048 
5049  //**Division assignment to sparse vectors*******************************************************
5050  // No special implementation for the division assignment to sparse vectors.
5051  //**********************************************************************************************
5052 
5053  //**SMP assignment to dense vectors*************************************************************
5067  template< typename VT1 > // Type of the target dense vector
5068  friend inline EnableIf_< UseSMPAssign<VT1> >
5069  smpAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5070  {
5072 
5073  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5074 
5075  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5076  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5077 
5078  if( left.rows() == 0UL ) {
5079  return;
5080  }
5081  else if( left.columns() == 0UL ) {
5082  reset( ~lhs );
5083  return;
5084  }
5085 
5086  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5087  RT x( right ); // Evaluation of the right-hand side dense vector operand
5088 
5089  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5090  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5091  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5092  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5093 
5094  smpAssign( ~lhs, A * x * rhs.scalar_ );
5095  }
5096  //**********************************************************************************************
5097 
5098  //**SMP assignment to sparse vectors************************************************************
5112  template< typename VT1 > // Type of the target sparse vector
5113  friend inline EnableIf_< UseSMPAssign<VT1> >
5114  smpAssign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5115  {
5117 
5120  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5121 
5122  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5123 
5124  const ResultType tmp( rhs );
5125  smpAssign( ~lhs, tmp );
5126  }
5127  //**********************************************************************************************
5128 
5129  //**SMP addition assignment to dense vectors****************************************************
5143  template< typename VT1 > // Type of the target dense vector
5144  friend inline EnableIf_< UseSMPAssign<VT1> >
5145  smpAddAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5146  {
5148 
5149  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5150 
5151  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5152  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5153 
5154  if( left.rows() == 0UL || left.columns() == 0UL ) {
5155  return;
5156  }
5157 
5158  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5159  RT x( right ); // Evaluation of the right-hand side dense vector operand
5160 
5161  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5162  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5163  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5164  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5165 
5166  smpAddAssign( ~lhs, A * x * rhs.scalar_ );
5167  }
5168  //**********************************************************************************************
5169 
5170  //**SMP addition assignment to sparse vectors***************************************************
5171  // No special implementation for the SMP addition assignment to sparse vectors.
5172  //**********************************************************************************************
5173 
5174  //**SMP subtraction assignment to dense vectors*************************************************
5188  template< typename VT1 > // Type of the target dense vector
5189  friend inline EnableIf_< UseSMPAssign<VT1> >
5190  smpSubAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5191  {
5193 
5194  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5195 
5196  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
5197  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
5198 
5199  if( left.rows() == 0UL || left.columns() == 0UL ) {
5200  return;
5201  }
5202 
5203  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5204  RT x( right ); // Evaluation of the right-hand side dense vector operand
5205 
5206  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5207  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5208  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5209  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5210 
5211  smpSubAssign( ~lhs, A * x * rhs.scalar_ );
5212  }
5213  //**********************************************************************************************
5214 
5215  //**SMP subtraction assignment to sparse vectors************************************************
5216  // No special implementation for the SMP subtraction assignment to sparse vectors.
5217  //**********************************************************************************************
5218 
5219  //**SMP multiplication assignment to dense vectors**********************************************
5233  template< typename VT1 > // Type of the target dense vector
5234  friend inline EnableIf_< UseSMPAssign<VT1> >
5235  smpMultAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5236  {
5238 
5241  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5242 
5243  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5244 
5245  const ResultType tmp( rhs );
5246  smpMultAssign( ~lhs, tmp );
5247  }
5248  //**********************************************************************************************
5249 
5250  //**SMP multiplication assignment to sparse vectors*********************************************
5251  // No special implementation for the SMP multiplication assignment to sparse vectors.
5252  //**********************************************************************************************
5253 
5254  //**SMP division assignment to dense vectors****************************************************
5268  template< typename VT1 > // Type of the target dense vector
5269  friend inline EnableIf_< UseSMPAssign<VT1> >
5270  smpDivAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5271  {
5273 
5276  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5277 
5278  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5279 
5280  const ResultType tmp( rhs );
5281  smpDivAssign( ~lhs, tmp );
5282  }
5283  //**********************************************************************************************
5284 
5285  //**SMP division assignment to sparse vectors***************************************************
5286  // No special implementation for the SMP division assignment to sparse vectors.
5287  //**********************************************************************************************
5288 
5289  //**Compile time checks*************************************************************************
5297  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
5298  //**********************************************************************************************
5299 };
5301 //*************************************************************************************************
5302 
5303 
5304 
5305 
5306 //=================================================================================================
5307 //
5308 // GLOBAL BINARY ARITHMETIC OPERATORS
5309 //
5310 //=================================================================================================
5311 
5312 //*************************************************************************************************
5342 template< typename T1 // Type of the left-hand side dense matrix
5343  , typename T2 > // Type of the right-hand side dense vector
5344 inline const DisableIf_< IsMatMatMultExpr<T1>, DMatDVecMultExpr<T1,T2> >
5346 {
5348 
5349  if( (~mat).columns() != (~vec).size() ) {
5350  BLAZE_THROW_INVALID_ARGUMENT( "Matrix and vector sizes do not match" );
5351  }
5352 
5353  return DMatDVecMultExpr<T1,T2>( ~mat, ~vec );
5354 }
5355 //*************************************************************************************************
5356 
5357 
5358 
5359 
5360 //=================================================================================================
5361 //
5362 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
5363 //
5364 //=================================================================================================
5365 
5366 //*************************************************************************************************
5379 template< typename T1 // Type of the left-hand side dense matrix
5380  , bool SO // Storage order of the left-hand side dense matrix
5381  , typename T2 > // Type of the right-hand side dense vector
5382 inline const EnableIf_< IsMatMatMultExpr<T1>, MultExprTrait_<T1,T2> >
5384 {
5386 
5388 
5389  return (~mat).leftOperand() * ( (~mat).rightOperand() * vec );
5390 }
5391 //*************************************************************************************************
5392 
5393 
5394 
5395 
5396 //=================================================================================================
5397 //
5398 // SIZE SPECIALIZATIONS
5399 //
5400 //=================================================================================================
5401 
5402 //*************************************************************************************************
5404 template< typename MT, typename VT >
5405 struct Size< DMatDVecMultExpr<MT,VT> > : public Rows<MT>
5406 {};
5408 //*************************************************************************************************
5409 
5410 
5411 
5412 
5413 //=================================================================================================
5414 //
5415 // ISALIGNED SPECIALIZATIONS
5416 //
5417 //=================================================================================================
5418 
5419 //*************************************************************************************************
5421 template< typename MT, typename VT >
5422 struct IsAligned< DMatDVecMultExpr<MT,VT> >
5423  : public BoolConstant< And< IsAligned<MT>, IsAligned<VT> >::value >
5424 {};
5426 //*************************************************************************************************
5427 
5428 
5429 
5430 
5431 //=================================================================================================
5432 //
5433 // EXPRESSION TRAIT SPECIALIZATIONS
5434 //
5435 //=================================================================================================
5436 
5437 //*************************************************************************************************
5439 template< typename MT, typename VT, bool AF >
5440 struct SubvectorExprTrait< DMatDVecMultExpr<MT,VT>, AF >
5441 {
5442  public:
5443  //**********************************************************************************************
5444  using Type = MultExprTrait_< SubmatrixExprTrait_<const MT,AF>
5445  , SubvectorExprTrait_<const VT,AF> >;
5446  //**********************************************************************************************
5447 };
5449 //*************************************************************************************************
5450 
5451 } // namespace blaze
5452 
5453 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
If_< IsExpression< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:216
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:211
Header file for auxiliary alias declarations.
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Header file for basic type definitions.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:354
BLAZE_ALWAYS_INLINE const complex< int8_t > sum(const SIMDcint8 &a) noexcept
Returns the sum of all elements in the 8-bit integral complex SIMD vector.
Definition: Reduction.h:63
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:374
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:125
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:136
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
If_< IsExpression< VT >, const VT, const VT & > RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:219
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
DMatDVecMultExpr< MT, VT > This
Type of this DMatDVecMultExpr instance.
Definition: DMatDVecMultExpr.h:207
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:133
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: AreSIMDCombinable.h:121
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:225
Header file for the IsComplexDouble type trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:364
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:135
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:210
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:330
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:66
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:110
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:310
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:384
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:342
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:208
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
System settings for the BLAS mode.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:297
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:385
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:251
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:134
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:209
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:213
Constraint on the data type.
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS general matrix/vector multiplication functions (gemv)
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:132
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
Header file for the AreSIMDCombinable type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:320
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:212
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:330
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Constraint on the data type.
Header file for the complex data type.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:265
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the MatVecMultExpr base class.
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:222
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:131