DMatDVecMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
56 #include <blaze/math/Intrinsics.h>
57 #include <blaze/math/shims/Reset.h>
79 #include <blaze/system/BLAS.h>
82 #include <blaze/util/Assert.h>
83 #include <blaze/util/Complex.h>
86 #include <blaze/util/DisableIf.h>
87 #include <blaze/util/EnableIf.h>
88 #include <blaze/util/Exception.h>
90 #include <blaze/util/SelectType.h>
91 #include <blaze/util/Types.h>
100 
101 
102 namespace blaze {
103 
104 //=================================================================================================
105 //
106 // CLASS DMATDVECMULTEXPR
107 //
108 //=================================================================================================
109 
110 //*************************************************************************************************
117 template< typename MT // Type of the left-hand side dense matrix
118  , typename VT > // Type of the right-hand side dense vector
119 class DMatDVecMultExpr : public DenseVector< DMatDVecMultExpr<MT,VT>, false >
120  , private MatVecMultExpr
121  , private Computation
122 {
123  private:
124  //**Type definitions****************************************************************************
125  typedef typename MT::ResultType MRT;
126  typedef typename VT::ResultType VRT;
127  typedef typename MRT::ElementType MET;
128  typedef typename VRT::ElementType VET;
129  typedef typename MT::CompositeType MCT;
130  typedef typename VT::CompositeType VCT;
131  //**********************************************************************************************
132 
133  //**********************************************************************************************
135  enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
137  //**********************************************************************************************
138 
139  //**********************************************************************************************
141  enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
142  //**********************************************************************************************
143 
144  //**********************************************************************************************
146 
150  template< typename T1 >
151  struct UseSMPAssign {
152  enum { value = ( evaluateMatrix || evaluateVector ) };
153  };
155  //**********************************************************************************************
156 
157  //**********************************************************************************************
159 
162  template< typename T1, typename T2, typename T3 >
163  struct UseBlasKernel {
164  enum { value = BLAZE_BLAS_MODE &&
169  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
175  };
177  //**********************************************************************************************
178 
179  //**********************************************************************************************
181 
185  template< typename T1, typename T2, typename T3 >
186  struct UseVectorizedDefaultKernel {
187  enum { value = useOptimizedKernels &&
189  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
194  };
196  //**********************************************************************************************
197 
198  public:
199  //**Type definitions****************************************************************************
205  typedef const ElementType ReturnType;
206  typedef const ResultType CompositeType;
207 
209  typedef typename SelectType< IsExpression<MT>::value, const MT, const MT& >::Type LeftOperand;
210 
212  typedef typename SelectType< IsExpression<VT>::value, const VT, const VT& >::Type RightOperand;
213 
216 
219  //**********************************************************************************************
220 
221  //**Compilation flags***************************************************************************
223  enum { vectorizable = !IsDiagonal<MT>::value &&
224  MT::vectorizable && VT::vectorizable &&
228 
230  enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
231  !evaluateVector && VT::smpAssignable };
232  //**********************************************************************************************
233 
234  //**Constructor*********************************************************************************
240  explicit inline DMatDVecMultExpr( const MT& mat, const VT& vec )
241  : mat_( mat ) // Left-hand side dense matrix of the multiplication expression
242  , vec_( vec ) // Right-hand side dense vector of the multiplication expression
243  {
244  BLAZE_INTERNAL_ASSERT( mat_.columns() == vec_.size(), "Invalid matrix and vector sizes" );
245  }
246  //**********************************************************************************************
247 
248  //**Subscript operator**************************************************************************
254  inline ReturnType operator[]( size_t index ) const {
255  BLAZE_INTERNAL_ASSERT( index < mat_.rows(), "Invalid vector access index" );
256 
257  if( ( IsStrictlyLower<MT>::value && index == 0UL ) ||
258  ( IsStrictlyUpper<MT>::value && index == mat_.rows()-1UL ) ||
259  mat_.columns() == 0UL )
260  return ElementType();
261 
263  return mat_(index,index) * vec_[index];
264 
265  const size_t jbegin( ( IsUpper<MT>::value )
266  ?( IsStrictlyUpper<MT>::value ? index+1UL : index )
267  :( 0UL ) );
268  const size_t jend( ( IsLower<MT>::value )
269  ?( IsStrictlyLower<MT>::value ? index : index+1UL )
270  :( mat_.columns() ) );
271  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
272 
273  const size_t jnum( jend - jbegin );
274  const size_t jpos( jbegin + ( ( jnum - 1UL ) & size_t(-2) ) + 1UL );
275 
276  ElementType res( mat_(index,jbegin) * vec_[jbegin] );
277 
278  for( size_t j=jbegin+1UL; j<jpos; j+=2UL ) {
279  res += mat_(index,j) * vec_[j] + mat_(index,j+1UL) * vec_[j+1UL];
280  }
281  if( jpos < jend ) {
282  res += mat_(index,jpos) * vec_[jpos];
283  }
284 
285  return res;
286  }
287  //**********************************************************************************************
288 
289  //**At function*********************************************************************************
296  inline ReturnType at( size_t index ) const {
297  if( index >= mat_.rows() ) {
298  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
299  }
300  return (*this)[index];
301  }
302  //**********************************************************************************************
303 
304  //**Size function*******************************************************************************
309  inline size_t size() const {
310  return mat_.rows();
311  }
312  //**********************************************************************************************
313 
314  //**Left operand access*************************************************************************
319  inline LeftOperand leftOperand() const {
320  return mat_;
321  }
322  //**********************************************************************************************
323 
324  //**Right operand access************************************************************************
329  inline RightOperand rightOperand() const {
330  return vec_;
331  }
332  //**********************************************************************************************
333 
334  //**********************************************************************************************
340  template< typename T >
341  inline bool canAlias( const T* alias ) const {
342  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
343  }
344  //**********************************************************************************************
345 
346  //**********************************************************************************************
352  template< typename T >
353  inline bool isAliased( const T* alias ) const {
354  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
355  }
356  //**********************************************************************************************
357 
358  //**********************************************************************************************
363  inline bool isAligned() const {
364  return mat_.isAligned() && vec_.isAligned();
365  }
366  //**********************************************************************************************
367 
368  //**********************************************************************************************
373  inline bool canSMPAssign() const {
374  return ( !BLAZE_BLAS_IS_PARALLEL ||
375  ( IsComputation<MT>::value && !evaluateMatrix ) ||
376  ( mat_.rows() * mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
377  ( size() > SMP_DMATDVECMULT_THRESHOLD );
378  }
379  //**********************************************************************************************
380 
381  private:
382  //**Member variables****************************************************************************
383  LeftOperand mat_;
384  RightOperand vec_;
385  //**********************************************************************************************
386 
387  //**Assignment to dense vectors*****************************************************************
400  template< typename VT1 > // Type of the target dense vector
401  friend inline void assign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
402  {
404 
405  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
406 
407  if( rhs.mat_.rows() == 0UL ) {
408  return;
409  }
410  else if( rhs.mat_.columns() == 0UL ) {
411  reset( ~lhs );
412  return;
413  }
414 
415  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
416  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
417 
418  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
419  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
420  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
421  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
422 
423  DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
424  }
426  //**********************************************************************************************
427 
428  //**Assignment to dense vectors (kernel selection)**********************************************
439  template< typename VT1 // Type of the left-hand side target vector
440  , typename MT1 // Type of the left-hand side matrix operand
441  , typename VT2 > // Type of the right-hand side vector operand
442  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x )
443  {
444  if( ( IsDiagonal<MT1>::value ) ||
445  ( IsComputation<MT>::value && !evaluateMatrix ) ||
446  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
447  selectSmallAssignKernel( y, A, x );
448  else
449  selectBlasAssignKernel( y, A, x );
450  }
452  //**********************************************************************************************
453 
454  //**Default assignment to dense vectors*********************************************************
468  template< typename VT1 // Type of the left-hand side target vector
469  , typename MT1 // Type of the left-hand side matrix operand
470  , typename VT2 > // Type of the right-hand side vector operand
471  static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x )
472  {
473  y.assign( A * x );
474  }
476  //**********************************************************************************************
477 
478  //**Default assignment to dense vectors (small matrices)****************************************
492  template< typename VT1 // Type of the left-hand side target vector
493  , typename MT1 // Type of the left-hand side matrix operand
494  , typename VT2 > // Type of the right-hand side vector operand
495  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
496  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
497  {
498  selectDefaultAssignKernel( y, A, x );
499  }
501  //**********************************************************************************************
502 
503  //**Vectorized default assignment to dense vectors (small matrices)*****************************
517  template< typename VT1 // Type of the left-hand side target vector
518  , typename MT1 // Type of the left-hand side matrix operand
519  , typename VT2 > // Type of the right-hand side vector operand
520  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
521  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
522  {
523  typedef IntrinsicTrait<ElementType> IT;
524 
525  const size_t M( A.rows() );
526  const size_t N( A.columns() );
527 
528  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
529 
530  size_t i( 0UL );
531 
532  for( ; (i+8UL) <= M; i+=8UL )
533  {
534  const size_t jbegin( ( IsUpper<MT1>::value )
535  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
536  :( 0UL ) );
537  const size_t jend( ( IsLower<MT1>::value )
538  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
539  :( N ) );
540  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
541 
542  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
543  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
544 
545  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
546  size_t j( jbegin );
547 
548  for( ; j<jpos; j+=IT::size ) {
549  const IntrinsicType x1( x.load(j) );
550  xmm1 = xmm1 + A.load(i ,j) * x1;
551  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
552  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
553  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
554  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
555  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
556  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
557  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
558  }
559 
560  y[i ] = sum( xmm1 );
561  y[i+1UL] = sum( xmm2 );
562  y[i+2UL] = sum( xmm3 );
563  y[i+3UL] = sum( xmm4 );
564  y[i+4UL] = sum( xmm5 );
565  y[i+5UL] = sum( xmm6 );
566  y[i+6UL] = sum( xmm7 );
567  y[i+7UL] = sum( xmm8 );
568 
569  for( ; remainder && j<jend; ++j ) {
570  y[i ] += A(i ,j) * x[j];
571  y[i+1UL] += A(i+1UL,j) * x[j];
572  y[i+2UL] += A(i+2UL,j) * x[j];
573  y[i+3UL] += A(i+3UL,j) * x[j];
574  y[i+4UL] += A(i+4UL,j) * x[j];
575  y[i+5UL] += A(i+5UL,j) * x[j];
576  y[i+6UL] += A(i+6UL,j) * x[j];
577  y[i+7UL] += A(i+7UL,j) * x[j];
578  }
579  }
580 
581  for( ; (i+4UL) <= M; i+=4UL )
582  {
583  const size_t jbegin( ( IsUpper<MT1>::value )
584  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
585  :( 0UL ) );
586  const size_t jend( ( IsLower<MT1>::value )
587  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
588  :( N ) );
589  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
590 
591  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
592  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
593 
594  IntrinsicType xmm1, xmm2, xmm3, xmm4;
595  size_t j( jbegin );
596 
597  for( ; j<jpos; j+=IT::size ) {
598  const IntrinsicType x1( x.load(j) );
599  xmm1 = xmm1 + A.load(i ,j) * x1;
600  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
601  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
602  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
603  }
604 
605  y[i ] = sum( xmm1 );
606  y[i+1UL] = sum( xmm2 );
607  y[i+2UL] = sum( xmm3 );
608  y[i+3UL] = sum( xmm4 );
609 
610  for( ; remainder && j<jend; ++j ) {
611  y[i ] += A(i ,j) * x[j];
612  y[i+1UL] += A(i+1UL,j) * x[j];
613  y[i+2UL] += A(i+2UL,j) * x[j];
614  y[i+3UL] += A(i+3UL,j) * x[j];
615  }
616  }
617 
618  for( ; (i+3UL) <= M; i+=3UL )
619  {
620  const size_t jbegin( ( IsUpper<MT1>::value )
621  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
622  :( 0UL ) );
623  const size_t jend( ( IsLower<MT1>::value )
624  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
625  :( N ) );
626  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
627 
628  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
629  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
630 
631  IntrinsicType xmm1, xmm2, xmm3;
632  size_t j( jbegin );
633 
634  for( ; j<jpos; j+=IT::size ) {
635  const IntrinsicType x1( x.load(j) );
636  xmm1 = xmm1 + A.load(i ,j) * x1;
637  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
638  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
639  }
640 
641  y[i ] = sum( xmm1 );
642  y[i+1UL] = sum( xmm2 );
643  y[i+2UL] = sum( xmm3 );
644 
645  for( ; remainder && j<jend; ++j ) {
646  y[i ] += A(i ,j) * x[j];
647  y[i+1UL] += A(i+1UL,j) * x[j];
648  y[i+2UL] += A(i+2UL,j) * x[j];
649  }
650  }
651 
652  for( ; (i+2UL) <= M; i+=2UL )
653  {
654  const size_t jbegin( ( IsUpper<MT1>::value )
655  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
656  :( 0UL ) );
657  const size_t jend( ( IsLower<MT1>::value )
658  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
659  :( N ) );
660  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
661 
662  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
663  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
664 
665  IntrinsicType xmm1, xmm2;
666  size_t j( jbegin );
667 
668  for( ; j<jpos; j+=IT::size ) {
669  const IntrinsicType x1( x.load(j) );
670  xmm1 = xmm1 + A.load(i ,j) * x1;
671  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
672  }
673 
674  y[i ] = sum( xmm1 );
675  y[i+1UL] = sum( xmm2 );
676 
677  for( ; remainder && j<jend; ++j ) {
678  y[i ] += A(i ,j) * x[j];
679  y[i+1UL] += A(i+1UL,j) * x[j];
680  }
681  }
682 
683  if( i < M )
684  {
685  const size_t jbegin( ( IsUpper<MT1>::value )
686  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
687  :( 0UL ) );
688  const size_t jend( ( IsLower<MT1>::value )
689  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
690  :( N ) );
691  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
692 
693  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
694  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
695 
696  IntrinsicType xmm1;
697  size_t j( jbegin );
698 
699  for( ; j<jpos; j+=IT::size ) {
700  xmm1 = xmm1 + A.load(i,j) * x.load(j);
701  }
702 
703  y[i] = sum( xmm1 );
704 
705  for( ; remainder && j<jend; ++j ) {
706  y[i] += A(i,j) * x[j];
707  }
708  }
709  }
711  //**********************************************************************************************
712 
713  //**Default assignment to dense vectors (large matrices)****************************************
727  template< typename VT1 // Type of the left-hand side target vector
728  , typename MT1 // Type of the left-hand side matrix operand
729  , typename VT2 > // Type of the right-hand side vector operand
730  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
731  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
732  {
733  selectDefaultAssignKernel( y, A, x );
734  }
736  //**********************************************************************************************
737 
738  //**Vectorized default assignment to dense vectors (large matrices)*****************************
752  template< typename VT1 // Type of the left-hand side target vector
753  , typename MT1 // Type of the left-hand side matrix operand
754  , typename VT2 > // Type of the right-hand side vector operand
755  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
756  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
757  {
758  typedef IntrinsicTrait<ElementType> IT;
759 
760  const size_t M( A.rows() );
761  const size_t N( A.columns() );
762 
763  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
764 
765  reset( y );
766 
767  size_t i( 0UL );
768 
769  for( ; (i+8UL) <= M; i+=8UL )
770  {
771  const size_t jbegin( ( IsUpper<MT1>::value )
772  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
773  :( 0UL ) );
774  const size_t jend( ( IsLower<MT1>::value )
775  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
776  :( N ) );
777  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
778 
779  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
780  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
781 
782  size_t j( jbegin );
783 
784  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
785  const size_t j1( j+IT::size );
786  const size_t j2( j+IT::size*2UL );
787  const size_t j3( j+IT::size*3UL );
788  const IntrinsicType x1( x.load(j ) );
789  const IntrinsicType x2( x.load(j1) );
790  const IntrinsicType x3( x.load(j2) );
791  const IntrinsicType x4( x.load(j3) );
792  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
793  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
794  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
795  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
796  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
797  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
798  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
799  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
800  }
801 
802  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
803  const size_t j1( j+IT::size );
804  const IntrinsicType x1( x.load(j ) );
805  const IntrinsicType x2( x.load(j1) );
806  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
807  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
808  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
809  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
810  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
811  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
812  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
813  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
814  }
815 
816  for( ; j<jpos; j+=IT::size ) {
817  const IntrinsicType x1( x.load(j) );
818  y[i ] += sum( A.load(i ,j) * x1 );
819  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
820  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
821  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
822  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
823  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
824  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
825  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
826  }
827 
828  for( ; remainder && j<jend; ++j ) {
829  y[i ] += A(i ,j) * x[j];
830  y[i+1UL] += A(i+1UL,j) * x[j];
831  y[i+2UL] += A(i+2UL,j) * x[j];
832  y[i+3UL] += A(i+3UL,j) * x[j];
833  y[i+4UL] += A(i+4UL,j) * x[j];
834  y[i+5UL] += A(i+5UL,j) * x[j];
835  y[i+6UL] += A(i+6UL,j) * x[j];
836  y[i+7UL] += A(i+7UL,j) * x[j];
837  }
838  }
839 
840  for( ; (i+4UL) <= M; i+=4UL )
841  {
842  const size_t jbegin( ( IsUpper<MT1>::value )
843  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
844  :( 0UL ) );
845  const size_t jend( ( IsLower<MT1>::value )
846  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
847  :( N ) );
848  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
849 
850  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
851  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
852 
853  size_t j( jbegin );
854 
855  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
856  const size_t j1( j+IT::size );
857  const size_t j2( j+IT::size*2UL );
858  const size_t j3( j+IT::size*3UL );
859  const IntrinsicType x1( x.load(j ) );
860  const IntrinsicType x2( x.load(j1) );
861  const IntrinsicType x3( x.load(j2) );
862  const IntrinsicType x4( x.load(j3) );
863  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
864  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
865  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
866  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
867  }
868 
869  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
870  const size_t j1( j+IT::size );
871  const IntrinsicType x1( x.load(j ) );
872  const IntrinsicType x2( x.load(j1) );
873  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
874  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
875  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
876  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
877  }
878 
879  for( ; j<jpos; j+=IT::size ) {
880  const IntrinsicType x1( x.load(j) );
881  y[i ] += sum( A.load(i ,j) * x1 );
882  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
883  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
884  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
885  }
886 
887  for( ; remainder && j<jend; ++j ) {
888  y[i ] += A(i ,j) * x[j];
889  y[i+1UL] += A(i+1UL,j) * x[j];
890  y[i+2UL] += A(i+2UL,j) * x[j];
891  y[i+3UL] += A(i+3UL,j) * x[j];
892  }
893  }
894 
895  for( ; (i+2UL) <= M; i+=2UL )
896  {
897  const size_t jbegin( ( IsUpper<MT1>::value )
898  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
899  :( 0UL ) );
900  const size_t jend( ( IsLower<MT1>::value )
901  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
902  :( N ) );
903  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
904 
905  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
906  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
907 
908  size_t j( jbegin );
909 
910  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
911  const size_t j1( j+IT::size );
912  const size_t j2( j+IT::size*2UL );
913  const size_t j3( j+IT::size*3UL );
914  const IntrinsicType x1( x.load(j ) );
915  const IntrinsicType x2( x.load(j1) );
916  const IntrinsicType x3( x.load(j2) );
917  const IntrinsicType x4( x.load(j3) );
918  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
919  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
920  }
921 
922  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
923  const size_t j1( j+IT::size );
924  const IntrinsicType x1( x.load(j ) );
925  const IntrinsicType x2( x.load(j1) );
926  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
927  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
928  }
929 
930  for( ; j<jpos; j+=IT::size ) {
931  const IntrinsicType x1( x.load(j) );
932  y[i ] += sum( A.load(i ,j) * x1 );
933  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
934  }
935 
936  for( ; remainder && j<jend; ++j ) {
937  y[i ] += A(i ,j) * x[j];
938  y[i+1UL] += A(i+1UL,j) * x[j];
939  }
940  }
941 
942  if( i < M )
943  {
944  const size_t jbegin( ( IsUpper<MT1>::value )
945  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
946  :( 0UL ) );
947  const size_t jend( ( IsLower<MT1>::value )
948  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
949  :( N ) );
950  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
951 
952  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
953  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
954 
955  size_t j( jbegin );
956 
957  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
958  const size_t j1( j+IT::size );
959  const size_t j2( j+IT::size*2UL );
960  const size_t j3( j+IT::size*3UL );
961  const IntrinsicType x1( x.load(j ) );
962  const IntrinsicType x2( x.load(j1) );
963  const IntrinsicType x3( x.load(j2) );
964  const IntrinsicType x4( x.load(j3) );
965  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
966  }
967 
968  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
969  const size_t j1( j+IT::size );
970  const IntrinsicType x1( x.load(j ) );
971  const IntrinsicType x2( x.load(j1) );
972  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
973  }
974 
975  for( ; j<jpos; j+=IT::size ) {
976  const IntrinsicType x1( x.load(j) );
977  y[i] += sum( A.load(i,j) * x1 );
978  }
979 
980  for( ; remainder && j<jend; ++j ) {
981  y[i] += A(i,j) * x[j];
982  }
983  }
984  }
986  //**********************************************************************************************
987 
988  //**BLAS-based assignment to dense vectors (default)********************************************
1002  template< typename VT1 // Type of the left-hand side target vector
1003  , typename MT1 // Type of the left-hand side matrix operand
1004  , typename VT2 > // Type of the right-hand side vector operand
1005  static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
1006  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1007  {
1008  selectLargeAssignKernel( y, A, x );
1009  }
1011  //**********************************************************************************************
1012 
1013  //**BLAS-based assignment to dense vectors******************************************************
1014 #if BLAZE_BLAS_MODE
1015 
1028  template< typename VT1 // Type of the left-hand side target vector
1029  , typename MT1 // Type of the left-hand side matrix operand
1030  , typename VT2 > // Type of the right-hand side vector operand
1031  static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
1032  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
1033  {
1034  typedef typename VT1::ElementType ET;
1035 
1036  if( IsTriangular<MT1>::value ) {
1037  assign( y, x );
1038  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1039  }
1040  else {
1041  gemv( y, A, x, ET(1), ET(0) );
1042  }
1043  }
1045 #endif
1046  //**********************************************************************************************
1047 
1048  //**Assignment to sparse vectors****************************************************************
1061  template< typename VT1 > // Type of the target sparse vector
1062  friend inline void assign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1063  {
1065 
1069 
1070  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1071 
1072  const ResultType tmp( serial( rhs ) );
1073  assign( ~lhs, tmp );
1074  }
1076  //**********************************************************************************************
1077 
1078  //**Addition assignment to dense vectors********************************************************
1091  template< typename VT1 > // Type of the target dense vector
1092  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1093  {
1095 
1096  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1097 
1098  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1099  return;
1100  }
1101 
1102  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1103  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1104 
1105  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1106  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1107  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1108  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1109 
1110  DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1111  }
1113  //**********************************************************************************************
1114 
1115  //**Addition assignment to dense vectors (kernel selection)*************************************
1126  template< typename VT1 // Type of the left-hand side target vector
1127  , typename MT1 // Type of the left-hand side matrix operand
1128  , typename VT2 > // Type of the right-hand side vector operand
1129  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1130  {
1131  if( ( IsDiagonal<MT1>::value ) ||
1132  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1133  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1134  selectSmallAddAssignKernel( y, A, x );
1135  else
1136  selectBlasAddAssignKernel( y, A, x );
1137  }
1139  //**********************************************************************************************
1140 
1141  //**Default addition assignment to dense vectors************************************************
1155  template< typename VT1 // Type of the left-hand side target vector
1156  , typename MT1 // Type of the left-hand side matrix operand
1157  , typename VT2 > // Type of the right-hand side vector operand
1158  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1159  {
1160  y.addAssign( A * x );
1161  }
1163  //**********************************************************************************************
1164 
1165  //**Default addition assignment to dense vectors (small matrices)*******************************
1179  template< typename VT1 // Type of the left-hand side target vector
1180  , typename MT1 // Type of the left-hand side matrix operand
1181  , typename VT2 > // Type of the right-hand side vector operand
1182  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1183  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1184  {
1185  selectDefaultAddAssignKernel( y, A, x );
1186  }
1188  //**********************************************************************************************
1189 
1190  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1204  template< typename VT1 // Type of the left-hand side target vector
1205  , typename MT1 // Type of the left-hand side matrix operand
1206  , typename VT2 > // Type of the right-hand side vector operand
1207  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1208  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1209  {
1210  typedef IntrinsicTrait<ElementType> IT;
1211 
1212  const size_t M( A.rows() );
1213  const size_t N( A.columns() );
1214 
1215  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1216 
1217  size_t i( 0UL );
1218 
1219  for( ; (i+8UL) <= M; i+=8UL )
1220  {
1221  const size_t jbegin( ( IsUpper<MT1>::value )
1222  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1223  :( 0UL ) );
1224  const size_t jend( ( IsLower<MT1>::value )
1225  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1226  :( N ) );
1227  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1228 
1229  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1230  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1231 
1232  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1233  size_t j( jbegin );
1234 
1235  for( ; j<jpos; j+=IT::size ) {
1236  const IntrinsicType x1( x.load(j) );
1237  xmm1 = xmm1 + A.load(i ,j) * x1;
1238  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1239  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1240  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1241  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1242  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1243  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1244  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1245  }
1246 
1247  y[i ] += sum( xmm1 );
1248  y[i+1UL] += sum( xmm2 );
1249  y[i+2UL] += sum( xmm3 );
1250  y[i+3UL] += sum( xmm4 );
1251  y[i+4UL] += sum( xmm5 );
1252  y[i+5UL] += sum( xmm6 );
1253  y[i+6UL] += sum( xmm7 );
1254  y[i+7UL] += sum( xmm8 );
1255 
1256  for( ; remainder && j<jend; ++j ) {
1257  y[i ] += A(i ,j) * x[j];
1258  y[i+1UL] += A(i+1UL,j) * x[j];
1259  y[i+2UL] += A(i+2UL,j) * x[j];
1260  y[i+3UL] += A(i+3UL,j) * x[j];
1261  y[i+4UL] += A(i+4UL,j) * x[j];
1262  y[i+5UL] += A(i+5UL,j) * x[j];
1263  y[i+6UL] += A(i+6UL,j) * x[j];
1264  y[i+7UL] += A(i+7UL,j) * x[j];
1265  }
1266  }
1267 
1268  for( ; (i+4UL) <= M; i+=4UL )
1269  {
1270  const size_t jbegin( ( IsUpper<MT1>::value )
1271  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1272  :( 0UL ) );
1273  const size_t jend( ( IsLower<MT1>::value )
1274  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1275  :( N ) );
1276  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1277 
1278  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1279  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1280 
1281  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1282  size_t j( jbegin );
1283 
1284  for( ; j<jpos; j+=IT::size ) {
1285  const IntrinsicType x1( x.load(j) );
1286  xmm1 = xmm1 + A.load(i ,j) * x1;
1287  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1288  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1289  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1290  }
1291 
1292  y[i ] += sum( xmm1 );
1293  y[i+1UL] += sum( xmm2 );
1294  y[i+2UL] += sum( xmm3 );
1295  y[i+3UL] += sum( xmm4 );
1296 
1297  for( ; remainder && j<jend; ++j ) {
1298  y[i ] += A(i ,j) * x[j];
1299  y[i+1UL] += A(i+1UL,j) * x[j];
1300  y[i+2UL] += A(i+2UL,j) * x[j];
1301  y[i+3UL] += A(i+3UL,j) * x[j];
1302  }
1303  }
1304 
1305  for( ; (i+3UL) <= M; i+=3UL )
1306  {
1307  const size_t jbegin( ( IsUpper<MT1>::value )
1308  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1309  :( 0UL ) );
1310  const size_t jend( ( IsLower<MT1>::value )
1311  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1312  :( N ) );
1313  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1314 
1315  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1316  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1317 
1318  IntrinsicType xmm1, xmm2, xmm3;
1319  size_t j( jbegin );
1320 
1321  for( ; j<jpos; j+=IT::size ) {
1322  const IntrinsicType x1( x.load(j) );
1323  xmm1 = xmm1 + A.load(i ,j) * x1;
1324  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1325  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1326  }
1327 
1328  y[i ] += sum( xmm1 );
1329  y[i+1UL] += sum( xmm2 );
1330  y[i+2UL] += sum( xmm3 );
1331 
1332  for( ; remainder && j<jend; ++j ) {
1333  y[i ] += A(i ,j) * x[j];
1334  y[i+1UL] += A(i+1UL,j) * x[j];
1335  y[i+2UL] += A(i+2UL,j) * x[j];
1336  }
1337  }
1338 
1339  for( ; (i+2UL) <= M; i+=2UL )
1340  {
1341  const size_t jbegin( ( IsUpper<MT1>::value )
1342  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1343  :( 0UL ) );
1344  const size_t jend( ( IsLower<MT1>::value )
1345  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1346  :( N ) );
1347  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1348 
1349  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1350  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1351 
1352  IntrinsicType xmm1, xmm2;
1353  size_t j( jbegin );
1354 
1355  for( ; j<jpos; j+=IT::size ) {
1356  const IntrinsicType x1( x.load(j) );
1357  xmm1 = xmm1 + A.load(i ,j) * x1;
1358  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1359  }
1360 
1361  y[i ] += sum( xmm1 );
1362  y[i+1UL] += sum( xmm2 );
1363 
1364  for( ; remainder && j<jend; ++j ) {
1365  y[i ] += A(i ,j) * x[j];
1366  y[i+1UL] += A(i+1UL,j) * x[j];
1367  }
1368  }
1369 
1370  if( i < M )
1371  {
1372  const size_t jbegin( ( IsUpper<MT1>::value )
1373  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1374  :( 0UL ) );
1375  const size_t jend( ( IsLower<MT1>::value )
1376  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1377  :( N ) );
1378  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1379 
1380  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1381  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1382 
1383  IntrinsicType xmm1;
1384  size_t j( jbegin );
1385 
1386  for( ; j<jpos; j+=IT::size ) {
1387  xmm1 = xmm1 + A.load(i,j) * x.load(j);
1388  }
1389 
1390  y[i] += sum( xmm1 );
1391 
1392  for( ; remainder && j<jend; ++j ) {
1393  y[i] += A(i,j) * x[j];
1394  }
1395  }
1396  }
1398  //**********************************************************************************************
1399 
1400  //**Default addition assignment to dense vectors (large matrices)*******************************
1414  template< typename VT1 // Type of the left-hand side target vector
1415  , typename MT1 // Type of the left-hand side matrix operand
1416  , typename VT2 > // Type of the right-hand side vector operand
1417  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1418  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1419  {
1420  selectDefaultAddAssignKernel( y, A, x );
1421  }
1423  //**********************************************************************************************
1424 
1425  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1439  template< typename VT1 // Type of the left-hand side target vector
1440  , typename MT1 // Type of the left-hand side matrix operand
1441  , typename VT2 > // Type of the right-hand side vector operand
1442  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1443  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1444  {
1445  typedef IntrinsicTrait<ElementType> IT;
1446 
1447  const size_t M( A.rows() );
1448  const size_t N( A.columns() );
1449 
1450  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1451 
1452  size_t i( 0UL );
1453 
1454  for( ; (i+8UL) <= M; i+=8UL )
1455  {
1456  const size_t jbegin( ( IsUpper<MT1>::value )
1457  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1458  :( 0UL ) );
1459  const size_t jend( ( IsLower<MT1>::value )
1460  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1461  :( N ) );
1462  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1463 
1464  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1465  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1466 
1467  size_t j( jbegin );
1468 
1469  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
1470  const size_t j1( j+IT::size );
1471  const size_t j2( j+IT::size*2UL );
1472  const size_t j3( j+IT::size*3UL );
1473  const IntrinsicType x1( x.load(j ) );
1474  const IntrinsicType x2( x.load(j1) );
1475  const IntrinsicType x3( x.load(j2) );
1476  const IntrinsicType x4( x.load(j3) );
1477  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1478  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1479  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1480  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1481  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1482  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1483  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1484  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1485  }
1486 
1487  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
1488  const size_t j1( j+IT::size );
1489  const IntrinsicType x1( x.load(j ) );
1490  const IntrinsicType x2( x.load(j1) );
1491  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1492  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1493  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1494  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1495  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1496  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1497  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1498  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1499  }
1500 
1501  for( ; j<jpos; j+=IT::size ) {
1502  const IntrinsicType x1( x.load(j) );
1503  y[i ] += sum( A.load(i ,j) * x1 );
1504  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1505  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1506  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1507  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
1508  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
1509  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
1510  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
1511  }
1512 
1513  for( ; remainder && j<jend; ++j ) {
1514  y[i ] += A(i ,j) * x[j];
1515  y[i+1UL] += A(i+1UL,j) * x[j];
1516  y[i+2UL] += A(i+2UL,j) * x[j];
1517  y[i+3UL] += A(i+3UL,j) * x[j];
1518  y[i+4UL] += A(i+4UL,j) * x[j];
1519  y[i+5UL] += A(i+5UL,j) * x[j];
1520  y[i+6UL] += A(i+6UL,j) * x[j];
1521  y[i+7UL] += A(i+7UL,j) * x[j];
1522  }
1523  }
1524 
1525  for( ; (i+4UL) <= M; i+=4UL )
1526  {
1527  const size_t jbegin( ( IsUpper<MT1>::value )
1528  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1529  :( 0UL ) );
1530  const size_t jend( ( IsLower<MT1>::value )
1531  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1532  :( N ) );
1533  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1534 
1535  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1536  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1537 
1538  size_t j( jbegin );
1539 
1540  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
1541  const size_t j1( j+IT::size );
1542  const size_t j2( j+IT::size*2UL );
1543  const size_t j3( j+IT::size*3UL );
1544  const IntrinsicType x1( x.load(j ) );
1545  const IntrinsicType x2( x.load(j1) );
1546  const IntrinsicType x3( x.load(j2) );
1547  const IntrinsicType x4( x.load(j3) );
1548  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1549  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1550  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1551  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1552  }
1553 
1554  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
1555  const size_t j1( j+IT::size );
1556  const IntrinsicType x1( x.load(j ) );
1557  const IntrinsicType x2( x.load(j1) );
1558  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1559  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1560  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1561  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1562  }
1563 
1564  for( ; j<jpos; j+=IT::size ) {
1565  const IntrinsicType x1( x.load(j) );
1566  y[i ] += sum( A.load(i ,j) * x1 );
1567  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1568  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
1569  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
1570  }
1571 
1572  for( ; remainder && j<jend; ++j ) {
1573  y[i ] += A(i ,j) * x[j];
1574  y[i+1UL] += A(i+1UL,j) * x[j];
1575  y[i+2UL] += A(i+2UL,j) * x[j];
1576  y[i+3UL] += A(i+3UL,j) * x[j];
1577  }
1578  }
1579 
1580  for( ; (i+2UL) <= M; i+=2UL )
1581  {
1582  const size_t jbegin( ( IsUpper<MT1>::value )
1583  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1584  :( 0UL ) );
1585  const size_t jend( ( IsLower<MT1>::value )
1586  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1587  :( N ) );
1588  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1589 
1590  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1591  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1592 
1593  size_t j( jbegin );
1594 
1595  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
1596  const size_t j1( j+IT::size );
1597  const size_t j2( j+IT::size*2UL );
1598  const size_t j3( j+IT::size*3UL );
1599  const IntrinsicType x1( x.load(j ) );
1600  const IntrinsicType x2( x.load(j1) );
1601  const IntrinsicType x3( x.load(j2) );
1602  const IntrinsicType x4( x.load(j3) );
1603  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1604  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1605  }
1606 
1607  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
1608  const size_t j1( j+IT::size );
1609  const IntrinsicType x1( x.load(j ) );
1610  const IntrinsicType x2( x.load(j1) );
1611  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1612  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1613  }
1614 
1615  for( ; j<jpos; j+=IT::size ) {
1616  const IntrinsicType x1( x.load(j) );
1617  y[i ] += sum( A.load(i ,j) * x1 );
1618  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
1619  }
1620 
1621  for( ; remainder && j<jend; ++j ) {
1622  y[i ] += A(i ,j) * x[j];
1623  y[i+1UL] += A(i+1UL,j) * x[j];
1624  }
1625  }
1626 
1627  if( i < M )
1628  {
1629  const size_t jbegin( ( IsUpper<MT1>::value )
1630  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1631  :( 0UL ) );
1632  const size_t jend( ( IsLower<MT1>::value )
1633  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1634  :( N ) );
1635  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1636 
1637  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1638  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1639 
1640  size_t j( jbegin );
1641 
1642  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
1643  const size_t j1( j+IT::size );
1644  const size_t j2( j+IT::size*2UL );
1645  const size_t j3( j+IT::size*3UL );
1646  const IntrinsicType x1( x.load(j ) );
1647  const IntrinsicType x2( x.load(j1) );
1648  const IntrinsicType x3( x.load(j2) );
1649  const IntrinsicType x4( x.load(j3) );
1650  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1651  }
1652 
1653  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
1654  const size_t j1( j+IT::size );
1655  const IntrinsicType x1( x.load(j ) );
1656  const IntrinsicType x2( x.load(j1) );
1657  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1658  }
1659 
1660  for( ; j<jpos; j+=IT::size ) {
1661  const IntrinsicType x1( x.load(j) );
1662  y[i] += sum( A.load(i,j) * x1 );
1663  }
1664 
1665  for( ; remainder && j<jend; ++j ) {
1666  y[i] += A(i,j) * x[j];
1667  }
1668  }
1669  }
1671  //**********************************************************************************************
1672 
1673  //**BLAS-based addition assignment to dense vectors (default)***********************************
1687  template< typename VT1 // Type of the left-hand side target vector
1688  , typename MT1 // Type of the left-hand side matrix operand
1689  , typename VT2 > // Type of the right-hand side vector operand
1690  static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
1691  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1692  {
1693  selectLargeAddAssignKernel( y, A, x );
1694  }
1696  //**********************************************************************************************
1697 
1698  //**BLAS-based addition assignment to dense vectors*********************************************
1699 #if BLAZE_BLAS_MODE
1700 
1713  template< typename VT1 // Type of the left-hand side target vector
1714  , typename MT1 // Type of the left-hand side matrix operand
1715  , typename VT2 > // Type of the right-hand side vector operand
1716  static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
1717  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1718  {
1719  typedef typename VT1::ElementType ET;
1720 
1721  if( IsTriangular<MT1>::value ) {
1722  typename VT1::ResultType tmp( serial( x ) );
1723  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1724  addAssign( y, tmp );
1725  }
1726  else {
1727  gemv( y, A, x, ET(1), ET(1) );
1728  }
1729  }
1731 #endif
1732  //**********************************************************************************************
1733 
1734  //**Addition assignment to sparse vectors*******************************************************
1735  // No special implementation for the addition assignment to sparse vectors.
1736  //**********************************************************************************************
1737 
1738  //**Subtraction assignment to dense vectors*****************************************************
1751  template< typename VT1 > // Type of the target dense vector
1752  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
1753  {
1755 
1756  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1757 
1758  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1759  return;
1760  }
1761 
1762  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1763  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1764 
1765  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1766  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1767  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1768  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1769 
1770  DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1771  }
1773  //**********************************************************************************************
1774 
1775  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1786  template< typename VT1 // Type of the left-hand side target vector
1787  , typename MT1 // Type of the left-hand side matrix operand
1788  , typename VT2 > // Type of the right-hand side vector operand
1789  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1790  {
1791  if( ( IsDiagonal<MT1>::value ) ||
1792  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1793  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1794  selectSmallSubAssignKernel( y, A, x );
1795  else
1796  selectBlasSubAssignKernel( y, A, x );
1797  }
1799  //**********************************************************************************************
1800 
1801  //**Default subtraction assignment to dense vectors*********************************************
1815  template< typename VT1 // Type of the left-hand side target vector
1816  , typename MT1 // Type of the left-hand side matrix operand
1817  , typename VT2 > // Type of the right-hand side vector operand
1818  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1819  {
1820  y.subAssign( A * x );
1821  }
1823  //**********************************************************************************************
1824 
1825  //**Default subtraction assignment to dense vectors (small matrices)****************************
1839  template< typename VT1 // Type of the left-hand side target vector
1840  , typename MT1 // Type of the left-hand side matrix operand
1841  , typename VT2 > // Type of the right-hand side vector operand
1842  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1843  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1844  {
1845  selectDefaultSubAssignKernel( y, A, x );
1846  }
1848  //**********************************************************************************************
1849 
1850  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1864  template< typename VT1 // Type of the left-hand side target vector
1865  , typename MT1 // Type of the left-hand side matrix operand
1866  , typename VT2 > // Type of the right-hand side vector operand
1867  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1868  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1869  {
1870  typedef IntrinsicTrait<ElementType> IT;
1871 
1872  const size_t M( A.rows() );
1873  const size_t N( A.columns() );
1874 
1875  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1876 
1877  size_t i( 0UL );
1878 
1879  for( ; (i+8UL) <= M; i+=8UL )
1880  {
1881  const size_t jbegin( ( IsUpper<MT1>::value )
1882  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1883  :( 0UL ) );
1884  const size_t jend( ( IsLower<MT1>::value )
1885  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1886  :( N ) );
1887  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1888 
1889  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1890  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1891 
1892  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1893  size_t j( jbegin );
1894 
1895  for( ; j<jpos; j+=IT::size ) {
1896  const IntrinsicType x1( x.load(j) );
1897  xmm1 = xmm1 + A.load(i ,j) * x1;
1898  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1899  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1900  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1901  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1902  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1903  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1904  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1905  }
1906 
1907  y[i ] -= sum( xmm1 );
1908  y[i+1UL] -= sum( xmm2 );
1909  y[i+2UL] -= sum( xmm3 );
1910  y[i+3UL] -= sum( xmm4 );
1911  y[i+4UL] -= sum( xmm5 );
1912  y[i+5UL] -= sum( xmm6 );
1913  y[i+6UL] -= sum( xmm7 );
1914  y[i+7UL] -= sum( xmm8 );
1915 
1916  for( ; remainder && j<jend; ++j ) {
1917  y[i ] -= A(i ,j) * x[j];
1918  y[i+1UL] -= A(i+1UL,j) * x[j];
1919  y[i+2UL] -= A(i+2UL,j) * x[j];
1920  y[i+3UL] -= A(i+3UL,j) * x[j];
1921  y[i+4UL] -= A(i+4UL,j) * x[j];
1922  y[i+5UL] -= A(i+5UL,j) * x[j];
1923  y[i+6UL] -= A(i+6UL,j) * x[j];
1924  y[i+7UL] -= A(i+7UL,j) * x[j];
1925  }
1926  }
1927 
1928  for( ; (i+4UL) <= M; i+=4UL )
1929  {
1930  const size_t jbegin( ( IsUpper<MT1>::value )
1931  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1932  :( 0UL ) );
1933  const size_t jend( ( IsLower<MT1>::value )
1934  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1935  :( N ) );
1936  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1937 
1938  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1939  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1940 
1941  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1942  size_t j( jbegin );
1943 
1944  for( ; j<jpos; j+=IT::size ) {
1945  const IntrinsicType x1( x.load(j) );
1946  xmm1 = xmm1 + A.load(i ,j) * x1;
1947  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1948  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1949  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1950  }
1951 
1952  y[i ] -= sum( xmm1 );
1953  y[i+1UL] -= sum( xmm2 );
1954  y[i+2UL] -= sum( xmm3 );
1955  y[i+3UL] -= sum( xmm4 );
1956 
1957  for( ; remainder && j<jend; ++j ) {
1958  y[i ] -= A(i ,j) * x[j];
1959  y[i+1UL] -= A(i+1UL,j) * x[j];
1960  y[i+2UL] -= A(i+2UL,j) * x[j];
1961  y[i+3UL] -= A(i+3UL,j) * x[j];
1962  }
1963  }
1964 
1965  for( ; (i+3UL) <= M; i+=3UL )
1966  {
1967  const size_t jbegin( ( IsUpper<MT1>::value )
1968  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
1969  :( 0UL ) );
1970  const size_t jend( ( IsLower<MT1>::value )
1971  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1972  :( N ) );
1973  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1974 
1975  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
1976  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
1977 
1978  IntrinsicType xmm1, xmm2, xmm3;
1979  size_t j( jbegin );
1980 
1981  for( ; j<jpos; j+=IT::size ) {
1982  const IntrinsicType x1( x.load(j) );
1983  xmm1 = xmm1 + A.load(i ,j) * x1;
1984  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1985  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1986  }
1987 
1988  y[i ] -= sum( xmm1 );
1989  y[i+1UL] -= sum( xmm2 );
1990  y[i+2UL] -= sum( xmm3 );
1991 
1992  for( ; remainder && j<jend; ++j ) {
1993  y[i ] -= A(i ,j) * x[j];
1994  y[i+1UL] -= A(i+1UL,j) * x[j];
1995  y[i+2UL] -= A(i+2UL,j) * x[j];
1996  }
1997  }
1998 
1999  for( ; (i+2UL) <= M; i+=2UL )
2000  {
2001  const size_t jbegin( ( IsUpper<MT1>::value )
2002  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
2003  :( 0UL ) );
2004  const size_t jend( ( IsLower<MT1>::value )
2005  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2006  :( N ) );
2007  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2008 
2009  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
2010  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
2011 
2012  IntrinsicType xmm1, xmm2;
2013  size_t j( jbegin );
2014 
2015  for( ; j<jpos; j+=IT::size ) {
2016  const IntrinsicType x1( x.load(j) );
2017  xmm1 = xmm1 + A.load(i ,j) * x1;
2018  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2019  }
2020 
2021  y[i ] -= sum( xmm1 );
2022  y[i+1UL] -= sum( xmm2 );
2023 
2024  for( ; remainder && j<jend; ++j ) {
2025  y[i ] -= A(i ,j) * x[j];
2026  y[i+1UL] -= A(i+1UL,j) * x[j];
2027  }
2028  }
2029 
2030  if( i < M )
2031  {
2032  const size_t jbegin( ( IsUpper<MT1>::value )
2033  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
2034  :( 0UL ) );
2035  const size_t jend( ( IsLower<MT1>::value )
2036  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2037  :( N ) );
2038  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2039 
2040  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
2041  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
2042 
2043  IntrinsicType xmm1;
2044  size_t j( jbegin );
2045 
2046  for( ; j<jpos; j+=IT::size ) {
2047  xmm1 = xmm1 + A.load(i,j) * x.load(j);
2048  }
2049 
2050  y[i] -= sum( xmm1 );
2051 
2052  for( ; remainder && j<jend; ++j ) {
2053  y[i] -= A(i,j) * x[j];
2054  }
2055  }
2056  }
2058  //**********************************************************************************************
2059 
2060  //**Default subtraction assignment to dense vectors (large matrices)****************************
2074  template< typename VT1 // Type of the left-hand side target vector
2075  , typename MT1 // Type of the left-hand side matrix operand
2076  , typename VT2 > // Type of the right-hand side vector operand
2077  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
2078  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2079  {
2080  selectDefaultSubAssignKernel( y, A, x );
2081  }
2083  //**********************************************************************************************
2084 
2085  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2099  template< typename VT1 // Type of the left-hand side target vector
2100  , typename MT1 // Type of the left-hand side matrix operand
2101  , typename VT2 > // Type of the right-hand side vector operand
2102  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
2103  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2104  {
2105  typedef IntrinsicTrait<ElementType> IT;
2106 
2107  const size_t M( A.rows() );
2108  const size_t N( A.columns() );
2109 
2110  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
2111 
2112  size_t i( 0UL );
2113 
2114  for( ; (i+8UL) <= M; i+=8UL )
2115  {
2116  const size_t jbegin( ( IsUpper<MT1>::value )
2117  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
2118  :( 0UL ) );
2119  const size_t jend( ( IsLower<MT1>::value )
2120  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
2121  :( N ) );
2122  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2123 
2124  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
2125  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
2126 
2127  size_t j( jbegin );
2128 
2129  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
2130  const size_t j1( j+IT::size );
2131  const size_t j2( j+IT::size*2UL );
2132  const size_t j3( j+IT::size*3UL );
2133  const IntrinsicType x1( x.load(j ) );
2134  const IntrinsicType x2( x.load(j1) );
2135  const IntrinsicType x3( x.load(j2) );
2136  const IntrinsicType x4( x.load(j3) );
2137  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2138  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2139  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2140  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2141  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2142  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2143  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2144  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2145  }
2146 
2147  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
2148  const size_t j1( j+IT::size );
2149  const IntrinsicType x1( x.load(j ) );
2150  const IntrinsicType x2( x.load(j1) );
2151  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2152  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2153  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2154  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2155  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2156  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2157  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2158  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2159  }
2160 
2161  for( ; j<jpos; j+=IT::size ) {
2162  const IntrinsicType x1( x.load(j) );
2163  y[i ] -= sum( A.load(i ,j) * x1 );
2164  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2165  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2166  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2167  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 );
2168  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 );
2169  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 );
2170  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 );
2171  }
2172 
2173  for( ; remainder && j<jend; ++j ) {
2174  y[i ] -= A(i ,j) * x[j];
2175  y[i+1UL] -= A(i+1UL,j) * x[j];
2176  y[i+2UL] -= A(i+2UL,j) * x[j];
2177  y[i+3UL] -= A(i+3UL,j) * x[j];
2178  y[i+4UL] -= A(i+4UL,j) * x[j];
2179  y[i+5UL] -= A(i+5UL,j) * x[j];
2180  y[i+6UL] -= A(i+6UL,j) * x[j];
2181  y[i+7UL] -= A(i+7UL,j) * x[j];
2182  }
2183  }
2184 
2185  for( ; (i+4UL) <= M; i+=4UL )
2186  {
2187  const size_t jbegin( ( IsUpper<MT1>::value )
2188  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
2189  :( 0UL ) );
2190  const size_t jend( ( IsLower<MT1>::value )
2191  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
2192  :( N ) );
2193  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2194 
2195  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
2196  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
2197 
2198  size_t j( jbegin );
2199 
2200  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
2201  const size_t j1( j+IT::size );
2202  const size_t j2( j+IT::size*2UL );
2203  const size_t j3( j+IT::size*3UL );
2204  const IntrinsicType x1( x.load(j ) );
2205  const IntrinsicType x2( x.load(j1) );
2206  const IntrinsicType x3( x.load(j2) );
2207  const IntrinsicType x4( x.load(j3) );
2208  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2209  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2210  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2211  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2212  }
2213 
2214  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
2215  const size_t j1( j+IT::size );
2216  const IntrinsicType x1( x.load(j ) );
2217  const IntrinsicType x2( x.load(j1) );
2218  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2219  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2220  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2221  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2222  }
2223 
2224  for( ; j<jpos; j+=IT::size ) {
2225  const IntrinsicType x1( x.load(j) );
2226  y[i ] -= sum( A.load(i ,j) * x1 );
2227  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2228  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 );
2229  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 );
2230  }
2231 
2232  for( ; remainder && j<jend; ++j ) {
2233  y[i ] -= A(i ,j) * x[j];
2234  y[i+1UL] -= A(i+1UL,j) * x[j];
2235  y[i+2UL] -= A(i+2UL,j) * x[j];
2236  y[i+3UL] -= A(i+3UL,j) * x[j];
2237  }
2238  }
2239 
2240  for( ; (i+2UL) <= M; i+=2UL )
2241  {
2242  const size_t jbegin( ( IsUpper<MT1>::value )
2243  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
2244  :( 0UL ) );
2245  const size_t jend( ( IsLower<MT1>::value )
2246  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2247  :( N ) );
2248  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2249 
2250  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
2251  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
2252 
2253  size_t j( jbegin );
2254 
2255  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
2256  const size_t j1( j+IT::size );
2257  const size_t j2( j+IT::size*2UL );
2258  const size_t j3( j+IT::size*3UL );
2259  const IntrinsicType x1( x.load(j ) );
2260  const IntrinsicType x2( x.load(j1) );
2261  const IntrinsicType x3( x.load(j2) );
2262  const IntrinsicType x4( x.load(j3) );
2263  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2264  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2265  }
2266 
2267  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
2268  const size_t j1( j+IT::size );
2269  const IntrinsicType x1( x.load(j ) );
2270  const IntrinsicType x2( x.load(j1) );
2271  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2272  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2273  }
2274 
2275  for( ; j<jpos; j+=IT::size ) {
2276  const IntrinsicType x1( x.load(j) );
2277  y[i ] -= sum( A.load(i ,j) * x1 );
2278  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 );
2279  }
2280 
2281  for( ; remainder && j<jend; ++j ) {
2282  y[i ] -= A(i ,j) * x[j];
2283  y[i+1UL] -= A(i+1UL,j) * x[j];
2284  }
2285  }
2286 
2287  if( i < M )
2288  {
2289  const size_t jbegin( ( IsUpper<MT1>::value )
2290  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
2291  :( 0UL ) );
2292  const size_t jend( ( IsLower<MT1>::value )
2293  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2294  :( N ) );
2295  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2296 
2297  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
2298  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
2299 
2300  size_t j( jbegin );
2301 
2302  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
2303  const size_t j1( j+IT::size );
2304  const size_t j2( j+IT::size*2UL );
2305  const size_t j3( j+IT::size*3UL );
2306  const IntrinsicType x1( x.load(j ) );
2307  const IntrinsicType x2( x.load(j1) );
2308  const IntrinsicType x3( x.load(j2) );
2309  const IntrinsicType x4( x.load(j3) );
2310  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2311  }
2312 
2313  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
2314  const size_t j1( j+IT::size );
2315  const IntrinsicType x1( x.load(j ) );
2316  const IntrinsicType x2( x.load(j1) );
2317  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2318  }
2319 
2320  for( ; j<jpos; j+=IT::size ) {
2321  const IntrinsicType x1( x.load(j) );
2322  y[i] -= sum( A.load(i,j) * x1 );
2323  }
2324 
2325  for( ; remainder && j<jend; ++j ) {
2326  y[i] -= A(i,j) * x[j];
2327  }
2328  }
2329  }
2331  //**********************************************************************************************
2332 
2333  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2347  template< typename VT1 // Type of the left-hand side target vector
2348  , typename MT1 // Type of the left-hand side matrix operand
2349  , typename VT2 > // Type of the right-hand side vector operand
2350  static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
2351  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2352  {
2353  selectLargeSubAssignKernel( y, A, x );
2354  }
2356  //**********************************************************************************************
2357 
2358  //**BLAS-based subtraction assignment to dense vectors******************************************
2359 #if BLAZE_BLAS_MODE
2360 
2373  template< typename VT1 // Type of the left-hand side target vector
2374  , typename MT1 // Type of the left-hand side matrix operand
2375  , typename VT2 > // Type of the right-hand side vector operand
2376  static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
2377  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2378  {
2379  typedef typename VT1::ElementType ET;
2380 
2381  if( IsTriangular<MT1>::value ) {
2382  typename VT1::ResultType tmp( serial( x ) );
2383  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2384  subAssign( y, tmp );
2385  }
2386  else {
2387  gemv( y, A, x, ET(-1), ET(1) );
2388  }
2389  }
2391 #endif
2392  //**********************************************************************************************
2393 
2394  //**Subtraction assignment to sparse vectors****************************************************
2395  // No special implementation for the subtraction assignment to sparse vectors.
2396  //**********************************************************************************************
2397 
2398  //**Multiplication assignment to dense vectors**************************************************
2411  template< typename VT1 > // Type of the target dense vector
2412  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2413  {
2415 
2419 
2420  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2421 
2422  const ResultType tmp( serial( rhs ) );
2423  multAssign( ~lhs, tmp );
2424  }
2426  //**********************************************************************************************
2427 
2428  //**Multiplication assignment to sparse vectors*************************************************
2429  // No special implementation for the multiplication assignment to sparse vectors.
2430  //**********************************************************************************************
2431 
2432  //**SMP assignment to dense vectors*************************************************************
2447  template< typename VT1 > // Type of the target dense vector
2448  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2449  smpAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2450  {
2452 
2453  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2454 
2455  if( rhs.mat_.rows() == 0UL ) {
2456  return;
2457  }
2458  else if( rhs.mat_.columns() == 0UL ) {
2459  reset( ~lhs );
2460  return;
2461  }
2462 
2463  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2464  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2465 
2466  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2467  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2468  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2469  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2470 
2471  smpAssign( ~lhs, A * x );
2472  }
2474  //**********************************************************************************************
2475 
2476  //**SMP assignment to sparse vectors************************************************************
2491  template< typename VT1 > // Type of the target sparse vector
2492  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2493  smpAssign( SparseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2494  {
2496 
2500 
2501  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2502 
2503  const ResultType tmp( rhs );
2504  smpAssign( ~lhs, tmp );
2505  }
2507  //**********************************************************************************************
2508 
2509  //**SMP addition assignment to dense vectors****************************************************
2524  template< typename VT1 > // Type of the target dense vector
2525  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2526  smpAddAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2527  {
2529 
2530  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2531 
2532  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2533  return;
2534  }
2535 
2536  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2537  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2538 
2539  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2540  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2541  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2542  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2543 
2544  smpAddAssign( ~lhs, A * x );
2545  }
2547  //**********************************************************************************************
2548 
2549  //**SMP addition assignment to sparse vectors***************************************************
2550  // No special implementation for the SMP addition assignment to sparse vectors.
2551  //**********************************************************************************************
2552 
2553  //**SMP subtraction assignment to dense vectors*************************************************
2568  template< typename VT1 > // Type of the target dense vector
2569  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2570  smpSubAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2571  {
2573 
2574  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2575 
2576  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2577  return;
2578  }
2579 
2580  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2581  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2582 
2583  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2584  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2585  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2586  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2587 
2588  smpSubAssign( ~lhs, A * x );
2589  }
2591  //**********************************************************************************************
2592 
2593  //**SMP subtraction assignment to sparse vectors************************************************
2594  // No special implementation for the SMP subtraction assignment to sparse vectors.
2595  //**********************************************************************************************
2596 
2597  //**SMP multiplication assignment to dense vectors**********************************************
2612  template< typename VT1 > // Type of the target dense vector
2613  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2614  smpMultAssign( DenseVector<VT1,false>& lhs, const DMatDVecMultExpr& rhs )
2615  {
2617 
2621 
2622  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2623 
2624  const ResultType tmp( rhs );
2625  smpMultAssign( ~lhs, tmp );
2626  }
2628  //**********************************************************************************************
2629 
2630  //**SMP multiplication assignment to sparse vectors*********************************************
2631  // No special implementation for the SMP multiplication assignment to sparse vectors.
2632  //**********************************************************************************************
2633 
2634  //**Compile time checks*************************************************************************
2642  //**********************************************************************************************
2643 };
2644 //*************************************************************************************************
2645 
2646 
2647 
2648 
2649 //=================================================================================================
2650 //
2651 // DVECSCALARMULTEXPR SPECIALIZATION
2652 //
2653 //=================================================================================================
2654 
2655 //*************************************************************************************************
2663 template< typename MT // Type of the left-hand side dense matrix
2664  , typename VT // Type of the right-hand side dense vector
2665  , typename ST > // Type of the scalar value
2666 class DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >
2667  : public DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false >
2668  , private VecScalarMultExpr
2669  , private Computation
2670 {
2671  private:
2672  //**Type definitions****************************************************************************
2673  typedef DMatDVecMultExpr<MT,VT> MVM;
2674  typedef typename MVM::ResultType RES;
2675  typedef typename MT::ResultType MRT;
2676  typedef typename VT::ResultType VRT;
2677  typedef typename MRT::ElementType MET;
2678  typedef typename VRT::ElementType VET;
2679  typedef typename MT::CompositeType MCT;
2680  typedef typename VT::CompositeType VCT;
2681  //**********************************************************************************************
2682 
2683  //**********************************************************************************************
2685  enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2686  IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2687  //**********************************************************************************************
2688 
2689  //**********************************************************************************************
2691  enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<MT>::value };
2692  //**********************************************************************************************
2693 
2694  //**********************************************************************************************
2696 
2699  template< typename T1 >
2700  struct UseSMPAssign {
2701  enum { value = ( evaluateMatrix || evaluateVector ) };
2702  };
2703  //**********************************************************************************************
2704 
2705  //**********************************************************************************************
2707 
2709  template< typename T1, typename T2, typename T3, typename T4 >
2710  struct UseBlasKernel {
2711  enum { value = BLAZE_BLAS_MODE &&
2712  HasMutableDataAccess<T1>::value &&
2713  HasConstDataAccess<T2>::value &&
2714  HasConstDataAccess<T3>::value &&
2715  !IsDiagonal<T2>::value &&
2716  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2717  IsBlasCompatible<typename T1::ElementType>::value &&
2718  IsBlasCompatible<typename T2::ElementType>::value &&
2719  IsBlasCompatible<typename T3::ElementType>::value &&
2720  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
2721  IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
2722  !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
2723  };
2724  //**********************************************************************************************
2725 
2726  //**********************************************************************************************
2728 
2731  template< typename T1, typename T2, typename T3, typename T4 >
2732  struct UseVectorizedDefaultKernel {
2733  enum { value = useOptimizedKernels &&
2734  !IsDiagonal<T2>::value &&
2735  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2736  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2737  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2738  IsSame<typename T1::ElementType,T4>::value &&
2739  IntrinsicTrait<typename T1::ElementType>::addition &&
2740  IntrinsicTrait<typename T1::ElementType>::multiplication };
2741  };
2742  //**********************************************************************************************
2743 
2744  public:
2745  //**Type definitions****************************************************************************
2746  typedef DVecScalarMultExpr<MVM,ST,false> This;
2747  typedef typename MultTrait<RES,ST>::Type ResultType;
2748  typedef typename ResultType::TransposeType TransposeType;
2749  typedef typename ResultType::ElementType ElementType;
2750  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2751  typedef const ElementType ReturnType;
2752  typedef const ResultType CompositeType;
2753 
2755  typedef const DMatDVecMultExpr<MT,VT> LeftOperand;
2756 
2758  typedef ST RightOperand;
2759 
2761  typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type LT;
2762 
2764  typedef typename SelectType< evaluateVector, const VRT, VCT >::Type RT;
2765  //**********************************************************************************************
2766 
2767  //**Compilation flags***************************************************************************
2769  enum { vectorizable = !IsDiagonal<MT>::value &&
2770  MT::vectorizable && VT::vectorizable &&
2771  IsSame<MET,VET>::value &&
2772  IsSame<MET,ST>::value &&
2773  IntrinsicTrait<MET>::addition &&
2774  IntrinsicTrait<MET>::multiplication };
2775 
2777  enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2778  !evaluateVector && VT::smpAssignable };
2779  //**********************************************************************************************
2780 
2781  //**Constructor*********************************************************************************
2787  explicit inline DVecScalarMultExpr( const MVM& vector, ST scalar )
2788  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2789  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2790  {}
2791  //**********************************************************************************************
2792 
2793  //**Subscript operator**************************************************************************
2799  inline ReturnType operator[]( size_t index ) const {
2800  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2801  return vector_[index] * scalar_;
2802  }
2803  //**********************************************************************************************
2804 
2805  //**At function*********************************************************************************
2812  inline ReturnType at( size_t index ) const {
2813  if( index >= vector_.size() ) {
2814  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2815  }
2816  return (*this)[index];
2817  }
2818  //**********************************************************************************************
2819 
2820  //**Size function*******************************************************************************
2825  inline size_t size() const {
2826  return vector_.size();
2827  }
2828  //**********************************************************************************************
2829 
2830  //**Left operand access*************************************************************************
2835  inline LeftOperand leftOperand() const {
2836  return vector_;
2837  }
2838  //**********************************************************************************************
2839 
2840  //**Right operand access************************************************************************
2845  inline RightOperand rightOperand() const {
2846  return scalar_;
2847  }
2848  //**********************************************************************************************
2849 
2850  //**********************************************************************************************
2856  template< typename T >
2857  inline bool canAlias( const T* alias ) const {
2858  return vector_.canAlias( alias );
2859  }
2860  //**********************************************************************************************
2861 
2862  //**********************************************************************************************
2868  template< typename T >
2869  inline bool isAliased( const T* alias ) const {
2870  return vector_.isAliased( alias );
2871  }
2872  //**********************************************************************************************
2873 
2874  //**********************************************************************************************
2879  inline bool isAligned() const {
2880  return vector_.isAligned();
2881  }
2882  //**********************************************************************************************
2883 
2884  //**********************************************************************************************
2889  inline bool canSMPAssign() const {
2890  typename MVM::LeftOperand A( vector_.leftOperand() );
2891  return ( !BLAZE_BLAS_IS_PARALLEL ||
2892  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2893  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2894  ( size() > SMP_DMATDVECMULT_THRESHOLD );
2895  }
2896  //**********************************************************************************************
2897 
2898  private:
2899  //**Member variables****************************************************************************
2900  LeftOperand vector_;
2901  RightOperand scalar_;
2902  //**********************************************************************************************
2903 
2904  //**Assignment to dense vectors*****************************************************************
2916  template< typename VT1 > // Type of the target dense vector
2917  friend inline void assign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
2918  {
2920 
2921  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2922 
2923  typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2924  typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2925 
2926  if( left.rows() == 0UL ) {
2927  return;
2928  }
2929  else if( left.columns() == 0UL ) {
2930  reset( ~lhs );
2931  return;
2932  }
2933 
2934  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
2935  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
2936 
2937  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2938  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
2939  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
2940  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2941 
2942  DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
2943  }
2944  //**********************************************************************************************
2945 
2946  //**Assignment to dense vectors (kernel selection)**********************************************
2957  template< typename VT1 // Type of the left-hand side target vector
2958  , typename MT1 // Type of the left-hand side matrix operand
2959  , typename VT2 // Type of the right-hand side vector operand
2960  , typename ST2 > // Type of the scalar value
2961  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2962  {
2963  if( ( IsDiagonal<MT1>::value ) ||
2964  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2965  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
2966  selectSmallAssignKernel( y, A, x, scalar );
2967  else
2968  selectBlasAssignKernel( y, A, x, scalar );
2969  }
2970  //**********************************************************************************************
2971 
2972  //**Default assignment to dense vectors*********************************************************
2986  template< typename VT1 // Type of the left-hand side target vector
2987  , typename MT1 // Type of the left-hand side matrix operand
2988  , typename VT2 // Type of the right-hand side vector operand
2989  , typename ST2 > // Type of the scalar value
2990  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2991  selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2992  {
2993  y.assign( A * x * scalar );
2994  }
2995  //**********************************************************************************************
2996 
2997  //**Default assignment to dense vectors (small matrices)****************************************
3011  template< typename VT1 // Type of the left-hand side target vector
3012  , typename MT1 // Type of the left-hand side matrix operand
3013  , typename VT2 // Type of the right-hand side vector operand
3014  , typename ST2 > // Type of the scalar value
3015  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3016  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3017  {
3018  selectDefaultAssignKernel( y, A, x, scalar );
3019  }
3020  //**********************************************************************************************
3021 
3022  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3036  template< typename VT1 // Type of the left-hand side target vector
3037  , typename MT1 // Type of the left-hand side matrix operand
3038  , typename VT2 // Type of the right-hand side vector operand
3039  , typename ST2 > // Type of the scalar value
3040  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3041  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3042  {
3043  typedef IntrinsicTrait<ElementType> IT;
3044 
3045  const size_t M( A.rows() );
3046  const size_t N( A.columns() );
3047 
3048  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3049 
3050  size_t i( 0UL );
3051 
3052  for( ; (i+8UL) <= M; i+=8UL )
3053  {
3054  const size_t jbegin( ( IsUpper<MT1>::value )
3055  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3056  :( 0UL ) );
3057  const size_t jend( ( IsLower<MT1>::value )
3058  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3059  :( N ) );
3060  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3061 
3062  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3063  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3064 
3065  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3066  size_t j( jbegin );
3067 
3068  for( ; j<jpos; j+=IT::size ) {
3069  const IntrinsicType x1( x.load(j) );
3070  xmm1 = xmm1 + A.load(i ,j) * x1;
3071  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3072  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3073  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3074  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3075  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3076  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3077  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3078  }
3079 
3080  y[i ] = sum( xmm1 ) * scalar;
3081  y[i+1UL] = sum( xmm2 ) * scalar;
3082  y[i+2UL] = sum( xmm3 ) * scalar;
3083  y[i+3UL] = sum( xmm4 ) * scalar;
3084  y[i+4UL] = sum( xmm5 ) * scalar;
3085  y[i+5UL] = sum( xmm6 ) * scalar;
3086  y[i+6UL] = sum( xmm7 ) * scalar;
3087  y[i+7UL] = sum( xmm8 ) * scalar;
3088 
3089  for( ; remainder && j<jend; ++j ) {
3090  y[i ] += A(i ,j) * x[j] * scalar;
3091  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3092  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3093  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3094  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3095  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3096  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3097  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3098  }
3099  }
3100 
3101  for( ; (i+4UL) <= M; i+=4UL )
3102  {
3103  const size_t jbegin( ( IsUpper<MT1>::value )
3104  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3105  :( 0UL ) );
3106  const size_t jend( ( IsLower<MT1>::value )
3107  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3108  :( N ) );
3109  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3110 
3111  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3112  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3113 
3114  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3115  size_t j( jbegin );
3116 
3117  for( ; j<jpos; j+=IT::size ) {
3118  const IntrinsicType x1( x.load(j) );
3119  xmm1 = xmm1 + A.load(i ,j) * x1;
3120  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3121  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3122  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3123  }
3124 
3125  y[i ] = sum( xmm1 ) * scalar;
3126  y[i+1UL] = sum( xmm2 ) * scalar;
3127  y[i+2UL] = sum( xmm3 ) * scalar;
3128  y[i+3UL] = sum( xmm4 ) * scalar;
3129 
3130  for( ; remainder && j<jend; ++j ) {
3131  y[i ] += A(i ,j) * x[j] * scalar;
3132  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3133  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3134  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3135  }
3136  }
3137 
3138  for( ; (i+3UL) <= M; i+=3UL )
3139  {
3140  const size_t jbegin( ( IsUpper<MT1>::value )
3141  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3142  :( 0UL ) );
3143  const size_t jend( ( IsLower<MT1>::value )
3144  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3145  :( N ) );
3146  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3147 
3148  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3149  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3150 
3151  IntrinsicType xmm1, xmm2, xmm3;
3152  size_t j( jbegin );
3153 
3154  for( ; j<jpos; j+=IT::size ) {
3155  const IntrinsicType x1( x.load(j) );
3156  xmm1 = xmm1 + A.load(i ,j) * x1;
3157  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3158  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3159  }
3160 
3161  y[i ] = sum( xmm1 ) * scalar;
3162  y[i+1UL] = sum( xmm2 ) * scalar;
3163  y[i+2UL] = sum( xmm3 ) * scalar;
3164 
3165  for( ; remainder && j<jend; ++j ) {
3166  y[i ] += A(i ,j) * x[j] * scalar;
3167  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3168  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3169  }
3170  }
3171 
3172  for( ; (i+2UL) <= M; i+=2UL )
3173  {
3174  const size_t jbegin( ( IsUpper<MT1>::value )
3175  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3176  :( 0UL ) );
3177  const size_t jend( ( IsLower<MT1>::value )
3178  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3179  :( N ) );
3180  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3181 
3182  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3183  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3184 
3185  IntrinsicType xmm1, xmm2;
3186  size_t j( jbegin );
3187 
3188  for( ; j<jpos; j+=IT::size ) {
3189  const IntrinsicType x1( x.load(j) );
3190  xmm1 = xmm1 + A.load(i ,j) * x1;
3191  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3192  }
3193 
3194  y[i ] = sum( xmm1 ) * scalar;
3195  y[i+1UL] = sum( xmm2 ) * scalar;
3196 
3197  for( ; remainder && j<jend; ++j ) {
3198  y[i ] += A(i ,j) * x[j] * scalar;
3199  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3200  }
3201  }
3202 
3203  if( i < M )
3204  {
3205  const size_t jbegin( ( IsUpper<MT1>::value )
3206  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3207  :( 0UL ) );
3208  const size_t jend( ( IsLower<MT1>::value )
3209  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3210  :( N ) );
3211  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3212 
3213  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3214  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3215 
3216  IntrinsicType xmm1;
3217  size_t j( jbegin );
3218 
3219  for( ; j<jpos; j+=IT::size ) {
3220  xmm1 = xmm1 + A.load(i,j) * x.load(j);
3221  }
3222 
3223  y[i] = sum( xmm1 ) * scalar;
3224 
3225  for( ; remainder && j<jend; ++j ) {
3226  y[i] += A(i,j) * x[j] * scalar;
3227  }
3228  }
3229  }
3230  //**********************************************************************************************
3231 
3232  //**Default assignment to dense vectors (large matrices)****************************************
3246  template< typename VT1 // Type of the left-hand side target vector
3247  , typename MT1 // Type of the left-hand side matrix operand
3248  , typename VT2 // Type of the right-hand side vector operand
3249  , typename ST2 > // Type of the scalar value
3250  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3251  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3252  {
3253  selectDefaultAssignKernel( y, A, x, scalar );
3254  }
3255  //**********************************************************************************************
3256 
3257  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3271  template< typename VT1 // Type of the left-hand side target vector
3272  , typename MT1 // Type of the left-hand side matrix operand
3273  , typename VT2 // Type of the right-hand side vector operand
3274  , typename ST2 > // Type of the scalar value
3275  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3276  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3277  {
3278  typedef IntrinsicTrait<ElementType> IT;
3279 
3280  const size_t M( A.rows() );
3281  const size_t N( A.columns() );
3282 
3283  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3284 
3285  reset( y );
3286 
3287  size_t i( 0UL );
3288 
3289  for( ; (i+8UL) <= M; i+=8UL )
3290  {
3291  const size_t jbegin( ( IsUpper<MT1>::value )
3292  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3293  :( 0UL ) );
3294  const size_t jend( ( IsLower<MT1>::value )
3295  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3296  :( N ) );
3297  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3298 
3299  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3300  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3301 
3302  size_t j( jbegin );
3303 
3304  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
3305  const size_t j1( j+IT::size );
3306  const size_t j2( j+IT::size*2UL );
3307  const size_t j3( j+IT::size*3UL );
3308  const IntrinsicType x1( x.load(j ) );
3309  const IntrinsicType x2( x.load(j1) );
3310  const IntrinsicType x3( x.load(j2) );
3311  const IntrinsicType x4( x.load(j3) );
3312  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3313  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3314  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3315  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3316  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3317  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3318  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3319  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3320  }
3321 
3322  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
3323  const size_t j1( j+IT::size );
3324  const IntrinsicType x1( x.load(j ) );
3325  const IntrinsicType x2( x.load(j1) );
3326  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3327  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3328  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3329  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3330  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3331  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3332  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3333  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3334  }
3335 
3336  for( ; j<jpos; j+=IT::size ) {
3337  const IntrinsicType x1( x.load(j) );
3338  y[i ] += sum( A.load(i ,j) * x1 );
3339  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3340  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3341  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3342  y[i+4UL] += sum( A.load(i+4UL,j) * x1 );
3343  y[i+5UL] += sum( A.load(i+5UL,j) * x1 );
3344  y[i+6UL] += sum( A.load(i+6UL,j) * x1 );
3345  y[i+7UL] += sum( A.load(i+7UL,j) * x1 );
3346  }
3347 
3348  for( ; remainder && j<jend; ++j ) {
3349  y[i ] += A(i ,j) * x[j];
3350  y[i+1UL] += A(i+1UL,j) * x[j];
3351  y[i+2UL] += A(i+2UL,j) * x[j];
3352  y[i+3UL] += A(i+3UL,j) * x[j];
3353  y[i+4UL] += A(i+4UL,j) * x[j];
3354  y[i+5UL] += A(i+5UL,j) * x[j];
3355  y[i+6UL] += A(i+6UL,j) * x[j];
3356  y[i+7UL] += A(i+7UL,j) * x[j];
3357  }
3358 
3359  y[i ] *= scalar;
3360  y[i+1UL] *= scalar;
3361  y[i+2UL] *= scalar;
3362  y[i+3UL] *= scalar;
3363  y[i+4UL] *= scalar;
3364  y[i+5UL] *= scalar;
3365  y[i+6UL] *= scalar;
3366  y[i+7UL] *= scalar;
3367  }
3368 
3369  for( ; (i+4UL) <= M; i+=4UL )
3370  {
3371  const size_t jbegin( ( IsUpper<MT1>::value )
3372  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3373  :( 0UL ) );
3374  const size_t jend( ( IsLower<MT1>::value )
3375  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3376  :( N ) );
3377  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3378 
3379  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3380  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3381 
3382  size_t j( jbegin );
3383 
3384  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
3385  const size_t j1( j+IT::size );
3386  const size_t j2( j+IT::size*2UL );
3387  const size_t j3( j+IT::size*3UL );
3388  const IntrinsicType x1( x.load(j ) );
3389  const IntrinsicType x2( x.load(j1) );
3390  const IntrinsicType x3( x.load(j2) );
3391  const IntrinsicType x4( x.load(j3) );
3392  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3393  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3394  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3395  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3396  }
3397 
3398  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
3399  const size_t j1( j+IT::size );
3400  const IntrinsicType x1( x.load(j ) );
3401  const IntrinsicType x2( x.load(j1) );
3402  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3403  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3404  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3405  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3406  }
3407 
3408  for( ; j<jpos; j+=IT::size ) {
3409  const IntrinsicType x1( x.load(j) );
3410  y[i ] += sum( A.load(i ,j) * x1 );
3411  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3412  y[i+2UL] += sum( A.load(i+2UL,j) * x1 );
3413  y[i+3UL] += sum( A.load(i+3UL,j) * x1 );
3414  }
3415 
3416  for( ; remainder && j<jend; ++j ) {
3417  y[i ] += A(i ,j) * x[j];
3418  y[i+1UL] += A(i+1UL,j) * x[j];
3419  y[i+2UL] += A(i+2UL,j) * x[j];
3420  y[i+3UL] += A(i+3UL,j) * x[j];
3421  }
3422 
3423  y[i ] *= scalar;
3424  y[i+1UL] *= scalar;
3425  y[i+2UL] *= scalar;
3426  y[i+3UL] *= scalar;
3427  }
3428 
3429  for( ; (i+2UL) <= M; i+=2UL )
3430  {
3431  const size_t jbegin( ( IsUpper<MT1>::value )
3432  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3433  :( 0UL ) );
3434  const size_t jend( ( IsLower<MT1>::value )
3435  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3436  :( N ) );
3437  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3438 
3439  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3440  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3441 
3442  size_t j( jbegin );
3443 
3444  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
3445  const size_t j1( j+IT::size );
3446  const size_t j2( j+IT::size*2UL );
3447  const size_t j3( j+IT::size*3UL );
3448  const IntrinsicType x1( x.load(j ) );
3449  const IntrinsicType x2( x.load(j1) );
3450  const IntrinsicType x3( x.load(j2) );
3451  const IntrinsicType x4( x.load(j3) );
3452  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3453  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3454  }
3455 
3456  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
3457  const size_t j1( j+IT::size );
3458  const IntrinsicType x1( x.load(j ) );
3459  const IntrinsicType x2( x.load(j1) );
3460  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3461  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3462  }
3463 
3464  for( ; j<jpos; j+=IT::size ) {
3465  const IntrinsicType x1( x.load(j) );
3466  y[i ] += sum( A.load(i ,j) * x1 );
3467  y[i+1UL] += sum( A.load(i+1UL,j) * x1 );
3468  }
3469 
3470  for( ; remainder && j<jend; ++j ) {
3471  y[i ] += A(i ,j) * x[j];
3472  y[i+1UL] += A(i+1UL,j) * x[j];
3473  }
3474 
3475  y[i ] *= scalar;
3476  y[i+1UL] *= scalar;
3477  }
3478 
3479  if( i < M )
3480  {
3481  const size_t jbegin( ( IsUpper<MT1>::value )
3482  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3483  :( 0UL ) );
3484  const size_t jend( ( IsLower<MT1>::value )
3485  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3486  :( N ) );
3487  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3488 
3489  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3490  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3491 
3492  size_t j( jbegin );
3493 
3494  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
3495  const size_t j1( j+IT::size );
3496  const size_t j2( j+IT::size*2UL );
3497  const size_t j3( j+IT::size*3UL );
3498  const IntrinsicType x1( x.load(j ) );
3499  const IntrinsicType x2( x.load(j1) );
3500  const IntrinsicType x3( x.load(j2) );
3501  const IntrinsicType x4( x.load(j3) );
3502  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3503  }
3504 
3505  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
3506  const size_t j1( j+IT::size );
3507  const IntrinsicType x1( x.load(j ) );
3508  const IntrinsicType x2( x.load(j1) );
3509  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3510  }
3511 
3512  for( ; j<jpos; j+=IT::size ) {
3513  const IntrinsicType x1( x.load(j) );
3514  y[i] += sum( A.load(i,j) * x1 );
3515  }
3516 
3517  for( ; remainder && j<jend; ++j ) {
3518  y[i] += A(i,j) * x[j];
3519  }
3520 
3521  y[i] *= scalar;
3522  }
3523  }
3524  //**********************************************************************************************
3525 
3526  //**BLAS-based assignment to dense vectors (default)********************************************
3540  template< typename VT1 // Type of the left-hand side target vector
3541  , typename MT1 // Type of the left-hand side matrix operand
3542  , typename VT2 // Type of the right-hand side vector operand
3543  , typename ST2 > // Type of the scalar value
3544  static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
3545  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3546  {
3547  selectLargeAssignKernel( y, A, x, scalar );
3548  }
3549  //**********************************************************************************************
3550 
3551  //**BLAS-based assignment to dense vectors******************************************************
3552 #if BLAZE_BLAS_MODE
3553 
3566  template< typename VT1 // Type of the left-hand side target vector
3567  , typename MT1 // Type of the left-hand side matrix operand
3568  , typename VT2 // Type of the right-hand side vector operand
3569  , typename ST2 > // Type of the scalar value
3570  static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
3571  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3572  {
3573  typedef typename VT1::ElementType ET;
3574 
3575  if( IsTriangular<MT1>::value ) {
3576  assign( y, scalar * x );
3577  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3578  }
3579  else {
3580  gemv( y, A, x, ET(scalar), ET(0) );
3581  }
3582  }
3583 #endif
3584  //**********************************************************************************************
3585 
3586  //**Assignment to sparse vectors****************************************************************
3598  template< typename VT1 > // Type of the target sparse vector
3599  friend inline void assign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3600  {
3602 
3606 
3607  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3608 
3609  const ResultType tmp( serial( rhs ) );
3610  assign( ~lhs, tmp );
3611  }
3612  //**********************************************************************************************
3613 
3614  //**Addition assignment to dense vectors********************************************************
3626  template< typename VT1 > // Type of the target dense vector
3627  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3628  {
3630 
3631  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3632 
3633  typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3634  typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3635 
3636  if( left.rows() == 0UL || left.columns() == 0UL ) {
3637  return;
3638  }
3639 
3640  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3641  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3642 
3643  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3644  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3645  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3646  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3647 
3648  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3649  }
3650  //**********************************************************************************************
3651 
3652  //**Addition assignment to dense vectors (kernel selection)*************************************
3663  template< typename VT1 // Type of the left-hand side target vector
3664  , typename MT1 // Type of the left-hand side matrix operand
3665  , typename VT2 // Type of the right-hand side vector operand
3666  , typename ST2 > // Type of the scalar value
3667  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3668  {
3669  if( ( IsDiagonal<MT1>::value ) ||
3670  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3671  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3672  selectSmallAddAssignKernel( y, A, x, scalar );
3673  else
3674  selectBlasAddAssignKernel( y, A, x, scalar );
3675  }
3676  //**********************************************************************************************
3677 
3678  //**Default addition assignment to dense vectors************************************************
3692  template< typename VT1 // Type of the left-hand side target vector
3693  , typename MT1 // Type of the left-hand side matrix operand
3694  , typename VT2 // Type of the right-hand side vector operand
3695  , typename ST2 > // Type of the scalar value
3696  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3697  {
3698  y.addAssign( A * x * scalar );
3699  }
3700  //**********************************************************************************************
3701 
3702  //**Default addition assignment to dense vectors (small matrices)*******************************
3716  template< typename VT1 // Type of the left-hand side target vector
3717  , typename MT1 // Type of the left-hand side matrix operand
3718  , typename VT2 // Type of the right-hand side vector operand
3719  , typename ST2 > // Type of the scalar value
3720  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3721  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3722  {
3723  selectDefaultAddAssignKernel( y, A, x, scalar );
3724  }
3725  //**********************************************************************************************
3726 
3727  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3741  template< typename VT1 // Type of the left-hand side target vector
3742  , typename MT1 // Type of the left-hand side matrix operand
3743  , typename VT2 // Type of the right-hand side vector operand
3744  , typename ST2 > // Type of the scalar value
3745  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3746  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3747  {
3748  typedef IntrinsicTrait<ElementType> IT;
3749 
3750  const size_t M( A.rows() );
3751  const size_t N( A.columns() );
3752 
3753  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3754 
3755  size_t i( 0UL );
3756 
3757  for( ; (i+8UL) <= M; i+=8UL )
3758  {
3759  const size_t jbegin( ( IsUpper<MT1>::value )
3760  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3761  :( 0UL ) );
3762  const size_t jend( ( IsLower<MT1>::value )
3763  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3764  :( N ) );
3765  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3766 
3767  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3768  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3769 
3770  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3771  size_t j( jbegin );
3772 
3773  for( ; j<jpos; j+=IT::size ) {
3774  const IntrinsicType x1( x.load(j) );
3775  xmm1 = xmm1 + A.load(i ,j) * x1;
3776  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3777  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3778  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3779  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3780  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3781  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3782  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3783  }
3784 
3785  y[i ] += sum( xmm1 ) * scalar;
3786  y[i+1UL] += sum( xmm2 ) * scalar;
3787  y[i+2UL] += sum( xmm3 ) * scalar;
3788  y[i+3UL] += sum( xmm4 ) * scalar;
3789  y[i+4UL] += sum( xmm5 ) * scalar;
3790  y[i+5UL] += sum( xmm6 ) * scalar;
3791  y[i+6UL] += sum( xmm7 ) * scalar;
3792  y[i+7UL] += sum( xmm8 ) * scalar;
3793 
3794  for( ; remainder && j<jend; ++j ) {
3795  y[i ] += A(i ,j) * x[j] * scalar;
3796  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3797  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3798  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3799  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3800  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3801  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3802  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3803  }
3804  }
3805 
3806  for( ; (i+4UL) <= M; i+=4UL )
3807  {
3808  const size_t jbegin( ( IsUpper<MT1>::value )
3809  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3810  :( 0UL ) );
3811  const size_t jend( ( IsLower<MT1>::value )
3812  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3813  :( N ) );
3814  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3815 
3816  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3817  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3818 
3819  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3820  size_t j( jbegin );
3821 
3822  for( ; j<jpos; j+=IT::size ) {
3823  const IntrinsicType x1( x.load(j) );
3824  xmm1 = xmm1 + A.load(i ,j) * x1;
3825  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3826  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3827  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3828  }
3829 
3830  y[i ] += sum( xmm1 ) * scalar;
3831  y[i+1UL] += sum( xmm2 ) * scalar;
3832  y[i+2UL] += sum( xmm3 ) * scalar;
3833  y[i+3UL] += sum( xmm4 ) * scalar;
3834 
3835  for( ; remainder && j<jend; ++j ) {
3836  y[i ] += A(i ,j) * x[j] * scalar;
3837  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3838  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3839  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3840  }
3841  }
3842 
3843  for( ; (i+3UL) <= M; i+=3UL )
3844  {
3845  const size_t jbegin( ( IsUpper<MT1>::value )
3846  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3847  :( 0UL ) );
3848  const size_t jend( ( IsLower<MT1>::value )
3849  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3850  :( N ) );
3851  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3852 
3853  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3854  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3855 
3856  IntrinsicType xmm1, xmm2, xmm3;
3857  size_t j( jbegin );
3858 
3859  for( ; j<jpos; j+=IT::size ) {
3860  const IntrinsicType x1( x.load(j) );
3861  xmm1 = xmm1 + A.load(i ,j) * x1;
3862  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3863  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3864  }
3865 
3866  y[i ] += sum( xmm1 ) * scalar;
3867  y[i+1UL] += sum( xmm2 ) * scalar;
3868  y[i+2UL] += sum( xmm3 ) * scalar;
3869 
3870  for( ; remainder && j<jend; ++j ) {
3871  y[i ] += A(i ,j) * x[j] * scalar;
3872  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3873  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3874  }
3875  }
3876 
3877  for( ; (i+2UL) <= M; i+=2UL )
3878  {
3879  const size_t jbegin( ( IsUpper<MT1>::value )
3880  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3881  :( 0UL ) );
3882  const size_t jend( ( IsLower<MT1>::value )
3883  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3884  :( N ) );
3885  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3886 
3887  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3888  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3889 
3890  IntrinsicType xmm1, xmm2;
3891  size_t j( jbegin );
3892 
3893  for( ; j<jpos; j+=IT::size ) {
3894  const IntrinsicType x1( x.load(j) );
3895  xmm1 = xmm1 + A.load(i ,j) * x1;
3896  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3897  }
3898 
3899  y[i ] += sum( xmm1 ) * scalar;
3900  y[i+1UL] += sum( xmm2 ) * scalar;
3901 
3902  for( ; remainder && j<jend; ++j ) {
3903  y[i ] += A(i ,j) * x[j] * scalar;
3904  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3905  }
3906  }
3907 
3908  if( i < M )
3909  {
3910  const size_t jbegin( ( IsUpper<MT1>::value )
3911  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3912  :( 0UL ) );
3913  const size_t jend( ( IsLower<MT1>::value )
3914  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3915  :( N ) );
3916  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3917 
3918  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
3919  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
3920 
3921  IntrinsicType xmm1;
3922  size_t j( jbegin );
3923 
3924  for( ; j<jpos; j+=IT::size ) {
3925  xmm1 = xmm1 + A.load(i,j) * x.load(j);
3926  }
3927 
3928  y[i] += sum( xmm1 ) * scalar;
3929 
3930  for( ; remainder && j<jend; ++j ) {
3931  y[i] += A(i,j) * x[j] * scalar;
3932  }
3933  }
3934  }
3935  //**********************************************************************************************
3936 
3937  //**Default addition assignment to dense vectors (large matrices)*******************************
3951  template< typename VT1 // Type of the left-hand side target vector
3952  , typename MT1 // Type of the left-hand side matrix operand
3953  , typename VT2 // Type of the right-hand side vector operand
3954  , typename ST2 > // Type of the scalar value
3955  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3956  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3957  {
3958  selectDefaultAddAssignKernel( y, A, x, scalar );
3959  }
3960  //**********************************************************************************************
3961 
3962  //**Vectorized default addition assignment to dense vectors (large matrices)********************
3976  template< typename VT1 // Type of the left-hand side target vector
3977  , typename MT1 // Type of the left-hand side matrix operand
3978  , typename VT2 // Type of the right-hand side vector operand
3979  , typename ST2 > // Type of the scalar value
3980  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3981  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3982  {
3983  typedef IntrinsicTrait<ElementType> IT;
3984 
3985  const size_t M( A.rows() );
3986  const size_t N( A.columns() );
3987 
3988  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3989 
3990  size_t i( 0UL );
3991 
3992  for( ; (i+8UL) <= M; i+=8UL )
3993  {
3994  const size_t jbegin( ( IsUpper<MT1>::value )
3995  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
3996  :( 0UL ) );
3997  const size_t jend( ( IsLower<MT1>::value )
3998  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3999  :( N ) );
4000  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4001 
4002  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4003  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4004 
4005  size_t j( jbegin );
4006 
4007  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
4008  const size_t j1( j+IT::size );
4009  const size_t j2( j+IT::size*2UL );
4010  const size_t j3( j+IT::size*3UL );
4011  const IntrinsicType x1( x.load(j ) );
4012  const IntrinsicType x2( x.load(j1) );
4013  const IntrinsicType x3( x.load(j2) );
4014  const IntrinsicType x4( x.load(j3) );
4015  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4016  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4017  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4018  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4019  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4020  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4021  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4022  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4023  }
4024 
4025  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
4026  const size_t j1( j+IT::size );
4027  const IntrinsicType x1( x.load(j ) );
4028  const IntrinsicType x2( x.load(j1) );
4029  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4030  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4031  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4032  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4033  y[i+4UL] += sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4034  y[i+5UL] += sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4035  y[i+6UL] += sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4036  y[i+7UL] += sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4037  }
4038 
4039  for( ; j<jpos; j+=IT::size ) {
4040  const IntrinsicType x1( x.load(j) );
4041  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4042  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4043  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4044  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4045  y[i+4UL] += sum( A.load(i+4UL,j) * x1 ) * scalar;
4046  y[i+5UL] += sum( A.load(i+5UL,j) * x1 ) * scalar;
4047  y[i+6UL] += sum( A.load(i+6UL,j) * x1 ) * scalar;
4048  y[i+7UL] += sum( A.load(i+7UL,j) * x1 ) * scalar;
4049  }
4050 
4051  for( ; remainder && j<jend; ++j ) {
4052  y[i ] += A(i ,j) * x[j] * scalar;
4053  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4054  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4055  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4056  y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4057  y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4058  y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4059  y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4060  }
4061  }
4062 
4063  for( ; (i+4UL) <= M; i+=4UL )
4064  {
4065  const size_t jbegin( ( IsUpper<MT1>::value )
4066  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4067  :( 0UL ) );
4068  const size_t jend( ( IsLower<MT1>::value )
4069  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4070  :( N ) );
4071  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4072 
4073  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4074  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4075 
4076  size_t j( jbegin );
4077 
4078  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
4079  const size_t j1( j+IT::size );
4080  const size_t j2( j+IT::size*2UL );
4081  const size_t j3( j+IT::size*3UL );
4082  const IntrinsicType x1( x.load(j ) );
4083  const IntrinsicType x2( x.load(j1) );
4084  const IntrinsicType x3( x.load(j2) );
4085  const IntrinsicType x4( x.load(j3) );
4086  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4087  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4088  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4089  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4090  }
4091 
4092  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
4093  const size_t j1( j+IT::size );
4094  const IntrinsicType x1( x.load(j ) );
4095  const IntrinsicType x2( x.load(j1) );
4096  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4097  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4098  y[i+2UL] += sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4099  y[i+3UL] += sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4100  }
4101 
4102  for( ; j<jpos; j+=IT::size ) {
4103  const IntrinsicType x1( x.load(j) );
4104  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4105  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4106  y[i+2UL] += sum( A.load(i+2UL,j) * x1 ) * scalar;
4107  y[i+3UL] += sum( A.load(i+3UL,j) * x1 ) * scalar;
4108  }
4109 
4110  for( ; remainder && j<jend; ++j ) {
4111  y[i ] += A(i ,j) * x[j] * scalar;
4112  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4113  y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4114  y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4115  }
4116  }
4117 
4118  for( ; (i+2UL) <= M; i+=2UL )
4119  {
4120  const size_t jbegin( ( IsUpper<MT1>::value )
4121  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4122  :( 0UL ) );
4123  const size_t jend( ( IsLower<MT1>::value )
4124  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4125  :( N ) );
4126  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4127 
4128  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4129  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4130 
4131  size_t j( jbegin );
4132 
4133  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
4134  const size_t j1( j+IT::size );
4135  const size_t j2( j+IT::size*2UL );
4136  const size_t j3( j+IT::size*3UL );
4137  const IntrinsicType x1( x.load(j ) );
4138  const IntrinsicType x2( x.load(j1) );
4139  const IntrinsicType x3( x.load(j2) );
4140  const IntrinsicType x4( x.load(j3) );
4141  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4142  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4143  }
4144 
4145  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
4146  const size_t j1( j+IT::size );
4147  const IntrinsicType x1( x.load(j ) );
4148  const IntrinsicType x2( x.load(j1) );
4149  y[i ] += sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4150  y[i+1UL] += sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4151  }
4152 
4153  for( ; j<jpos; j+=IT::size ) {
4154  const IntrinsicType x1( x.load(j) );
4155  y[i ] += sum( A.load(i ,j) * x1 ) * scalar;
4156  y[i+1UL] += sum( A.load(i+1UL,j) * x1 ) * scalar;
4157  }
4158 
4159  for( ; remainder && j<jend; ++j ) {
4160  y[i ] += A(i ,j) * x[j] * scalar;
4161  y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4162  }
4163  }
4164 
4165  if( i < M )
4166  {
4167  const size_t jbegin( ( IsUpper<MT1>::value )
4168  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4169  :( 0UL ) );
4170  const size_t jend( ( IsLower<MT1>::value )
4171  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4172  :( N ) );
4173  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4174 
4175  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4176  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4177 
4178  size_t j( jbegin );
4179 
4180  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
4181  const size_t j1( j+IT::size );
4182  const size_t j2( j+IT::size*2UL );
4183  const size_t j3( j+IT::size*3UL );
4184  const IntrinsicType x1( x.load(j ) );
4185  const IntrinsicType x2( x.load(j1) );
4186  const IntrinsicType x3( x.load(j2) );
4187  const IntrinsicType x4( x.load(j3) );
4188  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4189  }
4190 
4191  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
4192  const size_t j1( j+IT::size );
4193  const IntrinsicType x1( x.load(j ) );
4194  const IntrinsicType x2( x.load(j1) );
4195  y[i] += sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4196  }
4197 
4198  for( ; j<jpos; j+=IT::size ) {
4199  const IntrinsicType x1( x.load(j) );
4200  y[i] += sum( A.load(i,j) * x1 ) * scalar;
4201  }
4202 
4203  for( ; remainder && j<jend; ++j ) {
4204  y[i] += A(i,j) * x[j] * scalar;
4205  }
4206  }
4207  }
4208  //**********************************************************************************************
4209 
4210  //**BLAS-based addition assignment to dense vectors (default)***********************************
4224  template< typename VT1 // Type of the left-hand side target vector
4225  , typename MT1 // Type of the left-hand side matrix operand
4226  , typename VT2 // Type of the right-hand side vector operand
4227  , typename ST2 > // Type of the scalar value
4228  static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
4229  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4230  {
4231  selectLargeAddAssignKernel( y, A, x, scalar );
4232  }
4233  //**********************************************************************************************
4234 
4235  //**BLAS-based addition assignment to dense vectors*********************************************
4236 #if BLAZE_BLAS_MODE
4237 
4250  template< typename VT1 // Type of the left-hand side target vector
4251  , typename MT1 // Type of the left-hand side matrix operand
4252  , typename VT2 // Type of the right-hand side vector operand
4253  , typename ST2 > // Type of the scalar value
4254  static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
4255  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4256  {
4257  typedef typename VT1::ElementType ET;
4258 
4259  if( IsTriangular<MT1>::value ) {
4260  typename VT1::ResultType tmp( serial( scalar * x ) );
4261  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4262  addAssign( y, tmp );
4263  }
4264  else {
4265  gemv( y, A, x, ET(scalar), ET(1) );
4266  }
4267  }
4268 #endif
4269  //**********************************************************************************************
4270 
4271  //**Addition assignment to sparse vectors*******************************************************
4272  // No special implementation for the addition assignment to sparse vectors.
4273  //**********************************************************************************************
4274 
4275  //**Subtraction assignment to dense vectors*****************************************************
4287  template< typename VT1 > // Type of the target dense vector
4288  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4289  {
4291 
4292  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4293 
4294  typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
4295  typename MVM::RightOperand right( rhs.vector_.rightOperand() );
4296 
4297  if( left.rows() == 0UL || left.columns() == 0UL ) {
4298  return;
4299  }
4300 
4301  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4302  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
4303 
4304  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4305  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4306  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4307  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
4308 
4309  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
4310  }
4311  //**********************************************************************************************
4312 
4313  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4324  template< typename VT1 // Type of the left-hand side target vector
4325  , typename MT1 // Type of the left-hand side matrix operand
4326  , typename VT2 // Type of the right-hand side vector operand
4327  , typename ST2 > // Type of the scalar value
4328  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4329  {
4330  if( ( IsDiagonal<MT1>::value ) ||
4331  ( IsComputation<MT>::value && !evaluateMatrix ) ||
4332  ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4333  selectSmallSubAssignKernel( y, A, x, scalar );
4334  else
4335  selectBlasSubAssignKernel( y, A, x, scalar );
4336  }
4337  //**********************************************************************************************
4338 
4339  //**Default subtraction assignment to dense vectors*********************************************
4353  template< typename VT1 // Type of the left-hand side target vector
4354  , typename MT1 // Type of the left-hand side matrix operand
4355  , typename VT2 // Type of the right-hand side vector operand
4356  , typename ST2 > // Type of the scalar value
4357  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4358  {
4359  y.subAssign( A * x * scalar );
4360  }
4361  //**********************************************************************************************
4362 
4363  //**Default subtraction assignment to dense vectors (small matrices)****************************
4377  template< typename VT1 // Type of the left-hand side target vector
4378  , typename MT1 // Type of the left-hand side matrix operand
4379  , typename VT2 // Type of the right-hand side vector operand
4380  , typename ST2 > // Type of the scalar value
4381  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4382  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4383  {
4384  selectDefaultSubAssignKernel( y, A, x, scalar );
4385  }
4386  //**********************************************************************************************
4387 
4388  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4402  template< typename VT1 // Type of the left-hand side target vector
4403  , typename MT1 // Type of the left-hand side matrix operand
4404  , typename VT2 // Type of the right-hand side vector operand
4405  , typename ST2 > // Type of the scalar value
4406  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4407  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4408  {
4409  typedef IntrinsicTrait<ElementType> IT;
4410 
4411  const size_t M( A.rows() );
4412  const size_t N( A.columns() );
4413 
4414  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4415 
4416  size_t i( 0UL );
4417 
4418  for( ; (i+8UL) <= M; i+=8UL )
4419  {
4420  const size_t jbegin( ( IsUpper<MT1>::value )
4421  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4422  :( 0UL ) );
4423  const size_t jend( ( IsLower<MT1>::value )
4424  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4425  :( N ) );
4426  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4427 
4428  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4429  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4430 
4431  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4432  size_t j( jbegin );
4433 
4434  for( ; j<jpos; j+=IT::size ) {
4435  const IntrinsicType x1( x.load(j) );
4436  xmm1 = xmm1 + A.load(i ,j) * x1;
4437  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4438  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4439  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4440  xmm5 = xmm5 + A.load(i+4UL,j) * x1;
4441  xmm6 = xmm6 + A.load(i+5UL,j) * x1;
4442  xmm7 = xmm7 + A.load(i+6UL,j) * x1;
4443  xmm8 = xmm8 + A.load(i+7UL,j) * x1;
4444  }
4445 
4446  y[i ] -= sum( xmm1 ) * scalar;
4447  y[i+1UL] -= sum( xmm2 ) * scalar;
4448  y[i+2UL] -= sum( xmm3 ) * scalar;
4449  y[i+3UL] -= sum( xmm4 ) * scalar;
4450  y[i+4UL] -= sum( xmm5 ) * scalar;
4451  y[i+5UL] -= sum( xmm6 ) * scalar;
4452  y[i+6UL] -= sum( xmm7 ) * scalar;
4453  y[i+7UL] -= sum( xmm8 ) * scalar;
4454 
4455  for( ; remainder && j<jend; ++j ) {
4456  y[i ] -= A(i ,j) * x[j] * scalar;
4457  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4458  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4459  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4460  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4461  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4462  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4463  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4464  }
4465  }
4466 
4467  for( ; (i+4UL) <= M; i+=4UL )
4468  {
4469  const size_t jbegin( ( IsUpper<MT1>::value )
4470  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4471  :( 0UL ) );
4472  const size_t jend( ( IsLower<MT1>::value )
4473  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4474  :( N ) );
4475  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4476 
4477  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4478  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4479 
4480  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4481  size_t j( jbegin );
4482 
4483  for( ; j<jpos; j+=IT::size ) {
4484  const IntrinsicType x1( x.load(j) );
4485  xmm1 = xmm1 + A.load(i ,j) * x1;
4486  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4487  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4488  xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4489  }
4490 
4491  y[i ] -= sum( xmm1 ) * scalar;
4492  y[i+1UL] -= sum( xmm2 ) * scalar;
4493  y[i+2UL] -= sum( xmm3 ) * scalar;
4494  y[i+3UL] -= sum( xmm4 ) * scalar;
4495 
4496  for( ; remainder && j<jend; ++j ) {
4497  y[i ] -= A(i ,j) * x[j] * scalar;
4498  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4499  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4500  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4501  }
4502  }
4503 
4504  for( ; (i+3UL) <= M; i+=3UL )
4505  {
4506  const size_t jbegin( ( IsUpper<MT1>::value )
4507  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4508  :( 0UL ) );
4509  const size_t jend( ( IsLower<MT1>::value )
4510  ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
4511  :( N ) );
4512  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4513 
4514  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4515  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4516 
4517  IntrinsicType xmm1, xmm2, xmm3;
4518  size_t j( jbegin );
4519 
4520  for( ; j<jpos; j+=IT::size ) {
4521  const IntrinsicType x1( x.load(j) );
4522  xmm1 = xmm1 + A.load(i ,j) * x1;
4523  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4524  xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4525  }
4526 
4527  y[i ] -= sum( xmm1 ) * scalar;
4528  y[i+1UL] -= sum( xmm2 ) * scalar;
4529  y[i+2UL] -= sum( xmm3 ) * scalar;
4530 
4531  for( ; remainder && j<jend; ++j ) {
4532  y[i ] -= A(i ,j) * x[j] * scalar;
4533  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4534  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4535  }
4536  }
4537 
4538  for( ; (i+2UL) <= M; i+=2UL )
4539  {
4540  const size_t jbegin( ( IsUpper<MT1>::value )
4541  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4542  :( 0UL ) );
4543  const size_t jend( ( IsLower<MT1>::value )
4544  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4545  :( N ) );
4546  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4547 
4548  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4549  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4550 
4551  IntrinsicType xmm1, xmm2;
4552  size_t j( jbegin );
4553 
4554  for( ; j<jpos; j+=IT::size ) {
4555  const IntrinsicType x1( x.load(j) );
4556  xmm1 = xmm1 + A.load(i ,j) * x1;
4557  xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4558  }
4559 
4560  y[i ] -= sum( xmm1 ) * scalar;
4561  y[i+1UL] -= sum( xmm2 ) * scalar;
4562 
4563  for( ; remainder && j<jend; ++j ) {
4564  y[i ] -= A(i ,j) * x[j] * scalar;
4565  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4566  }
4567  }
4568 
4569  if( i < M )
4570  {
4571  const size_t jbegin( ( IsUpper<MT1>::value )
4572  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4573  :( 0UL ) );
4574  const size_t jend( ( IsLower<MT1>::value )
4575  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4576  :( N ) );
4577  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4578 
4579  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4580  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4581 
4582  IntrinsicType xmm1;
4583  size_t j( jbegin );
4584 
4585  for( ; j<jpos; j+=IT::size ) {
4586  xmm1 = xmm1 + A.load(i,j) * x.load(j);
4587  }
4588 
4589  y[i] -= sum( xmm1 ) * scalar;
4590 
4591  for( ; remainder && j<jend; ++j ) {
4592  y[i] -= A(i,j) * x[j] * scalar;
4593  }
4594  }
4595  }
4596  //**********************************************************************************************
4597 
4598  //**Default subtraction assignment to dense vectors (large matrices)****************************
4612  template< typename VT1 // Type of the left-hand side target vector
4613  , typename MT1 // Type of the left-hand side matrix operand
4614  , typename VT2 // Type of the right-hand side vector operand
4615  , typename ST2 > // Type of the scalar value
4616  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4617  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4618  {
4619  selectDefaultSubAssignKernel( y, A, x, scalar );
4620  }
4621  //**********************************************************************************************
4622 
4623  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4637  template< typename VT1 // Type of the left-hand side target vector
4638  , typename MT1 // Type of the left-hand side matrix operand
4639  , typename VT2 // Type of the right-hand side vector operand
4640  , typename ST2 > // Type of the scalar value
4641  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4642  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4643  {
4644  typedef IntrinsicTrait<ElementType> IT;
4645 
4646  const size_t M( A.rows() );
4647  const size_t N( A.columns() );
4648 
4649  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4650 
4651  size_t i( 0UL );
4652 
4653  for( ; (i+8UL) <= M; i+=8UL )
4654  {
4655  const size_t jbegin( ( IsUpper<MT1>::value )
4656  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4657  :( 0UL ) );
4658  const size_t jend( ( IsLower<MT1>::value )
4659  ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4660  :( N ) );
4661  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4662 
4663  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4664  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4665 
4666  size_t j( jbegin );
4667 
4668  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
4669  const size_t j1( j+IT::size );
4670  const size_t j2( j+IT::size*2UL );
4671  const size_t j3( j+IT::size*3UL );
4672  const IntrinsicType x1( x.load(j ) );
4673  const IntrinsicType x2( x.load(j1) );
4674  const IntrinsicType x3( x.load(j2) );
4675  const IntrinsicType x4( x.load(j3) );
4676  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4677  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4678  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4679  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4680  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4681  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4682  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4683  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4684  }
4685 
4686  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
4687  const size_t j1( j+IT::size );
4688  const IntrinsicType x1( x.load(j ) );
4689  const IntrinsicType x2( x.load(j1) );
4690  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4691  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4692  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4693  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4694  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4695  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4696  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4697  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4698  }
4699 
4700  for( ; j<jpos; j+=IT::size ) {
4701  const IntrinsicType x1( x.load(j) );
4702  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4703  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4704  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4705  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4706  y[i+4UL] -= sum( A.load(i+4UL,j) * x1 ) * scalar;
4707  y[i+5UL] -= sum( A.load(i+5UL,j) * x1 ) * scalar;
4708  y[i+6UL] -= sum( A.load(i+6UL,j) * x1 ) * scalar;
4709  y[i+7UL] -= sum( A.load(i+7UL,j) * x1 ) * scalar;
4710  }
4711 
4712  for( ; remainder && j<jend; ++j ) {
4713  y[i ] -= A(i ,j) * x[j] * scalar;
4714  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4715  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4716  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4717  y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4718  y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4719  y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4720  y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4721  }
4722  }
4723 
4724  for( ; (i+4UL) <= M; i+=4UL )
4725  {
4726  const size_t jbegin( ( IsUpper<MT1>::value )
4727  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4728  :( 0UL ) );
4729  const size_t jend( ( IsLower<MT1>::value )
4730  ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4731  :( N ) );
4732  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4733 
4734  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4735  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4736 
4737  size_t j( jbegin );
4738 
4739  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
4740  const size_t j1( j+IT::size );
4741  const size_t j2( j+IT::size*2UL );
4742  const size_t j3( j+IT::size*3UL );
4743  const IntrinsicType x1( x.load(j ) );
4744  const IntrinsicType x2( x.load(j1) );
4745  const IntrinsicType x3( x.load(j2) );
4746  const IntrinsicType x4( x.load(j3) );
4747  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4748  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4749  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4750  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4751  }
4752 
4753  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
4754  const size_t j1( j+IT::size );
4755  const IntrinsicType x1( x.load(j ) );
4756  const IntrinsicType x2( x.load(j1) );
4757  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4758  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4759  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4760  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4761  }
4762 
4763  for( ; j<jpos; j+=IT::size ) {
4764  const IntrinsicType x1( x.load(j) );
4765  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4766  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4767  y[i+2UL] -= sum( A.load(i+2UL,j) * x1 ) * scalar;
4768  y[i+3UL] -= sum( A.load(i+3UL,j) * x1 ) * scalar;
4769  }
4770 
4771  for( ; remainder && j<jend; ++j ) {
4772  y[i ] -= A(i ,j) * x[j] * scalar;
4773  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4774  y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4775  y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4776  }
4777  }
4778 
4779  for( ; (i+2UL) <= M; i+=2UL )
4780  {
4781  const size_t jbegin( ( IsUpper<MT1>::value )
4782  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4783  :( 0UL ) );
4784  const size_t jend( ( IsLower<MT1>::value )
4785  ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4786  :( N ) );
4787  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4788 
4789  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4790  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4791 
4792  size_t j( jbegin );
4793 
4794  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
4795  const size_t j1( j+IT::size );
4796  const size_t j2( j+IT::size*2UL );
4797  const size_t j3( j+IT::size*3UL );
4798  const IntrinsicType x1( x.load(j ) );
4799  const IntrinsicType x2( x.load(j1) );
4800  const IntrinsicType x3( x.load(j2) );
4801  const IntrinsicType x4( x.load(j3) );
4802  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4803  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4804  }
4805 
4806  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
4807  const size_t j1( j+IT::size );
4808  const IntrinsicType x1( x.load(j ) );
4809  const IntrinsicType x2( x.load(j1) );
4810  y[i ] -= sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4811  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4812  }
4813 
4814  for( ; j<jpos; j+=IT::size ) {
4815  const IntrinsicType x1( x.load(j) );
4816  y[i ] -= sum( A.load(i ,j) * x1 ) * scalar;
4817  y[i+1UL] -= sum( A.load(i+1UL,j) * x1 ) * scalar;
4818  }
4819 
4820  for( ; remainder && j<jend; ++j ) {
4821  y[i ] -= A(i ,j) * x[j] * scalar;
4822  y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4823  }
4824  }
4825 
4826  if( i < M )
4827  {
4828  const size_t jbegin( ( IsUpper<MT1>::value )
4829  ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) & size_t(-IT::size) )
4830  :( 0UL ) );
4831  const size_t jend( ( IsLower<MT1>::value )
4832  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4833  :( N ) );
4834  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4835 
4836  const size_t jpos( remainder ? ( jend & size_t(-IT::size) ) : jend );
4837  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % (IT::size) ) ) == jpos, "Invalid end calculation" );
4838 
4839  size_t j( jbegin );
4840 
4841  for( ; (j+IT::size*3UL) < jpos; j+=IT::size*4UL ) {
4842  const size_t j1( j+IT::size );
4843  const size_t j2( j+IT::size*2UL );
4844  const size_t j3( j+IT::size*3UL );
4845  const IntrinsicType x1( x.load(j ) );
4846  const IntrinsicType x2( x.load(j1) );
4847  const IntrinsicType x3( x.load(j2) );
4848  const IntrinsicType x4( x.load(j3) );
4849  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4850  }
4851 
4852  for( ; (j+IT::size) < jpos; j+=IT::size*2UL ) {
4853  const size_t j1( j+IT::size );
4854  const IntrinsicType x1( x.load(j ) );
4855  const IntrinsicType x2( x.load(j1) );
4856  y[i] -= sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4857  }
4858 
4859  for( ; j<jpos; j+=IT::size ) {
4860  const IntrinsicType x1( x.load(j) );
4861  y[i] -= sum( A.load(i,j) * x1 ) * scalar;
4862  }
4863 
4864  for( ; remainder && j<jend; ++j ) {
4865  y[i] -= A(i,j) * x[j] * scalar;
4866  }
4867  }
4868  }
4869  //**********************************************************************************************
4870 
4871  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4885  template< typename VT1 // Type of the left-hand side target vector
4886  , typename MT1 // Type of the left-hand side matrix operand
4887  , typename VT2 // Type of the right-hand side vector operand
4888  , typename ST2 > // Type of the scalar value
4889  static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
4890  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4891  {
4892  selectLargeSubAssignKernel( y, A, x, scalar );
4893  }
4894  //**********************************************************************************************
4895 
4896  //**BLAS-based subtraction assignment to dense vectors******************************************
4897 #if BLAZE_BLAS_MODE
4898 
4911  template< typename VT1 // Type of the left-hand side target vector
4912  , typename MT1 // Type of the left-hand side matrix operand
4913  , typename VT2 // Type of the right-hand side vector operand
4914  , typename ST2 > // Type of the scalar value
4915  static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
4916  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4917  {
4918  typedef typename VT1::ElementType ET;
4919 
4920  if( IsTriangular<MT1>::value ) {
4921  typename VT1::ResultType tmp( serial( scalar * x ) );
4922  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4923  subAssign( y, tmp );
4924  }
4925  else {
4926  gemv( y, A, x, ET(-scalar), ET(1) );
4927  }
4928  }
4929 #endif
4930  //**********************************************************************************************
4931 
4932  //**Subtraction assignment to sparse vectors****************************************************
4933  // No special implementation for the subtraction assignment to sparse vectors.
4934  //**********************************************************************************************
4935 
4936  //**Multiplication assignment to dense vectors**************************************************
4948  template< typename VT1 > // Type of the target dense vector
4949  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4950  {
4952 
4956 
4957  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4958 
4959  const ResultType tmp( serial( rhs ) );
4960  multAssign( ~lhs, tmp );
4961  }
4962  //**********************************************************************************************
4963 
4964  //**Multiplication assignment to sparse vectors*************************************************
4965  // No special implementation for the multiplication assignment to sparse vectors.
4966  //**********************************************************************************************
4967 
4968  //**SMP assignment to dense vectors*************************************************************
4982  template< typename VT1 > // Type of the target dense vector
4983  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
4984  smpAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4985  {
4987 
4988  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4989 
4990  typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
4991  typename MVM::RightOperand right( rhs.vector_.rightOperand() );
4992 
4993  if( left.rows() == 0UL ) {
4994  return;
4995  }
4996  else if( left.columns() == 0UL ) {
4997  reset( ~lhs );
4998  return;
4999  }
5000 
5001  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5002  RT x( right ); // Evaluation of the right-hand side dense vector operand
5003 
5004  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5005  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5006  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5007  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5008 
5009  smpAssign( ~lhs, A * x * rhs.scalar_ );
5010  }
5011  //**********************************************************************************************
5012 
5013  //**SMP assignment to sparse vectors************************************************************
5027  template< typename VT1 > // Type of the target sparse vector
5028  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5029  smpAssign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5030  {
5032 
5036 
5037  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5038 
5039  const ResultType tmp( rhs );
5040  smpAssign( ~lhs, tmp );
5041  }
5042  //**********************************************************************************************
5043 
5044  //**SMP addition assignment to dense vectors****************************************************
5058  template< typename VT1 > // Type of the target dense vector
5059  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5060  smpAddAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5061  {
5063 
5064  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5065 
5066  typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
5067  typename MVM::RightOperand right( rhs.vector_.rightOperand() );
5068 
5069  if( left.rows() == 0UL || left.columns() == 0UL ) {
5070  return;
5071  }
5072 
5073  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5074  RT x( right ); // Evaluation of the right-hand side dense vector operand
5075 
5076  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5077  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5078  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5079  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5080 
5081  smpAddAssign( ~lhs, A * x * rhs.scalar_ );
5082  }
5083  //**********************************************************************************************
5084 
5085  //**SMP addition assignment to sparse vectors***************************************************
5086  // No special implementation for the SMP addition assignment to sparse vectors.
5087  //**********************************************************************************************
5088 
5089  //**SMP subtraction assignment to dense vectors*************************************************
5103  template< typename VT1 > // Type of the target dense vector
5104  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5105  smpSubAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5106  {
5108 
5109  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5110 
5111  typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
5112  typename MVM::RightOperand right( rhs.vector_.rightOperand() );
5113 
5114  if( left.rows() == 0UL || left.columns() == 0UL ) {
5115  return;
5116  }
5117 
5118  LT A( left ); // Evaluation of the left-hand side dense matrix operand
5119  RT x( right ); // Evaluation of the right-hand side dense vector operand
5120 
5121  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5122  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
5123  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
5124  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
5125 
5126  smpSubAssign( ~lhs, A * x * rhs.scalar_ );
5127  }
5128  //**********************************************************************************************
5129 
5130  //**SMP subtraction assignment to sparse vectors************************************************
5131  // No special implementation for the SMP subtraction assignment to sparse vectors.
5132  //**********************************************************************************************
5133 
5134  //**SMP multiplication assignment to dense vectors**********************************************
5148  template< typename VT1 > // Type of the target dense vector
5149  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5150  smpMultAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
5151  {
5153 
5157 
5158  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5159 
5160  const ResultType tmp( rhs );
5161  smpMultAssign( ~lhs, tmp );
5162  }
5163  //**********************************************************************************************
5164 
5165  //**SMP multiplication assignment to sparse vectors*********************************************
5166  // No special implementation for the SMP multiplication assignment to sparse vectors.
5167  //**********************************************************************************************
5168 
5169  //**Compile time checks*************************************************************************
5177  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
5178  //**********************************************************************************************
5179 };
5181 //*************************************************************************************************
5182 
5183 
5184 
5185 
5186 //=================================================================================================
5187 //
5188 // GLOBAL BINARY ARITHMETIC OPERATORS
5189 //
5190 //=================================================================================================
5191 
5192 //*************************************************************************************************
5222 template< typename T1 // Type of the left-hand side dense matrix
5223  , typename T2 > // Type of the right-hand side dense vector
5224 inline const typename DisableIf< IsMatMatMultExpr<T1>, DMatDVecMultExpr<T1,T2> >::Type
5226 {
5228 
5229  if( (~mat).columns() != (~vec).size() ) {
5230  BLAZE_THROW_INVALID_ARGUMENT( "Matrix and vector sizes do not match" );
5231  }
5232 
5233  return DMatDVecMultExpr<T1,T2>( ~mat, ~vec );
5234 }
5235 //*************************************************************************************************
5236 
5237 
5238 
5239 
5240 //=================================================================================================
5241 //
5242 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
5243 //
5244 //=================================================================================================
5245 
5246 //*************************************************************************************************
5259 template< typename T1 // Type of the left-hand side dense matrix
5260  , bool SO // Storage order of the left-hand side dense matrix
5261  , typename T2 > // Type of the right-hand side dense vector
5262 inline const typename EnableIf< IsMatMatMultExpr<T1>, typename MultExprTrait<T1,T2>::Type >::Type
5264 {
5266 
5268 
5269  return (~mat).leftOperand() * ( (~mat).rightOperand() * vec );
5270 }
5271 //*************************************************************************************************
5272 
5273 
5274 
5275 
5276 //=================================================================================================
5277 //
5278 // SIZE SPECIALIZATIONS
5279 //
5280 //=================================================================================================
5281 
5282 //*************************************************************************************************
5284 template< typename MT, typename VT >
5285 struct Size< DMatDVecMultExpr<MT,VT> > : public Rows<MT>
5286 {};
5288 //*************************************************************************************************
5289 
5290 
5291 
5292 
5293 //=================================================================================================
5294 //
5295 // ISALIGNED SPECIALIZATIONS
5296 //
5297 //=================================================================================================
5298 
5299 //*************************************************************************************************
5301 template< typename MT, typename VT >
5302 struct IsAligned< DMatDVecMultExpr<MT,VT> >
5303  : public IsTrue< And< IsAligned<MT>, IsAligned<VT> >::value >
5304 {};
5306 //*************************************************************************************************
5307 
5308 
5309 
5310 
5311 //=================================================================================================
5312 //
5313 // EXPRESSION TRAIT SPECIALIZATIONS
5314 //
5315 //=================================================================================================
5316 
5317 //*************************************************************************************************
5319 template< typename MT, typename VT, bool AF >
5320 struct SubvectorExprTrait< DMatDVecMultExpr<MT,VT>, AF >
5321 {
5322  public:
5323  //**********************************************************************************************
5324  typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT,AF>::Type
5325  , typename SubvectorExprTrait<const VT,AF>::Type >::Type Type;
5326  //**********************************************************************************************
5327 };
5329 //*************************************************************************************************
5330 
5331 } // namespace blaze
5332 
5333 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:212
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
RightOperand rightOperand() const
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:329
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Header file for basic type definitions.
DMatDVecMultExpr(const MT &mat, const VT &vec)
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:240
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:79
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:119
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
MT::ResultType MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:125
Header file for the IsSame and IsStrictlySame type traits.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:363
MultTrait< MRT, VRT >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:201
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
DMatDVecMultExpr< MT, VT > This
Type of this DMatDVecMultExpr instance.
Definition: DMatDVecMultExpr.h:200
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
size_t size() const
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:309
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
VT::CompositeType VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:130
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MRT::ElementType MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:127
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:261
SelectType< evaluateVector, const VRT, VCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:218
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
Header file for the IsBlasCompatible type trait.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
MT::CompositeType MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:129
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:319
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:373
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:66
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:166
SelectType< evaluateMatrix, const MRT, MCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:215
Constraint on the data type.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:203
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:383
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:116
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Base template for the MultTrait class.
Definition: MultTrait.h:138
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:296
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:384
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:341
VT::ResultType VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:126
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:206
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:209
Constraint on the data type.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:353
Header file for the HasMutableDataAccess type trait.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:202
BLAZE_ALWAYS_INLINE int16_t sum(const simd_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:63
Header file for all intrinsic functionality.
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatDVecMultExpr.h:204
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix)
Returns the current number of columns of the matrix.
Definition: Matrix.h:324
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:205
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Constraint on the data type.
Header file for the complex data type.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:254
Header file for the IsUpper type trait.
Header file for exception macros.
Header file for the MatVecMultExpr base class.
VRT::ElementType VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:128
EnableIf< IsDenseVector< VT1 > >::Type smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:189
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.