TDVecTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
55 #include <blaze/math/Intrinsics.h>
56 #include <blaze/math/shims/Reset.h>
78 #include <blaze/system/BLAS.h>
81 #include <blaze/util/Assert.h>
82 #include <blaze/util/Complex.h>
85 #include <blaze/util/DisableIf.h>
86 #include <blaze/util/EnableIf.h>
87 #include <blaze/util/Exception.h>
89 #include <blaze/util/SelectType.h>
90 #include <blaze/util/Types.h>
99 
100 
101 namespace blaze {
102 
103 //=================================================================================================
104 //
105 // CLASS TDVECTDMATMULTEXPR
106 //
107 //=================================================================================================
108 
109 //*************************************************************************************************
116 template< typename VT // Type of the left-hand side dense vector
117  , typename MT > // Type of the right-hand side dense matrix
118 class TDVecTDMatMultExpr : public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
119  , private TVecMatMultExpr
120  , private Computation
121 {
122  private:
123  //**Type definitions****************************************************************************
124  typedef typename VT::ResultType VRT;
125  typedef typename MT::ResultType MRT;
126  typedef typename VRT::ElementType VET;
127  typedef typename MRT::ElementType MET;
128  typedef typename VT::CompositeType VCT;
129  typedef typename MT::CompositeType MCT;
130  //**********************************************************************************************
131 
132  //**********************************************************************************************
134  enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
135  //**********************************************************************************************
136 
137  //**********************************************************************************************
139  enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
141  //**********************************************************************************************
142 
143  //**********************************************************************************************
145 
149  template< typename T1 >
150  struct UseSMPAssign {
151  enum { value = ( evaluateVector || evaluateMatrix ) };
152  };
154  //**********************************************************************************************
155 
156  //**********************************************************************************************
158 
161  template< typename T1, typename T2, typename T3 >
162  struct UseBlasKernel {
163  enum { value = BLAZE_BLAS_MODE &&
164  HasMutableDataAccess<T1>::value &&
165  HasConstDataAccess<T2>::value &&
166  HasConstDataAccess<T3>::value &&
167  !IsDiagonal<T3>::value &&
168  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
169  IsBlasCompatible<typename T1::ElementType>::value &&
170  IsBlasCompatible<typename T2::ElementType>::value &&
171  IsBlasCompatible<typename T3::ElementType>::value &&
172  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
173  IsSame< typename T1::ElementType, typename T3::ElementType >::value };
174  };
176  //**********************************************************************************************
177 
178  //**********************************************************************************************
180 
184  template< typename T1, typename T2, typename T3 >
185  struct UseVectorizedDefaultKernel {
186  enum { value = useOptimizedKernels &&
187  !IsDiagonal<T3>::value &&
188  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
189  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
190  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
191  IntrinsicTrait<typename T1::ElementType>::addition &&
192  IntrinsicTrait<typename T1::ElementType>::multiplication };
193  };
195  //**********************************************************************************************
196 
197  public:
198  //**Type definitions****************************************************************************
204  typedef const ElementType ReturnType;
205  typedef const ResultType CompositeType;
206 
208  typedef typename SelectType< IsExpression<VT>::value, const VT, const VT& >::Type LeftOperand;
209 
211  typedef typename SelectType< IsExpression<MT>::value, const MT, const MT& >::Type RightOperand;
212 
215 
218  //**********************************************************************************************
219 
220  //**Compilation flags***************************************************************************
222  enum { vectorizable = !IsDiagonal<MT>::value &&
223  VT::vectorizable && MT::vectorizable &&
227 
229  enum { smpAssignable = !evaluateVector && VT::smpAssignable &&
230  !evaluateMatrix && MT::smpAssignable };
231  //**********************************************************************************************
232 
233  //**Constructor*********************************************************************************
239  explicit inline TDVecTDMatMultExpr( const VT& vec, const MT& mat )
240  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
241  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
242  {
243  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
244  }
245  //**********************************************************************************************
246 
247  //**Subscript operator**************************************************************************
253  inline ReturnType operator[]( size_t index ) const {
254  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
255 
256  if( ( IsStrictlyLower<MT>::value && index == mat_.columns()-1UL ) ||
257  ( IsStrictlyUpper<MT>::value && index == 0UL ) ||
258  mat_.rows() == 0UL )
259  return ElementType();
260 
262  return vec_[index] * mat_(index,index);
263 
264  const size_t ibegin( ( IsLower<MT>::value )
265  ?( IsStrictlyLower<MT>::value ? index+1UL : index )
266  :( 0UL ) );
267  const size_t iend( ( IsUpper<MT>::value )
268  ?( IsStrictlyUpper<MT>::value ? index : index+1UL )
269  :( mat_.rows() ) );
270  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
271 
272  const size_t inum( iend - ibegin );
273  const size_t ipos( ibegin + ( ( inum - 1UL ) & size_t(-2) ) + 1UL );
274 
275  ElementType res( vec_[ibegin] * mat_(ibegin,index) );
276 
277  for( size_t i=ibegin+1UL; i<ipos; i+=2UL ) {
278  res += vec_[i] * mat_(i,index) + vec_[i+1UL] * mat_(i+1UL,index);
279  }
280  if( ipos < iend ) {
281  res += vec_[ipos] * mat_(ipos,index);
282  }
283 
284  return res;
285  }
286  //**********************************************************************************************
287 
288  //**At function*********************************************************************************
295  inline ReturnType at( size_t index ) const {
296  if( index >= mat_.columns() ) {
297  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
298  }
299  return (*this)[index];
300  }
301  //**********************************************************************************************
302 
303  //**Size function*******************************************************************************
308  inline size_t size() const {
309  return mat_.columns();
310  }
311  //**********************************************************************************************
312 
313  //**Left operand access*************************************************************************
318  inline LeftOperand leftOperand() const {
319  return vec_;
320  }
321  //**********************************************************************************************
322 
323  //**Right operand access************************************************************************
328  inline RightOperand rightOperand() const {
329  return mat_;
330  }
331  //**********************************************************************************************
332 
333  //**********************************************************************************************
339  template< typename T >
340  inline bool canAlias( const T* alias ) const {
341  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
342  }
343  //**********************************************************************************************
344 
345  //**********************************************************************************************
351  template< typename T >
352  inline bool isAliased( const T* alias ) const {
353  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
354  }
355  //**********************************************************************************************
356 
357  //**********************************************************************************************
362  inline bool isAligned() const {
363  return vec_.isAligned() && mat_.isAligned();
364  }
365  //**********************************************************************************************
366 
367  //**********************************************************************************************
372  inline bool canSMPAssign() const {
373  return ( !BLAZE_BLAS_IS_PARALLEL ||
374  ( IsComputation<MT>::value && !evaluateMatrix ) ||
375  ( mat_.rows() * mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
376  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
377  }
378  //**********************************************************************************************
379 
380  private:
381  //**Member variables****************************************************************************
382  LeftOperand vec_;
383  RightOperand mat_;
384  //**********************************************************************************************
385 
386  //**Assignment to dense vectors*****************************************************************
399  template< typename VT1 > // Type of the target dense vector
400  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
401  {
403 
404  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
405 
406  if( rhs.mat_.rows() == 0UL ) {
407  reset( ~lhs );
408  return;
409  }
410  else if( rhs.mat_.columns() == 0UL ) {
411  return;
412  }
413 
414  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
415  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
416 
417  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
418  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
419  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
420  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
421 
422  TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
423  }
425  //**********************************************************************************************
426 
427  //**Assignment to dense vectors (kernel selection)**********************************************
438  template< typename VT1 // Type of the left-hand side target vector
439  , typename VT2 // Type of the left-hand side vector operand
440  , typename MT1 > // Type of the right-hand side matrix operand
441  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
442  {
443  if( ( IsDiagonal<MT1>::value ) ||
444  ( IsComputation<MT>::value && !evaluateMatrix ) ||
445  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
446  selectSmallAssignKernel( y, x, A );
447  else
448  selectBlasAssignKernel( y, x, A );
449  }
451  //**********************************************************************************************
452 
453  //**Default assignment to dense vectors*********************************************************
467  template< typename VT1 // Type of the left-hand side target vector
468  , typename VT2 // Type of the left-hand side vector operand
469  , typename MT1 > // Type of the right-hand side matrix operand
470  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
471  {
472  y.assign( x * A );
473  }
475  //**********************************************************************************************
476 
477  //**Default assignment to dense vectors (small matrices)****************************************
491  template< typename VT1 // Type of the left-hand side target vector
492  , typename VT2 // Type of the left-hand side vector operand
493  , typename MT1 > // Type of the right-hand side matrix operand
494  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
495  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
496  {
497  selectDefaultAssignKernel( y, x, A );
498  }
500  //**********************************************************************************************
501 
502  //**Vectorized default assignment to dense vectors (small matrices)*****************************
516  template< typename VT1 // Type of the left-hand side target vector
517  , typename VT2 // Type of the left-hand side vector operand
518  , typename MT1 > // Type of the right-hand side matrix operand
519  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
520  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
521  {
522  typedef IntrinsicTrait<ElementType> IT;
523 
524  const size_t M( A.rows() );
525  const size_t N( A.columns() );
526 
527  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
528 
529  size_t j( 0UL );
530 
531  for( ; (j+8UL) <= N; j+=8UL )
532  {
533  const size_t ibegin( ( IsLower<MT1>::value )
534  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
535  :( 0UL ) );
536  const size_t iend( ( IsUpper<MT1>::value )
537  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
538  :( M ) );
539  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
540 
541  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
542  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
543 
544  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
545  size_t i( ibegin );
546 
547  for( ; i<ipos; i+=IT::size ) {
548  const IntrinsicType x1( x.load(i) );
549  xmm1 = xmm1 + x1 * A.load(i,j );
550  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
551  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
552  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
553  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
554  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
555  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
556  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
557  }
558 
559  y[j ] = sum( xmm1 );
560  y[j+1UL] = sum( xmm2 );
561  y[j+2UL] = sum( xmm3 );
562  y[j+3UL] = sum( xmm4 );
563  y[j+4UL] = sum( xmm5 );
564  y[j+5UL] = sum( xmm6 );
565  y[j+6UL] = sum( xmm7 );
566  y[j+7UL] = sum( xmm8 );
567 
568  for( ; remainder && i<iend; ++i ) {
569  y[j ] += x[i] * A(i,j );
570  y[j+1UL] += x[i] * A(i,j+1UL);
571  y[j+2UL] += x[i] * A(i,j+2UL);
572  y[j+3UL] += x[i] * A(i,j+3UL);
573  y[j+4UL] += x[i] * A(i,j+4UL);
574  y[j+5UL] += x[i] * A(i,j+5UL);
575  y[j+6UL] += x[i] * A(i,j+6UL);
576  y[j+7UL] += x[i] * A(i,j+7UL);
577  }
578  }
579 
580  for( ; (j+4UL) <= N; j+=4UL )
581  {
582  const size_t ibegin( ( IsLower<MT1>::value )
583  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
584  :( 0UL ) );
585  const size_t iend( ( IsUpper<MT1>::value )
586  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
587  :( M ) );
588  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
589 
590  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
591  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
592 
593  IntrinsicType xmm1, xmm2, xmm3, xmm4;
594  size_t i( ibegin );
595 
596  for( ; i<ipos; i+=IT::size ) {
597  const IntrinsicType x1( x.load(i) );
598  xmm1 = xmm1 + x1 * A.load(i,j );
599  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
600  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
601  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
602  }
603 
604  y[j ] = sum( xmm1 );
605  y[j+1UL] = sum( xmm2 );
606  y[j+2UL] = sum( xmm3 );
607  y[j+3UL] = sum( xmm4 );
608 
609  for( ; remainder && i<iend; ++i ) {
610  y[j ] += x[i] * A(i,j );
611  y[j+1UL] += x[i] * A(i,j+1UL);
612  y[j+2UL] += x[i] * A(i,j+2UL);
613  y[j+3UL] += x[i] * A(i,j+3UL);
614  }
615  }
616 
617  for( ; (j+3UL) <= N; j+=3UL )
618  {
619  const size_t ibegin( ( IsLower<MT1>::value )
620  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
621  :( 0UL ) );
622  const size_t iend( ( IsUpper<MT1>::value )
623  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
624  :( M ) );
625  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
626 
627  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
628  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
629 
630  IntrinsicType xmm1, xmm2, xmm3;
631  size_t i( ibegin );
632 
633  for( ; i<ipos; i+=IT::size ) {
634  const IntrinsicType x1( x.load(i) );
635  xmm1 = xmm1 + x1 * A.load(i,j );
636  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
637  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
638  }
639 
640  y[j ] = sum( xmm1 );
641  y[j+1UL] = sum( xmm2 );
642  y[j+2UL] = sum( xmm3 );
643 
644  for( ; remainder && i<iend; ++i ) {
645  y[j ] += x[i] * A(i,j );
646  y[j+1UL] += x[i] * A(i,j+1UL);
647  y[j+2UL] += x[i] * A(i,j+2UL);
648  }
649  }
650 
651  for( ; (j+2UL) <= N; j+=2UL )
652  {
653  const size_t ibegin( ( IsLower<MT1>::value )
654  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
655  :( 0UL ) );
656  const size_t iend( ( IsUpper<MT1>::value )
657  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
658  :( M ) );
659  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
660 
661  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
662  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
663 
664  IntrinsicType xmm1, xmm2;
665  size_t i( ibegin );
666 
667  for( ; i<ipos; i+=IT::size ) {
668  const IntrinsicType x1( x.load(i) );
669  xmm1 = xmm1 + x1 * A.load(i,j );
670  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
671  }
672 
673  y[j ] = sum( xmm1 );
674  y[j+1UL] = sum( xmm2 );
675 
676  for( ; remainder && i<iend; ++i ) {
677  y[j ] += x[i] * A(i,j );
678  y[j+1UL] += x[i] * A(i,j+1UL);
679  }
680  }
681 
682  if( j < N )
683  {
684  const size_t ibegin( ( IsLower<MT1>::value )
685  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
686  :( 0UL ) );
687  const size_t iend( ( IsUpper<MT1>::value )
688  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
689  :( M ) );
690  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
691 
692  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
693  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
694 
695  IntrinsicType xmm1;
696  size_t i( ibegin );
697 
698  for( ; i<ipos; i+=IT::size ) {
699  xmm1 = xmm1 + x.load(i) * A.load(i,j);
700  }
701 
702  y[j] = sum( xmm1 );
703 
704  for( ; remainder && i<iend; ++i ) {
705  y[j] += x[i] * A(i,j);
706  }
707  }
708  }
710  //**********************************************************************************************
711 
712  //**Default assignment to dense vectors (large matrices)****************************************
726  template< typename VT1 // Type of the left-hand side target vector
727  , typename VT2 // Type of the left-hand side vector operand
728  , typename MT1 > // Type of the right-hand side matrix operand
729  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
730  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
731  {
732  selectDefaultAssignKernel( y, x, A );
733  }
735  //**********************************************************************************************
736 
737  //**Vectorized default assignment to dense vectors (large matrices)*****************************
751  template< typename VT1 // Type of the left-hand side target vector
752  , typename VT2 // Type of the left-hand side vector operand
753  , typename MT1 > // Type of the right-hand side matrix operand
754  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
755  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
756  {
757  typedef IntrinsicTrait<ElementType> IT;
758 
759  const size_t M( A.rows() );
760  const size_t N( A.columns() );
761 
762  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
763 
764  reset( y );
765 
766  size_t j( 0UL );
767 
768  for( ; (j+8UL) <= N; j+=8UL )
769  {
770  const size_t ibegin( ( IsLower<MT1>::value )
771  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
772  :( 0UL ) );
773  const size_t iend( ( IsUpper<MT1>::value )
774  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
775  :( M ) );
776  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
777 
778  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
779  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
780 
781  size_t i( ibegin );
782 
783  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
784  const size_t i1( i+IT::size );
785  const size_t i2( i+IT::size*2UL );
786  const size_t i3( i+IT::size*3UL );
787  const IntrinsicType x1( x.load(i ) );
788  const IntrinsicType x2( x.load(i1) );
789  const IntrinsicType x3( x.load(i2) );
790  const IntrinsicType x4( x.load(i3) );
791  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
792  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
793  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
794  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
795  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
796  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
797  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
798  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
799  }
800 
801  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
802  const size_t i1( i+IT::size );
803  const IntrinsicType x1( x.load(i ) );
804  const IntrinsicType x2( x.load(i1) );
805  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
806  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
807  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
808  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
809  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
810  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
811  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
812  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
813  }
814 
815  for( ; i<ipos; i+=IT::size ) {
816  const IntrinsicType x1( x.load(i) );
817  y[j ] += sum( x1 * A.load(i,j ) );
818  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
819  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
820  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
821  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
822  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
823  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
824  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
825  }
826 
827  for( ; remainder && i<iend; ++i ) {
828  y[j ] += x[i] * A(i,j );
829  y[j+1UL] += x[i] * A(i,j+1UL);
830  y[j+2UL] += x[i] * A(i,j+2UL);
831  y[j+3UL] += x[i] * A(i,j+3UL);
832  y[j+4UL] += x[i] * A(i,j+4UL);
833  y[j+5UL] += x[i] * A(i,j+5UL);
834  y[j+6UL] += x[i] * A(i,j+6UL);
835  y[j+7UL] += x[i] * A(i,j+7UL);
836  }
837  }
838 
839  for( ; (j+4UL) <= N; j+=4UL )
840  {
841  const size_t ibegin( ( IsLower<MT1>::value )
842  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
843  :( 0UL ) );
844  const size_t iend( ( IsUpper<MT1>::value )
845  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
846  :( M ) );
847  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
848 
849  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
850  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
851 
852  size_t i( ibegin );
853 
854  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
855  const size_t i1( i+IT::size );
856  const size_t i2( i+IT::size*2UL );
857  const size_t i3( i+IT::size*3UL );
858  const IntrinsicType x1( x.load(i ) );
859  const IntrinsicType x2( x.load(i1) );
860  const IntrinsicType x3( x.load(i2) );
861  const IntrinsicType x4( x.load(i3) );
862  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
863  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
864  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
865  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
866  }
867 
868  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
869  const size_t i1( i+IT::size );
870  const IntrinsicType x1( x.load(i ) );
871  const IntrinsicType x2( x.load(i1) );
872  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
873  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
874  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
875  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
876  }
877 
878  for( ; i<ipos; i+=IT::size ) {
879  const IntrinsicType x1( x.load(i) );
880  y[j ] += sum( x1 * A.load(i,j ) );
881  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
882  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
883  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
884  }
885 
886  for( ; remainder && i<iend; ++i ) {
887  y[j ] += x[i] * A(i,j );
888  y[j+1UL] += x[i] * A(i,j+1UL);
889  y[j+2UL] += x[i] * A(i,j+2UL);
890  y[j+3UL] += x[i] * A(i,j+3UL);
891  }
892  }
893 
894  for( ; (j+2UL) <= N; j+=2UL )
895  {
896  const size_t ibegin( ( IsLower<MT1>::value )
897  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
898  :( 0UL ) );
899  const size_t iend( ( IsUpper<MT1>::value )
900  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
901  :( M ) );
902  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
903 
904  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
905  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
906 
907  size_t i( ibegin );
908 
909  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
910  const size_t i1( i+IT::size );
911  const size_t i2( i+IT::size*2UL );
912  const size_t i3( i+IT::size*3UL );
913  const IntrinsicType x1( x.load(i ) );
914  const IntrinsicType x2( x.load(i1) );
915  const IntrinsicType x3( x.load(i2) );
916  const IntrinsicType x4( x.load(i3) );
917  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
918  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
919  }
920 
921  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
922  const size_t i1( i+IT::size );
923  const IntrinsicType x1( x.load(i ) );
924  const IntrinsicType x2( x.load(i1) );
925  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
926  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
927  }
928 
929  for( ; i<ipos; i+=IT::size ) {
930  const IntrinsicType x1( x.load(i) );
931  y[j ] += sum( x1 * A.load(i,j ) );
932  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
933  }
934 
935  for( ; remainder && i<iend; ++i ) {
936  y[j ] += x[i] * A(i,j );
937  y[j+1UL] += x[i] * A(i,j+1UL);
938  }
939  }
940 
941  if( j < N )
942  {
943  const size_t ibegin( ( IsLower<MT1>::value )
944  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
945  :( 0UL ) );
946  const size_t iend( ( IsUpper<MT1>::value )
947  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
948  :( M ) );
949  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
950 
951  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
952  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
953 
954  size_t i( ibegin );
955 
956  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
957  const size_t i1( i+IT::size );
958  const size_t i2( i+IT::size*2UL );
959  const size_t i3( i+IT::size*3UL );
960  const IntrinsicType x1( x.load(i ) );
961  const IntrinsicType x2( x.load(i1) );
962  const IntrinsicType x3( x.load(i2) );
963  const IntrinsicType x4( x.load(i3) );
964  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
965  }
966 
967  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
968  const size_t i1( i+IT::size );
969  const IntrinsicType x1( x.load(i ) );
970  const IntrinsicType x2( x.load(i1) );
971  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
972  }
973 
974  for( ; i<ipos; i+=IT::size ) {
975  const IntrinsicType x1( x.load(i) );
976  y[j] += sum( x1 * A.load(i,j) );
977  }
978 
979  for( ; remainder && i<iend; ++i ) {
980  y[j] += x[i] * A(i,j);
981  }
982  }
983  }
985  //**********************************************************************************************
986 
987  //**BLAS-based assignment to dense vectors (default)********************************************
1001  template< typename VT1 // Type of the left-hand side target vector
1002  , typename VT2 // Type of the left-hand side vector operand
1003  , typename MT1 > // Type of the right-hand side matrix operand
1004  static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
1005  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1006  {
1007  selectLargeAssignKernel( y, x, A );
1008  }
1010  //**********************************************************************************************
1011 
1012  //**BLAS-based assignment to dense vectors******************************************************
1013 #if BLAZE_BLAS_MODE
1014 
1027  template< typename VT1 // Type of the left-hand side target vector
1028  , typename VT2 // Type of the left-hand side vector operand
1029  , typename MT1 > // Type of the right-hand side matrix operand
1030  static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
1031  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1032  {
1033  typedef typename VT1::ElementType ET;
1034 
1035  if( IsTriangular<MT1>::value ) {
1036  assign( y, x );
1037  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1038  }
1039  else {
1040  gemv( y, x, A, ET(1), ET(0) );
1041  }
1042  }
1044 #endif
1045  //**********************************************************************************************
1046 
1047  //**Assignment to sparse vectors****************************************************************
1060  template< typename VT1 > // Type of the target sparse vector
1061  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1062  {
1064 
1068 
1069  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1070 
1071  const ResultType tmp( serial( rhs ) );
1072  assign( ~lhs, tmp );
1073  }
1075  //**********************************************************************************************
1076 
1077  //**Addition assignment to dense vectors********************************************************
1090  template< typename VT1 > // Type of the target dense vector
1091  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1092  {
1094 
1095  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1096 
1097  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1098  return;
1099  }
1100 
1101  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1102  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1103 
1104  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1105  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1106  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1107  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1108 
1109  TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1110  }
1112  //**********************************************************************************************
1113 
1114  //**Addition assignment to dense vectors (kernel selection)*************************************
1125  template< typename VT1 // Type of the left-hand side target vector
1126  , typename VT2 // Type of the left-hand side vector operand
1127  , typename MT1 > // Type of the right-hand side matrix operand
1128  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1129  {
1130  if( ( IsDiagonal<MT1>::value ) ||
1131  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1132  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1133  selectSmallAddAssignKernel( y, x, A );
1134  else
1135  selectBlasAddAssignKernel( y, x, A );
1136  }
1138  //**********************************************************************************************
1139 
1140  //**Default addition assignment to dense vectors************************************************
1154  template< typename VT1 // Type of the left-hand side target vector
1155  , typename VT2 // Type of the left-hand side vector operand
1156  , typename MT1 > // Type of the right-hand side matrix operand
1157  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1158  {
1159  y.addAssign( x * A );
1160  }
1162  //**********************************************************************************************
1163 
1164  //**Default addition assignment to dense vectors (small matrices)*******************************
1178  template< typename VT1 // Type of the left-hand side target vector
1179  , typename VT2 // Type of the left-hand side vector operand
1180  , typename MT1 > // Type of the right-hand side matrix operand
1181  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1182  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1183  {
1184  selectDefaultAddAssignKernel( y, x, A );
1185  }
1187  //**********************************************************************************************
1188 
1189  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1204  template< typename VT1 // Type of the left-hand side target vector
1205  , typename VT2 // Type of the left-hand side vector operand
1206  , typename MT1 > // Type of the right-hand side matrix operand
1207  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1208  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1209  {
1210  typedef IntrinsicTrait<ElementType> IT;
1211 
1212  const size_t M( A.rows() );
1213  const size_t N( A.columns() );
1214 
1215  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1216 
1217  size_t j( 0UL );
1218 
1219  for( ; (j+8UL) <= N; j+=8UL )
1220  {
1221  const size_t ibegin( ( IsLower<MT1>::value )
1222  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1223  :( 0UL ) );
1224  const size_t iend( ( IsUpper<MT1>::value )
1225  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1226  :( M ) );
1227  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1228 
1229  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1230  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1231 
1232  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1233  size_t i( ibegin );
1234 
1235  for( ; i<ipos; i+=IT::size ) {
1236  const IntrinsicType x1( x.load(i) );
1237  xmm1 = xmm1 + x1 * A.load(i,j );
1238  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1239  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1240  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1241  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1242  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1243  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1244  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1245  }
1246 
1247  y[j ] += sum( xmm1 );
1248  y[j+1UL] += sum( xmm2 );
1249  y[j+2UL] += sum( xmm3 );
1250  y[j+3UL] += sum( xmm4 );
1251  y[j+4UL] += sum( xmm5 );
1252  y[j+5UL] += sum( xmm6 );
1253  y[j+6UL] += sum( xmm7 );
1254  y[j+7UL] += sum( xmm8 );
1255 
1256  for( ; remainder && i<iend; ++i ) {
1257  y[j ] += x[i] * A(i,j );
1258  y[j+1UL] += x[i] * A(i,j+1UL);
1259  y[j+2UL] += x[i] * A(i,j+2UL);
1260  y[j+3UL] += x[i] * A(i,j+3UL);
1261  y[j+4UL] += x[i] * A(i,j+4UL);
1262  y[j+5UL] += x[i] * A(i,j+5UL);
1263  y[j+6UL] += x[i] * A(i,j+6UL);
1264  y[j+7UL] += x[i] * A(i,j+7UL);
1265  }
1266  }
1267 
1268  for( ; (j+4UL) <= N; j+=4UL )
1269  {
1270  const size_t ibegin( ( IsLower<MT1>::value )
1271  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1272  :( 0UL ) );
1273  const size_t iend( ( IsUpper<MT1>::value )
1274  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1275  :( M ) );
1276  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1277 
1278  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1279  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1280 
1281  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1282  size_t i( ibegin );
1283 
1284  for( ; i<ipos; i+=IT::size ) {
1285  const IntrinsicType x1( x.load(i) );
1286  xmm1 = xmm1 + x1 * A.load(i,j );
1287  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1288  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1289  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1290  }
1291 
1292  y[j ] += sum( xmm1 );
1293  y[j+1UL] += sum( xmm2 );
1294  y[j+2UL] += sum( xmm3 );
1295  y[j+3UL] += sum( xmm4 );
1296 
1297  for( ; remainder && i<iend; ++i ) {
1298  y[j ] += x[i] * A(i,j );
1299  y[j+1UL] += x[i] * A(i,j+1UL);
1300  y[j+2UL] += x[i] * A(i,j+2UL);
1301  y[j+3UL] += x[i] * A(i,j+3UL);
1302  }
1303  }
1304 
1305  for( ; (j+3UL) <= N; j+=3UL )
1306  {
1307  const size_t ibegin( ( IsLower<MT1>::value )
1308  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1309  :( 0UL ) );
1310  const size_t iend( ( IsUpper<MT1>::value )
1311  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1312  :( M ) );
1313  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1314 
1315  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1316  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1317 
1318  IntrinsicType xmm1, xmm2, xmm3;
1319  size_t i( ibegin );
1320 
1321  for( ; i<ipos; i+=IT::size ) {
1322  const IntrinsicType x1( x.load(i) );
1323  xmm1 = xmm1 + x1 * A.load(i,j );
1324  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1325  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1326  }
1327 
1328  y[j ] += sum( xmm1 );
1329  y[j+1UL] += sum( xmm2 );
1330  y[j+2UL] += sum( xmm3 );
1331 
1332  for( ; remainder && i<iend; ++i ) {
1333  y[j ] += x[i] * A(i,j );
1334  y[j+1UL] += x[i] * A(i,j+1UL);
1335  y[j+2UL] += x[i] * A(i,j+2UL);
1336  }
1337  }
1338 
1339  for( ; (j+2UL) <= N; j+=2UL )
1340  {
1341  const size_t ibegin( ( IsLower<MT1>::value )
1342  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1343  :( 0UL ) );
1344  const size_t iend( ( IsUpper<MT1>::value )
1345  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1346  :( M ) );
1347  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1348 
1349  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1350  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1351 
1352  IntrinsicType xmm1, xmm2;
1353  size_t i( ibegin );
1354 
1355  for( ; i<ipos; i+=IT::size ) {
1356  const IntrinsicType x1( x.load(i) );
1357  xmm1 = xmm1 + x1 * A.load(i,j );
1358  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1359  }
1360 
1361  y[j ] += sum( xmm1 );
1362  y[j+1UL] += sum( xmm2 );
1363 
1364  for( ; remainder && i<iend; ++i ) {
1365  y[j ] += x[i] * A(i,j );
1366  y[j+1UL] += x[i] * A(i,j+1UL);
1367  }
1368  }
1369 
1370  if( j < N )
1371  {
1372  const size_t ibegin( ( IsLower<MT1>::value )
1373  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1374  :( 0UL ) );
1375  const size_t iend( ( IsUpper<MT1>::value )
1376  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1377  :( M ) );
1378  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1379 
1380  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1381  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1382 
1383  IntrinsicType xmm1;
1384  size_t i( ibegin );
1385 
1386  for( ; i<ipos; i+=IT::size ) {
1387  xmm1 = xmm1 + A.load(i,j) * x.load(i);
1388  }
1389 
1390  y[j] += sum( xmm1 );
1391 
1392  for( ; remainder && i<iend; ++i ) {
1393  y[j] += x[i] * A(i,j);
1394  }
1395  }
1396  }
1398  //**********************************************************************************************
1399 
1400  //**Default addition assignment to dense vectors (large matrices)*******************************
1414  template< typename VT1 // Type of the left-hand side target vector
1415  , typename VT2 // Type of the left-hand side vector operand
1416  , typename MT1 > // Type of the right-hand side matrix operand
1417  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1418  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1419  {
1420  selectDefaultAddAssignKernel( y, x, A );
1421  }
1423  //**********************************************************************************************
1424 
1425  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1440  template< typename VT1 // Type of the left-hand side target vector
1441  , typename VT2 // Type of the left-hand side vector operand
1442  , typename MT1 > // Type of the right-hand side matrix operand
1443  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1444  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1445  {
1446  typedef IntrinsicTrait<ElementType> IT;
1447 
1448  const size_t M( A.rows() );
1449  const size_t N( A.columns() );
1450 
1451  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1452 
1453  size_t j( 0UL );
1454 
1455  for( ; (j+8UL) <= N; j+=8UL )
1456  {
1457  const size_t ibegin( ( IsLower<MT1>::value )
1458  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1459  :( 0UL ) );
1460  const size_t iend( ( IsUpper<MT1>::value )
1461  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1462  :( M ) );
1463  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1464 
1465  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1466  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1467 
1468  size_t i( ibegin );
1469 
1470  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
1471  const size_t i1( i+IT::size );
1472  const size_t i2( i+IT::size*2UL );
1473  const size_t i3( i+IT::size*3UL );
1474  const IntrinsicType x1( x.load(i ) );
1475  const IntrinsicType x2( x.load(i1) );
1476  const IntrinsicType x3( x.load(i2) );
1477  const IntrinsicType x4( x.load(i3) );
1478  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1479  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1480  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1481  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1482  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1483  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1484  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1485  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1486  }
1487 
1488  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
1489  const size_t i1( i+IT::size );
1490  const IntrinsicType x1( x.load(i ) );
1491  const IntrinsicType x2( x.load(i1) );
1492  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1493  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1494  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1495  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1496  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1497  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1498  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1499  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1500  }
1501 
1502  for( ; i<ipos; i+=IT::size ) {
1503  const IntrinsicType x1( x.load(i) );
1504  y[j ] += sum( x1 * A.load(i,j ) );
1505  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1506  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1507  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1508  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
1509  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
1510  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
1511  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
1512  }
1513 
1514  for( ; remainder && i<iend; ++i ) {
1515  y[j ] += x[i] * A(i,j );
1516  y[j+1UL] += x[i] * A(i,j+1UL);
1517  y[j+2UL] += x[i] * A(i,j+2UL);
1518  y[j+3UL] += x[i] * A(i,j+3UL);
1519  y[j+4UL] += x[i] * A(i,j+4UL);
1520  y[j+5UL] += x[i] * A(i,j+5UL);
1521  y[j+6UL] += x[i] * A(i,j+6UL);
1522  y[j+7UL] += x[i] * A(i,j+7UL);
1523  }
1524  }
1525 
1526  for( ; (j+4UL) <= N; j+=4UL )
1527  {
1528  const size_t ibegin( ( IsLower<MT1>::value )
1529  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1530  :( 0UL ) );
1531  const size_t iend( ( IsUpper<MT1>::value )
1532  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1533  :( M ) );
1534  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1535 
1536  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1537  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1538 
1539  size_t i( ibegin );
1540 
1541  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
1542  const size_t i1( i+IT::size );
1543  const size_t i2( i+IT::size*2UL );
1544  const size_t i3( i+IT::size*3UL );
1545  const IntrinsicType x1( x.load(i ) );
1546  const IntrinsicType x2( x.load(i1) );
1547  const IntrinsicType x3( x.load(i2) );
1548  const IntrinsicType x4( x.load(i3) );
1549  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1550  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1551  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1552  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1553  }
1554 
1555  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
1556  const size_t i1( i+IT::size );
1557  const IntrinsicType x1( x.load(i ) );
1558  const IntrinsicType x2( x.load(i1) );
1559  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1560  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1561  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1562  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1563  }
1564 
1565  for( ; i<ipos; i+=IT::size ) {
1566  const IntrinsicType x1( x.load(i) );
1567  y[j ] += sum( x1 * A.load(i,j ) );
1568  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1569  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1570  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1571  }
1572 
1573  for( ; remainder && i<iend; ++i ) {
1574  y[j ] += x[i] * A(i,j );
1575  y[j+1UL] += x[i] * A(i,j+1UL);
1576  y[j+2UL] += x[i] * A(i,j+2UL);
1577  y[j+3UL] += x[i] * A(i,j+3UL);
1578  }
1579  }
1580 
1581  for( ; (j+2UL) <= N; j+=2UL )
1582  {
1583  const size_t ibegin( ( IsLower<MT1>::value )
1584  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1585  :( 0UL ) );
1586  const size_t iend( ( IsUpper<MT1>::value )
1587  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1588  :( M ) );
1589  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1590 
1591  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1592  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1593 
1594  size_t i( ibegin );
1595 
1596  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
1597  const size_t i1( i+IT::size );
1598  const size_t i2( i+IT::size*2UL );
1599  const size_t i3( i+IT::size*3UL );
1600  const IntrinsicType x1( x.load(i ) );
1601  const IntrinsicType x2( x.load(i1) );
1602  const IntrinsicType x3( x.load(i2) );
1603  const IntrinsicType x4( x.load(i3) );
1604  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1605  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1606  }
1607 
1608  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
1609  const size_t i1( i+IT::size );
1610  const IntrinsicType x1( x.load(i ) );
1611  const IntrinsicType x2( x.load(i1) );
1612  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1613  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1614  }
1615 
1616  for( ; i<ipos; i+=IT::size ) {
1617  const IntrinsicType x1( x.load(i) );
1618  y[j ] += sum( x1 * A.load(i,j ) );
1619  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1620  }
1621 
1622  for( ; remainder && i<iend; ++i ) {
1623  y[j ] += x[i] * A(i,j );
1624  y[j+1UL] += x[i] * A(i,j+1UL);
1625  }
1626  }
1627 
1628  if( j < N )
1629  {
1630  const size_t ibegin( ( IsLower<MT1>::value )
1631  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1632  :( 0UL ) );
1633  const size_t iend( ( IsUpper<MT1>::value )
1634  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1635  :( M ) );
1636  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1637 
1638  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1639  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1640 
1641  size_t i( ibegin );
1642 
1643  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
1644  const size_t i1( i+IT::size );
1645  const size_t i2( i+IT::size*2UL );
1646  const size_t i3( i+IT::size*3UL );
1647  const IntrinsicType x1( x.load(i ) );
1648  const IntrinsicType x2( x.load(i1) );
1649  const IntrinsicType x3( x.load(i2) );
1650  const IntrinsicType x4( x.load(i3) );
1651  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1652  }
1653 
1654  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
1655  const size_t i1( i+IT::size );
1656  const IntrinsicType x1( x.load(i ) );
1657  const IntrinsicType x2( x.load(i1) );
1658  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1659  }
1660 
1661  for( ; i<ipos; i+=IT::size ) {
1662  const IntrinsicType x1( x.load(i) );
1663  y[j] += sum( x1 * A.load(i,j) );
1664  }
1665 
1666  for( ; remainder && i<iend; ++i ) {
1667  y[j] += x[i] * A(i,j);
1668  }
1669  }
1670  }
1672  //**********************************************************************************************
1673 
1674  //**BLAS-based addition assignment to dense vectors (default)***********************************
1688  template< typename VT1 // Type of the left-hand side target vector
1689  , typename VT2 // Type of the left-hand side vector operand
1690  , typename MT1 > // Type of the right-hand side matrix operand
1691  static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
1692  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1693  {
1694  selectLargeAddAssignKernel( y, x, A );
1695  }
1697  //**********************************************************************************************
1698 
1699  //**BLAS-based addition assignment to dense vectors*********************************************
1700 #if BLAZE_BLAS_MODE
1701 
1714  template< typename VT1 // Type of the left-hand side target vector
1715  , typename VT2 // Type of the left-hand side vector operand
1716  , typename MT1 > // Type of the right-hand side matrix operand
1717  static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
1718  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1719  {
1720  typedef typename VT1::ElementType ET;
1721 
1722  if( IsTriangular<MT1>::value ) {
1723  typename VT1::ResultType tmp( serial( x ) );
1724  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1725  addAssign( y, tmp );
1726  }
1727  else {
1728  gemv( y, x, A, ET(1), ET(1) );
1729  }
1730  }
1732 #endif
1733  //**********************************************************************************************
1734 
1735  //**Addition assignment to sparse vectors*******************************************************
1736  // No special implementation for the addition assignment to sparse vectors.
1737  //**********************************************************************************************
1738 
1739  //**Subtraction assignment to dense vectors*****************************************************
1752  template< typename VT1 > // Type of the target dense vector
1753  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1754  {
1756 
1757  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1758 
1759  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1760  return;
1761  }
1762 
1763  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1764  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1765 
1766  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1767  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1768  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1769  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1770 
1771  TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1772  }
1774  //**********************************************************************************************
1775 
1776  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1787  template< typename VT1 // Type of the left-hand side target vector
1788  , typename VT2 // Type of the left-hand side vector operand
1789  , typename MT1 > // Type of the right-hand side matrix operand
1790  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1791  {
1792  if( ( IsDiagonal<MT1>::value ) ||
1793  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1794  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1795  selectSmallSubAssignKernel( y, x, A );
1796  else
1797  selectBlasSubAssignKernel( y, x, A );
1798  }
1800  //**********************************************************************************************
1801 
1802  //**Default subtraction assignment to dense vectors*********************************************
1816  template< typename VT1 // Type of the left-hand side target vector
1817  , typename VT2 // Type of the left-hand side vector operand
1818  , typename MT1 > // Type of the right-hand side matrix operand
1819  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1820  {
1821  y.subAssign( x * A );
1822  }
1824  //**********************************************************************************************
1825 
1826  //**Default subtraction assignment to dense vectors (small matrices)****************************
1840  template< typename VT1 // Type of the left-hand side target vector
1841  , typename VT2 // Type of the left-hand side vector operand
1842  , typename MT1 > // Type of the right-hand side matrix operand
1843  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1844  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1845  {
1846  selectDefaultSubAssignKernel( y, x, A );
1847  }
1849  //**********************************************************************************************
1850 
1851  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1866  template< typename VT1 // Type of the left-hand side target vector
1867  , typename VT2 // Type of the left-hand side vector operand
1868  , typename MT1 > // Type of the right-hand side matrix operand
1869  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1870  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1871  {
1872  typedef IntrinsicTrait<ElementType> IT;
1873 
1874  const size_t M( A.rows() );
1875  const size_t N( A.columns() );
1876 
1877  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1878 
1879  size_t j( 0UL );
1880 
1881  for( ; (j+8UL) <= N; j+=8UL )
1882  {
1883  const size_t ibegin( ( IsLower<MT1>::value )
1884  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1885  :( 0UL ) );
1886  const size_t iend( ( IsUpper<MT1>::value )
1887  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1888  :( M ) );
1889  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1890 
1891  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1892  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1893 
1894  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1895  size_t i( ibegin );
1896 
1897  for( ; i<ipos; i+=IT::size ) {
1898  const IntrinsicType x1( x.load(i) );
1899  xmm1 = xmm1 + x1 * A.load(i,j );
1900  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1901  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1902  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1903  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1904  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1905  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1906  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1907  }
1908 
1909  y[j ] -= sum( xmm1 );
1910  y[j+1UL] -= sum( xmm2 );
1911  y[j+2UL] -= sum( xmm3 );
1912  y[j+3UL] -= sum( xmm4 );
1913  y[j+4UL] -= sum( xmm5 );
1914  y[j+5UL] -= sum( xmm6 );
1915  y[j+6UL] -= sum( xmm7 );
1916  y[j+7UL] -= sum( xmm8 );
1917 
1918  for( ; remainder && i<iend; ++i ) {
1919  y[j ] -= x[i] * A(i,j );
1920  y[j+1UL] -= x[i] * A(i,j+1UL);
1921  y[j+2UL] -= x[i] * A(i,j+2UL);
1922  y[j+3UL] -= x[i] * A(i,j+3UL);
1923  y[j+4UL] -= x[i] * A(i,j+4UL);
1924  y[j+5UL] -= x[i] * A(i,j+5UL);
1925  y[j+6UL] -= x[i] * A(i,j+6UL);
1926  y[j+7UL] -= x[i] * A(i,j+7UL);
1927  }
1928  }
1929 
1930  for( ; (j+4UL) <= N; j+=4UL )
1931  {
1932  const size_t ibegin( ( IsLower<MT1>::value )
1933  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1934  :( 0UL ) );
1935  const size_t iend( ( IsUpper<MT1>::value )
1936  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1937  :( M ) );
1938  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1939 
1940  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1941  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1942 
1943  IntrinsicType xmm1, xmm2, xmm3, xmm4;
1944  size_t i( ibegin );
1945 
1946  for( ; i<ipos; i+=IT::size ) {
1947  const IntrinsicType x1( x.load(i) );
1948  xmm1 = xmm1 + x1 * A.load(i,j );
1949  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1950  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1951  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1952  }
1953 
1954  y[j ] -= sum( xmm1 );
1955  y[j+1UL] -= sum( xmm2 );
1956  y[j+2UL] -= sum( xmm3 );
1957  y[j+3UL] -= sum( xmm4 );
1958 
1959  for( ; remainder && i<iend; ++i ) {
1960  y[j ] -= x[i] * A(i,j );
1961  y[j+1UL] -= x[i] * A(i,j+1UL);
1962  y[j+2UL] -= x[i] * A(i,j+2UL);
1963  y[j+3UL] -= x[i] * A(i,j+3UL);
1964  }
1965  }
1966 
1967  for( ; (j+3UL) <= N; j+=3UL )
1968  {
1969  const size_t ibegin( ( IsLower<MT1>::value )
1970  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
1971  :( 0UL ) );
1972  const size_t iend( ( IsUpper<MT1>::value )
1973  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1974  :( M ) );
1975  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1976 
1977  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
1978  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
1979 
1980  IntrinsicType xmm1, xmm2, xmm3;
1981  size_t i( ibegin );
1982 
1983  for( ; i<ipos; i+=IT::size ) {
1984  const IntrinsicType x1( x.load(i) );
1985  xmm1 = xmm1 + x1 * A.load(i,j );
1986  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1987  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1988  }
1989 
1990  y[j ] -= sum( xmm1 );
1991  y[j+1UL] -= sum( xmm2 );
1992  y[j+2UL] -= sum( xmm3 );
1993 
1994  for( ; remainder && i<iend; ++i ) {
1995  y[j ] -= x[i] * A(i,j );
1996  y[j+1UL] -= x[i] * A(i,j+1UL);
1997  y[j+2UL] -= x[i] * A(i,j+2UL);
1998  }
1999  }
2000 
2001  for( ; (j+2UL) <= N; j+=2UL )
2002  {
2003  const size_t ibegin( ( IsLower<MT1>::value )
2004  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
2005  :( 0UL ) );
2006  const size_t iend( ( IsUpper<MT1>::value )
2007  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2008  :( M ) );
2009  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2010 
2011  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
2012  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
2013 
2014  IntrinsicType xmm1, xmm2;
2015  size_t i( ibegin );
2016 
2017  for( ; i<ipos; i+=IT::size ) {
2018  const IntrinsicType x1( x.load(i) );
2019  xmm1 = xmm1 + x1 * A.load(i,j );
2020  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2021  }
2022 
2023  y[j ] -= sum( xmm1 );
2024  y[j+1UL] -= sum( xmm2 );
2025 
2026  for( ; remainder && i<iend; ++i ) {
2027  y[j ] -= x[i] * A(i,j );
2028  y[j+1UL] -= x[i] * A(i,j+1UL);
2029  }
2030  }
2031 
2032  if( j < N )
2033  {
2034  const size_t ibegin( ( IsLower<MT1>::value )
2035  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
2036  :( 0UL ) );
2037  const size_t iend( ( IsUpper<MT1>::value )
2038  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2039  :( M ) );
2040  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2041 
2042  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
2043  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
2044 
2045  IntrinsicType xmm1;
2046  size_t i( ibegin );
2047 
2048  for( ; i<ipos; i+=IT::size ) {
2049  xmm1 = xmm1 + A.load(i,j) * x.load(i);
2050  }
2051 
2052  y[j] -= sum( xmm1 );
2053 
2054  for( ; remainder && i<iend; ++i ) {
2055  y[j] -= x[i] * A(i,j);
2056  }
2057  }
2058  }
2060  //**********************************************************************************************
2061 
2062  //**Default subtraction assignment to dense vectors (large matrices)****************************
2076  template< typename VT1 // Type of the left-hand side target vector
2077  , typename VT2 // Type of the left-hand side vector operand
2078  , typename MT1 > // Type of the right-hand side matrix operand
2079  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
2080  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2081  {
2082  selectDefaultSubAssignKernel( y, x, A );
2083  }
2085  //**********************************************************************************************
2086 
2087  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2102  template< typename VT1 // Type of the left-hand side target vector
2103  , typename VT2 // Type of the left-hand side vector operand
2104  , typename MT1 > // Type of the right-hand side matrix operand
2105  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
2106  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2107  {
2108  typedef IntrinsicTrait<ElementType> IT;
2109 
2110  const size_t M( A.rows() );
2111  const size_t N( A.columns() );
2112 
2113  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
2114 
2115  size_t j( 0UL );
2116 
2117  for( ; (j+8UL) <= N; j+=8UL )
2118  {
2119  const size_t ibegin( ( IsLower<MT1>::value )
2120  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
2121  :( 0UL ) );
2122  const size_t iend( ( IsUpper<MT1>::value )
2123  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
2124  :( M ) );
2125  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2126 
2127  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
2128  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
2129 
2130  size_t i( ibegin );
2131 
2132  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
2133  const size_t i1( i+IT::size );
2134  const size_t i2( i+IT::size*2UL );
2135  const size_t i3( i+IT::size*3UL );
2136  const IntrinsicType x1( x.load(i ) );
2137  const IntrinsicType x2( x.load(i1) );
2138  const IntrinsicType x3( x.load(i2) );
2139  const IntrinsicType x4( x.load(i3) );
2140  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2141  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2142  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2143  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2144  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2145  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2146  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2147  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2148  }
2149 
2150  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
2151  const size_t i1( i+IT::size );
2152  const IntrinsicType x1( x.load(i ) );
2153  const IntrinsicType x2( x.load(i1) );
2154  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2155  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2156  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2157  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2158  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2159  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2160  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2161  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2162  }
2163 
2164  for( ; i<ipos; i+=IT::size ) {
2165  const IntrinsicType x1( x.load(i) );
2166  y[j ] -= sum( x1 * A.load(i,j ) );
2167  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2168  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2169  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2170  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) );
2171  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) );
2172  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) );
2173  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) );
2174  }
2175 
2176  for( ; remainder && i<iend; ++i ) {
2177  y[j ] -= x[i] * A(i,j );
2178  y[j+1UL] -= x[i] * A(i,j+1UL);
2179  y[j+2UL] -= x[i] * A(i,j+2UL);
2180  y[j+3UL] -= x[i] * A(i,j+3UL);
2181  y[j+4UL] -= x[i] * A(i,j+4UL);
2182  y[j+5UL] -= x[i] * A(i,j+5UL);
2183  y[j+6UL] -= x[i] * A(i,j+6UL);
2184  y[j+7UL] -= x[i] * A(i,j+7UL);
2185  }
2186  }
2187 
2188  for( ; (j+4UL) <= N; j+=4UL )
2189  {
2190  const size_t ibegin( ( IsLower<MT1>::value )
2191  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
2192  :( 0UL ) );
2193  const size_t iend( ( IsUpper<MT1>::value )
2194  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
2195  :( M ) );
2196  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2197 
2198  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
2199  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
2200 
2201  size_t i( ibegin );
2202 
2203  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
2204  const size_t i1( i+IT::size );
2205  const size_t i2( i+IT::size*2UL );
2206  const size_t i3( i+IT::size*3UL );
2207  const IntrinsicType x1( x.load(i ) );
2208  const IntrinsicType x2( x.load(i1) );
2209  const IntrinsicType x3( x.load(i2) );
2210  const IntrinsicType x4( x.load(i3) );
2211  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2212  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2213  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2214  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2215  }
2216 
2217  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
2218  const size_t i1( i+IT::size );
2219  const IntrinsicType x1( x.load(i ) );
2220  const IntrinsicType x2( x.load(i1) );
2221  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2222  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2223  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2224  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2225  }
2226 
2227  for( ; i<ipos; i+=IT::size ) {
2228  const IntrinsicType x1( x.load(i) );
2229  y[j ] -= sum( x1 * A.load(i,j ) );
2230  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2231  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2232  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2233  }
2234 
2235  for( ; remainder && i<iend; ++i ) {
2236  y[j ] -= x[i] * A(i,j );
2237  y[j+1UL] -= x[i] * A(i,j+1UL);
2238  y[j+2UL] -= x[i] * A(i,j+2UL);
2239  y[j+3UL] -= x[i] * A(i,j+3UL);
2240  }
2241  }
2242 
2243  for( ; (j+2UL) <= N; j+=2UL )
2244  {
2245  const size_t ibegin( ( IsLower<MT1>::value )
2246  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
2247  :( 0UL ) );
2248  const size_t iend( ( IsUpper<MT1>::value )
2249  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2250  :( M ) );
2251  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2252 
2253  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
2254  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
2255 
2256  size_t i( ibegin );
2257 
2258  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
2259  const size_t i1( i+IT::size );
2260  const size_t i2( i+IT::size*2UL );
2261  const size_t i3( i+IT::size*3UL );
2262  const IntrinsicType x1( x.load(i ) );
2263  const IntrinsicType x2( x.load(i1) );
2264  const IntrinsicType x3( x.load(i2) );
2265  const IntrinsicType x4( x.load(i3) );
2266  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2267  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2268  }
2269 
2270  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
2271  const size_t i1( i+IT::size );
2272  const IntrinsicType x1( x.load(i ) );
2273  const IntrinsicType x2( x.load(i1) );
2274  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2275  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2276  }
2277 
2278  for( ; i<ipos; i+=IT::size ) {
2279  const IntrinsicType x1( x.load(i) );
2280  y[j ] -= sum( x1 * A.load(i,j ) );
2281  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2282  }
2283 
2284  for( ; remainder && i<iend; ++i ) {
2285  y[j ] -= x[i] * A(i,j );
2286  y[j+1UL] -= x[i] * A(i,j+1UL);
2287  }
2288  }
2289 
2290  if( j < N )
2291  {
2292  const size_t ibegin( ( IsLower<MT1>::value )
2293  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
2294  :( 0UL ) );
2295  const size_t iend( ( IsUpper<MT1>::value )
2296  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2297  :( M ) );
2298  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2299 
2300  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
2301  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
2302 
2303  size_t i( ibegin );
2304 
2305  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
2306  const size_t i1( i+IT::size );
2307  const size_t i2( i+IT::size*2UL );
2308  const size_t i3( i+IT::size*3UL );
2309  const IntrinsicType x1( x.load(i ) );
2310  const IntrinsicType x2( x.load(i1) );
2311  const IntrinsicType x3( x.load(i2) );
2312  const IntrinsicType x4( x.load(i3) );
2313  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2314  }
2315 
2316  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
2317  const size_t i1( i+IT::size );
2318  const IntrinsicType x1( x.load(i ) );
2319  const IntrinsicType x2( x.load(i1) );
2320  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2321  }
2322 
2323  for( ; i<ipos; i+=IT::size ) {
2324  const IntrinsicType x1( x.load(i) );
2325  y[j] -= sum( x1 * A.load(i,j) );
2326  }
2327 
2328  for( ; remainder && i<iend; ++i ) {
2329  y[j] -= x[i] * A(i,j);
2330  }
2331  }
2332  }
2334  //**********************************************************************************************
2335 
2336  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2350  template< typename VT1 // Type of the left-hand side target vector
2351  , typename VT2 // Type of the left-hand side vector operand
2352  , typename MT1 > // Type of the right-hand side matrix operand
2353  static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
2354  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2355  {
2356  selectLargeSubAssignKernel( y, x, A );
2357  }
2359  //**********************************************************************************************
2360 
2361  //**BLAS-based subtraction assignment to dense vectors******************************************
2362 #if BLAZE_BLAS_MODE
2363 
2376  template< typename VT1 // Type of the left-hand side target vector
2377  , typename VT2 // Type of the left-hand side vector operand
2378  , typename MT1 > // Type of the right-hand side matrix operand
2379  static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
2380  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2381  {
2382  typedef typename VT1::ElementType ET;
2383 
2384  if( IsTriangular<MT1>::value ) {
2385  typename VT1::ResultType tmp( serial( x ) );
2386  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2387  subAssign( y, tmp );
2388  }
2389  else {
2390  gemv( y, x, A, ET(-1), ET(1) );
2391  }
2392  }
2394 #endif
2395  //**********************************************************************************************
2396 
2397  //**Subtraction assignment to sparse vectors****************************************************
2398  // No special implementation for the subtraction assignment to sparse vectors.
2399  //**********************************************************************************************
2400 
2401  //**Multiplication assignment to dense vectors**************************************************
2414  template< typename VT1 > // Type of the target dense vector
2415  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2416  {
2418 
2422 
2423  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2424 
2425  const ResultType tmp( serial( rhs ) );
2426  multAssign( ~lhs, tmp );
2427  }
2429  //**********************************************************************************************
2430 
2431  //**Multiplication assignment to sparse vectors*************************************************
2432  // No special implementation for the multiplication assignment to sparse vectors.
2433  //**********************************************************************************************
2434 
2435  //**SMP assignment to dense vectors*************************************************************
2450  template< typename VT1 > // Type of the target dense vector
2451  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2452  smpAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2453  {
2455 
2456  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2457 
2458  if( rhs.mat_.rows() == 0UL ) {
2459  reset( ~lhs );
2460  return;
2461  }
2462  else if( rhs.mat_.columns() == 0UL ) {
2463  return;
2464  }
2465 
2466  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2467  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2468 
2469  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2470  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2471  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2472  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2473 
2474  smpAssign( ~lhs, x * A );
2475  }
2477  //**********************************************************************************************
2478 
2479  //**SMP assignment to sparse vectors************************************************************
2494  template< typename VT1 > // Type of the target sparse vector
2495  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2496  smpAssign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2497  {
2499 
2503 
2504  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2505 
2506  const ResultType tmp( rhs );
2507  smpAssign( ~lhs, tmp );
2508  }
2510  //**********************************************************************************************
2511 
2512  //**SMP addition assignment to dense vectors****************************************************
2527  template< typename VT1 > // Type of the target dense vector
2528  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2529  smpAddAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2530  {
2532 
2533  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2534 
2535  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2536  return;
2537  }
2538 
2539  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2540  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2541 
2542  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2543  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2544  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2545  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2546 
2547  smpAddAssign( ~lhs, x * A );
2548  }
2550  //**********************************************************************************************
2551 
2552  //**SMP addition assignment to sparse vectors***************************************************
2553  // No special implementation for the SMP addition assignment to sparse vectors.
2554  //**********************************************************************************************
2555 
2556  //**SMP subtraction assignment to dense vectors*************************************************
2571  template< typename VT1 > // Type of the target dense vector
2572  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2573  smpSubAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2574  {
2576 
2577  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2578 
2579  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2580  return;
2581  }
2582 
2583  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2584  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2585 
2586  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2587  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2588  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2589  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2590 
2591  smpSubAssign( ~lhs, x * A );
2592  }
2594  //**********************************************************************************************
2595 
2596  //**SMP subtraction assignment to sparse vectors************************************************
2597  // No special implementation for the SMP subtraction assignment to sparse vectors.
2598  //**********************************************************************************************
2599 
2600  //**SMP multiplication assignment to dense vectors**********************************************
2615  template< typename VT1 > // Type of the target dense vector
2616  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2617  smpMultAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2618  {
2620 
2624 
2625  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2626 
2627  const ResultType tmp( rhs );
2628  smpMultAssign( ~lhs, tmp );
2629  }
2631  //**********************************************************************************************
2632 
2633  //**SMP multiplication assignment to sparse vectors*********************************************
2634  // No special implementation for the SMP multiplication assignment to sparse vectors.
2635  //**********************************************************************************************
2636 
2637  //**Compile time checks*************************************************************************
2645  //**********************************************************************************************
2646 };
2647 //*************************************************************************************************
2648 
2649 
2650 
2651 
2652 //=================================================================================================
2653 //
2654 // DVECSCALARMULTEXPR SPECIALIZATION
2655 //
2656 //=================================================================================================
2657 
2658 //*************************************************************************************************
2666 template< typename VT // Type of the left-hand side dense vector
2667  , typename MT // Type of the right-hand side dense matrix
2668  , typename ST > // Type of the side scalar value
2669 class DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >
2670  : public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
2671  , private VecScalarMultExpr
2672  , private Computation
2673 {
2674  private:
2675  //**Type definitions****************************************************************************
2676  typedef TDVecTDMatMultExpr<VT,MT> VMM;
2677  typedef typename VMM::ResultType RES;
2678  typedef typename VT::ResultType VRT;
2679  typedef typename MT::ResultType MRT;
2680  typedef typename VRT::ElementType VET;
2681  typedef typename MRT::ElementType MET;
2682  typedef typename VT::CompositeType VCT;
2683  typedef typename MT::CompositeType MCT;
2684  //**********************************************************************************************
2685 
2686  //**********************************************************************************************
2688  enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2689  //**********************************************************************************************
2690 
2691  //**********************************************************************************************
2693  enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2694  IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2695  //**********************************************************************************************
2696 
2697  //**********************************************************************************************
2699 
2702  template< typename T1 >
2703  struct UseSMPAssign {
2704  enum { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
2705  };
2706  //**********************************************************************************************
2707 
2708  //**********************************************************************************************
2710 
2712  template< typename T1, typename T2, typename T3, typename T4 >
2713  struct UseBlasKernel {
2714  enum { value = BLAZE_BLAS_MODE &&
2715  HasMutableDataAccess<T1>::value &&
2716  HasConstDataAccess<T2>::value &&
2717  HasConstDataAccess<T3>::value &&
2718  !IsDiagonal<T3>::value &&
2719  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2720  IsBlasCompatible<typename T1::ElementType>::value &&
2721  IsBlasCompatible<typename T2::ElementType>::value &&
2722  IsBlasCompatible<typename T3::ElementType>::value &&
2723  IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
2724  IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
2725  !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
2726  };
2727  //**********************************************************************************************
2728 
2729  //**********************************************************************************************
2731 
2734  template< typename T1, typename T2, typename T3, typename T4 >
2735  struct UseVectorizedDefaultKernel {
2736  enum { value = useOptimizedKernels &&
2737  !IsDiagonal<T3>::value &&
2738  T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2739  IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2740  IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2741  IsSame<typename T1::ElementType,T4>::value &&
2742  IntrinsicTrait<typename T1::ElementType>::addition &&
2743  IntrinsicTrait<typename T1::ElementType>::multiplication };
2744  };
2745  //**********************************************************************************************
2746 
2747  public:
2748  //**Type definitions****************************************************************************
2749  typedef DVecScalarMultExpr<VMM,ST,true> This;
2750  typedef typename MultTrait<RES,ST>::Type ResultType;
2751  typedef typename ResultType::TransposeType TransposeType;
2752  typedef typename ResultType::ElementType ElementType;
2753  typedef typename IntrinsicTrait<ElementType>::Type IntrinsicType;
2754  typedef const ElementType ReturnType;
2755  typedef const ResultType CompositeType;
2756 
2758  typedef const TDVecTDMatMultExpr<VT,MT> LeftOperand;
2759 
2761  typedef ST RightOperand;
2762 
2764  typedef typename SelectType< evaluateVector, const VRT, VCT >::Type LT;
2765 
2767  typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type RT;
2768  //**********************************************************************************************
2769 
2770  //**Compilation flags***************************************************************************
2772  enum { vectorizable = !IsDiagonal<MT>::value &&
2773  VT::vectorizable && MT::vectorizable &&
2774  IsSame<VET,MET>::value &&
2775  IsSame<VET,ST>::value &&
2776  IntrinsicTrait<VET>::addition &&
2777  IntrinsicTrait<VET>::multiplication };
2778 
2780  enum { smpAssignable = !evaluateVector && VT::smpAssignable &&
2781  !evaluateMatrix && MT::smpAssignable };
2782  //**********************************************************************************************
2783 
2784  //**Constructor*********************************************************************************
2790  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2791  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2792  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2793  {}
2794  //**********************************************************************************************
2795 
2796  //**Subscript operator**************************************************************************
2802  inline ReturnType operator[]( size_t index ) const {
2803  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2804  return vector_[index] * scalar_;
2805  }
2806  //**********************************************************************************************
2807 
2808  //**At function*********************************************************************************
2815  inline ReturnType at( size_t index ) const {
2816  if( index >= vector_.size() ) {
2817  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2818  }
2819  return (*this)[index];
2820  }
2821  //**********************************************************************************************
2822 
2823  //**Size function*******************************************************************************
2828  inline size_t size() const {
2829  return vector_.size();
2830  }
2831  //**********************************************************************************************
2832 
2833  //**Left operand access*************************************************************************
2838  inline LeftOperand leftOperand() const {
2839  return vector_;
2840  }
2841  //**********************************************************************************************
2842 
2843  //**Right operand access************************************************************************
2848  inline RightOperand rightOperand() const {
2849  return scalar_;
2850  }
2851  //**********************************************************************************************
2852 
2853  //**********************************************************************************************
2859  template< typename T >
2860  inline bool canAlias( const T* alias ) const {
2861  return vector_.canAlias( alias );
2862  }
2863  //**********************************************************************************************
2864 
2865  //**********************************************************************************************
2871  template< typename T >
2872  inline bool isAliased( const T* alias ) const {
2873  return vector_.isAliased( alias );
2874  }
2875  //**********************************************************************************************
2876 
2877  //**********************************************************************************************
2882  inline bool isAligned() const {
2883  return vector_.isAligned();
2884  }
2885  //**********************************************************************************************
2886 
2887  //**********************************************************************************************
2892  inline bool canSMPAssign() const {
2893  typename VMM::RightOperand A( vector_.rightOperand() );
2894  return ( !BLAZE_BLAS_IS_PARALLEL ||
2895  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2896  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2897  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
2898  }
2899  //**********************************************************************************************
2900 
2901  private:
2902  //**Member variables****************************************************************************
2903  LeftOperand vector_;
2904  RightOperand scalar_;
2905  //**********************************************************************************************
2906 
2907  //**Assignment to dense vectors*****************************************************************
2919  template< typename VT1 // Type of the target dense vector
2920  , bool TF > // Transpose flag of the target dense vector
2921  friend inline void assign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
2922  {
2924 
2925  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2926 
2927  typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2928  typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2929 
2930  if( right.rows() == 0UL ) {
2931  reset( ~lhs );
2932  return;
2933  }
2934  else if( right.columns() == 0UL ) {
2935  return;
2936  }
2937 
2938  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
2939  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
2940 
2941  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
2942  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
2943  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
2944  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2945 
2946  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2947  }
2948  //**********************************************************************************************
2949 
2950  //**Assignment to dense vectors (kernel selection)**********************************************
2961  template< typename VT1 // Type of the left-hand side target vector
2962  , typename VT2 // Type of the left-hand side vector operand
2963  , typename MT1 // Type of the right-hand side matrix operand
2964  , typename ST2 > // Type of the scalar value
2965  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2966  {
2967  if( ( IsDiagonal<MT1>::value ) ||
2968  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2969  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
2970  selectSmallAssignKernel( y, x, A, scalar );
2971  else
2972  selectBlasAssignKernel( y, x, A, scalar );
2973  }
2974  //**********************************************************************************************
2975 
2976  //**Default assignment to dense vectors*********************************************************
2990  template< typename VT1 // Type of the left-hand side target vector
2991  , typename VT2 // Type of the left-hand side vector operand
2992  , typename MT1 // Type of the right-hand side matrix operand
2993  , typename ST2 > // Type of the scalar value
2994  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2995  {
2996  y.assign( x * A * scalar );
2997  }
2998  //**********************************************************************************************
2999 
3000  //**Default assignment to dense vectors (small matrices)****************************************
3014  template< typename VT1 // Type of the left-hand side target vector
3015  , typename VT2 // Type of the left-hand side vector operand
3016  , typename MT1 // Type of the right-hand side matrix operand
3017  , typename ST2 > // Type of the scalar value
3018  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3019  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3020  {
3021  selectDefaultAssignKernel( y, x, A, scalar );
3022  }
3023  //**********************************************************************************************
3024 
3025  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3040  template< typename VT1 // Type of the left-hand side target vector
3041  , typename VT2 // Type of the left-hand side vector operand
3042  , typename MT1 // Type of the right-hand side matrix operand
3043  , typename ST2 > // Type of the scalar value
3044  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3045  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3046  {
3047  typedef IntrinsicTrait<ElementType> IT;
3048 
3049  const size_t M( A.rows() );
3050  const size_t N( A.columns() );
3051 
3052  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3053 
3054  size_t j( 0UL );
3055 
3056  for( ; (j+8UL) <= N; j+=8UL )
3057  {
3058  const size_t ibegin( ( IsLower<MT1>::value )
3059  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3060  :( 0UL ) );
3061  const size_t iend( ( IsUpper<MT1>::value )
3062  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3063  :( M ) );
3064  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3065 
3066  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3067  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3068 
3069  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3070  size_t i( ibegin );
3071 
3072  for( ; i<ipos; i+=IT::size ) {
3073  const IntrinsicType x1( x.load(i) );
3074  xmm1 = xmm1 + x1 * A.load(i,j );
3075  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3076  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3077  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3078  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3079  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3080  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3081  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3082  }
3083 
3084  y[j ] = sum( xmm1 ) * scalar;
3085  y[j+1UL] = sum( xmm2 ) * scalar;
3086  y[j+2UL] = sum( xmm3 ) * scalar;
3087  y[j+3UL] = sum( xmm4 ) * scalar;
3088  y[j+4UL] = sum( xmm5 ) * scalar;
3089  y[j+5UL] = sum( xmm6 ) * scalar;
3090  y[j+6UL] = sum( xmm7 ) * scalar;
3091  y[j+7UL] = sum( xmm8 ) * scalar;
3092 
3093  for( ; remainder && i<iend; ++i ) {
3094  y[j ] += x[i] * A(i,j ) * scalar;
3095  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3096  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3097  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3098  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3099  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3100  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3101  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3102  }
3103  }
3104 
3105  for( ; (j+4UL) <= N; j+=4UL )
3106  {
3107  const size_t ibegin( ( IsLower<MT1>::value )
3108  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3109  :( 0UL ) );
3110  const size_t iend( ( IsUpper<MT1>::value )
3111  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3112  :( M ) );
3113  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3114 
3115  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3116  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3117 
3118  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3119  size_t i( ibegin );
3120 
3121  for( ; i<ipos; i+=IT::size ) {
3122  const IntrinsicType x1( x.load(i) );
3123  xmm1 = xmm1 + x1 * A.load(i,j );
3124  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3125  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3126  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3127  }
3128 
3129  y[j ] = sum( xmm1 ) * scalar;
3130  y[j+1UL] = sum( xmm2 ) * scalar;
3131  y[j+2UL] = sum( xmm3 ) * scalar;
3132  y[j+3UL] = sum( xmm4 ) * scalar;
3133 
3134  for( ; remainder && i<iend; ++i ) {
3135  y[j ] += x[i] * A(i,j ) * scalar;
3136  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3137  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3138  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3139  }
3140  }
3141 
3142  for( ; (j+3UL) <= N; j+=3UL )
3143  {
3144  const size_t ibegin( ( IsLower<MT1>::value )
3145  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3146  :( 0UL ) );
3147  const size_t iend( ( IsUpper<MT1>::value )
3148  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3149  :( M ) );
3150  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3151 
3152  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3153  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3154 
3155  IntrinsicType xmm1, xmm2, xmm3;
3156  size_t i( ibegin );
3157 
3158  for( ; i<ipos; i+=IT::size ) {
3159  const IntrinsicType x1( x.load(i) );
3160  xmm1 = xmm1 + x1 * A.load(i,j );
3161  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3162  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3163  }
3164 
3165  y[j ] = sum( xmm1 ) * scalar;
3166  y[j+1UL] = sum( xmm2 ) * scalar;
3167  y[j+2UL] = sum( xmm3 ) * scalar;
3168 
3169  for( ; remainder && i<iend; ++i ) {
3170  y[j ] += x[i] * A(i,j ) * scalar;
3171  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3172  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3173  }
3174  }
3175 
3176  for( ; (j+2UL) <= N; j+=2UL )
3177  {
3178  const size_t ibegin( ( IsLower<MT1>::value )
3179  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3180  :( 0UL ) );
3181  const size_t iend( ( IsUpper<MT1>::value )
3182  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3183  :( M ) );
3184  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3185 
3186  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3187  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3188 
3189  IntrinsicType xmm1, xmm2;
3190  size_t i( ibegin );
3191 
3192  for( ; i<ipos; i+=IT::size ) {
3193  const IntrinsicType x1( x.load(i) );
3194  xmm1 = xmm1 + x1 * A.load(i,j );
3195  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3196  }
3197 
3198  y[j ] = sum( xmm1 ) * scalar;
3199  y[j+1UL] = sum( xmm2 ) * scalar;
3200 
3201  for( ; remainder && i<iend; ++i ) {
3202  y[j ] += x[i] * A(i,j ) * scalar;
3203  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3204  }
3205  }
3206 
3207  if( j < N )
3208  {
3209  const size_t ibegin( ( IsLower<MT1>::value )
3210  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3211  :( 0UL ) );
3212  const size_t iend( ( IsUpper<MT1>::value )
3213  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3214  :( M ) );
3215  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3216 
3217  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3218  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3219 
3220  IntrinsicType xmm1;
3221  size_t i( ibegin );
3222 
3223  for( ; i<ipos; i+=IT::size ) {
3224  xmm1 = xmm1 + A.load(i,j) * x.load(i);
3225  }
3226 
3227  y[j] = sum( xmm1 ) * scalar;
3228 
3229  for( ; remainder && i<iend; ++i ) {
3230  y[j] += x[i] * A(i,j) * scalar;
3231  }
3232  }
3233  }
3234  //**********************************************************************************************
3235 
3236  //**Default assignment to dense vectors (large matrices)****************************************
3250  template< typename VT1 // Type of the left-hand side target vector
3251  , typename VT2 // Type of the left-hand side vector operand
3252  , typename MT1 // Type of the right-hand side matrix operand
3253  , typename ST2 > // Type of the scalar value
3254  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3255  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3256  {
3257  selectDefaultAssignKernel( y, x, A, scalar );
3258  }
3259  //**********************************************************************************************
3260 
3261  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3276  template< typename VT1 // Type of the left-hand side target vector
3277  , typename VT2 // Type of the left-hand side vector operand
3278  , typename MT1 // Type of the right-hand side matrix operand
3279  , typename ST2 > // Type of the scalar value
3280  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3281  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3282  {
3283  typedef IntrinsicTrait<ElementType> IT;
3284 
3285  const size_t M( A.rows() );
3286  const size_t N( A.columns() );
3287 
3288  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3289 
3290  reset( y );
3291 
3292  size_t j( 0UL );
3293 
3294  for( ; (j+8UL) <= N; j+=8UL )
3295  {
3296  const size_t ibegin( ( IsLower<MT1>::value )
3297  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3298  :( 0UL ) );
3299  const size_t iend( ( IsUpper<MT1>::value )
3300  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3301  :( M ) );
3302  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3303 
3304  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3305  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3306 
3307  size_t i( ibegin );
3308 
3309  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
3310  const size_t i1( i+IT::size );
3311  const size_t i2( i+IT::size*2UL );
3312  const size_t i3( i+IT::size*3UL );
3313  const IntrinsicType x1( x.load(i ) );
3314  const IntrinsicType x2( x.load(i1) );
3315  const IntrinsicType x3( x.load(i2) );
3316  const IntrinsicType x4( x.load(i3) );
3317  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3318  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3319  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3320  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3321  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3322  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3323  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3324  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3325  }
3326 
3327  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
3328  const size_t i1( i+IT::size );
3329  const IntrinsicType x1( x.load(i ) );
3330  const IntrinsicType x2( x.load(i1) );
3331  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3332  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3333  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3334  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3335  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3336  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3337  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3338  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3339  }
3340 
3341  for( ; i<ipos; i+=IT::size ) {
3342  const IntrinsicType x1( x.load(i) );
3343  y[j ] += sum( x1 * A.load(i,j ) );
3344  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3345  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3346  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3347  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
3348  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
3349  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
3350  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
3351  }
3352 
3353  for( ; remainder && i<iend; ++i ) {
3354  y[j ] += x[i] * A(i,j );
3355  y[j+1UL] += x[i] * A(i,j+1UL);
3356  y[j+2UL] += x[i] * A(i,j+2UL);
3357  y[j+3UL] += x[i] * A(i,j+3UL);
3358  y[j+4UL] += x[i] * A(i,j+4UL);
3359  y[j+5UL] += x[i] * A(i,j+5UL);
3360  y[j+6UL] += x[i] * A(i,j+6UL);
3361  y[j+7UL] += x[i] * A(i,j+7UL);
3362  }
3363 
3364  y[j ] *= scalar;
3365  y[j+1UL] *= scalar;
3366  y[j+2UL] *= scalar;
3367  y[j+3UL] *= scalar;
3368  y[j+4UL] *= scalar;
3369  y[j+5UL] *= scalar;
3370  y[j+6UL] *= scalar;
3371  y[j+7UL] *= scalar;
3372  }
3373 
3374  for( ; (j+4UL) <= N; j+=4UL )
3375  {
3376  const size_t ibegin( ( IsLower<MT1>::value )
3377  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3378  :( 0UL ) );
3379  const size_t iend( ( IsUpper<MT1>::value )
3380  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3381  :( M ) );
3382  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3383 
3384  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3385  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3386 
3387  size_t i( ibegin );
3388 
3389  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
3390  const size_t i1( i+IT::size );
3391  const size_t i2( i+IT::size*2UL );
3392  const size_t i3( i+IT::size*3UL );
3393  const IntrinsicType x1( x.load(i ) );
3394  const IntrinsicType x2( x.load(i1) );
3395  const IntrinsicType x3( x.load(i2) );
3396  const IntrinsicType x4( x.load(i3) );
3397  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3398  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3399  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3400  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3401  }
3402 
3403  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
3404  const size_t i1( i+IT::size );
3405  const IntrinsicType x1( x.load(i ) );
3406  const IntrinsicType x2( x.load(i1) );
3407  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3408  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3409  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3410  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3411  }
3412 
3413  for( ; i<ipos; i+=IT::size ) {
3414  const IntrinsicType x1( x.load(i) );
3415  y[j ] += sum( x1 * A.load(i,j ) );
3416  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3417  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3418  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3419  }
3420 
3421  for( ; remainder && i<iend; ++i ) {
3422  y[j ] += x[i] * A(i,j );
3423  y[j+1UL] += x[i] * A(i,j+1UL);
3424  y[j+2UL] += x[i] * A(i,j+2UL);
3425  y[j+3UL] += x[i] * A(i,j+3UL);
3426  }
3427 
3428  y[j ] *= scalar;
3429  y[j+1UL] *= scalar;
3430  y[j+2UL] *= scalar;
3431  y[j+3UL] *= scalar;
3432  }
3433 
3434  for( ; (j+2UL) <= N; j+=2UL )
3435  {
3436  const size_t ibegin( ( IsLower<MT1>::value )
3437  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3438  :( 0UL ) );
3439  const size_t iend( ( IsUpper<MT1>::value )
3440  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3441  :( M ) );
3442  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3443 
3444  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3445  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3446 
3447  size_t i( ibegin );
3448 
3449  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
3450  const size_t i1( i+IT::size );
3451  const size_t i2( i+IT::size*2UL );
3452  const size_t i3( i+IT::size*3UL );
3453  const IntrinsicType x1( x.load(i ) );
3454  const IntrinsicType x2( x.load(i1) );
3455  const IntrinsicType x3( x.load(i2) );
3456  const IntrinsicType x4( x.load(i3) );
3457  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3458  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3459  }
3460 
3461  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
3462  const size_t i1( i+IT::size );
3463  const IntrinsicType x1( x.load(i ) );
3464  const IntrinsicType x2( x.load(i1) );
3465  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3466  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3467  }
3468 
3469  for( ; i<ipos; i+=IT::size ) {
3470  const IntrinsicType x1( x.load(i) );
3471  y[j ] += sum( x1 * A.load(i,j ) );
3472  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3473  }
3474 
3475  for( ; remainder && i<iend; ++i ) {
3476  y[j ] += x[i] * A(i,j );
3477  y[j+1UL] += x[i] * A(i,j+1UL);
3478  }
3479 
3480  y[j ] *= scalar;
3481  y[j+1UL] *= scalar;
3482  }
3483 
3484  if( j < N )
3485  {
3486  const size_t ibegin( ( IsLower<MT1>::value )
3487  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3488  :( 0UL ) );
3489  const size_t iend( ( IsUpper<MT1>::value )
3490  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3491  :( M ) );
3492  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3493 
3494  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3495  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3496 
3497  size_t i( ibegin );
3498 
3499  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
3500  const size_t i1( i+IT::size );
3501  const size_t i2( i+IT::size*2UL );
3502  const size_t i3( i+IT::size*3UL );
3503  const IntrinsicType x1( x.load(i ) );
3504  const IntrinsicType x2( x.load(i1) );
3505  const IntrinsicType x3( x.load(i2) );
3506  const IntrinsicType x4( x.load(i3) );
3507  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3508  }
3509 
3510  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
3511  const size_t i1( i+IT::size );
3512  const IntrinsicType x1( x.load(i ) );
3513  const IntrinsicType x2( x.load(i1) );
3514  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3515  }
3516 
3517  for( ; i<ipos; i+=IT::size ) {
3518  const IntrinsicType x1( x.load(i) );
3519  y[j] += sum( x1 * A.load(i,j) );
3520  }
3521 
3522  for( ; remainder && i<iend; ++i ) {
3523  y[j] += x[i] * A(i,j);
3524  }
3525 
3526  y[j] *= scalar;
3527  }
3528  }
3529  //**********************************************************************************************
3530 
3531  //**BLAS-based assignment to dense vectors (default)********************************************
3544  template< typename VT1 // Type of the left-hand side target vector
3545  , typename VT2 // Type of the left-hand side vector operand
3546  , typename MT1 // Type of the right-hand side matrix operand
3547  , typename ST2 > // Type of the scalar value
3548  static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
3549  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3550  {
3551  selectLargeAssignKernel( y, x, A, scalar );
3552  }
3553  //**********************************************************************************************
3554 
3555  //**BLAS-based assignment to dense vectors******************************************************
3556 #if BLAZE_BLAS_MODE
3557 
3570  template< typename VT1 // Type of the left-hand side target vector
3571  , typename VT2 // Type of the left-hand side vector operand
3572  , typename MT1 // Type of the right-hand side matrix operand
3573  , typename ST2 > // Type of the scalar value
3574  static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
3575  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3576  {
3577  typedef typename VT1::ElementType ET;
3578 
3579  if( IsTriangular<MT1>::value ) {
3580  assign( y, scalar * x );
3581  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3582  }
3583  else {
3584  gemv( y, x, A, ET(scalar), ET(0) );
3585  }
3586  }
3587 #endif
3588  //**********************************************************************************************
3589 
3590  //**Assignment to sparse vectors****************************************************************
3602  template< typename VT1 // Type of the target sparse vector
3603  , bool TF > // Transpose flag of the target sparse vector
3604  friend inline void assign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3605  {
3607 
3611 
3612  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3613 
3614  const ResultType tmp( serial( rhs ) );
3615  assign( ~lhs, tmp );
3616  }
3617  //**********************************************************************************************
3618 
3619  //**Addition assignment to dense vectors********************************************************
3631  template< typename VT1 // Type of the target dense vector
3632  , bool TF > // Transpose flag of the target dense vector
3633  friend inline void addAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3634  {
3636 
3637  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3638 
3639  typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
3640  typename VMM::RightOperand right( rhs.vector_.rightOperand() );
3641 
3642  if( right.rows() == 0UL || right.columns() == 0UL ) {
3643  return;
3644  }
3645 
3646  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3647  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3648 
3649  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3650  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3651  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3652  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3653 
3654  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3655  }
3656  //**********************************************************************************************
3657 
3658  //**Addition assignment to dense vectors (kernel selection)*************************************
3669  template< typename VT1 // Type of the left-hand side target vector
3670  , typename VT2 // Type of the left-hand side vector operand
3671  , typename MT1 // Type of the right-hand side matrix operand
3672  , typename ST2 > // Type of the scalar value
3673  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3674  {
3675  if( ( IsDiagonal<MT1>::value ) ||
3676  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3677  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3678  selectSmallAddAssignKernel( y, x, A, scalar );
3679  else
3680  selectBlasAddAssignKernel( y, x, A, scalar );
3681  }
3682  //**********************************************************************************************
3683 
3684  //**Default addition assignment to dense vectors************************************************
3698  template< typename VT1 // Type of the left-hand side target vector
3699  , typename VT2 // Type of the left-hand side vector operand
3700  , typename MT1 // Type of the right-hand side matrix operand
3701  , typename ST2 > // Type of the scalar value
3702  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3703  {
3704  y.addAssign( x * A * scalar );
3705  }
3706  //**********************************************************************************************
3707 
3708  //**Default addition assignment to dense vectors (small matrices)*******************************
3722  template< typename VT1 // Type of the left-hand side target vector
3723  , typename VT2 // Type of the left-hand side vector operand
3724  , typename MT1 // Type of the right-hand side matrix operand
3725  , typename ST2 > // Type of the scalar value
3726  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3727  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3728  {
3729  selectDefaultAddAssignKernel( y, x, A, scalar );
3730  }
3731  //**********************************************************************************************
3732 
3733  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3748  template< typename VT1 // Type of the left-hand side target vector
3749  , typename VT2 // Type of the left-hand side vector operand
3750  , typename MT1 // Type of the right-hand side matrix operand
3751  , typename ST2 > // Type of the scalar value
3752  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3753  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3754  {
3755  typedef IntrinsicTrait<ElementType> IT;
3756 
3757  const size_t M( A.rows() );
3758  const size_t N( A.columns() );
3759 
3760  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3761 
3762  size_t j( 0UL );
3763 
3764  for( ; (j+8UL) <= N; j+=8UL )
3765  {
3766  const size_t ibegin( ( IsLower<MT1>::value )
3767  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3768  :( 0UL ) );
3769  const size_t iend( ( IsUpper<MT1>::value )
3770  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3771  :( M ) );
3772  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3773 
3774  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3775  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3776 
3777  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3778  size_t i( ibegin );
3779 
3780  for( ; i<ipos; i+=IT::size ) {
3781  const IntrinsicType x1( x.load(i) );
3782  xmm1 = xmm1 + x1 * A.load(i,j );
3783  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3784  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3785  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3786  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3787  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3788  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3789  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3790  }
3791 
3792  y[j ] += sum( xmm1 ) * scalar;
3793  y[j+1UL] += sum( xmm2 ) * scalar;
3794  y[j+2UL] += sum( xmm3 ) * scalar;
3795  y[j+3UL] += sum( xmm4 ) * scalar;
3796  y[j+4UL] += sum( xmm5 ) * scalar;
3797  y[j+5UL] += sum( xmm6 ) * scalar;
3798  y[j+6UL] += sum( xmm7 ) * scalar;
3799  y[j+7UL] += sum( xmm8 ) * scalar;
3800 
3801  for( ; remainder && i<iend; ++i ) {
3802  y[j ] += x[i] * A(i,j ) * scalar;
3803  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3804  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3805  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3806  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3807  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3808  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3809  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3810  }
3811  }
3812 
3813  for( ; (j+4UL) <= N; j+=4UL )
3814  {
3815  const size_t ibegin( ( IsLower<MT1>::value )
3816  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3817  :( 0UL ) );
3818  const size_t iend( ( IsUpper<MT1>::value )
3819  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3820  :( M ) );
3821  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3822 
3823  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3824  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3825 
3826  IntrinsicType xmm1, xmm2, xmm3, xmm4;
3827  size_t i( ibegin );
3828 
3829  for( ; i<ipos; i+=IT::size ) {
3830  const IntrinsicType x1( x.load(i) );
3831  xmm1 = xmm1 + x1 * A.load(i,j );
3832  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3833  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3834  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3835  }
3836 
3837  y[j ] += sum( xmm1 ) * scalar;
3838  y[j+1UL] += sum( xmm2 ) * scalar;
3839  y[j+2UL] += sum( xmm3 ) * scalar;
3840  y[j+3UL] += sum( xmm4 ) * scalar;
3841 
3842  for( ; remainder && i<iend; ++i ) {
3843  y[j ] += x[i] * A(i,j ) * scalar;
3844  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3845  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3846  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3847  }
3848  }
3849 
3850  for( ; (j+3UL) <= N; j+=3UL )
3851  {
3852  const size_t ibegin( ( IsLower<MT1>::value )
3853  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3854  :( 0UL ) );
3855  const size_t iend( ( IsUpper<MT1>::value )
3856  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3857  :( M ) );
3858  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3859 
3860  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3861  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3862 
3863  IntrinsicType xmm1, xmm2, xmm3;
3864  size_t i( ibegin );
3865 
3866  for( ; i<ipos; i+=IT::size ) {
3867  const IntrinsicType x1( x.load(i) );
3868  xmm1 = xmm1 + x1 * A.load(i,j );
3869  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3870  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3871  }
3872 
3873  y[j ] += sum( xmm1 ) * scalar;
3874  y[j+1UL] += sum( xmm2 ) * scalar;
3875  y[j+2UL] += sum( xmm3 ) * scalar;
3876 
3877  for( ; remainder && i<iend; ++i ) {
3878  y[j ] += x[i] * A(i,j ) * scalar;
3879  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3880  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3881  }
3882  }
3883 
3884  for( ; (j+2UL) <= N; j+=2UL )
3885  {
3886  const size_t ibegin( ( IsLower<MT1>::value )
3887  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3888  :( 0UL ) );
3889  const size_t iend( ( IsUpper<MT1>::value )
3890  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3891  :( M ) );
3892  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3893 
3894  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3895  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3896 
3897  IntrinsicType xmm1, xmm2;
3898  size_t i( ibegin );
3899 
3900  for( ; i<ipos; i+=IT::size ) {
3901  const IntrinsicType x1( x.load(i) );
3902  xmm1 = xmm1 + x1 * A.load(i,j );
3903  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3904  }
3905 
3906  y[j ] += sum( xmm1 ) * scalar;
3907  y[j+1UL] += sum( xmm2 ) * scalar;
3908 
3909  for( ; remainder && i<iend; ++i ) {
3910  y[j ] += x[i] * A(i,j ) * scalar;
3911  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3912  }
3913  }
3914 
3915  if( j < N )
3916  {
3917  const size_t ibegin( ( IsLower<MT1>::value )
3918  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
3919  :( 0UL ) );
3920  const size_t iend( ( IsUpper<MT1>::value )
3921  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3922  :( M ) );
3923  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3924 
3925  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
3926  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
3927 
3928  IntrinsicType xmm1;
3929  size_t i( ibegin );
3930 
3931  for( ; i<ipos; i+=IT::size ) {
3932  xmm1 = xmm1 + A.load(i,j) * x.load(i);
3933  }
3934 
3935  y[j] += sum( xmm1 ) * scalar;
3936 
3937  for( ; remainder && i<iend; ++i ) {
3938  y[j] += x[i] * A(i,j) * scalar;
3939  }
3940  }
3941  }
3942  //**********************************************************************************************
3943 
3944  //**Default addition assignment to dense vectors (large matrices)*******************************
3958  template< typename VT1 // Type of the left-hand side target vector
3959  , typename VT2 // Type of the left-hand side vector operand
3960  , typename MT1 // Type of the right-hand side matrix operand
3961  , typename ST2 > // Type of the scalar value
3962  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3963  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3964  {
3965  selectDefaultAddAssignKernel( y, x, A, scalar );
3966  }
3967  //**********************************************************************************************
3968 
3969  //**Vectorized default addition assignment to dense vectors (large matrices)********************
3984  template< typename VT1 // Type of the left-hand side target vector
3985  , typename VT2 // Type of the left-hand side vector operand
3986  , typename MT1 // Type of the right-hand side matrix operand
3987  , typename ST2 > // Type of the scalar value
3988  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3989  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3990  {
3991  typedef IntrinsicTrait<ElementType> IT;
3992 
3993  const size_t M( A.rows() );
3994  const size_t N( A.columns() );
3995 
3996  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3997 
3998  size_t j( 0UL );
3999 
4000  for( ; (j+8UL) <= N; j+=8UL )
4001  {
4002  const size_t ibegin( ( IsLower<MT1>::value )
4003  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4004  :( 0UL ) );
4005  const size_t iend( ( IsUpper<MT1>::value )
4006  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4007  :( M ) );
4008  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4009 
4010  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4011  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4012 
4013  size_t i( ibegin );
4014 
4015  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
4016  const size_t i1( i+IT::size );
4017  const size_t i2( i+IT::size*2UL );
4018  const size_t i3( i+IT::size*3UL );
4019  const IntrinsicType x1( x.load(i ) );
4020  const IntrinsicType x2( x.load(i1) );
4021  const IntrinsicType x3( x.load(i2) );
4022  const IntrinsicType x4( x.load(i3) );
4023  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4024  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4025  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4026  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4027  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4028  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4029  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4030  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4031  }
4032 
4033  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
4034  const size_t i1( i+IT::size );
4035  const IntrinsicType x1( x.load(i ) );
4036  const IntrinsicType x2( x.load(i1) );
4037  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4038  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4039  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4040  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4041  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4042  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4043  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4044  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4045  }
4046 
4047  for( ; i<ipos; i+=IT::size ) {
4048  const IntrinsicType x1( x.load(i) );
4049  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4050  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4051  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4052  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4053  y[j+4UL] += sum( x1 * A.load(i,j+4UL) ) * scalar;
4054  y[j+5UL] += sum( x1 * A.load(i,j+5UL) ) * scalar;
4055  y[j+6UL] += sum( x1 * A.load(i,j+6UL) ) * scalar;
4056  y[j+7UL] += sum( x1 * A.load(i,j+7UL) ) * scalar;
4057  }
4058 
4059  for( ; remainder && i<iend; ++i ) {
4060  y[j ] += x[i] * A(i,j ) * scalar;
4061  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4062  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4063  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4064  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4065  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4066  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4067  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4068  }
4069  }
4070 
4071  for( ; (j+4UL) <= N; j+=4UL )
4072  {
4073  const size_t ibegin( ( IsLower<MT1>::value )
4074  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4075  :( 0UL ) );
4076  const size_t iend( ( IsUpper<MT1>::value )
4077  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4078  :( M ) );
4079  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4080 
4081  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4082  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4083 
4084  size_t i( ibegin );
4085 
4086  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
4087  const size_t i1( i+IT::size );
4088  const size_t i2( i+IT::size*2UL );
4089  const size_t i3( i+IT::size*3UL );
4090  const IntrinsicType x1( x.load(i ) );
4091  const IntrinsicType x2( x.load(i1) );
4092  const IntrinsicType x3( x.load(i2) );
4093  const IntrinsicType x4( x.load(i3) );
4094  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4095  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4096  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4097  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4098  }
4099 
4100  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
4101  const size_t i1( i+IT::size );
4102  const IntrinsicType x1( x.load(i ) );
4103  const IntrinsicType x2( x.load(i1) );
4104  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4105  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4106  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4107  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4108  }
4109 
4110  for( ; i<ipos; i+=IT::size ) {
4111  const IntrinsicType x1( x.load(i) );
4112  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4113  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4114  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4115  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4116  }
4117 
4118  for( ; remainder && i<iend; ++i ) {
4119  y[j ] += x[i] * A(i,j ) * scalar;
4120  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4121  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4122  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4123  }
4124  }
4125 
4126  for( ; (j+2UL) <= N; j+=2UL )
4127  {
4128  const size_t ibegin( ( IsLower<MT1>::value )
4129  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4130  :( 0UL ) );
4131  const size_t iend( ( IsUpper<MT1>::value )
4132  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4133  :( M ) );
4134  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4135 
4136  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4137  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4138 
4139  size_t i( ibegin );
4140 
4141  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
4142  const size_t i1( i+IT::size );
4143  const size_t i2( i+IT::size*2UL );
4144  const size_t i3( i+IT::size*3UL );
4145  const IntrinsicType x1( x.load(i ) );
4146  const IntrinsicType x2( x.load(i1) );
4147  const IntrinsicType x3( x.load(i2) );
4148  const IntrinsicType x4( x.load(i3) );
4149  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4150  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4151  }
4152 
4153  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
4154  const size_t i1( i+IT::size );
4155  const IntrinsicType x1( x.load(i ) );
4156  const IntrinsicType x2( x.load(i1) );
4157  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4158  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4159  }
4160 
4161  for( ; i<ipos; i+=IT::size ) {
4162  const IntrinsicType x1( x.load(i) );
4163  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4164  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4165  }
4166 
4167  for( ; remainder && i<iend; ++i ) {
4168  y[j ] += x[i] * A(i,j ) * scalar;
4169  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4170  }
4171  }
4172 
4173  if( j < N )
4174  {
4175  const size_t ibegin( ( IsLower<MT1>::value )
4176  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4177  :( 0UL ) );
4178  const size_t iend( ( IsUpper<MT1>::value )
4179  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4180  :( M ) );
4181  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4182 
4183  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4184  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4185 
4186  size_t i( ibegin );
4187 
4188  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
4189  const size_t i1( i+IT::size );
4190  const size_t i2( i+IT::size*2UL );
4191  const size_t i3( i+IT::size*3UL );
4192  const IntrinsicType x1( x.load(i ) );
4193  const IntrinsicType x2( x.load(i1) );
4194  const IntrinsicType x3( x.load(i2) );
4195  const IntrinsicType x4( x.load(i3) );
4196  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4197  }
4198 
4199  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
4200  const size_t i1( i+IT::size );
4201  const IntrinsicType x1( x.load(i ) );
4202  const IntrinsicType x2( x.load(i1) );
4203  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4204  }
4205 
4206  for( ; i<ipos; i+=IT::size ) {
4207  const IntrinsicType x1( x.load(i) );
4208  y[j] += sum( x1 * A.load(i,j) ) * scalar;
4209  }
4210 
4211  for( ; remainder && i<iend; ++i ) {
4212  y[j] += x[i] * A(i,j) * scalar;
4213  }
4214  }
4215  }
4216  //**********************************************************************************************
4217 
4218  //**BLAS-based addition assignment to dense vectors (default)***********************************
4233  template< typename VT1 // Type of the left-hand side target vector
4234  , typename VT2 // Type of the left-hand side vector operand
4235  , typename MT1 // Type of the right-hand side matrix operand
4236  , typename ST2 > // Type of the scalar value
4237  static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
4238  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4239  {
4240  selectLargeAddAssignKernel( y, x, A, scalar );
4241  }
4242  //**********************************************************************************************
4243 
4244  //**BLAS-based addition assignment to dense vectors*********************************************
4245 #if BLAZE_BLAS_MODE
4246 
4259  template< typename VT1 // Type of the left-hand side target vector
4260  , typename VT2 // Type of the left-hand side vector operand
4261  , typename MT1 // Type of the right-hand side matrix operand
4262  , typename ST2 > // Type of the scalar value
4263  static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
4264  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4265  {
4266  typedef typename VT1::ElementType ET;
4267 
4268  if( IsTriangular<MT1>::value ) {
4269  typename VT1::ResultType tmp( serial( scalar * x ) );
4270  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4271  addAssign( y, tmp );
4272  }
4273  else {
4274  gemv( y, x, A, ET(scalar), ET(1) );
4275  }
4276  }
4277 #endif
4278  //**********************************************************************************************
4279 
4280  //**Addition assignment to sparse vectors*******************************************************
4281  // No special implementation for the addition assignment to sparse vectors.
4282  //**********************************************************************************************
4283 
4284  //**Subtraction assignment to dense vectors*****************************************************
4296  template< typename VT1 // Type of the target dense vector
4297  , bool TF > // Transpose flag of the target dense vector
4298  friend inline void subAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
4299  {
4301 
4302  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4303 
4304  typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
4305  typename VMM::RightOperand right( rhs.vector_.rightOperand() );
4306 
4307  if( right.rows() == 0UL || right.columns() == 0UL ) {
4308  return;
4309  }
4310 
4311  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
4312  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4313 
4314  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4315  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4316  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4317  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4318 
4319  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
4320  }
4321  //**********************************************************************************************
4322 
4323  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4334  template< typename VT1 // Type of the left-hand side target vector
4335  , typename VT2 // Type of the left-hand side vector operand
4336  , typename MT1 // Type of the right-hand side matrix operand
4337  , typename ST2 > // Type of the scalar value
4338  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4339  {
4340  if( ( IsDiagonal<MT1>::value ) ||
4341  ( IsComputation<MT>::value && !evaluateMatrix ) ||
4342  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4343  selectSmallSubAssignKernel( y, x, A, scalar );
4344  else
4345  selectBlasSubAssignKernel( y, x, A, scalar );
4346  }
4347  //**********************************************************************************************
4348 
4349  //**Default subtraction assignment to dense vectors*********************************************
4363  template< typename VT1 // Type of the left-hand side target vector
4364  , typename VT2 // Type of the left-hand side vector operand
4365  , typename MT1 // Type of the right-hand side matrix operand
4366  , typename ST2 > // Type of the scalar value
4367  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4368  {
4369  y.subAssign( x * A * scalar );
4370  }
4371  //**********************************************************************************************
4372 
4373  //**Default subtraction assignment to dense vectors (small matrices)****************************
4387  template< typename VT1 // Type of the left-hand side target vector
4388  , typename VT2 // Type of the left-hand side vector operand
4389  , typename MT1 // Type of the right-hand side matrix operand
4390  , typename ST2 > // Type of the scalar value
4391  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4392  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4393  {
4394  selectDefaultSubAssignKernel( y, x, A, scalar );
4395  }
4396  //**********************************************************************************************
4397 
4398  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4413  template< typename VT1 // Type of the left-hand side target vector
4414  , typename VT2 // Type of the left-hand side vector operand
4415  , typename MT1 // Type of the right-hand side matrix operand
4416  , typename ST2 > // Type of the scalar value
4417  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4418  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4419  {
4420  typedef IntrinsicTrait<ElementType> IT;
4421 
4422  const size_t M( A.rows() );
4423  const size_t N( A.columns() );
4424 
4425  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4426 
4427  size_t j( 0UL );
4428 
4429  for( ; (j+8UL) <= N; j+=8UL )
4430  {
4431  const size_t ibegin( ( IsLower<MT1>::value )
4432  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4433  :( 0UL ) );
4434  const size_t iend( ( IsUpper<MT1>::value )
4435  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4436  :( M ) );
4437  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4438 
4439  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4440  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4441 
4442  IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4443  size_t i( ibegin );
4444 
4445  for( ; i<ipos; i+=IT::size ) {
4446  const IntrinsicType x1( x.load(i) );
4447  xmm1 = xmm1 + x1 * A.load(i,j );
4448  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4449  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4450  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4451  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
4452  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
4453  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
4454  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
4455  }
4456 
4457  y[j ] -= sum( xmm1 ) * scalar;
4458  y[j+1UL] -= sum( xmm2 ) * scalar;
4459  y[j+2UL] -= sum( xmm3 ) * scalar;
4460  y[j+3UL] -= sum( xmm4 ) * scalar;
4461  y[j+4UL] -= sum( xmm5 ) * scalar;
4462  y[j+5UL] -= sum( xmm6 ) * scalar;
4463  y[j+6UL] -= sum( xmm7 ) * scalar;
4464  y[j+7UL] -= sum( xmm8 ) * scalar;
4465 
4466  for( ; remainder && i<iend; ++i ) {
4467  y[j ] -= x[i] * A(i,j ) * scalar;
4468  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4469  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4470  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4471  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4472  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4473  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4474  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4475  }
4476  }
4477 
4478  for( ; (j+4UL) <= N; j+=4UL )
4479  {
4480  const size_t ibegin( ( IsLower<MT1>::value )
4481  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4482  :( 0UL ) );
4483  const size_t iend( ( IsUpper<MT1>::value )
4484  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4485  :( M ) );
4486  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4487 
4488  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4489  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4490 
4491  IntrinsicType xmm1, xmm2, xmm3, xmm4;
4492  size_t i( ibegin );
4493 
4494  for( ; i<ipos; i+=IT::size ) {
4495  const IntrinsicType x1( x.load(i) );
4496  xmm1 = xmm1 + x1 * A.load(i,j );
4497  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4498  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4499  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4500  }
4501 
4502  y[j ] -= sum( xmm1 ) * scalar;
4503  y[j+1UL] -= sum( xmm2 ) * scalar;
4504  y[j+2UL] -= sum( xmm3 ) * scalar;
4505  y[j+3UL] -= sum( xmm4 ) * scalar;
4506 
4507  for( ; remainder && i<iend; ++i ) {
4508  y[j ] -= x[i] * A(i,j ) * scalar;
4509  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4510  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4511  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4512  }
4513  }
4514 
4515  for( ; (j+3UL) <= N; j+=3UL )
4516  {
4517  const size_t ibegin( ( IsLower<MT1>::value )
4518  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4519  :( 0UL ) );
4520  const size_t iend( ( IsUpper<MT1>::value )
4521  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
4522  :( M ) );
4523  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4524 
4525  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4526  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4527 
4528  IntrinsicType xmm1, xmm2, xmm3;
4529  size_t i( ibegin );
4530 
4531  for( ; i<ipos; i+=IT::size ) {
4532  const IntrinsicType x1( x.load(i) );
4533  xmm1 = xmm1 + x1 * A.load(i,j );
4534  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4535  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4536  }
4537 
4538  y[j ] -= sum( xmm1 ) * scalar;
4539  y[j+1UL] -= sum( xmm2 ) * scalar;
4540  y[j+2UL] -= sum( xmm3 ) * scalar;
4541 
4542  for( ; remainder && i<iend; ++i ) {
4543  y[j ] -= x[i] * A(i,j ) * scalar;
4544  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4545  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4546  }
4547  }
4548 
4549  for( ; (j+2UL) <= N; j+=2UL )
4550  {
4551  const size_t ibegin( ( IsLower<MT1>::value )
4552  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4553  :( 0UL ) );
4554  const size_t iend( ( IsUpper<MT1>::value )
4555  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4556  :( M ) );
4557  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4558 
4559  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4560  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4561 
4562  IntrinsicType xmm1, xmm2;
4563  size_t i( ibegin );
4564 
4565  for( ; i<ipos; i+=IT::size ) {
4566  const IntrinsicType x1( x.load(i) );
4567  xmm1 = xmm1 + x1 * A.load(i,j );
4568  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4569  }
4570 
4571  y[j ] -= sum( xmm1 ) * scalar;
4572  y[j+1UL] -= sum( xmm2 ) * scalar;
4573 
4574  for( ; remainder && i<iend; ++i ) {
4575  y[j ] -= x[i] * A(i,j ) * scalar;
4576  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4577  }
4578  }
4579 
4580  if( j < N )
4581  {
4582  const size_t ibegin( ( IsLower<MT1>::value )
4583  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4584  :( 0UL ) );
4585  const size_t iend( ( IsUpper<MT1>::value )
4586  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4587  :( M ) );
4588  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4589 
4590  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4591  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4592 
4593  IntrinsicType xmm1;
4594  size_t i( ibegin );
4595 
4596  for( ; i<ipos; i+=IT::size ) {
4597  xmm1 = xmm1 + A.load(i,j) * x.load(i);
4598  }
4599 
4600  y[j] -= sum( xmm1 ) * scalar;
4601 
4602  for( ; remainder && i<iend; ++i ) {
4603  y[j] -= x[i] * A(i,j) * scalar;
4604  }
4605  }
4606  }
4607  //**********************************************************************************************
4608 
4609  //**Default subtraction assignment to dense vectors (large matrices)****************************
4623  template< typename VT1 // Type of the left-hand side target vector
4624  , typename VT2 // Type of the left-hand side vector operand
4625  , typename MT1 // Type of the right-hand side matrix operand
4626  , typename ST2 > // Type of the scalar value
4627  static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4628  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4629  {
4630  selectDefaultSubAssignKernel( y, x, A, scalar );
4631  }
4632  //**********************************************************************************************
4633 
4634  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4649  template< typename VT1 // Type of the left-hand side target vector
4650  , typename VT2 // Type of the left-hand side vector operand
4651  , typename MT1 // Type of the right-hand side matrix operand
4652  , typename ST2 > // Type of the scalar value
4653  static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4654  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4655  {
4656  typedef IntrinsicTrait<ElementType> IT;
4657 
4658  const size_t M( A.rows() );
4659  const size_t N( A.columns() );
4660 
4661  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4662 
4663  size_t j( 0UL );
4664 
4665  for( ; (j+8UL) <= N; j+=8UL )
4666  {
4667  const size_t ibegin( ( IsLower<MT1>::value )
4668  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4669  :( 0UL ) );
4670  const size_t iend( ( IsUpper<MT1>::value )
4671  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4672  :( M ) );
4673  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4674 
4675  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4676  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4677 
4678  size_t i( ibegin );
4679 
4680  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
4681  const size_t i1( i+IT::size );
4682  const size_t i2( i+IT::size*2UL );
4683  const size_t i3( i+IT::size*3UL );
4684  const IntrinsicType x1( x.load(i ) );
4685  const IntrinsicType x2( x.load(i1) );
4686  const IntrinsicType x3( x.load(i2) );
4687  const IntrinsicType x4( x.load(i3) );
4688  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4689  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4690  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4691  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4692  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4693  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4694  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4695  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4696  }
4697 
4698  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
4699  const size_t i1( i+IT::size );
4700  const IntrinsicType x1( x.load(i ) );
4701  const IntrinsicType x2( x.load(i1) );
4702  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4703  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4704  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4705  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4706  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4707  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4708  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4709  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4710  }
4711 
4712  for( ; i<ipos; i+=IT::size ) {
4713  const IntrinsicType x1( x.load(i) );
4714  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4715  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4716  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4717  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4718  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) ) * scalar;
4719  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) ) * scalar;
4720  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) ) * scalar;
4721  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) ) * scalar;
4722  }
4723 
4724  for( ; remainder && i<iend; ++i ) {
4725  y[j ] -= x[i] * A(i,j ) * scalar;
4726  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4727  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4728  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4729  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4730  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4731  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4732  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4733  }
4734  }
4735 
4736  for( ; (j+4UL) <= N; j+=4UL )
4737  {
4738  const size_t ibegin( ( IsLower<MT1>::value )
4739  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4740  :( 0UL ) );
4741  const size_t iend( ( IsUpper<MT1>::value )
4742  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4743  :( M ) );
4744  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4745 
4746  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4747  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4748 
4749  size_t i( ibegin );
4750 
4751  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
4752  const size_t i1( i+IT::size );
4753  const size_t i2( i+IT::size*2UL );
4754  const size_t i3( i+IT::size*3UL );
4755  const IntrinsicType x1( x.load(i ) );
4756  const IntrinsicType x2( x.load(i1) );
4757  const IntrinsicType x3( x.load(i2) );
4758  const IntrinsicType x4( x.load(i3) );
4759  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4760  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4761  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4762  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4763  }
4764 
4765  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
4766  const size_t i1( i+IT::size );
4767  const IntrinsicType x1( x.load(i ) );
4768  const IntrinsicType x2( x.load(i1) );
4769  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4770  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4771  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4772  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4773  }
4774 
4775  for( ; i<ipos; i+=IT::size ) {
4776  const IntrinsicType x1( x.load(i) );
4777  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4778  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4779  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4780  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4781  }
4782 
4783  for( ; remainder && i<iend; ++i ) {
4784  y[j ] -= x[i] * A(i,j ) * scalar;
4785  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4786  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4787  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4788  }
4789  }
4790 
4791  for( ; (j+2UL) <= N; j+=2UL )
4792  {
4793  const size_t ibegin( ( IsLower<MT1>::value )
4794  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4795  :( 0UL ) );
4796  const size_t iend( ( IsUpper<MT1>::value )
4797  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4798  :( M ) );
4799  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4800 
4801  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4802  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4803 
4804  size_t i( ibegin );
4805 
4806  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
4807  const size_t i1( i+IT::size );
4808  const size_t i2( i+IT::size*2UL );
4809  const size_t i3( i+IT::size*3UL );
4810  const IntrinsicType x1( x.load(i ) );
4811  const IntrinsicType x2( x.load(i1) );
4812  const IntrinsicType x3( x.load(i2) );
4813  const IntrinsicType x4( x.load(i3) );
4814  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4815  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4816  }
4817 
4818  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
4819  const size_t i1( i+IT::size );
4820  const IntrinsicType x1( x.load(i ) );
4821  const IntrinsicType x2( x.load(i1) );
4822  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4823  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4824  }
4825 
4826  for( ; i<ipos; i+=IT::size ) {
4827  const IntrinsicType x1( x.load(i) );
4828  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4829  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4830  }
4831 
4832  for( ; remainder && i<iend; ++i ) {
4833  y[j ] -= x[i] * A(i,j ) * scalar;
4834  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4835  }
4836  }
4837 
4838  if( j < N )
4839  {
4840  const size_t ibegin( ( IsLower<MT1>::value )
4841  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-IT::size) )
4842  :( 0UL ) );
4843  const size_t iend( ( IsUpper<MT1>::value )
4844  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4845  :( M ) );
4846  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4847 
4848  const size_t ipos( remainder ? ( iend & size_t(-IT::size) ) : iend );
4849  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (IT::size) ) ) == ipos, "Invalid end calculation" );
4850 
4851  size_t i( ibegin );
4852 
4853  for( ; (i+IT::size*3UL) < ipos; i+=IT::size*4UL ) {
4854  const size_t i1( i+IT::size );
4855  const size_t i2( i+IT::size*2UL );
4856  const size_t i3( i+IT::size*3UL );
4857  const IntrinsicType x1( x.load(i ) );
4858  const IntrinsicType x2( x.load(i1) );
4859  const IntrinsicType x3( x.load(i2) );
4860  const IntrinsicType x4( x.load(i3) );
4861  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4862  }
4863 
4864  for( ; (i+IT::size) < ipos; i+=IT::size*2UL ) {
4865  const size_t i1( i+IT::size );
4866  const IntrinsicType x1( x.load(i ) );
4867  const IntrinsicType x2( x.load(i1) );
4868  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4869  }
4870 
4871  for( ; i<ipos; i+=IT::size ) {
4872  const IntrinsicType x1( x.load(i) );
4873  y[j] -= sum( x1 * A.load(i,j) ) * scalar;
4874  }
4875 
4876  for( ; remainder && i<iend; ++i ) {
4877  y[j] -= x[i] * A(i,j) * scalar;
4878  }
4879  }
4880  }
4881  //**********************************************************************************************
4882 
4883  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4898  template< typename VT1 // Type of the left-hand side target vector
4899  , typename VT2 // Type of the left-hand side vector operand
4900  , typename MT1 // Type of the right-hand side matrix operand
4901  , typename ST2 > // Type of the scalar value
4902  static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
4903  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4904  {
4905  selectLargeSubAssignKernel( y, x, A, scalar );
4906  }
4907  //**********************************************************************************************
4908 
4909  //**BLAS-based subtraction assignment to dense vectors******************************************
4910 #if BLAZE_BLAS_MODE
4911 
4924  template< typename VT1 // Type of the left-hand side target vector
4925  , typename VT2 // Type of the left-hand side vector operand
4926  , typename MT1 // Type of the right-hand side matrix operand
4927  , typename ST2 > // Type of the scalar value
4928  static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
4929  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4930  {
4931  typedef typename VT1::ElementType ET;
4932 
4933  if( IsTriangular<MT1>::value ) {
4934  typename VT1::ResultType tmp( serial( scalar * x ) );
4935  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4936  subAssign( y, tmp );
4937  }
4938  else {
4939  gemv( y, x, A, ET(-scalar), ET(1) );
4940  }
4941  }
4942 #endif
4943  //**********************************************************************************************
4944 
4945  //**Subtraction assignment to sparse vectors****************************************************
4946  // No special implementation for the subtraction assignment to sparse vectors.
4947  //**********************************************************************************************
4948 
4949  //**Multiplication assignment to dense vectors**************************************************
4961  template< typename VT1 // Type of the target dense vector
4962  , bool TF > // Transpose flag of the target dense vector
4963  friend inline void multAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
4964  {
4966 
4970 
4971  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4972 
4973  const ResultType tmp( serial( rhs ) );
4974  multAssign( ~lhs, tmp );
4975  }
4976  //**********************************************************************************************
4977 
4978  //**Multiplication assignment to sparse vectors*************************************************
4979  // No special implementation for the multiplication assignment to sparse vectors.
4980  //**********************************************************************************************
4981 
4982  //**SMP assignment to dense vectors*************************************************************
4996  template< typename VT1 // Type of the target dense vector
4997  , bool TF > // Transpose flag of the target dense vector
4998  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
4999  smpAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5000  {
5002 
5003  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5004 
5005  typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
5006  typename VMM::RightOperand right( rhs.vector_.rightOperand() );
5007 
5008  if( right.rows() == 0UL ) {
5009  reset( ~lhs );
5010  return;
5011  }
5012  else if( right.columns() == 0UL ) {
5013  return;
5014  }
5015 
5016  LT x( left ); // Evaluation of the left-hand side dense vector operand
5017  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5018 
5019  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5020  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5021  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5022  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5023 
5024  smpAssign( ~lhs, x * A * rhs.scalar_ );
5025  }
5026  //**********************************************************************************************
5027 
5028  //**SMP assignment to sparse vectors************************************************************
5042  template< typename VT1 // Type of the target sparse vector
5043  , bool TF > // Transpose flag of the target sparse vector
5044  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5045  smpAssign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5046  {
5048 
5052 
5053  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5054 
5055  const ResultType tmp( rhs );
5056  smpAssign( ~lhs, tmp );
5057  }
5058  //**********************************************************************************************
5059 
5060  //**SMP addition assignment to dense vectors****************************************************
5074  template< typename VT1 // Type of the target dense vector
5075  , bool TF > // Transpose flag of the target dense vector
5076  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5077  smpAddAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5078  {
5080 
5081  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5082 
5083  typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
5084  typename VMM::RightOperand right( rhs.vector_.rightOperand() );
5085 
5086  if( right.rows() == 0UL || right.columns() == 0UL ) {
5087  return;
5088  }
5089 
5090  LT x( left ); // Evaluation of the left-hand side dense vector operand
5091  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5092 
5093  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5094  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5095  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5096  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5097 
5098  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
5099  }
5100  //**********************************************************************************************
5101 
5102  //**SMP addition assignment to sparse vectors***************************************************
5103  // No special implementation for the SMP addition assignment to sparse vectors.
5104  //**********************************************************************************************
5105 
5106  //**SMP subtraction assignment to dense vectors*************************************************
5120  template< typename VT1 // Type of the target dense vector
5121  , bool TF > // Transpose flag of the target dense vector
5122  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5123  smpSubAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5124  {
5126 
5127  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5128 
5129  typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
5130  typename VMM::RightOperand right( rhs.vector_.rightOperand() );
5131 
5132  if( right.rows() == 0UL || right.columns() == 0UL ) {
5133  return;
5134  }
5135 
5136  LT x( left ); // Evaluation of the left-hand side dense vector operand
5137  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5138 
5139  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5140  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5141  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5142  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5143 
5144  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
5145  }
5146  //**********************************************************************************************
5147 
5148  //**SMP subtraction assignment to sparse vectors************************************************
5149  // No special implementation for the SMP subtraction assignment to sparse vectors.
5150  //**********************************************************************************************
5151 
5152  //**SMP multiplication assignment to dense vectors**********************************************
5166  template< typename VT1 // Type of the target dense vector
5167  , bool TF > // Transpose flag of the target dense vector
5168  friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5169  smpMultAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5170  {
5172 
5176 
5177  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5178 
5179  const ResultType tmp( rhs );
5180  smpMultAssign( ~lhs, tmp );
5181  }
5182  //**********************************************************************************************
5183 
5184  //**SMP multiplication assignment to sparse vectors*********************************************
5185  // No special implementation for the SMP multiplication assignment to sparse vectors.
5186  //**********************************************************************************************
5187 
5188  //**Compile time checks*************************************************************************
5196  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
5197  //**********************************************************************************************
5198 };
5200 //*************************************************************************************************
5201 
5202 
5203 
5204 
5205 //=================================================================================================
5206 //
5207 // GLOBAL BINARY ARITHMETIC OPERATORS
5208 //
5209 //=================================================================================================
5210 
5211 //*************************************************************************************************
5242 template< typename T1 // Type of the left-hand side dense vector
5243  , typename T2 > // Type of the right-hand side dense matrix
5244 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >::Type
5246 {
5248 
5249  if( (~vec).size() != (~mat).rows() ) {
5250  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
5251  }
5252 
5253  return TDVecTDMatMultExpr<T1,T2>( ~vec, ~mat );
5254 }
5255 //*************************************************************************************************
5256 
5257 
5258 
5259 
5260 //=================================================================================================
5261 //
5262 // SIZE SPECIALIZATIONS
5263 //
5264 //=================================================================================================
5265 
5266 //*************************************************************************************************
5268 template< typename VT, typename MT >
5269 struct Size< TDVecTDMatMultExpr<VT,MT> > : public Columns<MT>
5270 {};
5272 //*************************************************************************************************
5273 
5274 
5275 
5276 
5277 //=================================================================================================
5278 //
5279 // ISALIGNED SPECIALIZATIONS
5280 //
5281 //=================================================================================================
5282 
5283 //*************************************************************************************************
5285 template< typename VT, typename MT >
5286 struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
5287  : public IsTrue< And< IsAligned<VT>, IsAligned<MT> >::value >
5288 {};
5290 //*************************************************************************************************
5291 
5292 
5293 
5294 
5295 //=================================================================================================
5296 //
5297 // EXPRESSION TRAIT SPECIALIZATIONS
5298 //
5299 //=================================================================================================
5300 
5301 //*************************************************************************************************
5303 template< typename VT, typename MT, bool AF >
5304 struct SubvectorExprTrait< TDVecTDMatMultExpr<VT,MT>, AF >
5305 {
5306  public:
5307  //**********************************************************************************************
5308  typedef typename MultExprTrait< typename SubvectorExprTrait<const VT,AF>::Type
5309  , typename SubmatrixExprTrait<const MT,AF>::Type >::Type Type;
5310  //**********************************************************************************************
5311 };
5313 //*************************************************************************************************
5314 
5315 } // namespace blaze
5316 
5317 #endif
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:340
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:148
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:328
Header file for basic type definitions.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:208
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the IsSame and IsStrictlySame type traits.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix)
Returns the current number of rows of the matrix.
Definition: Matrix.h:308
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:205
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:308
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:382
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:204
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
SelectType< evaluateMatrix, const MRT, MCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:217
TDVecTDMatMultExpr(const VT &vec, const MT &mat)
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:239
Header file for the VecScalarMultExpr base class.
VT::ResultType VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:124
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
MRT::ElementType MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:127
Header file for the IsComplexDouble type trait.
SelectType< evaluateVector, const VRT, VCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:214
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:261
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:253
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:201
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDVecTDMatMultExpr.h:203
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
Header file for the Columns type trait.
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:372
Header file for the IsBlasCompatible type trait.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
TDVecTDMatMultExpr< VT, MT > This
Type of this TDVecTDMatMultExpr instance.
Definition: TDVecTDMatMultExpr.h:199
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraint on the data type.
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:211
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
VRT::ElementType VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:126
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:202
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:295
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the SubmatrixExprTrait class template.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
MT::ResultType MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:125
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Base template for the MultTrait class.
Definition: MultTrait.h:138
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:383
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
Constraint on the data type.
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
Header file for the TVecMatMultExpr base class.
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:362
BLAZE_ALWAYS_INLINE int16_t sum(const simd_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:63
Header file for all intrinsic functionality.
Header file for BLAS general matrix/vector multiplication functions (gemv)
MultTrait< VRT, MRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:200
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:166
VT::CompositeType VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:128
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:79
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
Header file for the IsUpper type trait.
Header file for exception macros.
MT::CompositeType MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:129
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:352
EnableIf< IsDenseVector< VT1 > >::Type smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:189
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
LeftOperand leftOperand() const
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:318
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.