TDVecTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
51 #include <blaze/math/Exception.h>
57 #include <blaze/math/shims/Reset.h>
59 #include <blaze/math/SIMD.h>
83 #include <blaze/system/BLAS.h>
86 #include <blaze/util/Assert.h>
87 #include <blaze/util/Complex.h>
90 #include <blaze/util/DisableIf.h>
91 #include <blaze/util/EnableIf.h>
94 #include <blaze/util/mpl/And.h>
95 #include <blaze/util/mpl/If.h>
96 #include <blaze/util/Types.h>
105 
106 
107 namespace blaze {
108 
109 //=================================================================================================
110 //
111 // CLASS TDVECTDMATMULTEXPR
112 //
113 //=================================================================================================
114 
115 //*************************************************************************************************
122 template< typename VT // Type of the left-hand side dense vector
123  , typename MT > // Type of the right-hand side dense matrix
124 class TDVecTDMatMultExpr : public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
125  , private TVecMatMultExpr
126  , private Computation
127 {
128  private:
129  //**Type definitions****************************************************************************
136  //**********************************************************************************************
137 
138  //**********************************************************************************************
140  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
141  //**********************************************************************************************
142 
143  //**********************************************************************************************
145  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
147  //**********************************************************************************************
148 
149  //**********************************************************************************************
151 
155  template< typename T1 >
156  struct UseSMPAssign {
157  enum : bool { value = ( evaluateVector || evaluateMatrix ) };
158  };
160  //**********************************************************************************************
161 
162  //**********************************************************************************************
164 
167  template< typename T1, typename T2, typename T3 >
168  struct UseBlasKernel {
170  HasMutableDataAccess<T1>::value &&
171  HasConstDataAccess<T2>::value &&
172  HasConstDataAccess<T3>::value &&
173  !IsDiagonal<T3>::value &&
174  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
175  IsBLASCompatible< ElementType_<T1> >::value &&
176  IsBLASCompatible< ElementType_<T2> >::value &&
177  IsBLASCompatible< ElementType_<T3> >::value &&
178  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
179  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
180  };
182  //**********************************************************************************************
183 
184  //**********************************************************************************************
186 
190  template< typename T1, typename T2, typename T3 >
191  struct UseVectorizedDefaultKernel {
192  enum : bool { value = useOptimizedKernels &&
193  !IsDiagonal<T3>::value &&
194  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
195  AreSIMDCombinable< ElementType_<T1>
196  , ElementType_<T2>
197  , ElementType_<T3> >::value &&
198  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
199  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
200  };
202  //**********************************************************************************************
203 
204  public:
205  //**Type definitions****************************************************************************
211  typedef const ElementType ReturnType;
212  typedef const ResultType CompositeType;
213 
215  typedef If_< IsExpression<VT>, const VT, const VT& > LeftOperand;
216 
218  typedef If_< IsExpression<MT>, const MT, const MT& > RightOperand;
219 
222 
225  //**********************************************************************************************
226 
227  //**Compilation flags***************************************************************************
229  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
230  VT::simdEnabled && MT::simdEnabled &&
233 
235  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
236  !evaluateMatrix && MT::smpAssignable };
237  //**********************************************************************************************
238 
239  //**SIMD properties*****************************************************************************
241  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
242  //**********************************************************************************************
243 
244  //**Constructor*********************************************************************************
250  explicit inline TDVecTDMatMultExpr( const VT& vec, const MT& mat ) noexcept
251  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
252  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
253  {
254  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
255  }
256  //**********************************************************************************************
257 
258  //**Subscript operator**************************************************************************
264  inline ReturnType operator[]( size_t index ) const {
265  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
266 
268  {
269  return vec_[index] * mat_(index,index);
270  }
271  else if( IsLower<MT>::value && ( index > 8UL ) )
272  {
273  const size_t begin( IsStrictlyLower<MT>::value ? index+1UL : index );
274  const size_t n ( mat_.rows() - begin );
275  return subvector( vec_, begin, n ) * subvector( column( mat_, index ), begin, n );
276  }
277  else if( IsUpper<MT>::value && ( index + 8UL < mat_.rows() ) )
278  {
279  const size_t n( IsStrictlyUpper<MT>::value ? index : index+1UL );
280  return subvector( vec_, 0UL, n ) * subvector( column( mat_, index ), 0UL, n );
281  }
282  else
283  {
284  return vec_ * column( mat_, index );
285  }
286  }
287  //**********************************************************************************************
288 
289  //**At function*********************************************************************************
296  inline ReturnType at( size_t index ) const {
297  if( index >= mat_.columns() ) {
298  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
299  }
300  return (*this)[index];
301  }
302  //**********************************************************************************************
303 
304  //**Size function*******************************************************************************
309  inline size_t size() const noexcept {
310  return mat_.columns();
311  }
312  //**********************************************************************************************
313 
314  //**Left operand access*************************************************************************
319  inline LeftOperand leftOperand() const noexcept {
320  return vec_;
321  }
322  //**********************************************************************************************
323 
324  //**Right operand access************************************************************************
329  inline RightOperand rightOperand() const noexcept {
330  return mat_;
331  }
332  //**********************************************************************************************
333 
334  //**********************************************************************************************
340  template< typename T >
341  inline bool canAlias( const T* alias ) const noexcept {
342  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
343  }
344  //**********************************************************************************************
345 
346  //**********************************************************************************************
352  template< typename T >
353  inline bool isAliased( const T* alias ) const noexcept {
354  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
355  }
356  //**********************************************************************************************
357 
358  //**********************************************************************************************
363  inline bool isAligned() const noexcept {
364  return vec_.isAligned() && mat_.isAligned();
365  }
366  //**********************************************************************************************
367 
368  //**********************************************************************************************
373  inline bool canSMPAssign() const noexcept {
374  return ( !BLAZE_BLAS_IS_PARALLEL ||
375  ( IsComputation<MT>::value && !evaluateMatrix ) ||
376  ( mat_.rows() * mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
377  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
378  }
379  //**********************************************************************************************
380 
381  private:
382  //**Member variables****************************************************************************
383  LeftOperand vec_;
384  RightOperand mat_;
385  //**********************************************************************************************
386 
387  //**Assignment to dense vectors*****************************************************************
400  template< typename VT1 > // Type of the target dense vector
401  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
402  {
404 
405  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
406 
407  if( rhs.mat_.rows() == 0UL ) {
408  reset( ~lhs );
409  return;
410  }
411  else if( rhs.mat_.columns() == 0UL ) {
412  return;
413  }
414 
415  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
416  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
417 
418  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
419  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
420  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
421  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
422 
423  TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
424  }
426  //**********************************************************************************************
427 
428  //**Assignment to dense vectors (kernel selection)**********************************************
439  template< typename VT1 // Type of the left-hand side target vector
440  , typename VT2 // Type of the left-hand side vector operand
441  , typename MT1 > // Type of the right-hand side matrix operand
442  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
443  {
444  if( ( IsDiagonal<MT1>::value ) ||
445  ( IsComputation<MT>::value && !evaluateMatrix ) ||
446  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
447  selectSmallAssignKernel( y, x, A );
448  else
449  selectBlasAssignKernel( y, x, A );
450  }
452  //**********************************************************************************************
453 
454  //**Default assignment to dense vectors*********************************************************
468  template< typename VT1 // Type of the left-hand side target vector
469  , typename VT2 // Type of the left-hand side vector operand
470  , typename MT1 > // Type of the right-hand side matrix operand
471  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
472  {
473  y.assign( x * A );
474  }
476  //**********************************************************************************************
477 
478  //**Default assignment to dense vectors (small matrices)****************************************
492  template< typename VT1 // Type of the left-hand side target vector
493  , typename VT2 // Type of the left-hand side vector operand
494  , typename MT1 > // Type of the right-hand side matrix operand
495  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
496  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
497  {
498  selectDefaultAssignKernel( y, x, A );
499  }
501  //**********************************************************************************************
502 
503  //**Vectorized default assignment to dense vectors (small matrices)*****************************
517  template< typename VT1 // Type of the left-hand side target vector
518  , typename VT2 // Type of the left-hand side vector operand
519  , typename MT1 > // Type of the right-hand side matrix operand
520  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
521  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
522  {
523  const size_t M( A.rows() );
524  const size_t N( A.columns() );
525 
526  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
527 
528  size_t j( 0UL );
529 
530  for( ; (j+8UL) <= N; j+=8UL )
531  {
532  const size_t ibegin( ( IsLower<MT1>::value )
533  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
534  :( 0UL ) );
535  const size_t iend( ( IsUpper<MT1>::value )
536  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
537  :( M ) );
538  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
539 
540  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
541  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
542 
543  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
544  size_t i( ibegin );
545 
546  for( ; i<ipos; i+=SIMDSIZE ) {
547  const SIMDType x1( x.load(i) );
548  xmm1 = xmm1 + x1 * A.load(i,j );
549  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
550  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
551  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
552  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
553  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
554  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
555  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
556  }
557 
558  y[j ] = sum( xmm1 );
559  y[j+1UL] = sum( xmm2 );
560  y[j+2UL] = sum( xmm3 );
561  y[j+3UL] = sum( xmm4 );
562  y[j+4UL] = sum( xmm5 );
563  y[j+5UL] = sum( xmm6 );
564  y[j+6UL] = sum( xmm7 );
565  y[j+7UL] = sum( xmm8 );
566 
567  for( ; remainder && i<iend; ++i ) {
568  y[j ] += x[i] * A(i,j );
569  y[j+1UL] += x[i] * A(i,j+1UL);
570  y[j+2UL] += x[i] * A(i,j+2UL);
571  y[j+3UL] += x[i] * A(i,j+3UL);
572  y[j+4UL] += x[i] * A(i,j+4UL);
573  y[j+5UL] += x[i] * A(i,j+5UL);
574  y[j+6UL] += x[i] * A(i,j+6UL);
575  y[j+7UL] += x[i] * A(i,j+7UL);
576  }
577  }
578 
579  for( ; (j+4UL) <= N; j+=4UL )
580  {
581  const size_t ibegin( ( IsLower<MT1>::value )
582  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
583  :( 0UL ) );
584  const size_t iend( ( IsUpper<MT1>::value )
585  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
586  :( M ) );
587  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
588 
589  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
590  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
591 
592  SIMDType xmm1, xmm2, xmm3, xmm4;
593  size_t i( ibegin );
594 
595  for( ; i<ipos; i+=SIMDSIZE ) {
596  const SIMDType x1( x.load(i) );
597  xmm1 = xmm1 + x1 * A.load(i,j );
598  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
599  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
600  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
601  }
602 
603  y[j ] = sum( xmm1 );
604  y[j+1UL] = sum( xmm2 );
605  y[j+2UL] = sum( xmm3 );
606  y[j+3UL] = sum( xmm4 );
607 
608  for( ; remainder && i<iend; ++i ) {
609  y[j ] += x[i] * A(i,j );
610  y[j+1UL] += x[i] * A(i,j+1UL);
611  y[j+2UL] += x[i] * A(i,j+2UL);
612  y[j+3UL] += x[i] * A(i,j+3UL);
613  }
614  }
615 
616  for( ; (j+3UL) <= N; j+=3UL )
617  {
618  const size_t ibegin( ( IsLower<MT1>::value )
619  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
620  :( 0UL ) );
621  const size_t iend( ( IsUpper<MT1>::value )
622  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
623  :( M ) );
624  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
625 
626  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
627  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
628 
629  SIMDType xmm1, xmm2, xmm3;
630  size_t i( ibegin );
631 
632  for( ; i<ipos; i+=SIMDSIZE ) {
633  const SIMDType x1( x.load(i) );
634  xmm1 = xmm1 + x1 * A.load(i,j );
635  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
636  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
637  }
638 
639  y[j ] = sum( xmm1 );
640  y[j+1UL] = sum( xmm2 );
641  y[j+2UL] = sum( xmm3 );
642 
643  for( ; remainder && i<iend; ++i ) {
644  y[j ] += x[i] * A(i,j );
645  y[j+1UL] += x[i] * A(i,j+1UL);
646  y[j+2UL] += x[i] * A(i,j+2UL);
647  }
648  }
649 
650  for( ; (j+2UL) <= N; j+=2UL )
651  {
652  const size_t ibegin( ( IsLower<MT1>::value )
653  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
654  :( 0UL ) );
655  const size_t iend( ( IsUpper<MT1>::value )
656  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
657  :( M ) );
658  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
659 
660  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
661  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
662 
663  SIMDType xmm1, xmm2;
664  size_t i( ibegin );
665 
666  for( ; i<ipos; i+=SIMDSIZE ) {
667  const SIMDType x1( x.load(i) );
668  xmm1 = xmm1 + x1 * A.load(i,j );
669  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
670  }
671 
672  y[j ] = sum( xmm1 );
673  y[j+1UL] = sum( xmm2 );
674 
675  for( ; remainder && i<iend; ++i ) {
676  y[j ] += x[i] * A(i,j );
677  y[j+1UL] += x[i] * A(i,j+1UL);
678  }
679  }
680 
681  if( j < N )
682  {
683  const size_t ibegin( ( IsLower<MT1>::value )
684  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
685  :( 0UL ) );
686  const size_t iend( ( IsUpper<MT1>::value )
687  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
688  :( M ) );
689  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
690 
691  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
692  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
693 
694  SIMDType xmm1;
695  size_t i( ibegin );
696 
697  for( ; i<ipos; i+=SIMDSIZE ) {
698  xmm1 = xmm1 + x.load(i) * A.load(i,j);
699  }
700 
701  y[j] = sum( xmm1 );
702 
703  for( ; remainder && i<iend; ++i ) {
704  y[j] += x[i] * A(i,j);
705  }
706  }
707  }
709  //**********************************************************************************************
710 
711  //**Default assignment to dense vectors (large matrices)****************************************
725  template< typename VT1 // Type of the left-hand side target vector
726  , typename VT2 // Type of the left-hand side vector operand
727  , typename MT1 > // Type of the right-hand side matrix operand
728  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
729  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
730  {
731  selectDefaultAssignKernel( y, x, A );
732  }
734  //**********************************************************************************************
735 
736  //**Vectorized default assignment to dense vectors (large matrices)*****************************
750  template< typename VT1 // Type of the left-hand side target vector
751  , typename VT2 // Type of the left-hand side vector operand
752  , typename MT1 > // Type of the right-hand side matrix operand
753  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
754  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
755  {
756  const size_t M( A.rows() );
757  const size_t N( A.columns() );
758 
759  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
760 
761  reset( y );
762 
763  size_t j( 0UL );
764 
765  for( ; (j+8UL) <= N; j+=8UL )
766  {
767  const size_t ibegin( ( IsLower<MT1>::value )
768  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
769  :( 0UL ) );
770  const size_t iend( ( IsUpper<MT1>::value )
771  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
772  :( M ) );
773  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
774 
775  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
776  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
777 
778  size_t i( ibegin );
779 
780  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
781  const size_t i1( i+SIMDSIZE );
782  const size_t i2( i+SIMDSIZE*2UL );
783  const size_t i3( i+SIMDSIZE*3UL );
784  const SIMDType x1( x.load(i ) );
785  const SIMDType x2( x.load(i1) );
786  const SIMDType x3( x.load(i2) );
787  const SIMDType x4( x.load(i3) );
788  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
789  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
790  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
791  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
792  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
793  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
794  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
795  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
796  }
797 
798  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
799  const size_t i1( i+SIMDSIZE );
800  const SIMDType x1( x.load(i ) );
801  const SIMDType x2( x.load(i1) );
802  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
803  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
804  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
805  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
806  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
807  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
808  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
809  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
810  }
811 
812  for( ; i<ipos; i+=SIMDSIZE ) {
813  const SIMDType x1( x.load(i) );
814  y[j ] += sum( x1 * A.load(i,j ) );
815  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
816  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
817  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
818  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
819  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
820  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
821  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
822  }
823 
824  for( ; remainder && i<iend; ++i ) {
825  y[j ] += x[i] * A(i,j );
826  y[j+1UL] += x[i] * A(i,j+1UL);
827  y[j+2UL] += x[i] * A(i,j+2UL);
828  y[j+3UL] += x[i] * A(i,j+3UL);
829  y[j+4UL] += x[i] * A(i,j+4UL);
830  y[j+5UL] += x[i] * A(i,j+5UL);
831  y[j+6UL] += x[i] * A(i,j+6UL);
832  y[j+7UL] += x[i] * A(i,j+7UL);
833  }
834  }
835 
836  for( ; (j+4UL) <= N; j+=4UL )
837  {
838  const size_t ibegin( ( IsLower<MT1>::value )
839  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
840  :( 0UL ) );
841  const size_t iend( ( IsUpper<MT1>::value )
842  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
843  :( M ) );
844  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
845 
846  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
847  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
848 
849  size_t i( ibegin );
850 
851  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
852  const size_t i1( i+SIMDSIZE );
853  const size_t i2( i+SIMDSIZE*2UL );
854  const size_t i3( i+SIMDSIZE*3UL );
855  const SIMDType x1( x.load(i ) );
856  const SIMDType x2( x.load(i1) );
857  const SIMDType x3( x.load(i2) );
858  const SIMDType x4( x.load(i3) );
859  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
860  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
861  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
862  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
863  }
864 
865  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
866  const size_t i1( i+SIMDSIZE );
867  const SIMDType x1( x.load(i ) );
868  const SIMDType x2( x.load(i1) );
869  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
870  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
871  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
872  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
873  }
874 
875  for( ; i<ipos; i+=SIMDSIZE ) {
876  const SIMDType x1( x.load(i) );
877  y[j ] += sum( x1 * A.load(i,j ) );
878  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
879  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
880  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
881  }
882 
883  for( ; remainder && i<iend; ++i ) {
884  y[j ] += x[i] * A(i,j );
885  y[j+1UL] += x[i] * A(i,j+1UL);
886  y[j+2UL] += x[i] * A(i,j+2UL);
887  y[j+3UL] += x[i] * A(i,j+3UL);
888  }
889  }
890 
891  for( ; (j+2UL) <= N; j+=2UL )
892  {
893  const size_t ibegin( ( IsLower<MT1>::value )
894  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
895  :( 0UL ) );
896  const size_t iend( ( IsUpper<MT1>::value )
897  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
898  :( M ) );
899  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
900 
901  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
902  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
903 
904  size_t i( ibegin );
905 
906  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
907  const size_t i1( i+SIMDSIZE );
908  const size_t i2( i+SIMDSIZE*2UL );
909  const size_t i3( i+SIMDSIZE*3UL );
910  const SIMDType x1( x.load(i ) );
911  const SIMDType x2( x.load(i1) );
912  const SIMDType x3( x.load(i2) );
913  const SIMDType x4( x.load(i3) );
914  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
915  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
916  }
917 
918  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
919  const size_t i1( i+SIMDSIZE );
920  const SIMDType x1( x.load(i ) );
921  const SIMDType x2( x.load(i1) );
922  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
923  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
924  }
925 
926  for( ; i<ipos; i+=SIMDSIZE ) {
927  const SIMDType x1( x.load(i) );
928  y[j ] += sum( x1 * A.load(i,j ) );
929  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
930  }
931 
932  for( ; remainder && i<iend; ++i ) {
933  y[j ] += x[i] * A(i,j );
934  y[j+1UL] += x[i] * A(i,j+1UL);
935  }
936  }
937 
938  if( j < N )
939  {
940  const size_t ibegin( ( IsLower<MT1>::value )
941  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
942  :( 0UL ) );
943  const size_t iend( ( IsUpper<MT1>::value )
944  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
945  :( M ) );
946  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
947 
948  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
949  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
950 
951  size_t i( ibegin );
952 
953  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
954  const size_t i1( i+SIMDSIZE );
955  const size_t i2( i+SIMDSIZE*2UL );
956  const size_t i3( i+SIMDSIZE*3UL );
957  const SIMDType x1( x.load(i ) );
958  const SIMDType x2( x.load(i1) );
959  const SIMDType x3( x.load(i2) );
960  const SIMDType x4( x.load(i3) );
961  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
962  }
963 
964  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
965  const size_t i1( i+SIMDSIZE );
966  const SIMDType x1( x.load(i ) );
967  const SIMDType x2( x.load(i1) );
968  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
969  }
970 
971  for( ; i<ipos; i+=SIMDSIZE ) {
972  const SIMDType x1( x.load(i) );
973  y[j] += sum( x1 * A.load(i,j) );
974  }
975 
976  for( ; remainder && i<iend; ++i ) {
977  y[j] += x[i] * A(i,j);
978  }
979  }
980  }
982  //**********************************************************************************************
983 
984  //**BLAS-based assignment to dense vectors (default)********************************************
998  template< typename VT1 // Type of the left-hand side target vector
999  , typename VT2 // Type of the left-hand side vector operand
1000  , typename MT1 > // Type of the right-hand side matrix operand
1001  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
1002  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1003  {
1004  selectLargeAssignKernel( y, x, A );
1005  }
1007  //**********************************************************************************************
1008 
1009  //**BLAS-based assignment to dense vectors******************************************************
1010 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1011 
1024  template< typename VT1 // Type of the left-hand side target vector
1025  , typename VT2 // Type of the left-hand side vector operand
1026  , typename MT1 > // Type of the right-hand side matrix operand
1027  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
1028  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1029  {
1030  typedef ElementType_<VT1> ET;
1031 
1032  if( IsTriangular<MT1>::value ) {
1033  assign( y, x );
1034  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1035  }
1036  else {
1037  gemv( y, x, A, ET(1), ET(0) );
1038  }
1039  }
1041 #endif
1042  //**********************************************************************************************
1043 
1044  //**Assignment to sparse vectors****************************************************************
1057  template< typename VT1 > // Type of the target sparse vector
1058  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1059  {
1061 
1064  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
1065 
1066  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1067 
1068  const ResultType tmp( serial( rhs ) );
1069  assign( ~lhs, tmp );
1070  }
1072  //**********************************************************************************************
1073 
1074  //**Addition assignment to dense vectors********************************************************
1087  template< typename VT1 > // Type of the target dense vector
1088  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1089  {
1091 
1092  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1093 
1094  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1095  return;
1096  }
1097 
1098  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1099  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1100 
1101  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1102  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1103  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1104  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1105 
1106  TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1107  }
1109  //**********************************************************************************************
1110 
1111  //**Addition assignment to dense vectors (kernel selection)*************************************
1122  template< typename VT1 // Type of the left-hand side target vector
1123  , typename VT2 // Type of the left-hand side vector operand
1124  , typename MT1 > // Type of the right-hand side matrix operand
1125  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1126  {
1127  if( ( IsDiagonal<MT1>::value ) ||
1128  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1129  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1130  selectSmallAddAssignKernel( y, x, A );
1131  else
1132  selectBlasAddAssignKernel( y, x, A );
1133  }
1135  //**********************************************************************************************
1136 
1137  //**Default addition assignment to dense vectors************************************************
1151  template< typename VT1 // Type of the left-hand side target vector
1152  , typename VT2 // Type of the left-hand side vector operand
1153  , typename MT1 > // Type of the right-hand side matrix operand
1154  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1155  {
1156  y.addAssign( x * A );
1157  }
1159  //**********************************************************************************************
1160 
1161  //**Default addition assignment to dense vectors (small matrices)*******************************
1175  template< typename VT1 // Type of the left-hand side target vector
1176  , typename VT2 // Type of the left-hand side vector operand
1177  , typename MT1 > // Type of the right-hand side matrix operand
1178  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1179  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1180  {
1181  selectDefaultAddAssignKernel( y, x, A );
1182  }
1184  //**********************************************************************************************
1185 
1186  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1201  template< typename VT1 // Type of the left-hand side target vector
1202  , typename VT2 // Type of the left-hand side vector operand
1203  , typename MT1 > // Type of the right-hand side matrix operand
1204  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1205  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1206  {
1207  const size_t M( A.rows() );
1208  const size_t N( A.columns() );
1209 
1210  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1211 
1212  size_t j( 0UL );
1213 
1214  for( ; (j+8UL) <= N; j+=8UL )
1215  {
1216  const size_t ibegin( ( IsLower<MT1>::value )
1217  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1218  :( 0UL ) );
1219  const size_t iend( ( IsUpper<MT1>::value )
1220  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1221  :( M ) );
1222  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1223 
1224  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1225  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1226 
1227  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1228  size_t i( ibegin );
1229 
1230  for( ; i<ipos; i+=SIMDSIZE ) {
1231  const SIMDType x1( x.load(i) );
1232  xmm1 = xmm1 + x1 * A.load(i,j );
1233  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1234  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1235  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1236  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1237  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1238  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1239  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1240  }
1241 
1242  y[j ] += sum( xmm1 );
1243  y[j+1UL] += sum( xmm2 );
1244  y[j+2UL] += sum( xmm3 );
1245  y[j+3UL] += sum( xmm4 );
1246  y[j+4UL] += sum( xmm5 );
1247  y[j+5UL] += sum( xmm6 );
1248  y[j+6UL] += sum( xmm7 );
1249  y[j+7UL] += sum( xmm8 );
1250 
1251  for( ; remainder && i<iend; ++i ) {
1252  y[j ] += x[i] * A(i,j );
1253  y[j+1UL] += x[i] * A(i,j+1UL);
1254  y[j+2UL] += x[i] * A(i,j+2UL);
1255  y[j+3UL] += x[i] * A(i,j+3UL);
1256  y[j+4UL] += x[i] * A(i,j+4UL);
1257  y[j+5UL] += x[i] * A(i,j+5UL);
1258  y[j+6UL] += x[i] * A(i,j+6UL);
1259  y[j+7UL] += x[i] * A(i,j+7UL);
1260  }
1261  }
1262 
1263  for( ; (j+4UL) <= N; j+=4UL )
1264  {
1265  const size_t ibegin( ( IsLower<MT1>::value )
1266  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1267  :( 0UL ) );
1268  const size_t iend( ( IsUpper<MT1>::value )
1269  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1270  :( M ) );
1271  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1272 
1273  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1274  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1275 
1276  SIMDType xmm1, xmm2, xmm3, xmm4;
1277  size_t i( ibegin );
1278 
1279  for( ; i<ipos; i+=SIMDSIZE ) {
1280  const SIMDType x1( x.load(i) );
1281  xmm1 = xmm1 + x1 * A.load(i,j );
1282  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1283  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1284  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1285  }
1286 
1287  y[j ] += sum( xmm1 );
1288  y[j+1UL] += sum( xmm2 );
1289  y[j+2UL] += sum( xmm3 );
1290  y[j+3UL] += sum( xmm4 );
1291 
1292  for( ; remainder && i<iend; ++i ) {
1293  y[j ] += x[i] * A(i,j );
1294  y[j+1UL] += x[i] * A(i,j+1UL);
1295  y[j+2UL] += x[i] * A(i,j+2UL);
1296  y[j+3UL] += x[i] * A(i,j+3UL);
1297  }
1298  }
1299 
1300  for( ; (j+3UL) <= N; j+=3UL )
1301  {
1302  const size_t ibegin( ( IsLower<MT1>::value )
1303  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1304  :( 0UL ) );
1305  const size_t iend( ( IsUpper<MT1>::value )
1306  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1307  :( M ) );
1308  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1309 
1310  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1311  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1312 
1313  SIMDType xmm1, xmm2, xmm3;
1314  size_t i( ibegin );
1315 
1316  for( ; i<ipos; i+=SIMDSIZE ) {
1317  const SIMDType x1( x.load(i) );
1318  xmm1 = xmm1 + x1 * A.load(i,j );
1319  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1320  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1321  }
1322 
1323  y[j ] += sum( xmm1 );
1324  y[j+1UL] += sum( xmm2 );
1325  y[j+2UL] += sum( xmm3 );
1326 
1327  for( ; remainder && i<iend; ++i ) {
1328  y[j ] += x[i] * A(i,j );
1329  y[j+1UL] += x[i] * A(i,j+1UL);
1330  y[j+2UL] += x[i] * A(i,j+2UL);
1331  }
1332  }
1333 
1334  for( ; (j+2UL) <= N; j+=2UL )
1335  {
1336  const size_t ibegin( ( IsLower<MT1>::value )
1337  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1338  :( 0UL ) );
1339  const size_t iend( ( IsUpper<MT1>::value )
1340  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1341  :( M ) );
1342  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1343 
1344  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1345  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1346 
1347  SIMDType xmm1, xmm2;
1348  size_t i( ibegin );
1349 
1350  for( ; i<ipos; i+=SIMDSIZE ) {
1351  const SIMDType x1( x.load(i) );
1352  xmm1 = xmm1 + x1 * A.load(i,j );
1353  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1354  }
1355 
1356  y[j ] += sum( xmm1 );
1357  y[j+1UL] += sum( xmm2 );
1358 
1359  for( ; remainder && i<iend; ++i ) {
1360  y[j ] += x[i] * A(i,j );
1361  y[j+1UL] += x[i] * A(i,j+1UL);
1362  }
1363  }
1364 
1365  if( j < N )
1366  {
1367  const size_t ibegin( ( IsLower<MT1>::value )
1368  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1369  :( 0UL ) );
1370  const size_t iend( ( IsUpper<MT1>::value )
1371  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1372  :( M ) );
1373  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1374 
1375  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1376  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1377 
1378  SIMDType xmm1;
1379  size_t i( ibegin );
1380 
1381  for( ; i<ipos; i+=SIMDSIZE ) {
1382  xmm1 = xmm1 + A.load(i,j) * x.load(i);
1383  }
1384 
1385  y[j] += sum( xmm1 );
1386 
1387  for( ; remainder && i<iend; ++i ) {
1388  y[j] += x[i] * A(i,j);
1389  }
1390  }
1391  }
1393  //**********************************************************************************************
1394 
1395  //**Default addition assignment to dense vectors (large matrices)*******************************
1409  template< typename VT1 // Type of the left-hand side target vector
1410  , typename VT2 // Type of the left-hand side vector operand
1411  , typename MT1 > // Type of the right-hand side matrix operand
1412  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1413  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1414  {
1415  selectDefaultAddAssignKernel( y, x, A );
1416  }
1418  //**********************************************************************************************
1419 
1420  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1435  template< typename VT1 // Type of the left-hand side target vector
1436  , typename VT2 // Type of the left-hand side vector operand
1437  , typename MT1 > // Type of the right-hand side matrix operand
1438  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1439  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1440  {
1441  const size_t M( A.rows() );
1442  const size_t N( A.columns() );
1443 
1444  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1445 
1446  size_t j( 0UL );
1447 
1448  for( ; (j+8UL) <= N; j+=8UL )
1449  {
1450  const size_t ibegin( ( IsLower<MT1>::value )
1451  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1452  :( 0UL ) );
1453  const size_t iend( ( IsUpper<MT1>::value )
1454  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1455  :( M ) );
1456  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1457 
1458  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1459  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1460 
1461  size_t i( ibegin );
1462 
1463  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1464  const size_t i1( i+SIMDSIZE );
1465  const size_t i2( i+SIMDSIZE*2UL );
1466  const size_t i3( i+SIMDSIZE*3UL );
1467  const SIMDType x1( x.load(i ) );
1468  const SIMDType x2( x.load(i1) );
1469  const SIMDType x3( x.load(i2) );
1470  const SIMDType x4( x.load(i3) );
1471  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1472  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1473  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1474  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1475  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1476  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1477  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1478  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1479  }
1480 
1481  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1482  const size_t i1( i+SIMDSIZE );
1483  const SIMDType x1( x.load(i ) );
1484  const SIMDType x2( x.load(i1) );
1485  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1486  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1487  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1488  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1489  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1490  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1491  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1492  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1493  }
1494 
1495  for( ; i<ipos; i+=SIMDSIZE ) {
1496  const SIMDType x1( x.load(i) );
1497  y[j ] += sum( x1 * A.load(i,j ) );
1498  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1499  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1500  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1501  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
1502  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
1503  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
1504  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
1505  }
1506 
1507  for( ; remainder && i<iend; ++i ) {
1508  y[j ] += x[i] * A(i,j );
1509  y[j+1UL] += x[i] * A(i,j+1UL);
1510  y[j+2UL] += x[i] * A(i,j+2UL);
1511  y[j+3UL] += x[i] * A(i,j+3UL);
1512  y[j+4UL] += x[i] * A(i,j+4UL);
1513  y[j+5UL] += x[i] * A(i,j+5UL);
1514  y[j+6UL] += x[i] * A(i,j+6UL);
1515  y[j+7UL] += x[i] * A(i,j+7UL);
1516  }
1517  }
1518 
1519  for( ; (j+4UL) <= N; j+=4UL )
1520  {
1521  const size_t ibegin( ( IsLower<MT1>::value )
1522  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1523  :( 0UL ) );
1524  const size_t iend( ( IsUpper<MT1>::value )
1525  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1526  :( M ) );
1527  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1528 
1529  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1530  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1531 
1532  size_t i( ibegin );
1533 
1534  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1535  const size_t i1( i+SIMDSIZE );
1536  const size_t i2( i+SIMDSIZE*2UL );
1537  const size_t i3( i+SIMDSIZE*3UL );
1538  const SIMDType x1( x.load(i ) );
1539  const SIMDType x2( x.load(i1) );
1540  const SIMDType x3( x.load(i2) );
1541  const SIMDType x4( x.load(i3) );
1542  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1543  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1544  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1545  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1546  }
1547 
1548  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1549  const size_t i1( i+SIMDSIZE );
1550  const SIMDType x1( x.load(i ) );
1551  const SIMDType x2( x.load(i1) );
1552  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1553  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1554  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1555  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1556  }
1557 
1558  for( ; i<ipos; i+=SIMDSIZE ) {
1559  const SIMDType x1( x.load(i) );
1560  y[j ] += sum( x1 * A.load(i,j ) );
1561  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1562  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1563  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1564  }
1565 
1566  for( ; remainder && i<iend; ++i ) {
1567  y[j ] += x[i] * A(i,j );
1568  y[j+1UL] += x[i] * A(i,j+1UL);
1569  y[j+2UL] += x[i] * A(i,j+2UL);
1570  y[j+3UL] += x[i] * A(i,j+3UL);
1571  }
1572  }
1573 
1574  for( ; (j+2UL) <= N; j+=2UL )
1575  {
1576  const size_t ibegin( ( IsLower<MT1>::value )
1577  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1578  :( 0UL ) );
1579  const size_t iend( ( IsUpper<MT1>::value )
1580  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1581  :( M ) );
1582  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1583 
1584  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1585  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1586 
1587  size_t i( ibegin );
1588 
1589  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1590  const size_t i1( i+SIMDSIZE );
1591  const size_t i2( i+SIMDSIZE*2UL );
1592  const size_t i3( i+SIMDSIZE*3UL );
1593  const SIMDType x1( x.load(i ) );
1594  const SIMDType x2( x.load(i1) );
1595  const SIMDType x3( x.load(i2) );
1596  const SIMDType x4( x.load(i3) );
1597  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1598  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1599  }
1600 
1601  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1602  const size_t i1( i+SIMDSIZE );
1603  const SIMDType x1( x.load(i ) );
1604  const SIMDType x2( x.load(i1) );
1605  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1606  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1607  }
1608 
1609  for( ; i<ipos; i+=SIMDSIZE ) {
1610  const SIMDType x1( x.load(i) );
1611  y[j ] += sum( x1 * A.load(i,j ) );
1612  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1613  }
1614 
1615  for( ; remainder && i<iend; ++i ) {
1616  y[j ] += x[i] * A(i,j );
1617  y[j+1UL] += x[i] * A(i,j+1UL);
1618  }
1619  }
1620 
1621  if( j < N )
1622  {
1623  const size_t ibegin( ( IsLower<MT1>::value )
1624  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1625  :( 0UL ) );
1626  const size_t iend( ( IsUpper<MT1>::value )
1627  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1628  :( M ) );
1629  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1630 
1631  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1632  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1633 
1634  size_t i( ibegin );
1635 
1636  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1637  const size_t i1( i+SIMDSIZE );
1638  const size_t i2( i+SIMDSIZE*2UL );
1639  const size_t i3( i+SIMDSIZE*3UL );
1640  const SIMDType x1( x.load(i ) );
1641  const SIMDType x2( x.load(i1) );
1642  const SIMDType x3( x.load(i2) );
1643  const SIMDType x4( x.load(i3) );
1644  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1645  }
1646 
1647  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1648  const size_t i1( i+SIMDSIZE );
1649  const SIMDType x1( x.load(i ) );
1650  const SIMDType x2( x.load(i1) );
1651  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1652  }
1653 
1654  for( ; i<ipos; i+=SIMDSIZE ) {
1655  const SIMDType x1( x.load(i) );
1656  y[j] += sum( x1 * A.load(i,j) );
1657  }
1658 
1659  for( ; remainder && i<iend; ++i ) {
1660  y[j] += x[i] * A(i,j);
1661  }
1662  }
1663  }
1665  //**********************************************************************************************
1666 
1667  //**BLAS-based addition assignment to dense vectors (default)***********************************
1681  template< typename VT1 // Type of the left-hand side target vector
1682  , typename VT2 // Type of the left-hand side vector operand
1683  , typename MT1 > // Type of the right-hand side matrix operand
1684  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
1685  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1686  {
1687  selectLargeAddAssignKernel( y, x, A );
1688  }
1690  //**********************************************************************************************
1691 
1692  //**BLAS-based addition assignment to dense vectors*********************************************
1693 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1694 
1707  template< typename VT1 // Type of the left-hand side target vector
1708  , typename VT2 // Type of the left-hand side vector operand
1709  , typename MT1 > // Type of the right-hand side matrix operand
1710  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
1711  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1712  {
1713  typedef ElementType_<VT1> ET;
1714 
1715  if( IsTriangular<MT1>::value ) {
1716  ResultType_<VT1> tmp( serial( x ) );
1717  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1718  addAssign( y, tmp );
1719  }
1720  else {
1721  gemv( y, x, A, ET(1), ET(1) );
1722  }
1723  }
1725 #endif
1726  //**********************************************************************************************
1727 
1728  //**Addition assignment to sparse vectors*******************************************************
1729  // No special implementation for the addition assignment to sparse vectors.
1730  //**********************************************************************************************
1731 
1732  //**Subtraction assignment to dense vectors*****************************************************
1745  template< typename VT1 > // Type of the target dense vector
1746  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1747  {
1749 
1750  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1751 
1752  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1753  return;
1754  }
1755 
1756  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1757  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1758 
1759  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1760  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1761  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1762  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1763 
1764  TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1765  }
1767  //**********************************************************************************************
1768 
1769  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1780  template< typename VT1 // Type of the left-hand side target vector
1781  , typename VT2 // Type of the left-hand side vector operand
1782  , typename MT1 > // Type of the right-hand side matrix operand
1783  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1784  {
1785  if( ( IsDiagonal<MT1>::value ) ||
1786  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1787  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1788  selectSmallSubAssignKernel( y, x, A );
1789  else
1790  selectBlasSubAssignKernel( y, x, A );
1791  }
1793  //**********************************************************************************************
1794 
1795  //**Default subtraction assignment to dense vectors*********************************************
1809  template< typename VT1 // Type of the left-hand side target vector
1810  , typename VT2 // Type of the left-hand side vector operand
1811  , typename MT1 > // Type of the right-hand side matrix operand
1812  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1813  {
1814  y.subAssign( x * A );
1815  }
1817  //**********************************************************************************************
1818 
1819  //**Default subtraction assignment to dense vectors (small matrices)****************************
1833  template< typename VT1 // Type of the left-hand side target vector
1834  , typename VT2 // Type of the left-hand side vector operand
1835  , typename MT1 > // Type of the right-hand side matrix operand
1836  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1837  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1838  {
1839  selectDefaultSubAssignKernel( y, x, A );
1840  }
1842  //**********************************************************************************************
1843 
1844  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1859  template< typename VT1 // Type of the left-hand side target vector
1860  , typename VT2 // Type of the left-hand side vector operand
1861  , typename MT1 > // Type of the right-hand side matrix operand
1862  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1863  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1864  {
1865  const size_t M( A.rows() );
1866  const size_t N( A.columns() );
1867 
1868  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1869 
1870  size_t j( 0UL );
1871 
1872  for( ; (j+8UL) <= N; j+=8UL )
1873  {
1874  const size_t ibegin( ( IsLower<MT1>::value )
1875  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1876  :( 0UL ) );
1877  const size_t iend( ( IsUpper<MT1>::value )
1878  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1879  :( M ) );
1880  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1881 
1882  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1883  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1884 
1885  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1886  size_t i( ibegin );
1887 
1888  for( ; i<ipos; i+=SIMDSIZE ) {
1889  const SIMDType x1( x.load(i) );
1890  xmm1 = xmm1 + x1 * A.load(i,j );
1891  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1892  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1893  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1894  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1895  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1896  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1897  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1898  }
1899 
1900  y[j ] -= sum( xmm1 );
1901  y[j+1UL] -= sum( xmm2 );
1902  y[j+2UL] -= sum( xmm3 );
1903  y[j+3UL] -= sum( xmm4 );
1904  y[j+4UL] -= sum( xmm5 );
1905  y[j+5UL] -= sum( xmm6 );
1906  y[j+6UL] -= sum( xmm7 );
1907  y[j+7UL] -= sum( xmm8 );
1908 
1909  for( ; remainder && i<iend; ++i ) {
1910  y[j ] -= x[i] * A(i,j );
1911  y[j+1UL] -= x[i] * A(i,j+1UL);
1912  y[j+2UL] -= x[i] * A(i,j+2UL);
1913  y[j+3UL] -= x[i] * A(i,j+3UL);
1914  y[j+4UL] -= x[i] * A(i,j+4UL);
1915  y[j+5UL] -= x[i] * A(i,j+5UL);
1916  y[j+6UL] -= x[i] * A(i,j+6UL);
1917  y[j+7UL] -= x[i] * A(i,j+7UL);
1918  }
1919  }
1920 
1921  for( ; (j+4UL) <= N; j+=4UL )
1922  {
1923  const size_t ibegin( ( IsLower<MT1>::value )
1924  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1925  :( 0UL ) );
1926  const size_t iend( ( IsUpper<MT1>::value )
1927  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1928  :( M ) );
1929  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1930 
1931  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1932  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1933 
1934  SIMDType xmm1, xmm2, xmm3, xmm4;
1935  size_t i( ibegin );
1936 
1937  for( ; i<ipos; i+=SIMDSIZE ) {
1938  const SIMDType x1( x.load(i) );
1939  xmm1 = xmm1 + x1 * A.load(i,j );
1940  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1941  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1942  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1943  }
1944 
1945  y[j ] -= sum( xmm1 );
1946  y[j+1UL] -= sum( xmm2 );
1947  y[j+2UL] -= sum( xmm3 );
1948  y[j+3UL] -= sum( xmm4 );
1949 
1950  for( ; remainder && i<iend; ++i ) {
1951  y[j ] -= x[i] * A(i,j );
1952  y[j+1UL] -= x[i] * A(i,j+1UL);
1953  y[j+2UL] -= x[i] * A(i,j+2UL);
1954  y[j+3UL] -= x[i] * A(i,j+3UL);
1955  }
1956  }
1957 
1958  for( ; (j+3UL) <= N; j+=3UL )
1959  {
1960  const size_t ibegin( ( IsLower<MT1>::value )
1961  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1962  :( 0UL ) );
1963  const size_t iend( ( IsUpper<MT1>::value )
1964  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1965  :( M ) );
1966  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1967 
1968  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1969  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1970 
1971  SIMDType xmm1, xmm2, xmm3;
1972  size_t i( ibegin );
1973 
1974  for( ; i<ipos; i+=SIMDSIZE ) {
1975  const SIMDType x1( x.load(i) );
1976  xmm1 = xmm1 + x1 * A.load(i,j );
1977  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1978  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1979  }
1980 
1981  y[j ] -= sum( xmm1 );
1982  y[j+1UL] -= sum( xmm2 );
1983  y[j+2UL] -= sum( xmm3 );
1984 
1985  for( ; remainder && i<iend; ++i ) {
1986  y[j ] -= x[i] * A(i,j );
1987  y[j+1UL] -= x[i] * A(i,j+1UL);
1988  y[j+2UL] -= x[i] * A(i,j+2UL);
1989  }
1990  }
1991 
1992  for( ; (j+2UL) <= N; j+=2UL )
1993  {
1994  const size_t ibegin( ( IsLower<MT1>::value )
1995  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1996  :( 0UL ) );
1997  const size_t iend( ( IsUpper<MT1>::value )
1998  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1999  :( M ) );
2000  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2001 
2002  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2003  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2004 
2005  SIMDType xmm1, xmm2;
2006  size_t i( ibegin );
2007 
2008  for( ; i<ipos; i+=SIMDSIZE ) {
2009  const SIMDType x1( x.load(i) );
2010  xmm1 = xmm1 + x1 * A.load(i,j );
2011  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2012  }
2013 
2014  y[j ] -= sum( xmm1 );
2015  y[j+1UL] -= sum( xmm2 );
2016 
2017  for( ; remainder && i<iend; ++i ) {
2018  y[j ] -= x[i] * A(i,j );
2019  y[j+1UL] -= x[i] * A(i,j+1UL);
2020  }
2021  }
2022 
2023  if( j < N )
2024  {
2025  const size_t ibegin( ( IsLower<MT1>::value )
2026  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2027  :( 0UL ) );
2028  const size_t iend( ( IsUpper<MT1>::value )
2029  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2030  :( M ) );
2031  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2032 
2033  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2034  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2035 
2036  SIMDType xmm1;
2037  size_t i( ibegin );
2038 
2039  for( ; i<ipos; i+=SIMDSIZE ) {
2040  xmm1 = xmm1 + A.load(i,j) * x.load(i);
2041  }
2042 
2043  y[j] -= sum( xmm1 );
2044 
2045  for( ; remainder && i<iend; ++i ) {
2046  y[j] -= x[i] * A(i,j);
2047  }
2048  }
2049  }
2051  //**********************************************************************************************
2052 
2053  //**Default subtraction assignment to dense vectors (large matrices)****************************
2067  template< typename VT1 // Type of the left-hand side target vector
2068  , typename VT2 // Type of the left-hand side vector operand
2069  , typename MT1 > // Type of the right-hand side matrix operand
2070  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
2071  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2072  {
2073  selectDefaultSubAssignKernel( y, x, A );
2074  }
2076  //**********************************************************************************************
2077 
2078  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2093  template< typename VT1 // Type of the left-hand side target vector
2094  , typename VT2 // Type of the left-hand side vector operand
2095  , typename MT1 > // Type of the right-hand side matrix operand
2096  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
2097  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2098  {
2099  const size_t M( A.rows() );
2100  const size_t N( A.columns() );
2101 
2102  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
2103 
2104  size_t j( 0UL );
2105 
2106  for( ; (j+8UL) <= N; j+=8UL )
2107  {
2108  const size_t ibegin( ( IsLower<MT1>::value )
2109  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2110  :( 0UL ) );
2111  const size_t iend( ( IsUpper<MT1>::value )
2112  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
2113  :( M ) );
2114  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2115 
2116  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2117  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2118 
2119  size_t i( ibegin );
2120 
2121  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2122  const size_t i1( i+SIMDSIZE );
2123  const size_t i2( i+SIMDSIZE*2UL );
2124  const size_t i3( i+SIMDSIZE*3UL );
2125  const SIMDType x1( x.load(i ) );
2126  const SIMDType x2( x.load(i1) );
2127  const SIMDType x3( x.load(i2) );
2128  const SIMDType x4( x.load(i3) );
2129  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2130  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2131  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2132  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2133  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2134  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2135  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2136  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2137  }
2138 
2139  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2140  const size_t i1( i+SIMDSIZE );
2141  const SIMDType x1( x.load(i ) );
2142  const SIMDType x2( x.load(i1) );
2143  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2144  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2145  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2146  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2147  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2148  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2149  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2150  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2151  }
2152 
2153  for( ; i<ipos; i+=SIMDSIZE ) {
2154  const SIMDType x1( x.load(i) );
2155  y[j ] -= sum( x1 * A.load(i,j ) );
2156  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2157  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2158  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2159  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) );
2160  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) );
2161  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) );
2162  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) );
2163  }
2164 
2165  for( ; remainder && i<iend; ++i ) {
2166  y[j ] -= x[i] * A(i,j );
2167  y[j+1UL] -= x[i] * A(i,j+1UL);
2168  y[j+2UL] -= x[i] * A(i,j+2UL);
2169  y[j+3UL] -= x[i] * A(i,j+3UL);
2170  y[j+4UL] -= x[i] * A(i,j+4UL);
2171  y[j+5UL] -= x[i] * A(i,j+5UL);
2172  y[j+6UL] -= x[i] * A(i,j+6UL);
2173  y[j+7UL] -= x[i] * A(i,j+7UL);
2174  }
2175  }
2176 
2177  for( ; (j+4UL) <= N; j+=4UL )
2178  {
2179  const size_t ibegin( ( IsLower<MT1>::value )
2180  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2181  :( 0UL ) );
2182  const size_t iend( ( IsUpper<MT1>::value )
2183  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
2184  :( M ) );
2185  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2186 
2187  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2188  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2189 
2190  size_t i( ibegin );
2191 
2192  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2193  const size_t i1( i+SIMDSIZE );
2194  const size_t i2( i+SIMDSIZE*2UL );
2195  const size_t i3( i+SIMDSIZE*3UL );
2196  const SIMDType x1( x.load(i ) );
2197  const SIMDType x2( x.load(i1) );
2198  const SIMDType x3( x.load(i2) );
2199  const SIMDType x4( x.load(i3) );
2200  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2201  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2202  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2203  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2204  }
2205 
2206  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2207  const size_t i1( i+SIMDSIZE );
2208  const SIMDType x1( x.load(i ) );
2209  const SIMDType x2( x.load(i1) );
2210  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2211  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2212  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2213  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2214  }
2215 
2216  for( ; i<ipos; i+=SIMDSIZE ) {
2217  const SIMDType x1( x.load(i) );
2218  y[j ] -= sum( x1 * A.load(i,j ) );
2219  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2220  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2221  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2222  }
2223 
2224  for( ; remainder && i<iend; ++i ) {
2225  y[j ] -= x[i] * A(i,j );
2226  y[j+1UL] -= x[i] * A(i,j+1UL);
2227  y[j+2UL] -= x[i] * A(i,j+2UL);
2228  y[j+3UL] -= x[i] * A(i,j+3UL);
2229  }
2230  }
2231 
2232  for( ; (j+2UL) <= N; j+=2UL )
2233  {
2234  const size_t ibegin( ( IsLower<MT1>::value )
2235  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2236  :( 0UL ) );
2237  const size_t iend( ( IsUpper<MT1>::value )
2238  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2239  :( M ) );
2240  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2241 
2242  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2243  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2244 
2245  size_t i( ibegin );
2246 
2247  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2248  const size_t i1( i+SIMDSIZE );
2249  const size_t i2( i+SIMDSIZE*2UL );
2250  const size_t i3( i+SIMDSIZE*3UL );
2251  const SIMDType x1( x.load(i ) );
2252  const SIMDType x2( x.load(i1) );
2253  const SIMDType x3( x.load(i2) );
2254  const SIMDType x4( x.load(i3) );
2255  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2256  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2257  }
2258 
2259  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2260  const size_t i1( i+SIMDSIZE );
2261  const SIMDType x1( x.load(i ) );
2262  const SIMDType x2( x.load(i1) );
2263  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2264  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2265  }
2266 
2267  for( ; i<ipos; i+=SIMDSIZE ) {
2268  const SIMDType x1( x.load(i) );
2269  y[j ] -= sum( x1 * A.load(i,j ) );
2270  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2271  }
2272 
2273  for( ; remainder && i<iend; ++i ) {
2274  y[j ] -= x[i] * A(i,j );
2275  y[j+1UL] -= x[i] * A(i,j+1UL);
2276  }
2277  }
2278 
2279  if( j < N )
2280  {
2281  const size_t ibegin( ( IsLower<MT1>::value )
2282  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2283  :( 0UL ) );
2284  const size_t iend( ( IsUpper<MT1>::value )
2285  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2286  :( M ) );
2287  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2288 
2289  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2290  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2291 
2292  size_t i( ibegin );
2293 
2294  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2295  const size_t i1( i+SIMDSIZE );
2296  const size_t i2( i+SIMDSIZE*2UL );
2297  const size_t i3( i+SIMDSIZE*3UL );
2298  const SIMDType x1( x.load(i ) );
2299  const SIMDType x2( x.load(i1) );
2300  const SIMDType x3( x.load(i2) );
2301  const SIMDType x4( x.load(i3) );
2302  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2303  }
2304 
2305  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2306  const size_t i1( i+SIMDSIZE );
2307  const SIMDType x1( x.load(i ) );
2308  const SIMDType x2( x.load(i1) );
2309  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2310  }
2311 
2312  for( ; i<ipos; i+=SIMDSIZE ) {
2313  const SIMDType x1( x.load(i) );
2314  y[j] -= sum( x1 * A.load(i,j) );
2315  }
2316 
2317  for( ; remainder && i<iend; ++i ) {
2318  y[j] -= x[i] * A(i,j);
2319  }
2320  }
2321  }
2323  //**********************************************************************************************
2324 
2325  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2339  template< typename VT1 // Type of the left-hand side target vector
2340  , typename VT2 // Type of the left-hand side vector operand
2341  , typename MT1 > // Type of the right-hand side matrix operand
2342  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
2343  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2344  {
2345  selectLargeSubAssignKernel( y, x, A );
2346  }
2348  //**********************************************************************************************
2349 
2350  //**BLAS-based subtraction assignment to dense vectors******************************************
2351 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2352 
2365  template< typename VT1 // Type of the left-hand side target vector
2366  , typename VT2 // Type of the left-hand side vector operand
2367  , typename MT1 > // Type of the right-hand side matrix operand
2368  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
2369  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2370  {
2371  typedef ElementType_<VT1> ET;
2372 
2373  if( IsTriangular<MT1>::value ) {
2374  ResultType_<VT1> tmp( serial( x ) );
2375  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2376  subAssign( y, tmp );
2377  }
2378  else {
2379  gemv( y, x, A, ET(-1), ET(1) );
2380  }
2381  }
2383 #endif
2384  //**********************************************************************************************
2385 
2386  //**Subtraction assignment to sparse vectors****************************************************
2387  // No special implementation for the subtraction assignment to sparse vectors.
2388  //**********************************************************************************************
2389 
2390  //**Multiplication assignment to dense vectors**************************************************
2403  template< typename VT1 > // Type of the target dense vector
2404  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2405  {
2407 
2410  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2411 
2412  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2413 
2414  const ResultType tmp( serial( rhs ) );
2415  multAssign( ~lhs, tmp );
2416  }
2418  //**********************************************************************************************
2419 
2420  //**Multiplication assignment to sparse vectors*************************************************
2421  // No special implementation for the multiplication assignment to sparse vectors.
2422  //**********************************************************************************************
2423 
2424  //**Division assignment to dense vectors********************************************************
2437  template< typename VT1 > // Type of the target dense vector
2438  friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2439  {
2441 
2444  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2445 
2446  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2447 
2448  const ResultType tmp( serial( rhs ) );
2449  divAssign( ~lhs, tmp );
2450  }
2452  //**********************************************************************************************
2453 
2454  //**Division assignment to sparse vectors*******************************************************
2455  // No special implementation for the division assignment to sparse vectors.
2456  //**********************************************************************************************
2457 
2458  //**SMP assignment to dense vectors*************************************************************
2473  template< typename VT1 > // Type of the target dense vector
2474  friend inline EnableIf_< UseSMPAssign<VT1> >
2475  smpAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2476  {
2478 
2479  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2480 
2481  if( rhs.mat_.rows() == 0UL ) {
2482  reset( ~lhs );
2483  return;
2484  }
2485  else if( rhs.mat_.columns() == 0UL ) {
2486  return;
2487  }
2488 
2489  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2490  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2491 
2492  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2493  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2494  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2495  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2496 
2497  smpAssign( ~lhs, x * A );
2498  }
2500  //**********************************************************************************************
2501 
2502  //**SMP assignment to sparse vectors************************************************************
2517  template< typename VT1 > // Type of the target sparse vector
2518  friend inline EnableIf_< UseSMPAssign<VT1> >
2519  smpAssign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2520  {
2522 
2525  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2526 
2527  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2528 
2529  const ResultType tmp( rhs );
2530  smpAssign( ~lhs, tmp );
2531  }
2533  //**********************************************************************************************
2534 
2535  //**SMP addition assignment to dense vectors****************************************************
2550  template< typename VT1 > // Type of the target dense vector
2551  friend inline EnableIf_< UseSMPAssign<VT1> >
2552  smpAddAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2553  {
2555 
2556  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2557 
2558  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2559  return;
2560  }
2561 
2562  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2563  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2564 
2565  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2566  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2567  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2568  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2569 
2570  smpAddAssign( ~lhs, x * A );
2571  }
2573  //**********************************************************************************************
2574 
2575  //**SMP addition assignment to sparse vectors***************************************************
2576  // No special implementation for the SMP addition assignment to sparse vectors.
2577  //**********************************************************************************************
2578 
2579  //**SMP subtraction assignment to dense vectors*************************************************
2594  template< typename VT1 > // Type of the target dense vector
2595  friend inline EnableIf_< UseSMPAssign<VT1> >
2596  smpSubAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2597  {
2599 
2600  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2601 
2602  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2603  return;
2604  }
2605 
2606  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2607  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2608 
2609  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2610  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2611  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2612  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2613 
2614  smpSubAssign( ~lhs, x * A );
2615  }
2617  //**********************************************************************************************
2618 
2619  //**SMP subtraction assignment to sparse vectors************************************************
2620  // No special implementation for the SMP subtraction assignment to sparse vectors.
2621  //**********************************************************************************************
2622 
2623  //**SMP multiplication assignment to dense vectors**********************************************
2638  template< typename VT1 > // Type of the target dense vector
2639  friend inline EnableIf_< UseSMPAssign<VT1> >
2640  smpMultAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2641  {
2643 
2646  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2647 
2648  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2649 
2650  const ResultType tmp( rhs );
2651  smpMultAssign( ~lhs, tmp );
2652  }
2654  //**********************************************************************************************
2655 
2656  //**SMP multiplication assignment to sparse vectors*********************************************
2657  // No special implementation for the SMP multiplication assignment to sparse vectors.
2658  //**********************************************************************************************
2659 
2660  //**SMP division assignment to dense vectors****************************************************
2675  template< typename VT1 > // Type of the target dense vector
2676  friend inline EnableIf_< UseSMPAssign<VT1> >
2677  smpDivAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2678  {
2680 
2683  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2684 
2685  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2686 
2687  const ResultType tmp( rhs );
2688  smpDivAssign( ~lhs, tmp );
2689  }
2691  //**********************************************************************************************
2692 
2693  //**SMP division assignment to sparse vectors***************************************************
2694  // No special implementation for the SMP division assignment to sparse vectors.
2695  //**********************************************************************************************
2696 
2697  //**Compile time checks*************************************************************************
2705  //**********************************************************************************************
2706 };
2707 //*************************************************************************************************
2708 
2709 
2710 
2711 
2712 //=================================================================================================
2713 //
2714 // DVECSCALARMULTEXPR SPECIALIZATION
2715 //
2716 //=================================================================================================
2717 
2718 //*************************************************************************************************
2726 template< typename VT // Type of the left-hand side dense vector
2727  , typename MT // Type of the right-hand side dense matrix
2728  , typename ST > // Type of the side scalar value
2729 class DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >
2730  : public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
2731  , private VecScalarMultExpr
2732  , private Computation
2733 {
2734  private:
2735  //**Type definitions****************************************************************************
2736  typedef TDVecTDMatMultExpr<VT,MT> VMM;
2737  typedef ResultType_<VMM> RES;
2738  typedef ResultType_<VT> VRT;
2739  typedef ResultType_<MT> MRT;
2740  typedef ElementType_<VRT> VET;
2741  typedef ElementType_<MRT> MET;
2742  typedef CompositeType_<VT> VCT;
2743  typedef CompositeType_<MT> MCT;
2744  //**********************************************************************************************
2745 
2746  //**********************************************************************************************
2748  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2749  //**********************************************************************************************
2750 
2751  //**********************************************************************************************
2753  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2754  IsBLASCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2755  //**********************************************************************************************
2756 
2757  //**********************************************************************************************
2759 
2762  template< typename T1 >
2763  struct UseSMPAssign {
2764  enum : bool { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
2765  };
2766  //**********************************************************************************************
2767 
2768  //**********************************************************************************************
2770 
2772  template< typename T1, typename T2, typename T3, typename T4 >
2773  struct UseBlasKernel {
2775  HasMutableDataAccess<T1>::value &&
2776  HasConstDataAccess<T2>::value &&
2777  HasConstDataAccess<T3>::value &&
2778  !IsDiagonal<T3>::value &&
2779  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2780  IsBLASCompatible< ElementType_<T1> >::value &&
2781  IsBLASCompatible< ElementType_<T2> >::value &&
2782  IsBLASCompatible< ElementType_<T3> >::value &&
2783  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
2784  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2785  !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
2786  };
2787  //**********************************************************************************************
2788 
2789  //**********************************************************************************************
2791 
2794  template< typename T1, typename T2, typename T3, typename T4 >
2795  struct UseVectorizedDefaultKernel {
2796  enum : bool { value = useOptimizedKernels &&
2797  !IsDiagonal<T3>::value &&
2798  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2799  AreSIMDCombinable< ElementType_<T1>
2800  , ElementType_<T2>
2801  , ElementType_<T3>
2802  , T4 >::value &&
2803  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2804  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2805  };
2806  //**********************************************************************************************
2807 
2808  public:
2809  //**Type definitions****************************************************************************
2810  typedef DVecScalarMultExpr<VMM,ST,true> This;
2811  typedef MultTrait_<RES,ST> ResultType;
2812  typedef TransposeType_<ResultType> TransposeType;
2813  typedef ElementType_<ResultType> ElementType;
2814  typedef SIMDTrait_<ElementType> SIMDType;
2815  typedef const ElementType ReturnType;
2816  typedef const ResultType CompositeType;
2817 
2819  typedef const TDVecTDMatMultExpr<VT,MT> LeftOperand;
2820 
2822  typedef ST RightOperand;
2823 
2825  typedef IfTrue_< evaluateVector, const VRT, VCT > LT;
2826 
2828  typedef IfTrue_< evaluateMatrix, const MRT, MCT > RT;
2829  //**********************************************************************************************
2830 
2831  //**Compilation flags***************************************************************************
2833  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2834  VT::simdEnabled && MT::simdEnabled &&
2835  AreSIMDCombinable<VET,MET,ST>::value &&
2836  HasSIMDAdd<VET,MET>::value &&
2837  HasSIMDMult<VET,MET>::value };
2838 
2840  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2841  !evaluateMatrix && MT::smpAssignable };
2842  //**********************************************************************************************
2843 
2844  //**SIMD properties*****************************************************************************
2846  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2847  //**********************************************************************************************
2848 
2849  //**Constructor*********************************************************************************
2855  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2856  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2857  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2858  {}
2859  //**********************************************************************************************
2860 
2861  //**Subscript operator**************************************************************************
2867  inline ReturnType operator[]( size_t index ) const {
2868  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2869  return vector_[index] * scalar_;
2870  }
2871  //**********************************************************************************************
2872 
2873  //**At function*********************************************************************************
2880  inline ReturnType at( size_t index ) const {
2881  if( index >= vector_.size() ) {
2882  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2883  }
2884  return (*this)[index];
2885  }
2886  //**********************************************************************************************
2887 
2888  //**Size function*******************************************************************************
2893  inline size_t size() const {
2894  return vector_.size();
2895  }
2896  //**********************************************************************************************
2897 
2898  //**Left operand access*************************************************************************
2903  inline LeftOperand leftOperand() const {
2904  return vector_;
2905  }
2906  //**********************************************************************************************
2907 
2908  //**Right operand access************************************************************************
2913  inline RightOperand rightOperand() const {
2914  return scalar_;
2915  }
2916  //**********************************************************************************************
2917 
2918  //**********************************************************************************************
2924  template< typename T >
2925  inline bool canAlias( const T* alias ) const {
2926  return vector_.canAlias( alias );
2927  }
2928  //**********************************************************************************************
2929 
2930  //**********************************************************************************************
2936  template< typename T >
2937  inline bool isAliased( const T* alias ) const {
2938  return vector_.isAliased( alias );
2939  }
2940  //**********************************************************************************************
2941 
2942  //**********************************************************************************************
2947  inline bool isAligned() const {
2948  return vector_.isAligned();
2949  }
2950  //**********************************************************************************************
2951 
2952  //**********************************************************************************************
2957  inline bool canSMPAssign() const noexcept {
2958  RightOperand_<VMM> A( vector_.rightOperand() );
2959  return ( !BLAZE_BLAS_IS_PARALLEL ||
2960  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2961  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2962  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
2963  }
2964  //**********************************************************************************************
2965 
2966  private:
2967  //**Member variables****************************************************************************
2968  LeftOperand vector_;
2969  RightOperand scalar_;
2970  //**********************************************************************************************
2971 
2972  //**Assignment to dense vectors*****************************************************************
2984  template< typename VT1 // Type of the target dense vector
2985  , bool TF > // Transpose flag of the target dense vector
2986  friend inline void assign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
2987  {
2989 
2990  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2991 
2992  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
2993  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
2994 
2995  if( right.rows() == 0UL ) {
2996  reset( ~lhs );
2997  return;
2998  }
2999  else if( right.columns() == 0UL ) {
3000  return;
3001  }
3002 
3003  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3004  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3005 
3006  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3007  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3008  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3009  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3010 
3011  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
3012  }
3013  //**********************************************************************************************
3014 
3015  //**Assignment to dense vectors (kernel selection)**********************************************
3026  template< typename VT1 // Type of the left-hand side target vector
3027  , typename VT2 // Type of the left-hand side vector operand
3028  , typename MT1 // Type of the right-hand side matrix operand
3029  , typename ST2 > // Type of the scalar value
3030  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3031  {
3032  if( ( IsDiagonal<MT1>::value ) ||
3033  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3034  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3035  selectSmallAssignKernel( y, x, A, scalar );
3036  else
3037  selectBlasAssignKernel( y, x, A, scalar );
3038  }
3039  //**********************************************************************************************
3040 
3041  //**Default assignment to dense vectors*********************************************************
3055  template< typename VT1 // Type of the left-hand side target vector
3056  , typename VT2 // Type of the left-hand side vector operand
3057  , typename MT1 // Type of the right-hand side matrix operand
3058  , typename ST2 > // Type of the scalar value
3059  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3060  {
3061  y.assign( x * A * scalar );
3062  }
3063  //**********************************************************************************************
3064 
3065  //**Default assignment to dense vectors (small matrices)****************************************
3079  template< typename VT1 // Type of the left-hand side target vector
3080  , typename VT2 // Type of the left-hand side vector operand
3081  , typename MT1 // Type of the right-hand side matrix operand
3082  , typename ST2 > // Type of the scalar value
3083  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3084  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3085  {
3086  selectDefaultAssignKernel( y, x, A, scalar );
3087  }
3088  //**********************************************************************************************
3089 
3090  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3105  template< typename VT1 // Type of the left-hand side target vector
3106  , typename VT2 // Type of the left-hand side vector operand
3107  , typename MT1 // Type of the right-hand side matrix operand
3108  , typename ST2 > // Type of the scalar value
3109  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3110  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3111  {
3112  const size_t M( A.rows() );
3113  const size_t N( A.columns() );
3114 
3115  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3116 
3117  size_t j( 0UL );
3118 
3119  for( ; (j+8UL) <= N; j+=8UL )
3120  {
3121  const size_t ibegin( ( IsLower<MT1>::value )
3122  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3123  :( 0UL ) );
3124  const size_t iend( ( IsUpper<MT1>::value )
3125  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3126  :( M ) );
3127  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3128 
3129  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3130  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3131 
3132  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3133  size_t i( ibegin );
3134 
3135  for( ; i<ipos; i+=SIMDSIZE ) {
3136  const SIMDType x1( x.load(i) );
3137  xmm1 = xmm1 + x1 * A.load(i,j );
3138  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3139  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3140  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3141  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3142  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3143  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3144  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3145  }
3146 
3147  y[j ] = sum( xmm1 ) * scalar;
3148  y[j+1UL] = sum( xmm2 ) * scalar;
3149  y[j+2UL] = sum( xmm3 ) * scalar;
3150  y[j+3UL] = sum( xmm4 ) * scalar;
3151  y[j+4UL] = sum( xmm5 ) * scalar;
3152  y[j+5UL] = sum( xmm6 ) * scalar;
3153  y[j+6UL] = sum( xmm7 ) * scalar;
3154  y[j+7UL] = sum( xmm8 ) * scalar;
3155 
3156  for( ; remainder && i<iend; ++i ) {
3157  y[j ] += x[i] * A(i,j ) * scalar;
3158  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3159  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3160  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3161  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3162  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3163  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3164  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3165  }
3166  }
3167 
3168  for( ; (j+4UL) <= N; j+=4UL )
3169  {
3170  const size_t ibegin( ( IsLower<MT1>::value )
3171  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3172  :( 0UL ) );
3173  const size_t iend( ( IsUpper<MT1>::value )
3174  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3175  :( M ) );
3176  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3177 
3178  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3179  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3180 
3181  SIMDType xmm1, xmm2, xmm3, xmm4;
3182  size_t i( ibegin );
3183 
3184  for( ; i<ipos; i+=SIMDSIZE ) {
3185  const SIMDType x1( x.load(i) );
3186  xmm1 = xmm1 + x1 * A.load(i,j );
3187  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3188  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3189  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3190  }
3191 
3192  y[j ] = sum( xmm1 ) * scalar;
3193  y[j+1UL] = sum( xmm2 ) * scalar;
3194  y[j+2UL] = sum( xmm3 ) * scalar;
3195  y[j+3UL] = sum( xmm4 ) * scalar;
3196 
3197  for( ; remainder && i<iend; ++i ) {
3198  y[j ] += x[i] * A(i,j ) * scalar;
3199  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3200  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3201  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3202  }
3203  }
3204 
3205  for( ; (j+3UL) <= N; j+=3UL )
3206  {
3207  const size_t ibegin( ( IsLower<MT1>::value )
3208  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3209  :( 0UL ) );
3210  const size_t iend( ( IsUpper<MT1>::value )
3211  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3212  :( M ) );
3213  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3214 
3215  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3216  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3217 
3218  SIMDType xmm1, xmm2, xmm3;
3219  size_t i( ibegin );
3220 
3221  for( ; i<ipos; i+=SIMDSIZE ) {
3222  const SIMDType x1( x.load(i) );
3223  xmm1 = xmm1 + x1 * A.load(i,j );
3224  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3225  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3226  }
3227 
3228  y[j ] = sum( xmm1 ) * scalar;
3229  y[j+1UL] = sum( xmm2 ) * scalar;
3230  y[j+2UL] = sum( xmm3 ) * scalar;
3231 
3232  for( ; remainder && i<iend; ++i ) {
3233  y[j ] += x[i] * A(i,j ) * scalar;
3234  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3235  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3236  }
3237  }
3238 
3239  for( ; (j+2UL) <= N; j+=2UL )
3240  {
3241  const size_t ibegin( ( IsLower<MT1>::value )
3242  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3243  :( 0UL ) );
3244  const size_t iend( ( IsUpper<MT1>::value )
3245  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3246  :( M ) );
3247  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3248 
3249  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3250  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3251 
3252  SIMDType xmm1, xmm2;
3253  size_t i( ibegin );
3254 
3255  for( ; i<ipos; i+=SIMDSIZE ) {
3256  const SIMDType x1( x.load(i) );
3257  xmm1 = xmm1 + x1 * A.load(i,j );
3258  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3259  }
3260 
3261  y[j ] = sum( xmm1 ) * scalar;
3262  y[j+1UL] = sum( xmm2 ) * scalar;
3263 
3264  for( ; remainder && i<iend; ++i ) {
3265  y[j ] += x[i] * A(i,j ) * scalar;
3266  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3267  }
3268  }
3269 
3270  if( j < N )
3271  {
3272  const size_t ibegin( ( IsLower<MT1>::value )
3273  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3274  :( 0UL ) );
3275  const size_t iend( ( IsUpper<MT1>::value )
3276  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3277  :( M ) );
3278  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3279 
3280  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3281  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3282 
3283  SIMDType xmm1;
3284  size_t i( ibegin );
3285 
3286  for( ; i<ipos; i+=SIMDSIZE ) {
3287  xmm1 = xmm1 + A.load(i,j) * x.load(i);
3288  }
3289 
3290  y[j] = sum( xmm1 ) * scalar;
3291 
3292  for( ; remainder && i<iend; ++i ) {
3293  y[j] += x[i] * A(i,j) * scalar;
3294  }
3295  }
3296  }
3297  //**********************************************************************************************
3298 
3299  //**Default assignment to dense vectors (large matrices)****************************************
3313  template< typename VT1 // Type of the left-hand side target vector
3314  , typename VT2 // Type of the left-hand side vector operand
3315  , typename MT1 // Type of the right-hand side matrix operand
3316  , typename ST2 > // Type of the scalar value
3317  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3318  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3319  {
3320  selectDefaultAssignKernel( y, x, A, scalar );
3321  }
3322  //**********************************************************************************************
3323 
3324  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3339  template< typename VT1 // Type of the left-hand side target vector
3340  , typename VT2 // Type of the left-hand side vector operand
3341  , typename MT1 // Type of the right-hand side matrix operand
3342  , typename ST2 > // Type of the scalar value
3343  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3344  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3345  {
3346  const size_t M( A.rows() );
3347  const size_t N( A.columns() );
3348 
3349  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3350 
3351  reset( y );
3352 
3353  size_t j( 0UL );
3354 
3355  for( ; (j+8UL) <= N; j+=8UL )
3356  {
3357  const size_t ibegin( ( IsLower<MT1>::value )
3358  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3359  :( 0UL ) );
3360  const size_t iend( ( IsUpper<MT1>::value )
3361  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3362  :( M ) );
3363  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3364 
3365  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3366  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3367 
3368  size_t i( ibegin );
3369 
3370  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3371  const size_t i1( i+SIMDSIZE );
3372  const size_t i2( i+SIMDSIZE*2UL );
3373  const size_t i3( i+SIMDSIZE*3UL );
3374  const SIMDType x1( x.load(i ) );
3375  const SIMDType x2( x.load(i1) );
3376  const SIMDType x3( x.load(i2) );
3377  const SIMDType x4( x.load(i3) );
3378  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3379  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3380  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3381  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3382  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3383  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3384  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3385  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3386  }
3387 
3388  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3389  const size_t i1( i+SIMDSIZE );
3390  const SIMDType x1( x.load(i ) );
3391  const SIMDType x2( x.load(i1) );
3392  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3393  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3394  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3395  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3396  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3397  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3398  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3399  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3400  }
3401 
3402  for( ; i<ipos; i+=SIMDSIZE ) {
3403  const SIMDType x1( x.load(i) );
3404  y[j ] += sum( x1 * A.load(i,j ) );
3405  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3406  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3407  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3408  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
3409  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
3410  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
3411  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
3412  }
3413 
3414  for( ; remainder && i<iend; ++i ) {
3415  y[j ] += x[i] * A(i,j );
3416  y[j+1UL] += x[i] * A(i,j+1UL);
3417  y[j+2UL] += x[i] * A(i,j+2UL);
3418  y[j+3UL] += x[i] * A(i,j+3UL);
3419  y[j+4UL] += x[i] * A(i,j+4UL);
3420  y[j+5UL] += x[i] * A(i,j+5UL);
3421  y[j+6UL] += x[i] * A(i,j+6UL);
3422  y[j+7UL] += x[i] * A(i,j+7UL);
3423  }
3424 
3425  y[j ] *= scalar;
3426  y[j+1UL] *= scalar;
3427  y[j+2UL] *= scalar;
3428  y[j+3UL] *= scalar;
3429  y[j+4UL] *= scalar;
3430  y[j+5UL] *= scalar;
3431  y[j+6UL] *= scalar;
3432  y[j+7UL] *= scalar;
3433  }
3434 
3435  for( ; (j+4UL) <= N; j+=4UL )
3436  {
3437  const size_t ibegin( ( IsLower<MT1>::value )
3438  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3439  :( 0UL ) );
3440  const size_t iend( ( IsUpper<MT1>::value )
3441  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3442  :( M ) );
3443  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3444 
3445  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3446  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3447 
3448  size_t i( ibegin );
3449 
3450  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3451  const size_t i1( i+SIMDSIZE );
3452  const size_t i2( i+SIMDSIZE*2UL );
3453  const size_t i3( i+SIMDSIZE*3UL );
3454  const SIMDType x1( x.load(i ) );
3455  const SIMDType x2( x.load(i1) );
3456  const SIMDType x3( x.load(i2) );
3457  const SIMDType x4( x.load(i3) );
3458  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3459  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3460  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3461  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3462  }
3463 
3464  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3465  const size_t i1( i+SIMDSIZE );
3466  const SIMDType x1( x.load(i ) );
3467  const SIMDType x2( x.load(i1) );
3468  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3469  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3470  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3471  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3472  }
3473 
3474  for( ; i<ipos; i+=SIMDSIZE ) {
3475  const SIMDType x1( x.load(i) );
3476  y[j ] += sum( x1 * A.load(i,j ) );
3477  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3478  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3479  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3480  }
3481 
3482  for( ; remainder && i<iend; ++i ) {
3483  y[j ] += x[i] * A(i,j );
3484  y[j+1UL] += x[i] * A(i,j+1UL);
3485  y[j+2UL] += x[i] * A(i,j+2UL);
3486  y[j+3UL] += x[i] * A(i,j+3UL);
3487  }
3488 
3489  y[j ] *= scalar;
3490  y[j+1UL] *= scalar;
3491  y[j+2UL] *= scalar;
3492  y[j+3UL] *= scalar;
3493  }
3494 
3495  for( ; (j+2UL) <= N; j+=2UL )
3496  {
3497  const size_t ibegin( ( IsLower<MT1>::value )
3498  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3499  :( 0UL ) );
3500  const size_t iend( ( IsUpper<MT1>::value )
3501  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3502  :( M ) );
3503  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3504 
3505  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3506  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3507 
3508  size_t i( ibegin );
3509 
3510  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3511  const size_t i1( i+SIMDSIZE );
3512  const size_t i2( i+SIMDSIZE*2UL );
3513  const size_t i3( i+SIMDSIZE*3UL );
3514  const SIMDType x1( x.load(i ) );
3515  const SIMDType x2( x.load(i1) );
3516  const SIMDType x3( x.load(i2) );
3517  const SIMDType x4( x.load(i3) );
3518  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3519  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3520  }
3521 
3522  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3523  const size_t i1( i+SIMDSIZE );
3524  const SIMDType x1( x.load(i ) );
3525  const SIMDType x2( x.load(i1) );
3526  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3527  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3528  }
3529 
3530  for( ; i<ipos; i+=SIMDSIZE ) {
3531  const SIMDType x1( x.load(i) );
3532  y[j ] += sum( x1 * A.load(i,j ) );
3533  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3534  }
3535 
3536  for( ; remainder && i<iend; ++i ) {
3537  y[j ] += x[i] * A(i,j );
3538  y[j+1UL] += x[i] * A(i,j+1UL);
3539  }
3540 
3541  y[j ] *= scalar;
3542  y[j+1UL] *= scalar;
3543  }
3544 
3545  if( j < N )
3546  {
3547  const size_t ibegin( ( IsLower<MT1>::value )
3548  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3549  :( 0UL ) );
3550  const size_t iend( ( IsUpper<MT1>::value )
3551  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3552  :( M ) );
3553  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3554 
3555  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3556  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3557 
3558  size_t i( ibegin );
3559 
3560  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3561  const size_t i1( i+SIMDSIZE );
3562  const size_t i2( i+SIMDSIZE*2UL );
3563  const size_t i3( i+SIMDSIZE*3UL );
3564  const SIMDType x1( x.load(i ) );
3565  const SIMDType x2( x.load(i1) );
3566  const SIMDType x3( x.load(i2) );
3567  const SIMDType x4( x.load(i3) );
3568  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3569  }
3570 
3571  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3572  const size_t i1( i+SIMDSIZE );
3573  const SIMDType x1( x.load(i ) );
3574  const SIMDType x2( x.load(i1) );
3575  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3576  }
3577 
3578  for( ; i<ipos; i+=SIMDSIZE ) {
3579  const SIMDType x1( x.load(i) );
3580  y[j] += sum( x1 * A.load(i,j) );
3581  }
3582 
3583  for( ; remainder && i<iend; ++i ) {
3584  y[j] += x[i] * A(i,j);
3585  }
3586 
3587  y[j] *= scalar;
3588  }
3589  }
3590  //**********************************************************************************************
3591 
3592  //**BLAS-based assignment to dense vectors (default)********************************************
3605  template< typename VT1 // Type of the left-hand side target vector
3606  , typename VT2 // Type of the left-hand side vector operand
3607  , typename MT1 // Type of the right-hand side matrix operand
3608  , typename ST2 > // Type of the scalar value
3609  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3610  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3611  {
3612  selectLargeAssignKernel( y, x, A, scalar );
3613  }
3614  //**********************************************************************************************
3615 
3616  //**BLAS-based assignment to dense vectors******************************************************
3617 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3618 
3631  template< typename VT1 // Type of the left-hand side target vector
3632  , typename VT2 // Type of the left-hand side vector operand
3633  , typename MT1 // Type of the right-hand side matrix operand
3634  , typename ST2 > // Type of the scalar value
3635  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3636  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3637  {
3638  typedef ElementType_<VT1> ET;
3639 
3640  if( IsTriangular<MT1>::value ) {
3641  assign( y, scalar * x );
3642  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3643  }
3644  else {
3645  gemv( y, x, A, ET(scalar), ET(0) );
3646  }
3647  }
3648 #endif
3649  //**********************************************************************************************
3650 
3651  //**Assignment to sparse vectors****************************************************************
3663  template< typename VT1 // Type of the target sparse vector
3664  , bool TF > // Transpose flag of the target sparse vector
3665  friend inline void assign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3666  {
3668 
3671  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
3672 
3673  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3674 
3675  const ResultType tmp( serial( rhs ) );
3676  assign( ~lhs, tmp );
3677  }
3678  //**********************************************************************************************
3679 
3680  //**Addition assignment to dense vectors********************************************************
3692  template< typename VT1 // Type of the target dense vector
3693  , bool TF > // Transpose flag of the target dense vector
3694  friend inline void addAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3695  {
3697 
3698  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3699 
3700  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3701  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3702 
3703  if( right.rows() == 0UL || right.columns() == 0UL ) {
3704  return;
3705  }
3706 
3707  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3708  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3709 
3710  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3711  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3712  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3713  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3714 
3715  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3716  }
3717  //**********************************************************************************************
3718 
3719  //**Addition assignment to dense vectors (kernel selection)*************************************
3730  template< typename VT1 // Type of the left-hand side target vector
3731  , typename VT2 // Type of the left-hand side vector operand
3732  , typename MT1 // Type of the right-hand side matrix operand
3733  , typename ST2 > // Type of the scalar value
3734  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3735  {
3736  if( ( IsDiagonal<MT1>::value ) ||
3737  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3738  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3739  selectSmallAddAssignKernel( y, x, A, scalar );
3740  else
3741  selectBlasAddAssignKernel( y, x, A, scalar );
3742  }
3743  //**********************************************************************************************
3744 
3745  //**Default addition assignment to dense vectors************************************************
3759  template< typename VT1 // Type of the left-hand side target vector
3760  , typename VT2 // Type of the left-hand side vector operand
3761  , typename MT1 // Type of the right-hand side matrix operand
3762  , typename ST2 > // Type of the scalar value
3763  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3764  {
3765  y.addAssign( x * A * scalar );
3766  }
3767  //**********************************************************************************************
3768 
3769  //**Default addition assignment to dense vectors (small matrices)*******************************
3783  template< typename VT1 // Type of the left-hand side target vector
3784  , typename VT2 // Type of the left-hand side vector operand
3785  , typename MT1 // Type of the right-hand side matrix operand
3786  , typename ST2 > // Type of the scalar value
3787  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3788  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3789  {
3790  selectDefaultAddAssignKernel( y, x, A, scalar );
3791  }
3792  //**********************************************************************************************
3793 
3794  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3809  template< typename VT1 // Type of the left-hand side target vector
3810  , typename VT2 // Type of the left-hand side vector operand
3811  , typename MT1 // Type of the right-hand side matrix operand
3812  , typename ST2 > // Type of the scalar value
3813  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3814  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3815  {
3816  const size_t M( A.rows() );
3817  const size_t N( A.columns() );
3818 
3819  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3820 
3821  size_t j( 0UL );
3822 
3823  for( ; (j+8UL) <= N; j+=8UL )
3824  {
3825  const size_t ibegin( ( IsLower<MT1>::value )
3826  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3827  :( 0UL ) );
3828  const size_t iend( ( IsUpper<MT1>::value )
3829  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3830  :( M ) );
3831  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3832 
3833  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3834  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3835 
3836  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3837  size_t i( ibegin );
3838 
3839  for( ; i<ipos; i+=SIMDSIZE ) {
3840  const SIMDType x1( x.load(i) );
3841  xmm1 = xmm1 + x1 * A.load(i,j );
3842  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3843  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3844  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3845  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3846  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3847  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3848  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3849  }
3850 
3851  y[j ] += sum( xmm1 ) * scalar;
3852  y[j+1UL] += sum( xmm2 ) * scalar;
3853  y[j+2UL] += sum( xmm3 ) * scalar;
3854  y[j+3UL] += sum( xmm4 ) * scalar;
3855  y[j+4UL] += sum( xmm5 ) * scalar;
3856  y[j+5UL] += sum( xmm6 ) * scalar;
3857  y[j+6UL] += sum( xmm7 ) * scalar;
3858  y[j+7UL] += sum( xmm8 ) * scalar;
3859 
3860  for( ; remainder && i<iend; ++i ) {
3861  y[j ] += x[i] * A(i,j ) * scalar;
3862  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3863  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3864  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3865  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3866  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3867  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3868  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3869  }
3870  }
3871 
3872  for( ; (j+4UL) <= N; j+=4UL )
3873  {
3874  const size_t ibegin( ( IsLower<MT1>::value )
3875  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3876  :( 0UL ) );
3877  const size_t iend( ( IsUpper<MT1>::value )
3878  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3879  :( M ) );
3880  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3881 
3882  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3883  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3884 
3885  SIMDType xmm1, xmm2, xmm3, xmm4;
3886  size_t i( ibegin );
3887 
3888  for( ; i<ipos; i+=SIMDSIZE ) {
3889  const SIMDType x1( x.load(i) );
3890  xmm1 = xmm1 + x1 * A.load(i,j );
3891  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3892  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3893  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3894  }
3895 
3896  y[j ] += sum( xmm1 ) * scalar;
3897  y[j+1UL] += sum( xmm2 ) * scalar;
3898  y[j+2UL] += sum( xmm3 ) * scalar;
3899  y[j+3UL] += sum( xmm4 ) * scalar;
3900 
3901  for( ; remainder && i<iend; ++i ) {
3902  y[j ] += x[i] * A(i,j ) * scalar;
3903  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3904  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3905  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3906  }
3907  }
3908 
3909  for( ; (j+3UL) <= N; j+=3UL )
3910  {
3911  const size_t ibegin( ( IsLower<MT1>::value )
3912  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3913  :( 0UL ) );
3914  const size_t iend( ( IsUpper<MT1>::value )
3915  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3916  :( M ) );
3917  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3918 
3919  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3920  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3921 
3922  SIMDType xmm1, xmm2, xmm3;
3923  size_t i( ibegin );
3924 
3925  for( ; i<ipos; i+=SIMDSIZE ) {
3926  const SIMDType x1( x.load(i) );
3927  xmm1 = xmm1 + x1 * A.load(i,j );
3928  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3929  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3930  }
3931 
3932  y[j ] += sum( xmm1 ) * scalar;
3933  y[j+1UL] += sum( xmm2 ) * scalar;
3934  y[j+2UL] += sum( xmm3 ) * scalar;
3935 
3936  for( ; remainder && i<iend; ++i ) {
3937  y[j ] += x[i] * A(i,j ) * scalar;
3938  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3939  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3940  }
3941  }
3942 
3943  for( ; (j+2UL) <= N; j+=2UL )
3944  {
3945  const size_t ibegin( ( IsLower<MT1>::value )
3946  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3947  :( 0UL ) );
3948  const size_t iend( ( IsUpper<MT1>::value )
3949  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3950  :( M ) );
3951  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3952 
3953  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3954  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3955 
3956  SIMDType xmm1, xmm2;
3957  size_t i( ibegin );
3958 
3959  for( ; i<ipos; i+=SIMDSIZE ) {
3960  const SIMDType x1( x.load(i) );
3961  xmm1 = xmm1 + x1 * A.load(i,j );
3962  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3963  }
3964 
3965  y[j ] += sum( xmm1 ) * scalar;
3966  y[j+1UL] += sum( xmm2 ) * scalar;
3967 
3968  for( ; remainder && i<iend; ++i ) {
3969  y[j ] += x[i] * A(i,j ) * scalar;
3970  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3971  }
3972  }
3973 
3974  if( j < N )
3975  {
3976  const size_t ibegin( ( IsLower<MT1>::value )
3977  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3978  :( 0UL ) );
3979  const size_t iend( ( IsUpper<MT1>::value )
3980  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3981  :( M ) );
3982  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3983 
3984  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3985  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3986 
3987  SIMDType xmm1;
3988  size_t i( ibegin );
3989 
3990  for( ; i<ipos; i+=SIMDSIZE ) {
3991  xmm1 = xmm1 + A.load(i,j) * x.load(i);
3992  }
3993 
3994  y[j] += sum( xmm1 ) * scalar;
3995 
3996  for( ; remainder && i<iend; ++i ) {
3997  y[j] += x[i] * A(i,j) * scalar;
3998  }
3999  }
4000  }
4001  //**********************************************************************************************
4002 
4003  //**Default addition assignment to dense vectors (large matrices)*******************************
4017  template< typename VT1 // Type of the left-hand side target vector
4018  , typename VT2 // Type of the left-hand side vector operand
4019  , typename MT1 // Type of the right-hand side matrix operand
4020  , typename ST2 > // Type of the scalar value
4021  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4022  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4023  {
4024  selectDefaultAddAssignKernel( y, x, A, scalar );
4025  }
4026  //**********************************************************************************************
4027 
4028  //**Vectorized default addition assignment to dense vectors (large matrices)********************
4043  template< typename VT1 // Type of the left-hand side target vector
4044  , typename VT2 // Type of the left-hand side vector operand
4045  , typename MT1 // Type of the right-hand side matrix operand
4046  , typename ST2 > // Type of the scalar value
4047  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4048  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4049  {
4050  const size_t M( A.rows() );
4051  const size_t N( A.columns() );
4052 
4053  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4054 
4055  size_t j( 0UL );
4056 
4057  for( ; (j+8UL) <= N; j+=8UL )
4058  {
4059  const size_t ibegin( ( IsLower<MT1>::value )
4060  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4061  :( 0UL ) );
4062  const size_t iend( ( IsUpper<MT1>::value )
4063  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4064  :( M ) );
4065  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4066 
4067  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4068  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4069 
4070  size_t i( ibegin );
4071 
4072  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4073  const size_t i1( i+SIMDSIZE );
4074  const size_t i2( i+SIMDSIZE*2UL );
4075  const size_t i3( i+SIMDSIZE*3UL );
4076  const SIMDType x1( x.load(i ) );
4077  const SIMDType x2( x.load(i1) );
4078  const SIMDType x3( x.load(i2) );
4079  const SIMDType x4( x.load(i3) );
4080  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4081  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4082  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4083  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4084  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4085  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4086  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4087  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4088  }
4089 
4090  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4091  const size_t i1( i+SIMDSIZE );
4092  const SIMDType x1( x.load(i ) );
4093  const SIMDType x2( x.load(i1) );
4094  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4095  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4096  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4097  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4098  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4099  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4100  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4101  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4102  }
4103 
4104  for( ; i<ipos; i+=SIMDSIZE ) {
4105  const SIMDType x1( x.load(i) );
4106  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4107  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4108  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4109  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4110  y[j+4UL] += sum( x1 * A.load(i,j+4UL) ) * scalar;
4111  y[j+5UL] += sum( x1 * A.load(i,j+5UL) ) * scalar;
4112  y[j+6UL] += sum( x1 * A.load(i,j+6UL) ) * scalar;
4113  y[j+7UL] += sum( x1 * A.load(i,j+7UL) ) * scalar;
4114  }
4115 
4116  for( ; remainder && i<iend; ++i ) {
4117  y[j ] += x[i] * A(i,j ) * scalar;
4118  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4119  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4120  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4121  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4122  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4123  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4124  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4125  }
4126  }
4127 
4128  for( ; (j+4UL) <= N; j+=4UL )
4129  {
4130  const size_t ibegin( ( IsLower<MT1>::value )
4131  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4132  :( 0UL ) );
4133  const size_t iend( ( IsUpper<MT1>::value )
4134  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4135  :( M ) );
4136  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4137 
4138  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4139  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4140 
4141  size_t i( ibegin );
4142 
4143  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4144  const size_t i1( i+SIMDSIZE );
4145  const size_t i2( i+SIMDSIZE*2UL );
4146  const size_t i3( i+SIMDSIZE*3UL );
4147  const SIMDType x1( x.load(i ) );
4148  const SIMDType x2( x.load(i1) );
4149  const SIMDType x3( x.load(i2) );
4150  const SIMDType x4( x.load(i3) );
4151  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4152  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4153  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4154  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4155  }
4156 
4157  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4158  const size_t i1( i+SIMDSIZE );
4159  const SIMDType x1( x.load(i ) );
4160  const SIMDType x2( x.load(i1) );
4161  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4162  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4163  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4164  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4165  }
4166 
4167  for( ; i<ipos; i+=SIMDSIZE ) {
4168  const SIMDType x1( x.load(i) );
4169  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4170  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4171  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4172  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4173  }
4174 
4175  for( ; remainder && i<iend; ++i ) {
4176  y[j ] += x[i] * A(i,j ) * scalar;
4177  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4178  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4179  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4180  }
4181  }
4182 
4183  for( ; (j+2UL) <= N; j+=2UL )
4184  {
4185  const size_t ibegin( ( IsLower<MT1>::value )
4186  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4187  :( 0UL ) );
4188  const size_t iend( ( IsUpper<MT1>::value )
4189  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4190  :( M ) );
4191  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4192 
4193  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4194  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4195 
4196  size_t i( ibegin );
4197 
4198  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4199  const size_t i1( i+SIMDSIZE );
4200  const size_t i2( i+SIMDSIZE*2UL );
4201  const size_t i3( i+SIMDSIZE*3UL );
4202  const SIMDType x1( x.load(i ) );
4203  const SIMDType x2( x.load(i1) );
4204  const SIMDType x3( x.load(i2) );
4205  const SIMDType x4( x.load(i3) );
4206  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4207  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4208  }
4209 
4210  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4211  const size_t i1( i+SIMDSIZE );
4212  const SIMDType x1( x.load(i ) );
4213  const SIMDType x2( x.load(i1) );
4214  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4215  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4216  }
4217 
4218  for( ; i<ipos; i+=SIMDSIZE ) {
4219  const SIMDType x1( x.load(i) );
4220  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4221  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4222  }
4223 
4224  for( ; remainder && i<iend; ++i ) {
4225  y[j ] += x[i] * A(i,j ) * scalar;
4226  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4227  }
4228  }
4229 
4230  if( j < N )
4231  {
4232  const size_t ibegin( ( IsLower<MT1>::value )
4233  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4234  :( 0UL ) );
4235  const size_t iend( ( IsUpper<MT1>::value )
4236  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4237  :( M ) );
4238  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4239 
4240  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4241  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4242 
4243  size_t i( ibegin );
4244 
4245  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4246  const size_t i1( i+SIMDSIZE );
4247  const size_t i2( i+SIMDSIZE*2UL );
4248  const size_t i3( i+SIMDSIZE*3UL );
4249  const SIMDType x1( x.load(i ) );
4250  const SIMDType x2( x.load(i1) );
4251  const SIMDType x3( x.load(i2) );
4252  const SIMDType x4( x.load(i3) );
4253  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4254  }
4255 
4256  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4257  const size_t i1( i+SIMDSIZE );
4258  const SIMDType x1( x.load(i ) );
4259  const SIMDType x2( x.load(i1) );
4260  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4261  }
4262 
4263  for( ; i<ipos; i+=SIMDSIZE ) {
4264  const SIMDType x1( x.load(i) );
4265  y[j] += sum( x1 * A.load(i,j) ) * scalar;
4266  }
4267 
4268  for( ; remainder && i<iend; ++i ) {
4269  y[j] += x[i] * A(i,j) * scalar;
4270  }
4271  }
4272  }
4273  //**********************************************************************************************
4274 
4275  //**BLAS-based addition assignment to dense vectors (default)***********************************
4290  template< typename VT1 // Type of the left-hand side target vector
4291  , typename VT2 // Type of the left-hand side vector operand
4292  , typename MT1 // Type of the right-hand side matrix operand
4293  , typename ST2 > // Type of the scalar value
4294  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4295  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4296  {
4297  selectLargeAddAssignKernel( y, x, A, scalar );
4298  }
4299  //**********************************************************************************************
4300 
4301  //**BLAS-based addition assignment to dense vectors*********************************************
4302 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4303 
4316  template< typename VT1 // Type of the left-hand side target vector
4317  , typename VT2 // Type of the left-hand side vector operand
4318  , typename MT1 // Type of the right-hand side matrix operand
4319  , typename ST2 > // Type of the scalar value
4320  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4321  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4322  {
4323  typedef ElementType_<VT1> ET;
4324 
4325  if( IsTriangular<MT1>::value ) {
4326  ResultType_<VT1> tmp( serial( scalar * x ) );
4327  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4328  addAssign( y, tmp );
4329  }
4330  else {
4331  gemv( y, x, A, ET(scalar), ET(1) );
4332  }
4333  }
4334 #endif
4335  //**********************************************************************************************
4336 
4337  //**Addition assignment to sparse vectors*******************************************************
4338  // No special implementation for the addition assignment to sparse vectors.
4339  //**********************************************************************************************
4340 
4341  //**Subtraction assignment to dense vectors*****************************************************
4353  template< typename VT1 // Type of the target dense vector
4354  , bool TF > // Transpose flag of the target dense vector
4355  friend inline void subAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
4356  {
4358 
4359  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4360 
4361  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4362  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4363 
4364  if( right.rows() == 0UL || right.columns() == 0UL ) {
4365  return;
4366  }
4367 
4368  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
4369  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4370 
4371  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4372  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4373  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4374  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4375 
4376  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
4377  }
4378  //**********************************************************************************************
4379 
4380  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4391  template< typename VT1 // Type of the left-hand side target vector
4392  , typename VT2 // Type of the left-hand side vector operand
4393  , typename MT1 // Type of the right-hand side matrix operand
4394  , typename ST2 > // Type of the scalar value
4395  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4396  {
4397  if( ( IsDiagonal<MT1>::value ) ||
4398  ( IsComputation<MT>::value && !evaluateMatrix ) ||
4399  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4400  selectSmallSubAssignKernel( y, x, A, scalar );
4401  else
4402  selectBlasSubAssignKernel( y, x, A, scalar );
4403  }
4404  //**********************************************************************************************
4405 
4406  //**Default subtraction assignment to dense vectors*********************************************
4420  template< typename VT1 // Type of the left-hand side target vector
4421  , typename VT2 // Type of the left-hand side vector operand
4422  , typename MT1 // Type of the right-hand side matrix operand
4423  , typename ST2 > // Type of the scalar value
4424  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4425  {
4426  y.subAssign( x * A * scalar );
4427  }
4428  //**********************************************************************************************
4429 
4430  //**Default subtraction assignment to dense vectors (small matrices)****************************
4444  template< typename VT1 // Type of the left-hand side target vector
4445  , typename VT2 // Type of the left-hand side vector operand
4446  , typename MT1 // Type of the right-hand side matrix operand
4447  , typename ST2 > // Type of the scalar value
4448  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4449  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4450  {
4451  selectDefaultSubAssignKernel( y, x, A, scalar );
4452  }
4453  //**********************************************************************************************
4454 
4455  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4470  template< typename VT1 // Type of the left-hand side target vector
4471  , typename VT2 // Type of the left-hand side vector operand
4472  , typename MT1 // Type of the right-hand side matrix operand
4473  , typename ST2 > // Type of the scalar value
4474  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4475  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4476  {
4477  const size_t M( A.rows() );
4478  const size_t N( A.columns() );
4479 
4480  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4481 
4482  size_t j( 0UL );
4483 
4484  for( ; (j+8UL) <= N; j+=8UL )
4485  {
4486  const size_t ibegin( ( IsLower<MT1>::value )
4487  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4488  :( 0UL ) );
4489  const size_t iend( ( IsUpper<MT1>::value )
4490  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4491  :( M ) );
4492  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4493 
4494  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4495  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4496 
4497  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4498  size_t i( ibegin );
4499 
4500  for( ; i<ipos; i+=SIMDSIZE ) {
4501  const SIMDType x1( x.load(i) );
4502  xmm1 = xmm1 + x1 * A.load(i,j );
4503  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4504  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4505  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4506  xmm5 = xmm5 + x1 * A.load(i,j+4UL);
4507  xmm6 = xmm6 + x1 * A.load(i,j+5UL);
4508  xmm7 = xmm7 + x1 * A.load(i,j+6UL);
4509  xmm8 = xmm8 + x1 * A.load(i,j+7UL);
4510  }
4511 
4512  y[j ] -= sum( xmm1 ) * scalar;
4513  y[j+1UL] -= sum( xmm2 ) * scalar;
4514  y[j+2UL] -= sum( xmm3 ) * scalar;
4515  y[j+3UL] -= sum( xmm4 ) * scalar;
4516  y[j+4UL] -= sum( xmm5 ) * scalar;
4517  y[j+5UL] -= sum( xmm6 ) * scalar;
4518  y[j+6UL] -= sum( xmm7 ) * scalar;
4519  y[j+7UL] -= sum( xmm8 ) * scalar;
4520 
4521  for( ; remainder && i<iend; ++i ) {
4522  y[j ] -= x[i] * A(i,j ) * scalar;
4523  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4524  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4525  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4526  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4527  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4528  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4529  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4530  }
4531  }
4532 
4533  for( ; (j+4UL) <= N; j+=4UL )
4534  {
4535  const size_t ibegin( ( IsLower<MT1>::value )
4536  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4537  :( 0UL ) );
4538  const size_t iend( ( IsUpper<MT1>::value )
4539  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4540  :( M ) );
4541  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4542 
4543  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4544  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4545 
4546  SIMDType xmm1, xmm2, xmm3, xmm4;
4547  size_t i( ibegin );
4548 
4549  for( ; i<ipos; i+=SIMDSIZE ) {
4550  const SIMDType x1( x.load(i) );
4551  xmm1 = xmm1 + x1 * A.load(i,j );
4552  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4553  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4554  xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4555  }
4556 
4557  y[j ] -= sum( xmm1 ) * scalar;
4558  y[j+1UL] -= sum( xmm2 ) * scalar;
4559  y[j+2UL] -= sum( xmm3 ) * scalar;
4560  y[j+3UL] -= sum( xmm4 ) * scalar;
4561 
4562  for( ; remainder && i<iend; ++i ) {
4563  y[j ] -= x[i] * A(i,j ) * scalar;
4564  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4565  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4566  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4567  }
4568  }
4569 
4570  for( ; (j+3UL) <= N; j+=3UL )
4571  {
4572  const size_t ibegin( ( IsLower<MT1>::value )
4573  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4574  :( 0UL ) );
4575  const size_t iend( ( IsUpper<MT1>::value )
4576  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
4577  :( M ) );
4578  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4579 
4580  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4581  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4582 
4583  SIMDType xmm1, xmm2, xmm3;
4584  size_t i( ibegin );
4585 
4586  for( ; i<ipos; i+=SIMDSIZE ) {
4587  const SIMDType x1( x.load(i) );
4588  xmm1 = xmm1 + x1 * A.load(i,j );
4589  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4590  xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4591  }
4592 
4593  y[j ] -= sum( xmm1 ) * scalar;
4594  y[j+1UL] -= sum( xmm2 ) * scalar;
4595  y[j+2UL] -= sum( xmm3 ) * scalar;
4596 
4597  for( ; remainder && i<iend; ++i ) {
4598  y[j ] -= x[i] * A(i,j ) * scalar;
4599  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4600  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4601  }
4602  }
4603 
4604  for( ; (j+2UL) <= N; j+=2UL )
4605  {
4606  const size_t ibegin( ( IsLower<MT1>::value )
4607  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4608  :( 0UL ) );
4609  const size_t iend( ( IsUpper<MT1>::value )
4610  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4611  :( M ) );
4612  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4613 
4614  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4615  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4616 
4617  SIMDType xmm1, xmm2;
4618  size_t i( ibegin );
4619 
4620  for( ; i<ipos; i+=SIMDSIZE ) {
4621  const SIMDType x1( x.load(i) );
4622  xmm1 = xmm1 + x1 * A.load(i,j );
4623  xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4624  }
4625 
4626  y[j ] -= sum( xmm1 ) * scalar;
4627  y[j+1UL] -= sum( xmm2 ) * scalar;
4628 
4629  for( ; remainder && i<iend; ++i ) {
4630  y[j ] -= x[i] * A(i,j ) * scalar;
4631  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4632  }
4633  }
4634 
4635  if( j < N )
4636  {
4637  const size_t ibegin( ( IsLower<MT1>::value )
4638  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4639  :( 0UL ) );
4640  const size_t iend( ( IsUpper<MT1>::value )
4641  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4642  :( M ) );
4643  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4644 
4645  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4646  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4647 
4648  SIMDType xmm1;
4649  size_t i( ibegin );
4650 
4651  for( ; i<ipos; i+=SIMDSIZE ) {
4652  xmm1 = xmm1 + A.load(i,j) * x.load(i);
4653  }
4654 
4655  y[j] -= sum( xmm1 ) * scalar;
4656 
4657  for( ; remainder && i<iend; ++i ) {
4658  y[j] -= x[i] * A(i,j) * scalar;
4659  }
4660  }
4661  }
4662  //**********************************************************************************************
4663 
4664  //**Default subtraction assignment to dense vectors (large matrices)****************************
4678  template< typename VT1 // Type of the left-hand side target vector
4679  , typename VT2 // Type of the left-hand side vector operand
4680  , typename MT1 // Type of the right-hand side matrix operand
4681  , typename ST2 > // Type of the scalar value
4682  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4683  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4684  {
4685  selectDefaultSubAssignKernel( y, x, A, scalar );
4686  }
4687  //**********************************************************************************************
4688 
4689  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4704  template< typename VT1 // Type of the left-hand side target vector
4705  , typename VT2 // Type of the left-hand side vector operand
4706  , typename MT1 // Type of the right-hand side matrix operand
4707  , typename ST2 > // Type of the scalar value
4708  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4709  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4710  {
4711  const size_t M( A.rows() );
4712  const size_t N( A.columns() );
4713 
4714  const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4715 
4716  size_t j( 0UL );
4717 
4718  for( ; (j+8UL) <= N; j+=8UL )
4719  {
4720  const size_t ibegin( ( IsLower<MT1>::value )
4721  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4722  :( 0UL ) );
4723  const size_t iend( ( IsUpper<MT1>::value )
4724  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4725  :( M ) );
4726  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4727 
4728  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4729  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4730 
4731  size_t i( ibegin );
4732 
4733  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4734  const size_t i1( i+SIMDSIZE );
4735  const size_t i2( i+SIMDSIZE*2UL );
4736  const size_t i3( i+SIMDSIZE*3UL );
4737  const SIMDType x1( x.load(i ) );
4738  const SIMDType x2( x.load(i1) );
4739  const SIMDType x3( x.load(i2) );
4740  const SIMDType x4( x.load(i3) );
4741  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4742  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4743  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4744  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4745  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4746  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4747  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4748  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4749  }
4750 
4751  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4752  const size_t i1( i+SIMDSIZE );
4753  const SIMDType x1( x.load(i ) );
4754  const SIMDType x2( x.load(i1) );
4755  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4756  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4757  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4758  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4759  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4760  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4761  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4762  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4763  }
4764 
4765  for( ; i<ipos; i+=SIMDSIZE ) {
4766  const SIMDType x1( x.load(i) );
4767  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4768  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4769  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4770  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4771  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) ) * scalar;
4772  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) ) * scalar;
4773  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) ) * scalar;
4774  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) ) * scalar;
4775  }
4776 
4777  for( ; remainder && i<iend; ++i ) {
4778  y[j ] -= x[i] * A(i,j ) * scalar;
4779  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4780  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4781  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4782  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4783  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4784  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4785  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4786  }
4787  }
4788 
4789  for( ; (j+4UL) <= N; j+=4UL )
4790  {
4791  const size_t ibegin( ( IsLower<MT1>::value )
4792  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4793  :( 0UL ) );
4794  const size_t iend( ( IsUpper<MT1>::value )
4795  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4796  :( M ) );
4797  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4798 
4799  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4800  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4801 
4802  size_t i( ibegin );
4803 
4804  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4805  const size_t i1( i+SIMDSIZE );
4806  const size_t i2( i+SIMDSIZE*2UL );
4807  const size_t i3( i+SIMDSIZE*3UL );
4808  const SIMDType x1( x.load(i ) );
4809  const SIMDType x2( x.load(i1) );
4810  const SIMDType x3( x.load(i2) );
4811  const SIMDType x4( x.load(i3) );
4812  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4813  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4814  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4815  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4816  }
4817 
4818  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4819  const size_t i1( i+SIMDSIZE );
4820  const SIMDType x1( x.load(i ) );
4821  const SIMDType x2( x.load(i1) );
4822  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4823  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4824  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4825  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4826  }
4827 
4828  for( ; i<ipos; i+=SIMDSIZE ) {
4829  const SIMDType x1( x.load(i) );
4830  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4831  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4832  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4833  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4834  }
4835 
4836  for( ; remainder && i<iend; ++i ) {
4837  y[j ] -= x[i] * A(i,j ) * scalar;
4838  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4839  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4840  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4841  }
4842  }
4843 
4844  for( ; (j+2UL) <= N; j+=2UL )
4845  {
4846  const size_t ibegin( ( IsLower<MT1>::value )
4847  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4848  :( 0UL ) );
4849  const size_t iend( ( IsUpper<MT1>::value )
4850  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4851  :( M ) );
4852  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4853 
4854  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4855  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4856 
4857  size_t i( ibegin );
4858 
4859  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4860  const size_t i1( i+SIMDSIZE );
4861  const size_t i2( i+SIMDSIZE*2UL );
4862  const size_t i3( i+SIMDSIZE*3UL );
4863  const SIMDType x1( x.load(i ) );
4864  const SIMDType x2( x.load(i1) );
4865  const SIMDType x3( x.load(i2) );
4866  const SIMDType x4( x.load(i3) );
4867  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4868  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4869  }
4870 
4871  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4872  const size_t i1( i+SIMDSIZE );
4873  const SIMDType x1( x.load(i ) );
4874  const SIMDType x2( x.load(i1) );
4875  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4876  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4877  }
4878 
4879  for( ; i<ipos; i+=SIMDSIZE ) {
4880  const SIMDType x1( x.load(i) );
4881  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4882  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4883  }
4884 
4885  for( ; remainder && i<iend; ++i ) {
4886  y[j ] -= x[i] * A(i,j ) * scalar;
4887  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4888  }
4889  }
4890 
4891  if( j < N )
4892  {
4893  const size_t ibegin( ( IsLower<MT1>::value )
4894  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4895  :( 0UL ) );
4896  const size_t iend( ( IsUpper<MT1>::value )
4897  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4898  :( M ) );
4899  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4900 
4901  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4902  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4903 
4904  size_t i( ibegin );
4905 
4906  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4907  const size_t i1( i+SIMDSIZE );
4908  const size_t i2( i+SIMDSIZE*2UL );
4909  const size_t i3( i+SIMDSIZE*3UL );
4910  const SIMDType x1( x.load(i ) );
4911  const SIMDType x2( x.load(i1) );
4912  const SIMDType x3( x.load(i2) );
4913  const SIMDType x4( x.load(i3) );
4914  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4915  }
4916 
4917  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4918  const size_t i1( i+SIMDSIZE );
4919  const SIMDType x1( x.load(i ) );
4920  const SIMDType x2( x.load(i1) );
4921  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4922  }
4923 
4924  for( ; i<ipos; i+=SIMDSIZE ) {
4925  const SIMDType x1( x.load(i) );
4926  y[j] -= sum( x1 * A.load(i,j) ) * scalar;
4927  }
4928 
4929  for( ; remainder && i<iend; ++i ) {
4930  y[j] -= x[i] * A(i,j) * scalar;
4931  }
4932  }
4933  }
4934  //**********************************************************************************************
4935 
4936  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4951  template< typename VT1 // Type of the left-hand side target vector
4952  , typename VT2 // Type of the left-hand side vector operand
4953  , typename MT1 // Type of the right-hand side matrix operand
4954  , typename ST2 > // Type of the scalar value
4955  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4956  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4957  {
4958  selectLargeSubAssignKernel( y, x, A, scalar );
4959  }
4960  //**********************************************************************************************
4961 
4962  //**BLAS-based subtraction assignment to dense vectors******************************************
4963 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4964 
4977  template< typename VT1 // Type of the left-hand side target vector
4978  , typename VT2 // Type of the left-hand side vector operand
4979  , typename MT1 // Type of the right-hand side matrix operand
4980  , typename ST2 > // Type of the scalar value
4981  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4982  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4983  {
4984  typedef ElementType_<VT1> ET;
4985 
4986  if( IsTriangular<MT1>::value ) {
4987  ResultType_<VT1> tmp( serial( scalar * x ) );
4988  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4989  subAssign( y, tmp );
4990  }
4991  else {
4992  gemv( y, x, A, ET(-scalar), ET(1) );
4993  }
4994  }
4995 #endif
4996  //**********************************************************************************************
4997 
4998  //**Subtraction assignment to sparse vectors****************************************************
4999  // No special implementation for the subtraction assignment to sparse vectors.
5000  //**********************************************************************************************
5001 
5002  //**Multiplication assignment to dense vectors**************************************************
5014  template< typename VT1 // Type of the target dense vector
5015  , bool TF > // Transpose flag of the target dense vector
5016  friend inline void multAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5017  {
5019 
5022  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5023 
5024  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5025 
5026  const ResultType tmp( serial( rhs ) );
5027  multAssign( ~lhs, tmp );
5028  }
5029  //**********************************************************************************************
5030 
5031  //**Multiplication assignment to sparse vectors*************************************************
5032  // No special implementation for the multiplication assignment to sparse vectors.
5033  //**********************************************************************************************
5034 
5035  //**Division assignment to dense vectors********************************************************
5047  template< typename VT1 // Type of the target dense vector
5048  , bool TF > // Transpose flag of the target dense vector
5049  friend inline void divAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5050  {
5052 
5055  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5056 
5057  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5058 
5059  const ResultType tmp( serial( rhs ) );
5060  divAssign( ~lhs, tmp );
5061  }
5062  //**********************************************************************************************
5063 
5064  //**Division assignment to sparse vectors*******************************************************
5065  // No special implementation for the division assignment to sparse vectors.
5066  //**********************************************************************************************
5067 
5068  //**SMP assignment to dense vectors*************************************************************
5082  template< typename VT1 // Type of the target dense vector
5083  , bool TF > // Transpose flag of the target dense vector
5084  friend inline EnableIf_< UseSMPAssign<VT1> >
5085  smpAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5086  {
5088 
5089  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5090 
5091  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5092  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5093 
5094  if( right.rows() == 0UL ) {
5095  reset( ~lhs );
5096  return;
5097  }
5098  else if( right.columns() == 0UL ) {
5099  return;
5100  }
5101 
5102  LT x( left ); // Evaluation of the left-hand side dense vector operand
5103  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5104 
5105  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5106  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5107  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5108  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5109 
5110  smpAssign( ~lhs, x * A * rhs.scalar_ );
5111  }
5112  //**********************************************************************************************
5113 
5114  //**SMP assignment to sparse vectors************************************************************
5128  template< typename VT1 // Type of the target sparse vector
5129  , bool TF > // Transpose flag of the target sparse vector
5130  friend inline EnableIf_< UseSMPAssign<VT1> >
5131  smpAssign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5132  {
5134 
5137  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5138 
5139  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5140 
5141  const ResultType tmp( rhs );
5142  smpAssign( ~lhs, tmp );
5143  }
5144  //**********************************************************************************************
5145 
5146  //**SMP addition assignment to dense vectors****************************************************
5160  template< typename VT1 // Type of the target dense vector
5161  , bool TF > // Transpose flag of the target dense vector
5162  friend inline EnableIf_< UseSMPAssign<VT1> >
5163  smpAddAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5164  {
5166 
5167  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5168 
5169  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5170  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5171 
5172  if( right.rows() == 0UL || right.columns() == 0UL ) {
5173  return;
5174  }
5175 
5176  LT x( left ); // Evaluation of the left-hand side dense vector operand
5177  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5178 
5179  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5180  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5181  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5182  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5183 
5184  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
5185  }
5186  //**********************************************************************************************
5187 
5188  //**SMP addition assignment to sparse vectors***************************************************
5189  // No special implementation for the SMP addition assignment to sparse vectors.
5190  //**********************************************************************************************
5191 
5192  //**SMP subtraction assignment to dense vectors*************************************************
5206  template< typename VT1 // Type of the target dense vector
5207  , bool TF > // Transpose flag of the target dense vector
5208  friend inline EnableIf_< UseSMPAssign<VT1> >
5209  smpSubAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5210  {
5212 
5213  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5214 
5215  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5216  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5217 
5218  if( right.rows() == 0UL || right.columns() == 0UL ) {
5219  return;
5220  }
5221 
5222  LT x( left ); // Evaluation of the left-hand side dense vector operand
5223  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5224 
5225  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5226  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5227  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5228  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5229 
5230  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
5231  }
5232  //**********************************************************************************************
5233 
5234  //**SMP subtraction assignment to sparse vectors************************************************
5235  // No special implementation for the SMP subtraction assignment to sparse vectors.
5236  //**********************************************************************************************
5237 
5238  //**SMP multiplication assignment to dense vectors**********************************************
5252  template< typename VT1 // Type of the target dense vector
5253  , bool TF > // Transpose flag of the target dense vector
5254  friend inline EnableIf_< UseSMPAssign<VT1> >
5255  smpMultAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5256  {
5258 
5261  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5262 
5263  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5264 
5265  const ResultType tmp( rhs );
5266  smpMultAssign( ~lhs, tmp );
5267  }
5268  //**********************************************************************************************
5269 
5270  //**SMP multiplication assignment to sparse vectors*********************************************
5271  // No special implementation for the SMP multiplication assignment to sparse vectors.
5272  //**********************************************************************************************
5273 
5274  //**SMP division assignment to dense vectors****************************************************
5288  template< typename VT1 // Type of the target dense vector
5289  , bool TF > // Transpose flag of the target dense vector
5290  friend inline EnableIf_< UseSMPAssign<VT1> >
5291  smpDivAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5292  {
5294 
5297  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
5298 
5299  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5300 
5301  const ResultType tmp( rhs );
5302  smpDivAssign( ~lhs, tmp );
5303  }
5304  //**********************************************************************************************
5305 
5306  //**SMP division assignment to sparse vectors***************************************************
5307  // No special implementation for the SMP division assignment to sparse vectors.
5308  //**********************************************************************************************
5309 
5310  //**Compile time checks*************************************************************************
5318  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
5319  //**********************************************************************************************
5320 };
5322 //*************************************************************************************************
5323 
5324 
5325 
5326 
5327 //=================================================================================================
5328 //
5329 // GLOBAL BINARY ARITHMETIC OPERATORS
5330 //
5331 //=================================================================================================
5332 
5333 //*************************************************************************************************
5364 template< typename T1 // Type of the left-hand side dense vector
5365  , typename T2 > // Type of the right-hand side dense matrix
5366 inline const DisableIf_< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >
5368 {
5370 
5371  if( (~vec).size() != (~mat).rows() ) {
5372  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
5373  }
5374 
5375  return TDVecTDMatMultExpr<T1,T2>( ~vec, ~mat );
5376 }
5377 //*************************************************************************************************
5378 
5379 
5380 
5381 
5382 //=================================================================================================
5383 //
5384 // SIZE SPECIALIZATIONS
5385 //
5386 //=================================================================================================
5387 
5388 //*************************************************************************************************
5390 template< typename VT, typename MT >
5391 struct Size< TDVecTDMatMultExpr<VT,MT> > : public Columns<MT>
5392 {};
5394 //*************************************************************************************************
5395 
5396 
5397 
5398 
5399 //=================================================================================================
5400 //
5401 // ISALIGNED SPECIALIZATIONS
5402 //
5403 //=================================================================================================
5404 
5405 //*************************************************************************************************
5407 template< typename VT, typename MT >
5408 struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
5409  : public BoolConstant< And< IsAligned<VT>, IsAligned<MT> >::value >
5410 {};
5412 //*************************************************************************************************
5413 
5414 
5415 
5416 
5417 //=================================================================================================
5418 //
5419 // EXPRESSION TRAIT SPECIALIZATIONS
5420 //
5421 //=================================================================================================
5422 
5423 //*************************************************************************************************
5425 template< typename VT, typename MT, bool AF >
5426 struct SubvectorExprTrait< TDVecTDMatMultExpr<VT,MT>, AF >
5427 {
5428  public:
5429  //**********************************************************************************************
5430  using Type = MultExprTrait_< SubvectorExprTrait_<const VT,AF>
5431  , SubmatrixExprTrait_<const MT,AF> >;
5432  //**********************************************************************************************
5433 };
5435 //*************************************************************************************************
5436 
5437 } // namespace blaze
5438 
5439 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:353
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:341
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:138
Header file for auxiliary alias declarations.
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Header file for basic type definitions.
BLAZE_ALWAYS_INLINE const complex< int8_t > sum(const SIMDcint8 &a) noexcept
Returns the sum of all elements in the 8-bit integral complex SIMD vector.
Definition: Reduction.h:63
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:130
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:250
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:212
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:383
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:211
If_< IsExpression< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:215
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:363
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:264
Header file for the If class template.
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:221
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:135
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the Columns type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
TDVecTDMatMultExpr< VT, MT > This
Type of this TDVecTDMatMultExpr instance.
Definition: TDVecTDMatMultExpr.h:206
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:207
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:133
If_< IsExpression< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:218
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:296
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:131
Header file for the SubmatrixExprTrait class template.
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:384
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:210
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:132
Header file for the TVecMatMultExpr base class.
Constraints on the storage order of matrix types.
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:319
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:314
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:110
Header file for the AreSIMDCombinable type trait.
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:134
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:208
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:309
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:209
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:373
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:329
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.