TDMatDVecMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
51 #include <blaze/math/Exception.h>
57 #include <blaze/math/Functions.h>
58 #include <blaze/math/shims/Reset.h>
60 #include <blaze/math/SIMD.h>
84 #include <blaze/system/BLAS.h>
87 #include <blaze/util/Assert.h>
88 #include <blaze/util/Complex.h>
91 #include <blaze/util/DisableIf.h>
92 #include <blaze/util/EnableIf.h>
95 #include <blaze/util/mpl/And.h>
96 #include <blaze/util/mpl/If.h>
97 #include <blaze/util/Types.h>
106 
107 
108 namespace blaze {
109 
110 //=================================================================================================
111 //
112 // CLASS TDMATDVECMULTEXPR
113 //
114 //=================================================================================================
115 
116 //*************************************************************************************************
123 template< typename MT // Type of the left-hand side dense matrix
124  , typename VT > // Type of the right-hand side dense vector
125 class TDMatDVecMultExpr : public DenseVector< TDMatDVecMultExpr<MT,VT>, false >
126  , private MatVecMultExpr
127  , private Computation
128 {
129  private:
130  //**Type definitions****************************************************************************
137  //**********************************************************************************************
138 
139  //**********************************************************************************************
141  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
143  //**********************************************************************************************
144 
145  //**********************************************************************************************
147  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
148  //**********************************************************************************************
149 
150  //**********************************************************************************************
152 
156  template< typename T1 >
157  struct UseSMPAssign {
158  enum : bool { value = ( evaluateMatrix || evaluateVector ) };
159  };
161  //**********************************************************************************************
162 
163  //**********************************************************************************************
165 
168  template< typename T1, typename T2, typename T3 >
169  struct UseBlasKernel {
175  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
180  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
181  };
183  //**********************************************************************************************
184 
185  //**********************************************************************************************
187 
191  template< typename T1, typename T2, typename T3 >
192  struct UseVectorizedDefaultKernel {
193  enum : bool { value = useOptimizedKernels &&
195  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
198  , ElementType_<T3> >::value &&
201  };
203  //**********************************************************************************************
204 
205  public:
206  //**Type definitions****************************************************************************
212  typedef const ElementType ReturnType;
213  typedef const ResultType CompositeType;
214 
216  typedef If_< IsExpression<MT>, const MT, const MT& > LeftOperand;
217 
219  typedef If_< IsExpression<VT>, const VT, const VT& > RightOperand;
220 
223 
226  //**********************************************************************************************
227 
228  //**Compilation flags***************************************************************************
230  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
231  MT::simdEnabled && VT::simdEnabled &&
234 
236  enum : bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
237  !evaluateVector && VT::smpAssignable };
238  //**********************************************************************************************
239 
240  //**SIMD properties*****************************************************************************
242  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
243  //**********************************************************************************************
244 
245  //**Constructor*********************************************************************************
251  explicit inline TDMatDVecMultExpr( const MT& mat, const VT& vec ) noexcept
252  : mat_( mat ) // Left-hand side dense matrix of the multiplication expression
253  , vec_( vec ) // Right-hand side dense vector of the multiplication expression
254  {
255  BLAZE_INTERNAL_ASSERT( mat_.columns() == vec_.size(), "Invalid matrix and vector sizes" );
256  }
257  //**********************************************************************************************
258 
259  //**Subscript operator**************************************************************************
265  inline ReturnType operator[]( size_t index ) const {
266  BLAZE_INTERNAL_ASSERT( index < mat_.rows(), "Invalid vector access index" );
267 
269  {
270  return mat_(index,index) * vec_[index];
271  }
272  else if( IsLower<MT>::value && ( index + 8UL < mat_.rows() ) )
273  {
274  const size_t n( IsStrictlyLower<MT>::value ? index : index+1UL );
275  return subvector( row( mat_, index ), 0UL, n ) * subvector( vec_, 0UL, n );
276  }
277  else if( IsUpper<MT>::value && ( index > 8UL ) )
278  {
279  const size_t begin( IsStrictlyUpper<MT>::value ? index+1UL : index );
280  const size_t n ( mat_.columns() - begin );
281  return subvector( row( mat_, index ), begin, n ) * subvector( vec_, begin, n );
282  }
283  else
284  {
285  return row( mat_, index ) * vec_;
286  }
287  }
288  //**********************************************************************************************
289 
290  //**At function*********************************************************************************
297  inline ReturnType at( size_t index ) const {
298  if( index >= mat_.rows() ) {
299  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
300  }
301  return (*this)[index];
302  }
303  //**********************************************************************************************
304 
305  //**Size function*******************************************************************************
310  inline size_t size() const noexcept {
311  return mat_.rows();
312  }
313  //**********************************************************************************************
314 
315  //**Left operand access*************************************************************************
320  inline LeftOperand leftOperand() const noexcept {
321  return mat_;
322  }
323  //**********************************************************************************************
324 
325  //**Right operand access************************************************************************
330  inline RightOperand rightOperand() const noexcept {
331  return vec_;
332  }
333  //**********************************************************************************************
334 
335  //**********************************************************************************************
341  template< typename T >
342  inline bool canAlias( const T* alias ) const noexcept {
343  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
344  }
345  //**********************************************************************************************
346 
347  //**********************************************************************************************
353  template< typename T >
354  inline bool isAliased( const T* alias ) const noexcept {
355  return ( mat_.isAliased( alias ) || vec_.isAliased( alias ) );
356  }
357  //**********************************************************************************************
358 
359  //**********************************************************************************************
364  inline bool isAligned() const noexcept {
365  return mat_.isAligned() && vec_.isAligned();
366  }
367  //**********************************************************************************************
368 
369  //**********************************************************************************************
374  inline bool canSMPAssign() const noexcept {
375  return ( !BLAZE_BLAS_IS_PARALLEL ||
376  ( IsComputation<MT>::value && !evaluateMatrix ) ||
377  ( mat_.rows() * mat_.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
378  ( size() > SMP_TDMATDVECMULT_THRESHOLD );
379  }
380  //**********************************************************************************************
381 
382  private:
383  //**Member variables****************************************************************************
384  LeftOperand mat_;
385  RightOperand vec_;
386  //**********************************************************************************************
387 
388  //**Assignment to dense vectors*****************************************************************
401  template< typename VT1 > // Type of the target dense vector
402  friend inline void assign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
403  {
405 
406  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
407 
408  if( rhs.mat_.rows() == 0UL ) {
409  return;
410  }
411  else if( rhs.mat_.columns() == 0UL ) {
412  reset( ~lhs );
413  return;
414  }
415 
416  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
417  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
418 
419  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
420  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
421  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
422  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
423 
424  TDMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
425  }
427  //**********************************************************************************************
428 
429  //**Assignment to dense vectors (kernel selection)**********************************************
440  template< typename VT1 // Type of the left-hand side target vector
441  , typename MT1 // Type of the left-hand side matrix operand
442  , typename VT2 > // Type of the right-hand side vector operand
443  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x )
444  {
445  if( ( IsDiagonal<MT1>::value ) ||
446  ( IsComputation<MT>::value && !evaluateMatrix ) ||
447  ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
448  selectSmallAssignKernel( y, A, x );
449  else
450  selectBlasAssignKernel( y, A, x );
451  }
453  //**********************************************************************************************
454 
455  //**Default assignment to dense vectors*********************************************************
469  template< typename VT1 // Type of the left-hand side target vector
470  , typename MT1 // Type of the left-hand side matrix operand
471  , typename VT2 > // Type of the right-hand side vector operand
472  static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x )
473  {
474  const size_t M( A.rows() );
475  const size_t N( A.columns() );
476 
477  if( IsStrictlyLower<MT1>::value ) {
478  reset( y[0] );
479  }
480 
481  if( !IsUpper<MT1>::value )
482  {
483  for( size_t i=( IsStrictlyLower<MT1>::value ? 1UL : 0UL ); i<M; ++i ) {
484  y[i] = A(i,0UL) * x[0UL];
485  }
486  }
487 
488  for( size_t j=( IsUpper<MT1>::value && !IsStrictlyUpper<MT1>::value ? 0UL : 1UL ); j<N; ++j )
489  {
490  if( IsDiagonal<MT1>::value )
491  {
492  y[j] = A(j,j) * x[j];
493  }
494  else
495  {
496  const size_t ibegin( ( IsLower<MT1>::value )
497  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
498  :( 0UL ) );
499  const size_t iend( ( IsUpper<MT1>::value )
500  ?( IsStrictlyUpper<MT1>::value ? j-1UL : j )
501  :( M ) );
502  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
503 
504  const size_t inum( iend - ibegin );
505  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
506 
507  for( size_t i=ibegin; i<ipos; i+=2UL ) {
508  y[i ] += A(i ,j) * x[j];
509  y[i+1UL] += A(i+1UL,j) * x[j];
510  }
511  if( ipos < iend ) {
512  y[ipos] += A(ipos,j) * x[j];
513  }
514  if( IsUpper<MT1>::value ) {
515  y[iend] = A(iend,j) * x[j];
516  }
517  }
518  }
519 
520  if( IsStrictlyUpper<MT1>::value ) {
521  reset( y[M-1UL] );
522  }
523  }
525  //**********************************************************************************************
526 
527  //**Default assignment to dense vectors (small matrices)****************************************
541  template< typename VT1 // Type of the left-hand side target vector
542  , typename MT1 // Type of the left-hand side matrix operand
543  , typename VT2 > // Type of the right-hand side vector operand
544  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
545  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
546  {
547  selectDefaultAssignKernel( y, A, x );
548  }
550  //**********************************************************************************************
551 
552  //**Vectorized default assignment to dense vectors (small matrices)*****************************
566  template< typename VT1 // Type of the left-hand side target vector
567  , typename MT1 // Type of the left-hand side matrix operand
568  , typename VT2 > // Type of the right-hand side vector operand
569  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
570  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x )
571  {
572  const size_t M( A.rows() );
573  const size_t N( A.columns() );
574 
575  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
576 
577  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
578  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
579 
580  size_t i( 0UL );
581 
582  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
583  {
584  const size_t jbegin( ( IsUpper<MT1>::value )
585  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
586  :( 0UL ) );
587  const size_t jend( ( IsLower<MT1>::value )
588  ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
589  :( N ) );
590  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
591 
592  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
593 
594  for( size_t j=jbegin; j<jend; ++j ) {
595  const SIMDType x1( set( x[j] ) );
596  xmm1 = xmm1 + A.load(i ,j) * x1;
597  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
598  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
599  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
600  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
601  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
602  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
603  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
604  }
605 
606  y.store( i , xmm1 );
607  y.store( i+SIMDSIZE , xmm2 );
608  y.store( i+SIMDSIZE*2UL, xmm3 );
609  y.store( i+SIMDSIZE*3UL, xmm4 );
610  y.store( i+SIMDSIZE*4UL, xmm5 );
611  y.store( i+SIMDSIZE*5UL, xmm6 );
612  y.store( i+SIMDSIZE*6UL, xmm7 );
613  y.store( i+SIMDSIZE*7UL, xmm8 );
614  }
615 
616  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
617  {
618  const size_t jbegin( ( IsUpper<MT1>::value )
619  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
620  :( 0UL ) );
621  const size_t jend( ( IsLower<MT1>::value )
622  ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
623  :( N ) );
624  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
625 
626  SIMDType xmm1, xmm2, xmm3, xmm4;
627 
628  for( size_t j=jbegin; j<jend; ++j ) {
629  const SIMDType x1( set( x[j] ) );
630  xmm1 = xmm1 + A.load(i ,j) * x1;
631  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
632  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
633  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
634  }
635 
636  y.store( i , xmm1 );
637  y.store( i+SIMDSIZE , xmm2 );
638  y.store( i+SIMDSIZE*2UL, xmm3 );
639  y.store( i+SIMDSIZE*3UL, xmm4 );
640  }
641 
642  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
643  {
644  const size_t jbegin( ( IsUpper<MT1>::value )
645  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
646  :( 0UL ) );
647  const size_t jend( ( IsLower<MT1>::value )
648  ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
649  :( N ) );
650  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
651 
652  SIMDType xmm1, xmm2, xmm3;
653 
654  for( size_t j=jbegin; j<jend; ++j ) {
655  const SIMDType x1( set( x[j] ) );
656  xmm1 = xmm1 + A.load(i ,j) * x1;
657  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
658  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
659  }
660 
661  y.store( i , xmm1 );
662  y.store( i+SIMDSIZE , xmm2 );
663  y.store( i+SIMDSIZE*2UL, xmm3 );
664  }
665 
666  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
667  {
668  const size_t jbegin( ( IsUpper<MT1>::value )
669  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
670  :( 0UL ) );
671  const size_t jend( ( IsLower<MT1>::value )
672  ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
673  :( N ) );
674  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
675 
676  SIMDType xmm1, xmm2;
677 
678  for( size_t j=jbegin; j<jend; ++j ) {
679  const SIMDType x1( set( x[j] ) );
680  xmm1 = xmm1 + A.load(i ,j) * x1;
681  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
682  }
683 
684  y.store( i , xmm1 );
685  y.store( i+SIMDSIZE, xmm2 );
686  }
687 
688  for( ; i<ipos; i+=SIMDSIZE )
689  {
690  const size_t jbegin( ( IsUpper<MT1>::value )
691  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
692  :( 0UL ) );
693  const size_t jend( ( IsLower<MT1>::value )
694  ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
695  :( N ) );
696  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
697 
698  SIMDType xmm1;
699 
700  for( size_t j=jbegin; j<jend; ++j ) {
701  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
702  }
703 
704  y.store( i, xmm1 );
705  }
706 
707  for( ; remainder && i<M; ++i )
708  {
709  const size_t jbegin( ( IsUpper<MT1>::value )
710  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
711  :( 0UL ) );
712  const size_t jend( ( IsLower<MT1>::value )
713  ?( min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
714  :( N ) );
715  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
716 
717  ElementType value = ElementType();
718 
719  for( size_t j=jbegin; j<jend; ++j ) {
720  value += A(i,j) * x[j];
721  }
722 
723  y[i] = value;
724  }
725  }
727  //**********************************************************************************************
728 
729  //**Default assignment to dense vectors (large matrices)****************************************
743  template< typename VT1 // Type of the left-hand side target vector
744  , typename MT1 // Type of the left-hand side matrix operand
745  , typename VT2 > // Type of the right-hand side vector operand
746  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
747  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
748  {
749  selectDefaultAssignKernel( y, A, x );
750  }
752  //**********************************************************************************************
753 
754  //**Vectorized default assignment to dense vectors (large matrices)*****************************
768  template< typename VT1 // Type of the left-hand side target vector
769  , typename MT1 // Type of the left-hand side matrix operand
770  , typename VT2 > // Type of the right-hand side vector operand
771  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
772  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x )
773  {
774  const size_t M( A.rows() );
775  const size_t N( A.columns() );
776 
777  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
778 
779  const size_t iblock( 32768UL / sizeof( ElementType ) );
780  const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
781 
782  BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
783 
784  reset( y );
785 
786  for( size_t ii=0U; ii<M; ii+=iblock ) {
787  for( size_t jj=0UL; jj<N; jj+=jblock )
788  {
789  const size_t jend( min( jj+jblock, N ) );
790  const size_t itmp( min( ii+iblock, M ) );
791  const size_t iend( ( IsUpper<MT1>::value )
792  ?( min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
793  :( itmp ) );
794 
795  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
796  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
797 
798  size_t i( ( IsLower<MT1>::value )
799  ?( max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) & size_t(-SIMDSIZE) ) )
800  :( ii ) );
801 
802  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
803  {
804  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
805 
806  for( size_t j=jj; j<jend; ++j ) {
807  const SIMDType x1( set( x[j] ) );
808  xmm1 = xmm1 + A.load(i ,j) * x1;
809  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
810  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
811  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
812  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
813  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
814  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
815  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
816  }
817 
818  y.store( i , y.load(i ) + xmm1 );
819  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
820  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
821  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
822  y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
823  y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
824  y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
825  y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
826  }
827 
828  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
829  {
830  SIMDType xmm1, xmm2, xmm3, xmm4;
831 
832  for( size_t j=jj; j<jend; ++j ) {
833  const SIMDType x1( set( x[j] ) );
834  xmm1 = xmm1 + A.load(i ,j) * x1;
835  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
836  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
837  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
838  }
839 
840  y.store( i , y.load(i ) + xmm1 );
841  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
842  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
843  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
844  }
845 
846  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
847  {
848  SIMDType xmm1, xmm2, xmm3;
849 
850  for( size_t j=jj; j<jend; ++j ) {
851  const SIMDType x1( set( x[j] ) );
852  xmm1 = xmm1 + A.load(i ,j) * x1;
853  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
854  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
855  }
856 
857  y.store( i , y.load(i ) + xmm1 );
858  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
859  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
860  }
861 
862  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
863  {
864  SIMDType xmm1, xmm2;
865 
866  for( size_t j=jj; j<jend; ++j ) {
867  const SIMDType x1( set( x[j] ) );
868  xmm1 = xmm1 + A.load(i ,j) * x1;
869  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
870  }
871 
872  y.store( i , y.load(i ) + xmm1 );
873  y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
874  }
875 
876  for( ; i<ipos; i+=SIMDSIZE )
877  {
878  SIMDType xmm1;
879 
880  for( size_t j=jj; j<jend; ++j ) {
881  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
882  }
883 
884  y.store( i, y.load(i) + xmm1 );
885  }
886 
887  for( ; remainder && i<iend; ++i )
888  {
889  ElementType value = ElementType();
890 
891  for( size_t j=jj; j<jend; ++j ) {
892  value += A(i,j) * x[j];
893  }
894 
895  y[i] += value;
896  }
897  }
898  }
899  }
901  //**********************************************************************************************
902 
903  //**BLAS-based assignment to dense vectors (default)********************************************
917  template< typename VT1 // Type of the left-hand side target vector
918  , typename MT1 // Type of the left-hand side matrix operand
919  , typename VT2 > // Type of the right-hand side vector operand
920  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
921  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
922  {
923  selectLargeAssignKernel( y, A, x );
924  }
926  //**********************************************************************************************
927 
928  //**BLAS-based assignment to dense vectors******************************************************
929 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
930 
943  template< typename VT1 // Type of the left-hand side target vector
944  , typename MT1 // Type of the left-hand side matrix operand
945  , typename VT2 > // Type of the right-hand side vector operand
946  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
947  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x )
948  {
949  typedef ElementType_<VT1> ET;
950 
951  if( IsTriangular<MT1>::value ) {
952  assign( y, x );
953  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
954  }
955  else {
956  gemv( y, A, x, ET(1), ET(0) );
957  }
958  }
960 #endif
961  //**********************************************************************************************
962 
963  //**Assignment to sparse vectors****************************************************************
976  template< typename VT1 > // Type of the target sparse vector
977  friend inline void assign( SparseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
978  {
980 
983  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
984 
985  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
986 
987  const ResultType tmp( serial( rhs ) );
988  assign( ~lhs, tmp );
989  }
991  //**********************************************************************************************
992 
993  //**Addition assignment to dense vectors********************************************************
1006  template< typename VT1 > // Type of the target dense vector
1007  friend inline void addAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
1008  {
1010 
1011  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1012 
1013  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1014  return;
1015  }
1016 
1017  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1018  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1019 
1020  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1021  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1022  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1023  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1024 
1025  TDMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1026  }
1028  //**********************************************************************************************
1029 
1030  //**Addition assignment to dense vectors (kernel selection)*************************************
1041  template< typename VT1 // Type of the left-hand side target vector
1042  , typename MT1 // Type of the left-hand side matrix operand
1043  , typename VT2 > // Type of the right-hand side vector operand
1044  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1045  {
1046  if( ( IsDiagonal<MT1>::value ) ||
1047  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1048  ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1049  selectSmallAddAssignKernel( y, A, x );
1050  else
1051  selectBlasAddAssignKernel( y, A, x );
1052  }
1054  //**********************************************************************************************
1055 
1056  //**Default addition assignment to dense vectors************************************************
1070  template< typename VT1 // Type of the left-hand side target vector
1071  , typename MT1 // Type of the left-hand side matrix operand
1072  , typename VT2 > // Type of the right-hand side vector operand
1073  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1074  {
1075  const size_t M( A.rows() );
1076  const size_t N( A.columns() );
1077 
1078  for( size_t j=0UL; j<N; ++j )
1079  {
1080  if( IsDiagonal<MT1>::value )
1081  {
1082  y[j] += A(j,j) * x[j];
1083  }
1084  else
1085  {
1086  const size_t ibegin( ( IsLower<MT1>::value )
1087  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1088  :( 0UL ) );
1089  const size_t iend( ( IsUpper<MT1>::value )
1090  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1091  :( M ) );
1092  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1093 
1094  const size_t inum( iend - ibegin );
1095  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1096 
1097  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1098  y[i ] += A(i ,j) * x[j];
1099  y[i+1UL] += A(i+1UL,j) * x[j];
1100  }
1101  if( ipos < iend ) {
1102  y[ipos] += A(ipos,j) * x[j];
1103  }
1104  }
1105  }
1106  }
1108  //**********************************************************************************************
1109 
1110  //**Default addition assignment to dense vectors (small matrices)*******************************
1124  template< typename VT1 // Type of the left-hand side target vector
1125  , typename MT1 // Type of the left-hand side matrix operand
1126  , typename VT2 > // Type of the right-hand side vector operand
1127  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1128  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1129  {
1130  selectDefaultAddAssignKernel( y, A, x );
1131  }
1133  //**********************************************************************************************
1134 
1135  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1149  template< typename VT1 // Type of the left-hand side target vector
1150  , typename MT1 // Type of the left-hand side matrix operand
1151  , typename VT2 > // Type of the right-hand side vector operand
1152  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1153  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1154  {
1155  const size_t M( A.rows() );
1156  const size_t N( A.columns() );
1157 
1158  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
1159 
1160  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
1161  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1162 
1163  size_t i( 0UL );
1164 
1165  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1166  {
1167  const size_t jbegin( ( IsUpper<MT1>::value )
1168  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1169  :( 0UL ) );
1170  const size_t jend( ( IsLower<MT1>::value )
1171  ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1172  :( N ) );
1173  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1174 
1175  SIMDType xmm1( y.load(i ) );
1176  SIMDType xmm2( y.load(i+SIMDSIZE ) );
1177  SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1178  SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1179  SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1180  SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1181  SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1182  SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1183 
1184  for( size_t j=jbegin; j<jend; ++j ) {
1185  const SIMDType x1( set( x[j] ) );
1186  xmm1 = xmm1 + A.load(i ,j) * x1;
1187  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1188  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1189  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1190  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
1191  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
1192  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
1193  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
1194  }
1195 
1196  y.store( i , xmm1 );
1197  y.store( i+SIMDSIZE , xmm2 );
1198  y.store( i+SIMDSIZE*2UL, xmm3 );
1199  y.store( i+SIMDSIZE*3UL, xmm4 );
1200  y.store( i+SIMDSIZE*4UL, xmm5 );
1201  y.store( i+SIMDSIZE*5UL, xmm6 );
1202  y.store( i+SIMDSIZE*6UL, xmm7 );
1203  y.store( i+SIMDSIZE*7UL, xmm8 );
1204  }
1205 
1206  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1207  {
1208  const size_t jbegin( ( IsUpper<MT1>::value )
1209  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1210  :( 0UL ) );
1211  const size_t jend( ( IsLower<MT1>::value )
1212  ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1213  :( N ) );
1214  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1215 
1216  SIMDType xmm1( y.load(i ) );
1217  SIMDType xmm2( y.load(i+SIMDSIZE ) );
1218  SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1219  SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1220 
1221  for( size_t j=jbegin; j<jend; ++j ) {
1222  const SIMDType x1( set( x[j] ) );
1223  xmm1 = xmm1 + A.load(i ,j) * x1;
1224  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1225  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1226  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1227  }
1228 
1229  y.store( i , xmm1 );
1230  y.store( i+SIMDSIZE , xmm2 );
1231  y.store( i+SIMDSIZE*2UL, xmm3 );
1232  y.store( i+SIMDSIZE*3UL, xmm4 );
1233  }
1234 
1235  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1236  {
1237  const size_t jbegin( ( IsUpper<MT1>::value )
1238  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1239  :( 0UL ) );
1240  const size_t jend( ( IsLower<MT1>::value )
1241  ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1242  :( N ) );
1243  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1244 
1245  SIMDType xmm1( y.load(i ) );
1246  SIMDType xmm2( y.load(i+SIMDSIZE ) );
1247  SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1248 
1249  for( size_t j=jbegin; j<jend; ++j ) {
1250  const SIMDType x1( set( x[j] ) );
1251  xmm1 = xmm1 + A.load(i ,j) * x1;
1252  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1253  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1254  }
1255 
1256  y.store( i , xmm1 );
1257  y.store( i+SIMDSIZE , xmm2 );
1258  y.store( i+SIMDSIZE*2UL, xmm3 );
1259  }
1260 
1261  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1262  {
1263  const size_t jbegin( ( IsUpper<MT1>::value )
1264  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1265  :( 0UL ) );
1266  const size_t jend( ( IsLower<MT1>::value )
1267  ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1268  :( N ) );
1269  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1270 
1271  SIMDType xmm1( y.load(i ) );
1272  SIMDType xmm2( y.load(i+SIMDSIZE) );
1273 
1274  for( size_t j=jbegin; j<jend; ++j ) {
1275  const SIMDType x1( set( x[j] ) );
1276  xmm1 = xmm1 + A.load(i ,j) * x1;
1277  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
1278  }
1279 
1280  y.store( i , xmm1 );
1281  y.store( i+SIMDSIZE, xmm2 );
1282  }
1283 
1284  for( ; i<ipos; i+=SIMDSIZE )
1285  {
1286  const size_t jbegin( ( IsUpper<MT1>::value )
1287  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1288  :( 0UL ) );
1289  const size_t jend( ( IsLower<MT1>::value )
1290  ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1291  :( N ) );
1292  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1293 
1294  SIMDType xmm1( y.load(i) );
1295 
1296  for( size_t j=jbegin; j<jend; ++j ) {
1297  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
1298  }
1299 
1300  y.store( i, xmm1 );
1301  }
1302 
1303  for( ; remainder && i<M; ++i )
1304  {
1305  const size_t jbegin( ( IsUpper<MT1>::value )
1306  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1307  :( 0UL ) );
1308  const size_t jend( ( IsLower<MT1>::value )
1309  ?( min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1310  :( N ) );
1311  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1312 
1313  ElementType value = ElementType();
1314 
1315  for( size_t j=jbegin; j<jend; ++j ) {
1316  value += A(i,j) * x[j];
1317  }
1318 
1319  y[i] += value;
1320  }
1321  }
1323  //**********************************************************************************************
1324 
1325  //**Default addition assignment to dense vectors (large matrices)*******************************
1339  template< typename VT1 // Type of the left-hand side target vector
1340  , typename MT1 // Type of the left-hand side matrix operand
1341  , typename VT2 > // Type of the right-hand side vector operand
1342  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1343  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1344  {
1345  selectDefaultAddAssignKernel( y, A, x );
1346  }
1348  //**********************************************************************************************
1349 
1350  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1364  template< typename VT1 // Type of the left-hand side target vector
1365  , typename MT1 // Type of the left-hand side matrix operand
1366  , typename VT2 > // Type of the right-hand side vector operand
1367  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1368  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1369  {
1370  const size_t M( A.rows() );
1371  const size_t N( A.columns() );
1372 
1373  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
1374 
1375  const size_t iblock( 32768UL / sizeof( ElementType ) );
1376  const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1377 
1378  BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1379 
1380  for( size_t ii=0U; ii<M; ii+=iblock ) {
1381  for( size_t jj=0UL; jj<N; jj+=jblock )
1382  {
1383  const size_t jend( min( jj+jblock, N ) );
1384  const size_t itmp( min( ii+iblock, M ) );
1385  const size_t iend( ( IsUpper<MT1>::value )
1386  ?( min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
1387  :( itmp ) );
1388 
1389  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1390  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1391 
1392  size_t i( ( IsLower<MT1>::value )
1393  ?( max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) & size_t(-SIMDSIZE) ) )
1394  :( ii ) );
1395 
1396  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1397  {
1398  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1399 
1400  for( size_t j=jj; j<jend; ++j ) {
1401  const SIMDType x1( set( x[j] ) );
1402  xmm1 = xmm1 + A.load(i ,j) * x1;
1403  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1404  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1405  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1406  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
1407  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
1408  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
1409  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
1410  }
1411 
1412  y.store( i , y.load(i ) + xmm1 );
1413  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1414  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1415  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1416  y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
1417  y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
1418  y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
1419  y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
1420  }
1421 
1422  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1423  {
1424  SIMDType xmm1, xmm2, xmm3, xmm4;
1425 
1426  for( size_t j=jj; j<jend; ++j ) {
1427  const SIMDType x1( set( x[j] ) );
1428  xmm1 = xmm1 + A.load(i ,j) * x1;
1429  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1430  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1431  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1432  }
1433 
1434  y.store( i , y.load(i ) + xmm1 );
1435  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1436  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1437  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1438  }
1439 
1440  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1441  {
1442  SIMDType xmm1, xmm2, xmm3;
1443 
1444  for( size_t j=jj; j<jend; ++j ) {
1445  const SIMDType x1( set( x[j] ) );
1446  xmm1 = xmm1 + A.load(i ,j) * x1;
1447  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1448  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1449  }
1450 
1451  y.store( i , y.load(i ) + xmm1 );
1452  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1453  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1454  }
1455 
1456  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1457  {
1458  SIMDType xmm1, xmm2;
1459 
1460  for( size_t j=jj; j<jend; ++j ) {
1461  const SIMDType x1( set( x[j] ) );
1462  xmm1 = xmm1 + A.load(i ,j) * x1;
1463  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
1464  }
1465 
1466  y.store( i , y.load(i ) + xmm1 );
1467  y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
1468  }
1469 
1470  for( ; i<ipos; i+=SIMDSIZE )
1471  {
1472  SIMDType xmm1;
1473 
1474  for( size_t j=jj; j<jend; ++j ) {
1475  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
1476  }
1477 
1478  y.store( i, y.load(i) + xmm1 );
1479  }
1480 
1481  for( ; remainder && i<iend; ++i )
1482  {
1483  ElementType value = ElementType();
1484 
1485  for( size_t j=jj; j<jend; ++j ) {
1486  value += A(i,j) * x[j];
1487  }
1488 
1489  y[i] += value;
1490  }
1491  }
1492  }
1493  }
1495  //**********************************************************************************************
1496 
1497  //**BLAS-based addition assignment to dense vectors (default)***********************************
1511  template< typename VT1 // Type of the left-hand side target vector
1512  , typename MT1 // Type of the left-hand side matrix operand
1513  , typename VT2 > // Type of the right-hand side vector operand
1514  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
1515  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1516  {
1517  selectLargeAddAssignKernel( y, A, x );
1518  }
1520  //**********************************************************************************************
1521 
1522  //**BLAS-based addition assignment to dense vectors*********************************************
1523 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1524 
1537  template< typename VT1 // Type of the left-hand side target vector
1538  , typename MT1 // Type of the left-hand side matrix operand
1539  , typename VT2 > // Type of the right-hand side vector operand
1540  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
1541  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x )
1542  {
1543  typedef ElementType_<VT1> ET;
1544 
1545  if( IsTriangular<MT1>::value ) {
1546  ResultType_<VT1> tmp( serial( x ) );
1547  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1548  addAssign( y, tmp );
1549  }
1550  else {
1551  gemv( y, A, x, ET(1), ET(1) );
1552  }
1553  }
1555 #endif
1556  //**********************************************************************************************
1557 
1558  //**Addition assignment to sparse vectors*******************************************************
1559  // No special implementation for the addition assignment to sparse vectors.
1560  //**********************************************************************************************
1561 
1562  //**Subtraction assignment to dense vectors*****************************************************
1575  template< typename VT1 > // Type of the target dense vector
1576  friend inline void subAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
1577  {
1579 
1580  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1581 
1582  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1583  return;
1584  }
1585 
1586  LT A( serial( rhs.mat_ ) ); // Evaluation of the left-hand side dense matrix operand
1587  RT x( serial( rhs.vec_ ) ); // Evaluation of the right-hand side dense vector operand
1588 
1589  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1590  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1591  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1592  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
1593 
1594  TDMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1595  }
1597  //**********************************************************************************************
1598 
1599  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1610  template< typename VT1 // Type of the left-hand side target vector
1611  , typename MT1 // Type of the left-hand side matrix operand
1612  , typename VT2 > // Type of the right-hand side vector operand
1613  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1614  {
1615  if( ( IsDiagonal<MT1>::value ) ||
1616  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1617  ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1618  selectSmallSubAssignKernel( y, A, x );
1619  else
1620  selectBlasSubAssignKernel( y, A, x );
1621  }
1623  //**********************************************************************************************
1624 
1625  //**Default subtraction assignment to dense vectors*********************************************
1639  template< typename VT1 // Type of the left-hand side target vector
1640  , typename MT1 // Type of the left-hand side matrix operand
1641  , typename VT2 > // Type of the right-hand side vector operand
1642  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1643  {
1644  const size_t M( A.rows() );
1645  const size_t N( A.columns() );
1646 
1647  for( size_t j=0UL; j<N; ++j )
1648  {
1649  if( IsDiagonal<MT1>::value )
1650  {
1651  y[j] -= A(j,j) * x[j];
1652  }
1653  else
1654  {
1655  const size_t ibegin( ( IsLower<MT1>::value )
1656  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1657  :( 0UL ) );
1658  const size_t iend( ( IsUpper<MT1>::value )
1659  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1660  :( M ) );
1661  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1662 
1663  const size_t inum( iend - ibegin );
1664  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1665 
1666  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1667  y[i ] -= A(i ,j) * x[j];
1668  y[i+1UL] -= A(i+1UL,j) * x[j];
1669  }
1670  if( ipos < iend ) {
1671  y[ipos] -= A(ipos,j) * x[j];
1672  }
1673  }
1674  }
1675  }
1677  //**********************************************************************************************
1678 
1679  //**Default subtraction assignment to dense vectors (small matrices)****************************
1693  template< typename VT1 // Type of the left-hand side target vector
1694  , typename MT1 // Type of the left-hand side matrix operand
1695  , typename VT2 > // Type of the right-hand side vector operand
1696  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1697  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1698  {
1699  selectDefaultSubAssignKernel( y, A, x );
1700  }
1702  //**********************************************************************************************
1703 
1704  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1719  template< typename VT1 // Type of the left-hand side target vector
1720  , typename MT1 // Type of the left-hand side matrix operand
1721  , typename VT2 > // Type of the right-hand side vector operand
1722  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1723  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1724  {
1725  const size_t M( A.rows() );
1726  const size_t N( A.columns() );
1727 
1728  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
1729 
1730  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
1731  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1732 
1733  size_t i( 0UL );
1734 
1735  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1736  {
1737  const size_t jbegin( ( IsUpper<MT1>::value )
1738  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1739  :( 0UL ) );
1740  const size_t jend( ( IsLower<MT1>::value )
1741  ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1742  :( N ) );
1743  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1744 
1745  SIMDType xmm1( y.load(i ) );
1746  SIMDType xmm2( y.load(i+SIMDSIZE ) );
1747  SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1748  SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1749  SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1750  SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1751  SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1752  SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1753 
1754  for( size_t j=jbegin; j<jend; ++j ) {
1755  const SIMDType x1( set( x[j] ) );
1756  xmm1 = xmm1 - A.load(i ,j) * x1;
1757  xmm2 = xmm2 - A.load(i+SIMDSIZE ,j) * x1;
1758  xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,j) * x1;
1759  xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,j) * x1;
1760  xmm5 = xmm5 - A.load(i+SIMDSIZE*4UL,j) * x1;
1761  xmm6 = xmm6 - A.load(i+SIMDSIZE*5UL,j) * x1;
1762  xmm7 = xmm7 - A.load(i+SIMDSIZE*6UL,j) * x1;
1763  xmm8 = xmm8 - A.load(i+SIMDSIZE*7UL,j) * x1;
1764  }
1765 
1766  y.store( i , xmm1 );
1767  y.store( i+SIMDSIZE , xmm2 );
1768  y.store( i+SIMDSIZE*2UL, xmm3 );
1769  y.store( i+SIMDSIZE*3UL, xmm4 );
1770  y.store( i+SIMDSIZE*4UL, xmm5 );
1771  y.store( i+SIMDSIZE*5UL, xmm6 );
1772  y.store( i+SIMDSIZE*6UL, xmm7 );
1773  y.store( i+SIMDSIZE*7UL, xmm8 );
1774  }
1775 
1776  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1777  {
1778  const size_t jbegin( ( IsUpper<MT1>::value )
1779  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1780  :( 0UL ) );
1781  const size_t jend( ( IsLower<MT1>::value )
1782  ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1783  :( N ) );
1784  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1785 
1786  SIMDType xmm1( y.load(i ) );
1787  SIMDType xmm2( y.load(i+SIMDSIZE ) );
1788  SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1789  SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1790 
1791  for( size_t j=jbegin; j<jend; ++j ) {
1792  const SIMDType x1( set( x[j] ) );
1793  xmm1 = xmm1 - A.load(i ,j) * x1;
1794  xmm2 = xmm2 - A.load(i+SIMDSIZE ,j) * x1;
1795  xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,j) * x1;
1796  xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,j) * x1;
1797  }
1798 
1799  y.store( i , xmm1 );
1800  y.store( i+SIMDSIZE , xmm2 );
1801  y.store( i+SIMDSIZE*2UL, xmm3 );
1802  y.store( i+SIMDSIZE*3UL, xmm4 );
1803  }
1804 
1805  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1806  {
1807  const size_t jbegin( ( IsUpper<MT1>::value )
1808  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1809  :( 0UL ) );
1810  const size_t jend( ( IsLower<MT1>::value )
1811  ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1812  :( N ) );
1813  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1814 
1815  SIMDType xmm1( y.load(i ) );
1816  SIMDType xmm2( y.load(i+SIMDSIZE ) );
1817  SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1818 
1819  for( size_t j=jbegin; j<jend; ++j ) {
1820  const SIMDType x1( set( x[j] ) );
1821  xmm1 = xmm1 - A.load(i ,j) * x1;
1822  xmm2 = xmm2 - A.load(i+SIMDSIZE ,j) * x1;
1823  xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,j) * x1;
1824  }
1825 
1826  y.store( i , xmm1 );
1827  y.store( i+SIMDSIZE , xmm2 );
1828  y.store( i+SIMDSIZE*2UL, xmm3 );
1829  }
1830 
1831  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1832  {
1833  const size_t jbegin( ( IsUpper<MT1>::value )
1834  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1835  :( 0UL ) );
1836  const size_t jend( ( IsLower<MT1>::value )
1837  ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1838  :( N ) );
1839  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1840 
1841  SIMDType xmm1( y.load(i ) );
1842  SIMDType xmm2( y.load(i+SIMDSIZE) );
1843 
1844  for( size_t j=jbegin; j<jend; ++j ) {
1845  const SIMDType x1( set( x[j] ) );
1846  xmm1 = xmm1 - A.load(i ,j) * x1;
1847  xmm2 = xmm2 - A.load(i+SIMDSIZE,j) * x1;
1848  }
1849 
1850  y.store( i , xmm1 );
1851  y.store( i+SIMDSIZE, xmm2 );
1852  }
1853 
1854  for( ; i<ipos; i+=SIMDSIZE )
1855  {
1856  const size_t jbegin( ( IsUpper<MT1>::value )
1857  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1858  :( 0UL ) );
1859  const size_t jend( ( IsLower<MT1>::value )
1860  ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1861  :( N ) );
1862  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1863 
1864  SIMDType xmm1( y.load(i) );
1865 
1866  for( size_t j=jbegin; j<jend; ++j ) {
1867  xmm1 = xmm1 - A.load(i,j) * set( x[j] );
1868  }
1869 
1870  y.store( i, xmm1 );
1871  }
1872 
1873  for( ; remainder && i<M; ++i )
1874  {
1875  const size_t jbegin( ( IsUpper<MT1>::value )
1876  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1877  :( 0UL ) );
1878  const size_t jend( ( IsLower<MT1>::value )
1879  ?( min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1880  :( N ) );
1881  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1882 
1883  ElementType value = ElementType();
1884 
1885  for( size_t j=jbegin; j<jend; ++j ) {
1886  value += A(i,j) * x[j];
1887  }
1888 
1889  y[i] -= value;
1890  }
1891  }
1893  //**********************************************************************************************
1894 
1895  //**Default subtraction assignment to dense vectors (large matrices)****************************
1909  template< typename VT1 // Type of the left-hand side target vector
1910  , typename MT1 // Type of the left-hand side matrix operand
1911  , typename VT2 > // Type of the right-hand side vector operand
1912  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1913  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1914  {
1915  selectDefaultSubAssignKernel( y, A, x );
1916  }
1918  //**********************************************************************************************
1919 
1920  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
1935  template< typename VT1 // Type of the left-hand side target vector
1936  , typename MT1 // Type of the left-hand side matrix operand
1937  , typename VT2 > // Type of the right-hand side vector operand
1938  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1939  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
1940  {
1941  const size_t M( A.rows() );
1942  const size_t N( A.columns() );
1943 
1944  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
1945 
1946  const size_t iblock( 32768UL / sizeof( ElementType ) );
1947  const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1948 
1949  BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1950 
1951  for( size_t ii=0U; ii<M; ii+=iblock ) {
1952  for( size_t jj=0UL; jj<N; jj+=jblock )
1953  {
1954  const size_t jend( min( jj+jblock, N ) );
1955  const size_t itmp( min( ii+iblock, M ) );
1956  const size_t iend( ( IsUpper<MT1>::value )
1957  ?( min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
1958  :( itmp ) );
1959 
1960  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1961  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1962 
1963  size_t i( ( IsLower<MT1>::value )
1964  ?( max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) & size_t(-SIMDSIZE) ) )
1965  :( ii ) );
1966 
1967  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1968  {
1969  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1970 
1971  for( size_t j=jj; j<jend; ++j ) {
1972  const SIMDType x1( set( x[j] ) );
1973  xmm1 = xmm1 + A.load(i ,j) * x1;
1974  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1975  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1976  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1977  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
1978  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
1979  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
1980  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
1981  }
1982 
1983  y.store( i , y.load(i ) - xmm1 );
1984  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
1985  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
1986  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
1987  y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5 );
1988  y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6 );
1989  y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7 );
1990  y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8 );
1991  }
1992 
1993  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1994  {
1995  SIMDType xmm1, xmm2, xmm3, xmm4;
1996 
1997  for( size_t j=jj; j<jend; ++j ) {
1998  const SIMDType x1( set( x[j] ) );
1999  xmm1 = xmm1 + A.load(i ,j) * x1;
2000  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2001  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
2002  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
2003  }
2004 
2005  y.store( i , y.load(i ) - xmm1 );
2006  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2007  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2008  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
2009  }
2010 
2011  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2012  {
2013  SIMDType xmm1, xmm2, xmm3;
2014 
2015  for( size_t j=jj; j<jend; ++j ) {
2016  const SIMDType x1( set( x[j] ) );
2017  xmm1 = xmm1 + A.load(i ,j) * x1;
2018  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2019  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
2020  }
2021 
2022  y.store( i , y.load(i ) - xmm1 );
2023  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2024  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2025  }
2026 
2027  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2028  {
2029  SIMDType xmm1, xmm2;
2030 
2031  for( size_t j=jj; j<jend; ++j ) {
2032  const SIMDType x1( set( x[j] ) );
2033  xmm1 = xmm1 + A.load(i ,j) * x1;
2034  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
2035  }
2036 
2037  y.store( i , y.load(i ) - xmm1 );
2038  y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2 );
2039  }
2040 
2041  for( ; i<ipos; i+=SIMDSIZE )
2042  {
2043  SIMDType xmm1;
2044 
2045  for( size_t j=jj; j<jend; ++j ) {
2046  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
2047  }
2048 
2049  y.store( i, y.load(i) - xmm1 );
2050  }
2051 
2052  for( ; remainder && i<iend; ++i )
2053  {
2054  ElementType value = ElementType();
2055 
2056  for( size_t j=jj; j<jend; ++j ) {
2057  value += A(i,j) * x[j];
2058  }
2059 
2060  y[i] -= value;
2061  }
2062  }
2063  }
2064  }
2066  //**********************************************************************************************
2067 
2068  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2082  template< typename VT1 // Type of the left-hand side target vector
2083  , typename MT1 // Type of the left-hand side matrix operand
2084  , typename VT2 > // Type of the right-hand side vector operand
2085  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
2086  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2087  {
2088  selectLargeSubAssignKernel( y, A, x );
2089  }
2091  //**********************************************************************************************
2092 
2093  //**BLAS-based subtraction assignment to dense vectors******************************************
2094 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2095 
2108  template< typename VT1 // Type of the left-hand side target vector
2109  , typename MT1 // Type of the left-hand side matrix operand
2110  , typename VT2 > // Type of the right-hand side vector operand
2111  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
2112  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x )
2113  {
2114  typedef ElementType_<VT1> ET;
2115 
2116  if( IsTriangular<MT1>::value ) {
2117  ResultType_<VT1> tmp( serial( x ) );
2118  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2119  subAssign( y, tmp );
2120  }
2121  else {
2122  gemv( y, A, x, ET(-1), ET(1) );
2123  }
2124  }
2126 #endif
2127  //**********************************************************************************************
2128 
2129  //**Subtraction assignment to sparse vectors****************************************************
2130  // No special implementation for the subtraction assignment to sparse vectors.
2131  //**********************************************************************************************
2132 
2133  //**Multiplication assignment to dense vectors**************************************************
2146  template< typename VT1 > // Type of the target dense vector
2147  friend inline void multAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2148  {
2150 
2153  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2154 
2155  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2156 
2157  const ResultType tmp( serial( rhs ) );
2158  multAssign( ~lhs, tmp );
2159  }
2161  //**********************************************************************************************
2162 
2163  //**Multiplication assignment to sparse vectors*************************************************
2164  // No special implementation for the multiplication assignment to sparse vectors.
2165  //**********************************************************************************************
2166 
2167  //**Division assignment to dense vectors********************************************************
2180  template< typename VT1 > // Type of the target dense vector
2181  friend inline void divAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2182  {
2184 
2187  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2188 
2189  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2190 
2191  const ResultType tmp( serial( rhs ) );
2192  divAssign( ~lhs, tmp );
2193  }
2195  //**********************************************************************************************
2196 
2197  //**Division assignment to sparse vectors*******************************************************
2198  // No special implementation for the division assignment to sparse vectors.
2199  //**********************************************************************************************
2200 
2201  //**SMP assignment to dense vectors*************************************************************
2216  template< typename VT1 > // Type of the target dense vector
2217  friend inline EnableIf_< UseSMPAssign<VT1> >
2218  smpAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2219  {
2221 
2222  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2223 
2224  if( rhs.mat_.rows() == 0UL ) {
2225  return;
2226  }
2227  else if( rhs.mat_.columns() == 0UL ) {
2228  reset( ~lhs );
2229  return;
2230  }
2231 
2232  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2233  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2234 
2235  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2236  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2237  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2238  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2239 
2240  smpAssign( ~lhs, A * x );
2241  }
2243  //**********************************************************************************************
2244 
2245  //**SMP assignment to sparse vectors************************************************************
2260  template< typename VT1 > // Type of the target sparse vector
2261  friend inline EnableIf_< UseSMPAssign<VT1> >
2262  smpAssign( SparseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2263  {
2265 
2268  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2269 
2270  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2271 
2272  const ResultType tmp( rhs );
2273  smpAssign( ~lhs, tmp );
2274  }
2276  //**********************************************************************************************
2277 
2278  //**SMP addition assignment to dense vectors****************************************************
2293  template< typename VT1 > // Type of the target dense vector
2294  friend inline EnableIf_< UseSMPAssign<VT1> >
2295  smpAddAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2296  {
2298 
2299  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2300 
2301  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2302  return;
2303  }
2304 
2305  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2306  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2307 
2308  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2309  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2310  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2311  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2312 
2313  smpAddAssign( ~lhs, A * x );
2314  }
2316  //**********************************************************************************************
2317 
2318  //**SMP addition assignment to sparse vectors***************************************************
2319  // No special implementation for the SMP addition assignment to sparse vectors.
2320  //**********************************************************************************************
2321 
2322  //**SMP subtraction assignment to dense vectors*************************************************
2337  template< typename VT1 > // Type of the target dense vector
2338  friend inline EnableIf_< UseSMPAssign<VT1> >
2339  smpSubAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2340  {
2342 
2343  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2344 
2345  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2346  return;
2347  }
2348 
2349  LT A( rhs.mat_ ); // Evaluation of the left-hand side dense matrix operand
2350  RT x( rhs.vec_ ); // Evaluation of the right-hand side dense vector operand
2351 
2352  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2353  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2354  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2355  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2356 
2357  smpSubAssign( ~lhs, A * x );
2358  }
2360  //**********************************************************************************************
2361 
2362  //**SMP subtraction assignment to sparse vectors************************************************
2363  // No special implementation for the SMP subtraction assignment to sparse vectors.
2364  //**********************************************************************************************
2365 
2366  //**SMP multiplication assignment to dense vectors**********************************************
2381  template< typename VT1 > // Type of the target dense vector
2382  friend inline EnableIf_< UseSMPAssign<VT1> >
2383  smpMultAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2384  {
2386 
2389  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2390 
2391  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2392 
2393  const ResultType tmp( rhs );
2394  smpMultAssign( ~lhs, tmp );
2395  }
2397  //**********************************************************************************************
2398 
2399  //**SMP multiplication assignment to sparse vectors*********************************************
2400  // No special implementation for the SMP multiplication assignment to sparse vectors.
2401  //**********************************************************************************************
2402 
2403  //**SMP division assignment to dense vectors****************************************************
2418  template< typename VT1 > // Type of the target dense vector
2419  friend inline EnableIf_< UseSMPAssign<VT1> >
2420  smpDivAssign( DenseVector<VT1,false>& lhs, const TDMatDVecMultExpr& rhs )
2421  {
2423 
2426  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2427 
2428  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2429 
2430  const ResultType tmp( rhs );
2431  smpDivAssign( ~lhs, tmp );
2432  }
2434  //**********************************************************************************************
2435 
2436  //**SMP division assignment to sparse vectors***************************************************
2437  // No special implementation for the SMP division assignment to sparse vectors.
2438  //**********************************************************************************************
2439 
2440  //**Compile time checks*************************************************************************
2448  //**********************************************************************************************
2449 };
2450 //*************************************************************************************************
2451 
2452 
2453 
2454 
2455 //=================================================================================================
2456 //
2457 // DVECSCALARMULTEXPR SPECIALIZATION
2458 //
2459 //=================================================================================================
2460 
2461 //*************************************************************************************************
2470 template< typename MT // Type of the left-hand side dense matrix
2471  , typename VT // Type of the right-hand side dense vector
2472  , typename ST > // Type of the side scalar value
2473 class DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >
2474  : public DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false >
2475  , private VecScalarMultExpr
2476  , private Computation
2477 {
2478  private:
2479  //**Type definitions****************************************************************************
2480  typedef TDMatDVecMultExpr<MT,VT> MVM;
2481  typedef ResultType_<MVM> RES;
2482  typedef ResultType_<MT> MRT;
2483  typedef ResultType_<VT> VRT;
2484  typedef ElementType_<MRT> MET;
2485  typedef ElementType_<VRT> VET;
2486  typedef CompositeType_<MT> MCT;
2487  typedef CompositeType_<VT> VCT;
2488  //**********************************************************************************************
2489 
2490  //**********************************************************************************************
2492  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2493  IsBLASCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2494  //**********************************************************************************************
2495 
2496  //**********************************************************************************************
2498  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2499  //**********************************************************************************************
2500 
2501  //**********************************************************************************************
2503 
2506  template< typename T1 >
2507  struct UseSMPAssign {
2508  enum : bool { value = ( evaluateMatrix || evaluateVector ) };
2509  };
2510  //**********************************************************************************************
2511 
2512  //**********************************************************************************************
2514 
2516  template< typename T1, typename T2, typename T3, typename T4 >
2517  struct UseBlasKernel {
2519  HasMutableDataAccess<T1>::value &&
2520  HasConstDataAccess<T2>::value &&
2521  HasConstDataAccess<T3>::value &&
2522  !IsDiagonal<T2>::value &&
2523  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2524  IsBLASCompatible< ElementType_<T1> >::value &&
2525  IsBLASCompatible< ElementType_<T2> >::value &&
2526  IsBLASCompatible< ElementType_<T3> >::value &&
2527  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
2528  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2529  !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
2530  };
2531  //**********************************************************************************************
2532 
2533  //**********************************************************************************************
2535 
2538  template< typename T1, typename T2, typename T3, typename T4 >
2539  struct UseVectorizedDefaultKernel {
2540  enum : bool { value = useOptimizedKernels &&
2541  !IsDiagonal<T2>::value &&
2542  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2543  AreSIMDCombinable< ElementType_<T1>
2544  , ElementType_<T2>
2545  , ElementType_<T3>
2546  , T4 >::value &&
2547  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2548  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2549  };
2550  //**********************************************************************************************
2551 
2552  public:
2553  //**Type definitions****************************************************************************
2554  typedef DVecScalarMultExpr<MVM,ST,false> This;
2555  typedef MultTrait_<RES,ST> ResultType;
2556  typedef TransposeType_<ResultType> TransposeType;
2557  typedef ElementType_<ResultType> ElementType;
2558  typedef SIMDTrait_<ElementType> SIMDType;
2559  typedef const ElementType ReturnType;
2560  typedef const ResultType CompositeType;
2561 
2563  typedef const TDMatDVecMultExpr<MT,VT> LeftOperand;
2564 
2566  typedef ST RightOperand;
2567 
2569  typedef IfTrue_< evaluateMatrix, const MRT, MCT > LT;
2570 
2572  typedef IfTrue_< evaluateVector, const VRT, VCT > RT;
2573  //**********************************************************************************************
2574 
2575  //**Compilation flags***************************************************************************
2577  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2578  MT::simdEnabled && VT::simdEnabled &&
2579  AreSIMDCombinable<MET,VET,ST>::value &&
2580  HasSIMDAdd<MET,VET>::value &&
2581  HasSIMDMult<MET,VET>::value };
2582 
2584  enum : bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2585  !evaluateVector && VT::smpAssignable };
2586  //**********************************************************************************************
2587 
2588  //**SIMD properties*****************************************************************************
2590  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2591  //**********************************************************************************************
2592 
2593  //**Constructor*********************************************************************************
2599  explicit inline DVecScalarMultExpr( const MVM& vector, ST scalar )
2600  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2601  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2602  {}
2603  //**********************************************************************************************
2604 
2605  //**Subscript operator**************************************************************************
2611  inline ReturnType operator[]( size_t index ) const {
2612  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2613  return vector_[index] * scalar_;
2614  }
2615  //**********************************************************************************************
2616 
2617  //**At function*********************************************************************************
2624  inline ReturnType at( size_t index ) const {
2625  if( index >= vector_.size() ) {
2626  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2627  }
2628  return (*this)[index];
2629  }
2630  //**********************************************************************************************
2631 
2632  //**Size function*******************************************************************************
2637  inline size_t size() const {
2638  return vector_.size();
2639  }
2640  //**********************************************************************************************
2641 
2642  //**Left operand access*************************************************************************
2647  inline LeftOperand leftOperand() const {
2648  return vector_;
2649  }
2650  //**********************************************************************************************
2651 
2652  //**Right operand access************************************************************************
2657  inline RightOperand rightOperand() const {
2658  return scalar_;
2659  }
2660  //**********************************************************************************************
2661 
2662  //**********************************************************************************************
2668  template< typename T >
2669  inline bool canAlias( const T* alias ) const {
2670  return vector_.canAlias( alias );
2671  }
2672  //**********************************************************************************************
2673 
2674  //**********************************************************************************************
2680  template< typename T >
2681  inline bool isAliased( const T* alias ) const {
2682  return vector_.isAliased( alias );
2683  }
2684  //**********************************************************************************************
2685 
2686  //**********************************************************************************************
2691  inline bool isAligned() const {
2692  return vector_.isAligned();
2693  }
2694  //**********************************************************************************************
2695 
2696  //**********************************************************************************************
2701  inline bool canSMPAssign() const noexcept {
2702  LeftOperand_<MVM> A( vector_.leftOperand() );
2703  return ( !BLAZE_BLAS_IS_PARALLEL ||
2704  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2705  ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
2706  ( size() > SMP_TDMATDVECMULT_THRESHOLD );
2707  }
2708  //**********************************************************************************************
2709 
2710  private:
2711  //**Member variables****************************************************************************
2712  LeftOperand vector_;
2713  RightOperand scalar_;
2714  //**********************************************************************************************
2715 
2716  //**Assignment to dense vectors*****************************************************************
2728  template< typename VT1 > // Type of the target dense vector
2729  friend inline void assign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
2730  {
2732 
2733  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2734 
2735  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
2736  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
2737 
2738  if( left.rows() == 0UL ) {
2739  return;
2740  }
2741  else if( left.columns() == 0UL ) {
2742  reset( ~lhs );
2743  return;
2744  }
2745 
2746  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
2747  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
2748 
2749  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
2750  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
2751  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
2752  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
2753 
2754  DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
2755  }
2756  //**********************************************************************************************
2757 
2758  //**Assignment to dense vectors (kernel selection)**********************************************
2769  template< typename VT1 // Type of the left-hand side target vector
2770  , typename MT1 // Type of the left-hand side matrix operand
2771  , typename VT2 // Type of the right-hand side vector operand
2772  , typename ST2 > // Type of the scalar value
2773  static inline void selectAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2774  {
2775  if( ( IsDiagonal<MT1>::value ) ||
2776  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2777  ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
2778  selectSmallAssignKernel( y, A, x, scalar );
2779  else
2780  selectBlasAssignKernel( y, A, x, scalar );
2781  }
2782  //**********************************************************************************************
2783 
2784  //**Default assignment to dense vectors*********************************************************
2798  template< typename VT1 // Type of the left-hand side target vector
2799  , typename MT1 // Type of the left-hand side matrix operand
2800  , typename VT2 // Type of the right-hand side vector operand
2801  , typename ST2 > // Type of the scalar value
2802  static inline void selectDefaultAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2803  {
2804  const size_t M( A.rows() );
2805  const size_t N( A.columns() );
2806 
2807  if( IsStrictlyLower<MT1>::value ) {
2808  reset( y[0] );
2809  }
2810 
2811  if( !IsUpper<MT1>::value )
2812  {
2813  for( size_t i=( IsStrictlyLower<MT1>::value ? 1UL : 0UL ); i<M; ++i ) {
2814  y[i] = A(i,0UL) * x[0UL];
2815  }
2816  }
2817 
2818  for( size_t j=( IsUpper<MT1>::value && !IsStrictlyUpper<MT1>::value ? 0UL : 1UL ); j<N; ++j )
2819  {
2820  if( IsDiagonal<MT1>::value )
2821  {
2822  y[j] = A(j,j) * x[j] * scalar;
2823  }
2824  else
2825  {
2826  const size_t ibegin( ( IsLower<MT1>::value )
2827  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2828  :( 0UL ) );
2829  const size_t iend( ( IsUpper<MT1>::value )
2830  ?( IsStrictlyUpper<MT1>::value ? j-1UL : j )
2831  :( M ) );
2832  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2833 
2834  const size_t inum( iend - ibegin );
2835  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2836 
2837  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2838  y[i ] += A(i ,j) * x[j];
2839  y[i+1UL] += A(i+1UL,j) * x[j];
2840  }
2841  if( ipos < iend ) {
2842  y[ipos] += A(ipos,j) * x[j];
2843  }
2844  if( IsUpper<MT1>::value ) {
2845  y[iend] = A(iend,j) * x[j];
2846  }
2847  }
2848  }
2849 
2850  if( IsStrictlyUpper<MT1>::value ) {
2851  reset( y[M-1UL] );
2852  }
2853 
2854  if( !IsDiagonal<MT1>::value )
2855  {
2856  const size_t iend( IsStrictlyUpper<MT1>::value ? M-1UL : M );
2857  for( size_t i=( IsStrictlyLower<MT1>::value ? 1UL : 0UL ); i<iend; ++i ) {
2858  y[i] *= scalar;
2859  }
2860  }
2861  }
2862  //**********************************************************************************************
2863 
2864  //**Default assignment to dense vectors (small matrices)****************************************
2878  template< typename VT1 // Type of the left-hand side target vector
2879  , typename MT1 // Type of the left-hand side matrix operand
2880  , typename VT2 // Type of the right-hand side vector operand
2881  , typename ST2 > // Type of the scalar value
2882  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
2883  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2884  {
2885  selectDefaultAssignKernel( y, A, x, scalar );
2886  }
2887  //**********************************************************************************************
2888 
2889  //**Vectorized default assignment to dense vectors (small matrices)*****************************
2903  template< typename VT1 // Type of the left-hand side target vector
2904  , typename MT1 // Type of the left-hand side matrix operand
2905  , typename VT2 // Type of the right-hand side vector operand
2906  , typename ST2 > // Type of the scalar value
2907  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
2908  selectSmallAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
2909  {
2910  const size_t M( A.rows() );
2911  const size_t N( A.columns() );
2912 
2913  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
2914 
2915  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
2916  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
2917 
2918  const SIMDType factor( set( scalar ) );
2919 
2920  size_t i( 0UL );
2921 
2922  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
2923  {
2924  const size_t jbegin( ( IsUpper<MT1>::value )
2925  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2926  :( 0UL ) );
2927  const size_t jend( ( IsLower<MT1>::value )
2928  ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
2929  :( N ) );
2930  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2931 
2932  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2933 
2934  for( size_t j=jbegin; j<jend; ++j ) {
2935  const SIMDType x1( set( x[j] ) );
2936  xmm1 = xmm1 + A.load(i ,j) * x1;
2937  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2938  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
2939  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
2940  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
2941  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
2942  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
2943  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
2944  }
2945 
2946  y.store( i , xmm1*factor );
2947  y.store( i+SIMDSIZE , xmm2*factor );
2948  y.store( i+SIMDSIZE*2UL, xmm3*factor );
2949  y.store( i+SIMDSIZE*3UL, xmm4*factor );
2950  y.store( i+SIMDSIZE*4UL, xmm5*factor );
2951  y.store( i+SIMDSIZE*5UL, xmm6*factor );
2952  y.store( i+SIMDSIZE*6UL, xmm7*factor );
2953  y.store( i+SIMDSIZE*7UL, xmm8*factor );
2954  }
2955 
2956  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2957  {
2958  const size_t jbegin( ( IsUpper<MT1>::value )
2959  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2960  :( 0UL ) );
2961  const size_t jend( ( IsLower<MT1>::value )
2962  ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
2963  :( N ) );
2964  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2965 
2966  SIMDType xmm1, xmm2, xmm3, xmm4;
2967 
2968  for( size_t j=jbegin; j<jend; ++j ) {
2969  const SIMDType x1( set( x[j] ) );
2970  xmm1 = xmm1 + A.load(i ,j) * x1;
2971  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2972  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
2973  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
2974  }
2975 
2976  y.store( i , xmm1*factor );
2977  y.store( i+SIMDSIZE , xmm2*factor );
2978  y.store( i+SIMDSIZE*2UL, xmm3*factor );
2979  y.store( i+SIMDSIZE*3UL, xmm4*factor );
2980  }
2981 
2982  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2983  {
2984  const size_t jbegin( ( IsUpper<MT1>::value )
2985  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2986  :( 0UL ) );
2987  const size_t jend( ( IsLower<MT1>::value )
2988  ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
2989  :( N ) );
2990  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2991 
2992  SIMDType xmm1, xmm2, xmm3;
2993 
2994  for( size_t j=jbegin; j<jend; ++j ) {
2995  const SIMDType x1( set( x[j] ) );
2996  xmm1 = xmm1 + A.load(i ,j) * x1;
2997  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2998  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
2999  }
3000 
3001  y.store( i , xmm1*factor );
3002  y.store( i+SIMDSIZE , xmm2*factor );
3003  y.store( i+SIMDSIZE*2UL, xmm3*factor );
3004  }
3005 
3006  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3007  {
3008  const size_t jbegin( ( IsUpper<MT1>::value )
3009  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3010  :( 0UL ) );
3011  const size_t jend( ( IsLower<MT1>::value )
3012  ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3013  :( N ) );
3014  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3015 
3016  SIMDType xmm1, xmm2;
3017 
3018  for( size_t j=jbegin; j<jend; ++j ) {
3019  const SIMDType x1( set( x[j] ) );
3020  xmm1 = xmm1 + A.load(i ,j) * x1;
3021  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
3022  }
3023 
3024  y.store( i , xmm1*factor );
3025  y.store( i+SIMDSIZE, xmm2*factor );
3026  }
3027 
3028  for( ; i<ipos; i+=SIMDSIZE )
3029  {
3030  const size_t jbegin( ( IsUpper<MT1>::value )
3031  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3032  :( 0UL ) );
3033  const size_t jend( ( IsLower<MT1>::value )
3034  ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3035  :( N ) );
3036  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3037 
3038  SIMDType xmm1;
3039 
3040  for( size_t j=jbegin; j<jend; ++j ) {
3041  const SIMDType x1( set( x[j] ) );
3042  xmm1 = xmm1 + A.load(i,j) * x1;
3043  }
3044 
3045  y.store( i, xmm1*factor );
3046  }
3047 
3048  for( ; remainder && i<M; ++i )
3049  {
3050  const size_t jbegin( ( IsUpper<MT1>::value )
3051  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3052  :( 0UL ) );
3053  const size_t jend( ( IsLower<MT1>::value )
3054  ?( min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3055  :( N ) );
3056  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3057 
3058  ElementType value = ElementType();
3059 
3060  for( size_t j=jbegin; j<jend; ++j ) {
3061  value += A(i,j) * x[j];
3062  }
3063 
3064  y[i] = value * scalar;
3065  }
3066  }
3067  //**********************************************************************************************
3068 
3069  //**Default assignment to dense vectors (large matrices)****************************************
3083  template< typename VT1 // Type of the left-hand side target vector
3084  , typename MT1 // Type of the left-hand side matrix operand
3085  , typename VT2 // Type of the right-hand side vector operand
3086  , typename ST2 > // Type of the scalar value
3087  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3088  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3089  {
3090  selectDefaultAssignKernel( y, A, x, scalar );
3091  }
3092  //**********************************************************************************************
3093 
3094  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3108  template< typename VT1 // Type of the left-hand side target vector
3109  , typename MT1 // Type of the left-hand side matrix operand
3110  , typename VT2 // Type of the right-hand side vector operand
3111  , typename ST2 > // Type of the scalar value
3112  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3113  selectLargeAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3114  {
3115  const size_t M( A.rows() );
3116  const size_t N( A.columns() );
3117 
3118  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
3119 
3120  const size_t iblock( 32768UL / sizeof( ElementType ) );
3121  const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3122 
3123  BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3124 
3125  const SIMDType factor( set( scalar ) );
3126 
3127  reset( y );
3128 
3129  for( size_t ii=0U; ii<M; ii+=iblock ) {
3130  for( size_t jj=0UL; jj<N; jj+=jblock )
3131  {
3132  const size_t jend( min( jj+jblock, N ) );
3133  const size_t itmp( min( ii+iblock, M ) );
3134  const size_t iend( ( IsUpper<MT1>::value )
3135  ?( min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
3136  :( itmp ) );
3137 
3138  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3139  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3140 
3141  size_t i( ( IsLower<MT1>::value )
3142  ?( max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) & size_t(-SIMDSIZE) ) )
3143  :( ii ) );
3144 
3145  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3146  {
3147  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3148 
3149  for( size_t j=jj; j<jend; ++j ) {
3150  const SIMDType x1( set( x[j] ) );
3151  xmm1 = xmm1 + A.load(i ,j) * x1;
3152  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3153  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3154  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3155  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
3156  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
3157  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
3158  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
3159  }
3160 
3161  y.store( i , y.load(i ) + xmm1*factor );
3162  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3163  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3164  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3165  y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3166  y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3167  y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3168  y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3169  }
3170 
3171  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3172  {
3173  SIMDType xmm1, xmm2, xmm3, xmm4;
3174 
3175  for( size_t j=jj; j<jend; ++j ) {
3176  const SIMDType x1( set( x[j] ) );
3177  xmm1 = xmm1 + A.load(i ,j) * x1;
3178  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3179  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3180  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3181  }
3182 
3183  y.store( i , y.load(i ) + xmm1*factor );
3184  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3185  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3186  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3187  }
3188 
3189  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3190  {
3191  SIMDType xmm1, xmm2, xmm3;
3192 
3193  for( size_t j=jj; j<jend; ++j ) {
3194  const SIMDType x1( set( x[j] ) );
3195  xmm1 = xmm1 + A.load(i ,j) * x1;
3196  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3197  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3198  }
3199 
3200  y.store( i , y.load(i ) + xmm1*factor );
3201  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3202  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3203  }
3204 
3205  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3206  {
3207  SIMDType xmm1, xmm2;
3208 
3209  for( size_t j=jj; j<jend; ++j ) {
3210  const SIMDType x1( set( x[j] ) );
3211  xmm1 = xmm1 + A.load(i ,j) * x1;
3212  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
3213  }
3214 
3215  y.store( i , y.load(i ) + xmm1*factor );
3216  y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3217  }
3218 
3219  for( ; i<ipos; i+=SIMDSIZE )
3220  {
3221  SIMDType xmm1;
3222 
3223  for( size_t j=jj; j<jend; ++j ) {
3224  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
3225  }
3226 
3227  y.store( i, y.load(i) + xmm1*factor );
3228  }
3229 
3230  for( ; remainder && i<iend; ++i )
3231  {
3232  ElementType value = ElementType();
3233 
3234  for( size_t j=jj; j<jend; ++j ) {
3235  value += A(i,j) * x[j];
3236  }
3237 
3238  y[i] += value * scalar;
3239  }
3240  }
3241  }
3242  }
3243  //**********************************************************************************************
3244 
3245  //**BLAS-based assignment to dense vectors (default)********************************************
3259  template< typename VT1 // Type of the left-hand side target vector
3260  , typename MT1 // Type of the left-hand side matrix operand
3261  , typename VT2 // Type of the right-hand side vector operand
3262  , typename ST2 > // Type of the scalar value
3263  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3264  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3265  {
3266  selectLargeAssignKernel( y, A, x, scalar );
3267  }
3268  //**********************************************************************************************
3269 
3270  //**BLAS-based assignment to dense vectors******************************************************
3271 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3272 
3285  template< typename VT1 // Type of the left-hand side target vector
3286  , typename MT1 // Type of the left-hand side matrix operand
3287  , typename VT2 // Type of the right-hand side vector operand
3288  , typename ST2 > // Type of the scalar value
3289  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3290  selectBlasAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3291  {
3292  typedef ElementType_<VT1> ET;
3293 
3294  if( IsTriangular<MT1>::value ) {
3295  assign( y, scalar * x );
3296  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3297  }
3298  else {
3299  gemv( y, A, x, ET(scalar), ET(0) );
3300  }
3301  }
3302 #endif
3303  //**********************************************************************************************
3304 
3305  //**Assignment to sparse vectors****************************************************************
3317  template< typename VT1 > // Type of the target sparse vector
3318  friend inline void assign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3319  {
3321 
3324  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
3325 
3326  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3327 
3328  const ResultType tmp( serial( rhs ) );
3329  assign( ~lhs, tmp );
3330  }
3331  //**********************************************************************************************
3332 
3333  //**Addition assignment to dense vectors********************************************************
3345  template< typename VT1 > // Type of the target dense vector
3346  friend inline void addAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3347  {
3349 
3350  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3351 
3352  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
3353  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
3354 
3355  if( left.rows() == 0UL || left.columns() == 0UL ) {
3356  return;
3357  }
3358 
3359  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3360  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3361 
3362  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3363  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3364  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3365  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3366 
3367  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3368  }
3369  //**********************************************************************************************
3370 
3371  //**Addition assignment to dense vectors (kernel selection)*************************************
3382  template< typename VT1 // Type of the left-hand side target vector
3383  , typename MT1 // Type of the left-hand side matrix operand
3384  , typename VT2 // Type of the right-hand side vector operand
3385  , typename ST2 > // Type of the scalar value
3386  static inline void selectAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3387  {
3388  if( ( IsDiagonal<MT1>::value ) ||
3389  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3390  ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3391  selectSmallAddAssignKernel( y, A, x, scalar );
3392  else
3393  selectBlasAddAssignKernel( y, A, x, scalar );
3394  }
3395  //**********************************************************************************************
3396 
3397  //**Default addition assignment to dense vectors************************************************
3411  template< typename VT1 // Type of the left-hand side target vector
3412  , typename MT1 // Type of the left-hand side matrix operand
3413  , typename VT2 // Type of the right-hand side vector operand
3414  , typename ST2 > // Type of the scalar value
3415  static inline void selectDefaultAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3416  {
3417  y.addAssign( A * x * scalar );
3418  }
3419  //**********************************************************************************************
3420 
3421  //**Default addition assignment to dense vectors (small matrices)*******************************
3435  template< typename VT1 // Type of the left-hand side target vector
3436  , typename MT1 // Type of the left-hand side matrix operand
3437  , typename VT2 // Type of the right-hand side vector operand
3438  , typename ST2 > // Type of the scalar value
3439  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3440  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3441  {
3442  selectDefaultAddAssignKernel( y, A, x, scalar );
3443  }
3444  //**********************************************************************************************
3445 
3446  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3461  template< typename VT1 // Type of the left-hand side target vector
3462  , typename MT1 // Type of the left-hand side matrix operand
3463  , typename VT2 // Type of the right-hand side vector operand
3464  , typename ST2 > // Type of the scalar value
3465  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3466  selectSmallAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3467  {
3468  const size_t M( A.rows() );
3469  const size_t N( A.columns() );
3470 
3471  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
3472 
3473  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3474  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3475 
3476  const SIMDType factor( set( scalar ) );
3477 
3478  size_t i( 0UL );
3479 
3480  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3481  {
3482  const size_t jbegin( ( IsUpper<MT1>::value )
3483  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3484  :( 0UL ) );
3485  const size_t jend( ( IsLower<MT1>::value )
3486  ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3487  :( N ) );
3488  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3489 
3490  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3491 
3492  for( size_t j=jbegin; j<jend; ++j ) {
3493  const SIMDType x1( set( x[j] ) );
3494  xmm1 = xmm1 + A.load(i ,j) * x1;
3495  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3496  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3497  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3498  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
3499  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
3500  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
3501  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
3502  }
3503 
3504  y.store( i , y.load(i ) + xmm1*factor );
3505  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3506  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3507  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3508  y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3509  y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3510  y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3511  y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3512  }
3513 
3514  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3515  {
3516  const size_t jbegin( ( IsUpper<MT1>::value )
3517  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3518  :( 0UL ) );
3519  const size_t jend( ( IsLower<MT1>::value )
3520  ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3521  :( N ) );
3522  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3523 
3524  SIMDType xmm1, xmm2, xmm3, xmm4;
3525 
3526  for( size_t j=jbegin; j<jend; ++j ) {
3527  const SIMDType x1( set( x[j] ) );
3528  xmm1 = xmm1 + A.load(i ,j) * x1;
3529  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3530  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3531  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3532  }
3533 
3534  y.store( i , y.load(i ) + xmm1*factor );
3535  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3536  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3537  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3538  }
3539 
3540  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3541  {
3542  const size_t jbegin( ( IsUpper<MT1>::value )
3543  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3544  :( 0UL ) );
3545  const size_t jend( ( IsLower<MT1>::value )
3546  ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3547  :( N ) );
3548  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3549 
3550  SIMDType xmm1, xmm2, xmm3;
3551 
3552  for( size_t j=jbegin; j<jend; ++j ) {
3553  const SIMDType x1( set( x[j] ) );
3554  xmm1 = xmm1 + A.load(i ,j) * x1;
3555  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3556  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3557  }
3558 
3559  y.store( i , y.load(i ) + xmm1*factor );
3560  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3561  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3562  }
3563 
3564  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3565  {
3566  const size_t jbegin( ( IsUpper<MT1>::value )
3567  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3568  :( 0UL ) );
3569  const size_t jend( ( IsLower<MT1>::value )
3570  ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3571  :( N ) );
3572  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3573 
3574  SIMDType xmm1, xmm2;
3575 
3576  for( size_t j=jbegin; j<jend; ++j ) {
3577  const SIMDType x1( set( x[j] ) );
3578  xmm1 = xmm1 + A.load(i ,j) * x1;
3579  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
3580  }
3581 
3582  y.store( i , y.load(i ) + xmm1*factor );
3583  y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3584  }
3585 
3586  for( ; i<ipos; i+=SIMDSIZE )
3587  {
3588  const size_t jbegin( ( IsUpper<MT1>::value )
3589  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3590  :( 0UL ) );
3591  const size_t jend( ( IsLower<MT1>::value )
3592  ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3593  :( N ) );
3594  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3595 
3596  SIMDType xmm1;
3597 
3598  for( size_t j=jbegin; j<jend; ++j ) {
3599  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
3600  }
3601 
3602  y.store( i, y.load(i) + xmm1*factor );
3603  }
3604 
3605  for( ; remainder && i<M; ++i )
3606  {
3607  const size_t jbegin( ( IsUpper<MT1>::value )
3608  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3609  :( 0UL ) );
3610  const size_t jend( ( IsLower<MT1>::value )
3611  ?( min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3612  :( N ) );
3613  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3614 
3615  ElementType value = ElementType();
3616 
3617  for( size_t j=jbegin; j<jend; ++j ) {
3618  value += A(i,j) * x[j];
3619  }
3620 
3621  y[i] += value * scalar;
3622  }
3623  }
3624  //**********************************************************************************************
3625 
3626  //**Default addition assignment to dense vectors (large matrices)*******************************
3640  template< typename VT1 // Type of the left-hand side target vector
3641  , typename MT1 // Type of the left-hand side matrix operand
3642  , typename VT2 // Type of the right-hand side vector operand
3643  , typename ST2 > // Type of the scalar value
3644  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3645  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3646  {
3647  selectDefaultAddAssignKernel( y, A, x, scalar );
3648  }
3649  //**********************************************************************************************
3650 
3651  //**Vectorized default addition assignment to dense vectors (large matrices)********************
3666  template< typename VT1 // Type of the left-hand side target vector
3667  , typename MT1 // Type of the left-hand side matrix operand
3668  , typename VT2 // Type of the right-hand side vector operand
3669  , typename ST2 > // Type of the scalar value
3670  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3671  selectLargeAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3672  {
3673  const size_t M( A.rows() );
3674  const size_t N( A.columns() );
3675 
3676  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
3677 
3678  const size_t iblock( 32768UL / sizeof( ElementType ) );
3679  const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3680 
3681  BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3682 
3683  const SIMDType factor( set( scalar ) );
3684 
3685  for( size_t ii=0U; ii<M; ii+=iblock ) {
3686  for( size_t jj=0UL; jj<N; jj+=jblock )
3687  {
3688  const size_t jend( min( jj+jblock, N ) );
3689  const size_t itmp( min( ii+iblock, M ) );
3690  const size_t iend( ( IsUpper<MT1>::value )
3691  ?( min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
3692  :( itmp ) );
3693 
3694  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3695  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3696 
3697  size_t i( ( IsLower<MT1>::value )
3698  ?( max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) & size_t(-SIMDSIZE) ) )
3699  :( ii ) );
3700 
3701  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3702  {
3703  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3704 
3705  for( size_t j=jj; j<jend; ++j ) {
3706  const SIMDType x1( set( x[j] ) );
3707  xmm1 = xmm1 + A.load(i ,j) * x1;
3708  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3709  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3710  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3711  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
3712  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
3713  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
3714  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
3715  }
3716 
3717  y.store( i , y.load(i ) + xmm1*factor );
3718  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3719  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3720  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3721  y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3722  y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3723  y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3724  y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3725  }
3726 
3727  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3728  {
3729  SIMDType xmm1, xmm2, xmm3, xmm4;
3730 
3731  for( size_t j=jj; j<jend; ++j ) {
3732  const SIMDType x1( set( x[j] ) );
3733  xmm1 = xmm1 + A.load(i ,j) * x1;
3734  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3735  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3736  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3737  }
3738 
3739  y.store( i , y.load(i ) + xmm1*factor );
3740  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3741  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3742  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3743  }
3744 
3745  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3746  {
3747  SIMDType xmm1, xmm2, xmm3;
3748 
3749  for( size_t j=jj; j<jend; ++j ) {
3750  const SIMDType x1( set( x[j] ) );
3751  xmm1 = xmm1 + A.load(i ,j) * x1;
3752  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3753  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3754  }
3755 
3756  y.store( i , y.load(i ) + xmm1*factor );
3757  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3758  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3759  }
3760 
3761  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3762  {
3763  SIMDType xmm1, xmm2;
3764 
3765  for( size_t j=jj; j<jend; ++j ) {
3766  const SIMDType x1( set( x[j] ) );
3767  xmm1 = xmm1 + A.load(i ,j) * x1;
3768  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
3769  }
3770 
3771  y.store( i , y.load(i ) + xmm1*factor );
3772  y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3773  }
3774 
3775  for( ; i<ipos; i+=SIMDSIZE )
3776  {
3777  SIMDType xmm1;
3778 
3779  for( size_t j=jj; j<jend; ++j ) {
3780  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
3781  }
3782 
3783  y.store( i, y.load(i) + xmm1*factor );
3784  }
3785 
3786  for( ; remainder && i<iend; ++i )
3787  {
3788  ElementType value = ElementType();
3789 
3790  for( size_t j=jj; j<jend; ++j ) {
3791  value += A(i,j) * x[j];
3792  }
3793 
3794  y[i] += value * scalar;
3795  }
3796  }
3797  }
3798  }
3799  //**********************************************************************************************
3800 
3801  //**BLAS-based addition assignment to dense vectors (default)***********************************
3815  template< typename VT1 // Type of the left-hand side target vector
3816  , typename MT1 // Type of the left-hand side matrix operand
3817  , typename VT2 // Type of the right-hand side vector operand
3818  , typename ST2 > // Type of the scalar value
3819  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3820  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3821  {
3822  selectLargeAddAssignKernel( y, A, x, scalar );
3823  }
3824  //**********************************************************************************************
3825 
3826  //**BLAS-based addition assignment to dense vectors*********************************************
3827 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3828 
3841  template< typename VT1 // Type of the left-hand side target vector
3842  , typename MT1 // Type of the left-hand side matrix operand
3843  , typename VT2 // Type of the right-hand side vector operand
3844  , typename ST2 > // Type of the scalar value
3845  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3846  selectBlasAddAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3847  {
3848  typedef ElementType_<VT1> ET;
3849 
3850  if( IsTriangular<MT1>::value ) {
3851  ResultType_<VT1> tmp( serial( scalar * x ) );
3852  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3853  addAssign( y, tmp );
3854  }
3855  else {
3856  gemv( y, A, x, ET(scalar), ET(1) );
3857  }
3858  }
3859 #endif
3860  //**********************************************************************************************
3861 
3862  //**Addition assignment to sparse vectors*******************************************************
3863  // No special implementation for the addition assignment to sparse vectors.
3864  //**********************************************************************************************
3865 
3866  //**Subtraction assignment to dense vectors*****************************************************
3878  template< typename VT1 > // Type of the target dense vector
3879  friend inline void subAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
3880  {
3882 
3883  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3884 
3885  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
3886  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
3887 
3888  if( left.rows() == 0UL || left.columns() == 0UL ) {
3889  return;
3890  }
3891 
3892  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
3893  RT x( serial( right ) ); // Evaluation of the right-hand side dense vector operand
3894 
3895  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
3896  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
3897  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
3898  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
3899 
3900  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
3901  }
3902  //**********************************************************************************************
3903 
3904  //**Subtraction assignment to dense vectors (kernel selection)**********************************
3915  template< typename VT1 // Type of the left-hand side target vector
3916  , typename MT1 // Type of the left-hand side matrix operand
3917  , typename VT2 // Type of the right-hand side vector operand
3918  , typename ST2 > // Type of the scalar value
3919  static inline void selectSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3920  {
3921  if( ( IsDiagonal<MT1>::value ) ||
3922  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3923  ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3924  selectSmallSubAssignKernel( y, A, x, scalar );
3925  else
3926  selectBlasSubAssignKernel( y, A, x, scalar );
3927  }
3928  //**********************************************************************************************
3929 
3930  //**Default subtraction assignment to dense vectors*********************************************
3944  template< typename VT1 // Type of the left-hand side target vector
3945  , typename MT1 // Type of the left-hand side matrix operand
3946  , typename VT2 // Type of the right-hand side vector operand
3947  , typename ST2 > // Type of the scalar value
3948  static inline void selectDefaultSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3949  {
3950  y.subAssign( A * x * scalar );
3951  }
3952  //**********************************************************************************************
3953 
3954  //**Default subtraction assignment to dense vectors (small matrices)****************************
3968  template< typename VT1 // Type of the left-hand side target vector
3969  , typename MT1 // Type of the left-hand side matrix operand
3970  , typename VT2 // Type of the right-hand side vector operand
3971  , typename ST2 > // Type of the scalar value
3972  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3973  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
3974  {
3975  selectDefaultSubAssignKernel( y, A, x, scalar );
3976  }
3977  //**********************************************************************************************
3978 
3979  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
3994  template< typename VT1 // Type of the left-hand side target vector
3995  , typename MT1 // Type of the left-hand side matrix operand
3996  , typename VT2 // Type of the right-hand side vector operand
3997  , typename ST2 > // Type of the scalar value
3998  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3999  selectSmallSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4000  {
4001  const size_t M( A.rows() );
4002  const size_t N( A.columns() );
4003 
4004  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
4005 
4006  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
4007  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
4008 
4009  const SIMDType factor( set( scalar ) );
4010 
4011  size_t i( 0UL );
4012 
4013  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4014  {
4015  const size_t jbegin( ( IsUpper<MT1>::value )
4016  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4017  :( 0UL ) );
4018  const size_t jend( ( IsLower<MT1>::value )
4019  ?( min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4020  :( N ) );
4021  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4022 
4023  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4024 
4025  for( size_t j=jbegin; j<jend; ++j ) {
4026  const SIMDType x1( set( x[j] ) );
4027  xmm1 = xmm1 + A.load(i ,j) * x1;
4028  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4029  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4030  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
4031  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
4032  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
4033  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
4034  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
4035  }
4036 
4037  y.store( i , y.load(i ) - xmm1*factor );
4038  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4039  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4040  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4041  y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4042  y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4043  y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4044  y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4045  }
4046 
4047  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4048  {
4049  const size_t jbegin( ( IsUpper<MT1>::value )
4050  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4051  :( 0UL ) );
4052  const size_t jend( ( IsLower<MT1>::value )
4053  ?( min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4054  :( N ) );
4055  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4056 
4057  SIMDType xmm1, xmm2, xmm3, xmm4;
4058 
4059  for( size_t j=jbegin; j<jend; ++j ) {
4060  const SIMDType x1( set( x[j] ) );
4061  xmm1 = xmm1 + A.load(i ,j) * x1;
4062  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4063  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4064  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
4065  }
4066 
4067  y.store( i , y.load(i ) - xmm1*factor );
4068  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4069  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4070  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4071  }
4072 
4073  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4074  {
4075  const size_t jbegin( ( IsUpper<MT1>::value )
4076  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4077  :( 0UL ) );
4078  const size_t jend( ( IsLower<MT1>::value )
4079  ?( min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4080  :( N ) );
4081  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4082 
4083  SIMDType xmm1, xmm2, xmm3;
4084 
4085  for( size_t j=jbegin; j<jend; ++j ) {
4086  const SIMDType x1( set( x[j] ) );
4087  xmm1 = xmm1 + A.load(i ,j) * x1;
4088  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4089  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4090  }
4091 
4092  y.store( i , y.load(i ) - xmm1*factor );
4093  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4094  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4095  }
4096 
4097  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4098  {
4099  const size_t jbegin( ( IsUpper<MT1>::value )
4100  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4101  :( 0UL ) );
4102  const size_t jend( ( IsLower<MT1>::value )
4103  ?( min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4104  :( N ) );
4105  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4106 
4107  SIMDType xmm1, xmm2;
4108 
4109  for( size_t j=jbegin; j<jend; ++j ) {
4110  const SIMDType x1( set( x[j] ) );
4111  xmm1 = xmm1 + A.load(i ,j) * x1;
4112  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
4113  }
4114 
4115  y.store( i , y.load(i ) - xmm1*factor );
4116  y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4117  }
4118 
4119  for( ; i<ipos; i+=SIMDSIZE )
4120  {
4121  const size_t jbegin( ( IsUpper<MT1>::value )
4122  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4123  :( 0UL ) );
4124  const size_t jend( ( IsLower<MT1>::value )
4125  ?( min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4126  :( N ) );
4127  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4128 
4129  SIMDType xmm1;
4130 
4131  for( size_t j=jbegin; j<jend; ++j ) {
4132  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
4133  }
4134 
4135  y.store( i, y.load(i) - xmm1*factor );
4136  }
4137 
4138  for( ; remainder && i<M; ++i )
4139  {
4140  const size_t jbegin( ( IsUpper<MT1>::value )
4141  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4142  :( 0UL ) );
4143  const size_t jend( ( IsLower<MT1>::value )
4144  ?( min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4145  :( N ) );
4146  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4147 
4148  ElementType value = ElementType();
4149 
4150  for( size_t j=jbegin; j<jend; ++j ) {
4151  value += A(i,j) * x[j];
4152  }
4153 
4154  y[i] -= value * scalar;
4155  }
4156  }
4157  //**********************************************************************************************
4158 
4159  //**Default subtraction assignment to dense vectors (large matrices)****************************
4173  template< typename VT1 // Type of the left-hand side target vector
4174  , typename MT1 // Type of the left-hand side matrix operand
4175  , typename VT2 // Type of the right-hand side vector operand
4176  , typename ST2 > // Type of the scalar value
4177  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4178  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4179  {
4180  selectDefaultSubAssignKernel( y, A, x, scalar );
4181  }
4182  //**********************************************************************************************
4183 
4184  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4199  template< typename VT1 // Type of the left-hand side target vector
4200  , typename MT1 // Type of the left-hand side matrix operand
4201  , typename VT2 // Type of the right-hand side vector operand
4202  , typename ST2 > // Type of the scalar value
4203  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4204  selectLargeSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4205  {
4206  const size_t M( A.rows() );
4207  const size_t N( A.columns() );
4208 
4209  const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
4210 
4211  const size_t iblock( 32768UL / sizeof( ElementType ) );
4212  const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
4213 
4214  BLAZE_INTERNAL_ASSERT( ( iblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
4215 
4216  const SIMDType factor( set( scalar ) );
4217 
4218  for( size_t ii=0U; ii<M; ii+=iblock ) {
4219  for( size_t jj=0UL; jj<N; jj+=jblock )
4220  {
4221  const size_t jend( min( jj+jblock, N ) );
4222  const size_t itmp( min( ii+iblock, M ) );
4223  const size_t iend( ( IsUpper<MT1>::value )
4224  ?( min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
4225  :( itmp ) );
4226 
4227  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4228  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
4229 
4230  size_t i( ( IsLower<MT1>::value )
4231  ?( max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) & size_t(-SIMDSIZE) ) )
4232  :( ii ) );
4233 
4234  for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4235  {
4236  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4237 
4238  for( size_t j=jj; j<jend; ++j ) {
4239  const SIMDType x1( set( x[j] ) );
4240  xmm1 = xmm1 + A.load(i ,j) * x1;
4241  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4242  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4243  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
4244  xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
4245  xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
4246  xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
4247  xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
4248  }
4249 
4250  y.store( i , y.load(i ) - xmm1*factor );
4251  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4252  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4253  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4254  y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4255  y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4256  y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4257  y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4258  }
4259 
4260  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4261  {
4262  SIMDType xmm1, xmm2, xmm3, xmm4;
4263 
4264  for( size_t j=jj; j<jend; ++j ) {
4265  const SIMDType x1( set( x[j] ) );
4266  xmm1 = xmm1 + A.load(i ,j) * x1;
4267  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4268  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4269  xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
4270  }
4271 
4272  y.store( i , y.load(i ) - xmm1*factor );
4273  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4274  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4275  y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4276  }
4277 
4278  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4279  {
4280  SIMDType xmm1, xmm2, xmm3;
4281 
4282  for( size_t j=jj; j<jend; ++j ) {
4283  const SIMDType x1( set( x[j] ) );
4284  xmm1 = xmm1 + A.load(i ,j) * x1;
4285  xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4286  xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4287  }
4288 
4289  y.store( i , y.load(i ) - xmm1*factor );
4290  y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4291  y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4292  }
4293 
4294  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4295  {
4296  SIMDType xmm1, xmm2;
4297 
4298  for( size_t j=jj; j<jend; ++j ) {
4299  const SIMDType x1( set( x[j] ) );
4300  xmm1 = xmm1 + A.load(i ,j) * x1;
4301  xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
4302  }
4303 
4304  y.store( i , y.load(i ) - xmm1*factor );
4305  y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4306  }
4307 
4308  for( ; i<ipos; i+=SIMDSIZE )
4309  {
4310  SIMDType xmm1;
4311 
4312  for( size_t j=jj; j<jend; ++j ) {
4313  xmm1 = xmm1 + A.load(i,j) * set( x[j] );
4314  }
4315 
4316  y.store( i, y.load(i) - xmm1*factor );
4317  }
4318 
4319  for( ; remainder && i<iend; ++i )
4320  {
4321  ElementType value = ElementType();
4322 
4323  for( size_t j=jj; j<jend; ++j ) {
4324  value += A(i,j) * x[j];
4325  }
4326 
4327  y[i] -= value * scalar;
4328  }
4329  }
4330  }
4331  }
4332  //**********************************************************************************************
4333 
4334  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4348  template< typename VT1 // Type of the left-hand side target vector
4349  , typename MT1 // Type of the left-hand side matrix operand
4350  , typename VT2 // Type of the right-hand side vector operand
4351  , typename ST2 > // Type of the scalar value
4352  static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4353  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4354  {
4355  selectLargeSubAssignKernel( y, A, x, scalar );
4356  }
4357  //**********************************************************************************************
4358 
4359  //**BLAS-based subtraction assignment to dense vectors******************************************
4360 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4361 
4374  template< typename VT1 // Type of the left-hand side target vector
4375  , typename MT1 // Type of the left-hand side matrix operand
4376  , typename VT2 // Type of the right-hand side vector operand
4377  , typename ST2 > // Type of the scalar value
4378  static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4379  selectBlasSubAssignKernel( VT1& y, const MT1& A, const VT2& x, ST2 scalar )
4380  {
4381  typedef ElementType_<VT1> ET;
4382 
4383  if( IsTriangular<MT1>::value ) {
4384  ResultType_<VT1> tmp( serial( scalar * x ) );
4385  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4386  subAssign( y, tmp );
4387  }
4388  else {
4389  gemv( y, A, x, ET(-scalar), ET(1) );
4390  }
4391  }
4392 #endif
4393  //**********************************************************************************************
4394 
4395  //**Subtraction assignment to sparse vectors****************************************************
4396  // No special implementation for the subtraction assignment to sparse vectors.
4397  //**********************************************************************************************
4398 
4399  //**Multiplication assignment to dense vectors**************************************************
4411  template< typename VT1 > // Type of the target dense vector
4412  friend inline void multAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4413  {
4415 
4418  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4419 
4420  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4421 
4422  const ResultType tmp( serial( rhs ) );
4423  multAssign( ~lhs, tmp );
4424  }
4425  //**********************************************************************************************
4426 
4427  //**Multiplication assignment to sparse vectors*************************************************
4428  // No special implementation for the multiplication assignment to sparse vectors.
4429  //**********************************************************************************************
4430 
4431  //**Division assignment to dense vectors********************************************************
4443  template< typename VT1 > // Type of the target dense vector
4444  friend inline void divAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4445  {
4447 
4450  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4451 
4452  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4453 
4454  const ResultType tmp( serial( rhs ) );
4455  divAssign( ~lhs, tmp );
4456  }
4457  //**********************************************************************************************
4458 
4459  //**Division assignment to sparse vectors*******************************************************
4460  // No special implementation for the division assignment to sparse vectors.
4461  //**********************************************************************************************
4462 
4463  //**SMP assignment to dense vectors*************************************************************
4477  template< typename VT1 > // Type of the target dense vector
4478  friend inline EnableIf_< UseSMPAssign<VT1> >
4479  smpAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4480  {
4482 
4483  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4484 
4485  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4486  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4487 
4488  if( left.rows() == 0UL ) {
4489  return;
4490  }
4491  else if( left.columns() == 0UL ) {
4492  reset( ~lhs );
4493  return;
4494  }
4495 
4496  LT A( left ); // Evaluation of the left-hand side dense matrix operand
4497  RT x( right ); // Evaluation of the right-hand side dense vector operand
4498 
4499  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4500  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4501  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4502  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
4503 
4504  smpAssign( ~lhs, A * x * rhs.scalar_ );
4505  }
4506  //**********************************************************************************************
4507 
4508  //**SMP assignment to sparse vectors************************************************************
4522  template< typename VT1 > // Type of the target sparse vector
4523  friend inline EnableIf_< UseSMPAssign<VT1> >
4524  smpAssign( SparseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4525  {
4527 
4530  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4531 
4532  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4533 
4534  const ResultType tmp( rhs );
4535  smpAssign( ~lhs, tmp );
4536  }
4537  //**********************************************************************************************
4538 
4539  //**SMP addition assignment to dense vectors****************************************************
4553  template< typename VT1 > // Type of the target dense vector
4554  friend inline EnableIf_< UseSMPAssign<VT1> >
4555  smpAddAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4556  {
4558 
4559  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4560 
4561  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4562  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4563 
4564  if( left.rows() == 0UL || left.columns() == 0UL ) {
4565  return;
4566  }
4567 
4568  LT A( left ); // Evaluation of the left-hand side dense matrix operand
4569  RT x( right ); // Evaluation of the right-hand side dense vector operand
4570 
4571  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4572  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4573  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4574  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
4575 
4576  smpAddAssign( ~lhs, A * x * rhs.scalar_ );
4577  }
4578  //**********************************************************************************************
4579 
4580  //**SMP addition assignment to sparse vectors***************************************************
4581  // No special implementation for the SMP addition assignment to sparse vectors.
4582  //**********************************************************************************************
4583 
4584  //**SMP subtraction assignment to dense vectors*************************************************
4598  template< typename VT1 > // Type of the target dense vector
4599  friend inline EnableIf_< UseSMPAssign<VT1> >
4600  smpSubAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4601  {
4603 
4604  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4605 
4606  LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4607  RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4608 
4609  if( left.rows() == 0UL || left.columns() == 0UL ) {
4610  return;
4611  }
4612 
4613  LT A( left ); // Evaluation of the left-hand side dense matrix operand
4614  RT x( right ); // Evaluation of the right-hand side dense vector operand
4615 
4616  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4617  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns(), "Invalid number of columns" );
4618  BLAZE_INTERNAL_ASSERT( x.size() == right.size() , "Invalid vector size" );
4619  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).size() , "Invalid vector size" );
4620 
4621  smpSubAssign( ~lhs, A * x * rhs.scalar_ );
4622  }
4623  //**********************************************************************************************
4624 
4625  //**SMP subtraction assignment to sparse vectors************************************************
4626  // No special implementation for the SMP subtraction assignment to sparse vectors.
4627  //**********************************************************************************************
4628 
4629  //**SMP multiplication assignment to dense vectors**********************************************
4644  template< typename VT1 > // Type of the target dense vector
4645  friend inline EnableIf_< UseSMPAssign<VT1> >
4646  smpMultAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4647  {
4649 
4652  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4653 
4654  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4655 
4656  const ResultType tmp( rhs );
4657  smpMultAssign( ~lhs, tmp );
4658  }
4659  //**********************************************************************************************
4660 
4661  //**SMP multiplication assignment to sparse vectors*********************************************
4662  // No special implementation for the SMP multiplication assignment to sparse vectors.
4663  //**********************************************************************************************
4664 
4665  //**SMP division assignment to dense vectors****************************************************
4679  template< typename VT1 > // Type of the target dense vector
4680  friend inline EnableIf_< UseSMPAssign<VT1> >
4681  smpDivAssign( DenseVector<VT1,false>& lhs, const DVecScalarMultExpr& rhs )
4682  {
4684 
4687  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4688 
4689  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4690 
4691  const ResultType tmp( rhs );
4692  smpDivAssign( ~lhs, tmp );
4693  }
4694  //**********************************************************************************************
4695 
4696  //**SMP division assignment to sparse vectors***************************************************
4697  // No special implementation for the SMP division assignment to sparse vectors.
4698  //**********************************************************************************************
4699 
4700  //**Compile time checks*************************************************************************
4708  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
4709  //**********************************************************************************************
4710 };
4712 //*************************************************************************************************
4713 
4714 
4715 
4716 
4717 //=================================================================================================
4718 //
4719 // GLOBAL BINARY ARITHMETIC OPERATORS
4720 //
4721 //=================================================================================================
4722 
4723 //*************************************************************************************************
4754 template< typename T1 // Type of the left-hand side dense matrix
4755  , typename T2 > // Type of the right-hand side dense vector
4756 inline const DisableIf_< IsMatMatMultExpr<T1>, TDMatDVecMultExpr<T1,T2> >
4758 {
4760 
4761  if( (~mat).columns() != (~vec).size() ) {
4762  BLAZE_THROW_INVALID_ARGUMENT( "Matrix and vector sizes do not match" );
4763  }
4764 
4765  return TDMatDVecMultExpr<T1,T2>( ~mat, ~vec );
4766 }
4767 //*************************************************************************************************
4768 
4769 
4770 
4771 
4772 //=================================================================================================
4773 //
4774 // SIZE SPECIALIZATIONS
4775 //
4776 //=================================================================================================
4777 
4778 //*************************************************************************************************
4780 template< typename MT, typename VT >
4781 struct Size< TDMatDVecMultExpr<MT,VT> > : public Rows<MT>
4782 {};
4784 //*************************************************************************************************
4785 
4786 
4787 
4788 
4789 //=================================================================================================
4790 //
4791 // ISALIGNED SPECIALIZATIONS
4792 //
4793 //=================================================================================================
4794 
4795 //*************************************************************************************************
4797 template< typename MT, typename VT >
4798 struct IsAligned< TDMatDVecMultExpr<MT,VT> >
4799  : public BoolConstant< And< IsAligned<MT>, IsAligned<VT> >::value >
4800 {};
4802 //*************************************************************************************************
4803 
4804 
4805 
4806 
4807 //=================================================================================================
4808 //
4809 // EXPRESSION TRAIT SPECIALIZATIONS
4810 //
4811 //=================================================================================================
4812 
4813 //*************************************************************************************************
4815 template< typename MT, typename VT, bool AF >
4816 struct SubvectorExprTrait< TDMatDVecMultExpr<MT,VT>, AF >
4817 {
4818  public:
4819  //**********************************************************************************************
4820  using Type = MultExprTrait_< SubmatrixExprTrait_<const MT,AF>
4821  , SubvectorExprTrait_<const VT,AF> >;
4822  //**********************************************************************************************
4823 };
4825 //*************************************************************************************************
4826 
4827 } // namespace blaze
4828 
4829 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:131
Header file for auxiliary alias declarations.
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:208
Header file for basic type definitions.
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:135
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:374
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
TDMatDVecMultExpr< MT, VT > This
Type of this TDMatDVecMultExpr instance.
Definition: TDMatDVecMultExpr.h:207
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
If_< IsExpression< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:216
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Expression object for transpose dense matrix-dense vector multiplications.The TDMatDVecMultExpr class...
Definition: Forward.h:129
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:209
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:364
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: AreSIMDCombinable.h:121
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:210
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:132
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:330
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:110
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:134
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:222
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:133
Header file for all forward declarations for expression class templates.
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:310
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDVecMultExpr.h:211
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:265
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
System settings for the BLAS mode.
Header file for the SubmatrixExprTrait class template.
Header file for the HasSIMDMult type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:320
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:354
Header file for run time assertion macros.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraints on the storage order of matrix types.
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
Header file for the AreSIMDCombinable type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDMatDVecMultExpr.h:297
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Constraint on the data type.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:384
Header file for the MatVecMultExpr base class.
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:136
TDMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:251
Header file for the Size type trait.
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:385
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:213
If_< IsExpression< VT >, const VT, const VT & > RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:219
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:212
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:342
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:330
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:225