TDVecDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/Exception.h>
58 #include <blaze/math/Functions.h>
59 #include <blaze/math/shims/Reset.h>
61 #include <blaze/math/SIMD.h>
85 #include <blaze/system/BLAS.h>
88 #include <blaze/util/Assert.h>
89 #include <blaze/util/Complex.h>
92 #include <blaze/util/DisableIf.h>
93 #include <blaze/util/EnableIf.h>
96 #include <blaze/util/mpl/And.h>
97 #include <blaze/util/mpl/If.h>
98 #include <blaze/util/Types.h>
107 
108 
109 namespace blaze {
110 
111 //=================================================================================================
112 //
113 // CLASS TDVECDMATMULTEXPR
114 //
115 //=================================================================================================
116 
117 //*************************************************************************************************
124 template< typename VT // Type of the left-hand side dense vector
125  , typename MT > // Type of the right-hand side dense matrix
126 class TDVecDMatMultExpr : public DenseVector< TDVecDMatMultExpr<VT,MT>, true >
127  , private TVecMatMultExpr
128  , private Computation
129 {
130  private:
131  //**Type definitions****************************************************************************
138  //**********************************************************************************************
139 
140  //**********************************************************************************************
142  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
143  //**********************************************************************************************
144 
145  //**********************************************************************************************
147  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
149  //**********************************************************************************************
150 
151  //**********************************************************************************************
153 
157  template< typename T1 >
158  struct UseSMPAssign {
159  enum : bool { value = ( evaluateVector || evaluateMatrix ) };
160  };
162  //**********************************************************************************************
163 
164  //**********************************************************************************************
166 
169  template< typename T1, typename T2, typename T3 >
170  struct UseBlasKernel {
172  HasMutableDataAccess<T1>::value &&
173  HasConstDataAccess<T2>::value &&
174  HasConstDataAccess<T3>::value &&
175  !IsDiagonal<T3>::value &&
176  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
177  IsBLASCompatible< ElementType_<T1> >::value &&
178  IsBLASCompatible< ElementType_<T2> >::value &&
179  IsBLASCompatible< ElementType_<T3> >::value &&
180  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
181  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
182  };
184  //**********************************************************************************************
185 
186  //**********************************************************************************************
188 
192  template< typename T1, typename T2, typename T3 >
193  struct UseVectorizedDefaultKernel {
194  enum : bool { value = useOptimizedKernels &&
195  !IsDiagonal<T3>::value &&
196  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
197  AreSIMDCombinable< ElementType_<T1>
198  , ElementType_<T2>
199  , ElementType_<T3> >::value &&
200  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
201  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
202  };
204  //**********************************************************************************************
205 
206  public:
207  //**Type definitions****************************************************************************
213  typedef const ElementType ReturnType;
214  typedef const ResultType CompositeType;
215 
217  typedef If_< IsExpression<VT>, const VT, const VT& > LeftOperand;
218 
220  typedef If_< IsExpression<MT>, const MT, const MT& > RightOperand;
221 
224 
227  //**********************************************************************************************
228 
229  //**Compilation flags***************************************************************************
231  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
232  VT::simdEnabled && MT::simdEnabled &&
235 
237  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
238  !evaluateMatrix && MT::smpAssignable };
239  //**********************************************************************************************
240 
241  //**SIMD properties*****************************************************************************
243  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
244  //**********************************************************************************************
245 
246  //**Constructor*********************************************************************************
252  explicit inline TDVecDMatMultExpr( const VT& vec, const MT& mat ) noexcept
253  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
254  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
255  {
256  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
257  }
258  //**********************************************************************************************
259 
260  //**Subscript operator**************************************************************************
266  inline ReturnType operator[]( size_t index ) const {
267  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
268 
270  {
271  return vec_[index] * mat_(index,index);
272  }
273  else if( IsLower<MT>::value && ( index > 8UL ) )
274  {
275  const size_t begin( IsStrictlyLower<MT>::value ? index+1UL : index );
276  const size_t n ( mat_.rows() - begin );
277  return subvector( vec_, begin, n ) * subvector( column( mat_, index ), begin, n );
278  }
279  else if( IsUpper<MT>::value && ( index + 8UL < mat_.rows() ) )
280  {
281  const size_t n( IsStrictlyUpper<MT>::value ? index : index+1UL );
282  return subvector( vec_, 0UL, n ) * subvector( column( mat_, index ), 0UL, n );
283  }
284  else
285  {
286  return vec_ * column( mat_, index );
287  }
288  }
289  //**********************************************************************************************
290 
291  //**At function*********************************************************************************
298  inline ReturnType at( size_t index ) const {
299  if( index >= mat_.columns() ) {
300  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
301  }
302  return (*this)[index];
303  }
304  //**********************************************************************************************
305 
306  //**Size function*******************************************************************************
311  inline size_t size() const noexcept {
312  return mat_.columns();
313  }
314  //**********************************************************************************************
315 
316  //**Left operand access*************************************************************************
321  inline LeftOperand leftOperand() const noexcept {
322  return vec_;
323  }
324  //**********************************************************************************************
325 
326  //**Right operand access************************************************************************
331  inline RightOperand rightOperand() const noexcept {
332  return mat_;
333  }
334  //**********************************************************************************************
335 
336  //**********************************************************************************************
342  template< typename T >
343  inline bool canAlias( const T* alias ) const noexcept {
344  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
345  }
346  //**********************************************************************************************
347 
348  //**********************************************************************************************
354  template< typename T >
355  inline bool isAliased( const T* alias ) const noexcept {
356  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
357  }
358  //**********************************************************************************************
359 
360  //**********************************************************************************************
365  inline bool isAligned() const noexcept {
366  return vec_.isAligned() && mat_.isAligned();
367  }
368  //**********************************************************************************************
369 
370  //**********************************************************************************************
375  inline bool canSMPAssign() const noexcept {
376  return ( !BLAZE_BLAS_IS_PARALLEL ||
377  ( IsComputation<MT>::value && !evaluateMatrix ) ||
378  ( mat_.rows() * mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
379  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
380  }
381  //**********************************************************************************************
382 
383  private:
384  //**Member variables****************************************************************************
385  LeftOperand vec_;
386  RightOperand mat_;
387  //**********************************************************************************************
388 
389  //**Assignment to dense vectors*****************************************************************
402  template< typename VT1 > // Type of the target dense vector
403  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
404  {
406 
407  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
408 
409  if( rhs.mat_.rows() == 0UL ) {
410  reset( ~lhs );
411  return;
412  }
413  else if( rhs.mat_.columns() == 0UL ) {
414  return;
415  }
416 
417  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
418  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
419 
420  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
421  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
422  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
423  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
424 
425  TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
426  }
428  //**********************************************************************************************
429 
430  //**Assignment to dense vectors (kernel selection)**********************************************
441  template< typename VT1 // Type of the left-hand side target vector
442  , typename VT2 // Type of the left-hand side vector operand
443  , typename MT1 > // Type of the right-hand side matrix operand
444  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
445  {
446  if( ( IsDiagonal<MT1>::value ) ||
447  ( IsComputation<MT>::value && !evaluateMatrix ) ||
448  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
449  selectSmallAssignKernel( y, x, A );
450  else
451  selectBlasAssignKernel( y, x, A );
452  }
454  //**********************************************************************************************
455 
456  //**Default assignment to dense vectors*********************************************************
470  template< typename VT1 // Type of the left-hand side target vector
471  , typename VT2 // Type of the left-hand side vector operand
472  , typename MT1 > // Type of the right-hand side matrix operand
473  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
474  {
475  const size_t M( A.rows() );
476  const size_t N( A.columns() );
477 
478  if( IsStrictlyUpper<MT1>::value ) {
479  reset( y[0] );
480  }
481 
482  if( !IsLower<MT1>::value )
483  {
484  const size_t jbegin( IsStrictlyUpper<MT1>::value ? 1UL : 0UL );
485  for( size_t j=jbegin; j<N; ++j ) {
486  y[j] = x[0UL] * A(0UL,j);
487  }
488  }
489 
490  for( size_t i=( IsLower<MT1>::value && !IsStrictlyLower<MT1>::value ? 0UL : 1UL ); i<M; ++i )
491  {
492  if( IsDiagonal<MT1>::value )
493  {
494  y[i] = x[i] * A(i,i);
495  }
496  else
497  {
498  const size_t jbegin( ( IsUpper<MT1>::value )
499  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
500  :( 0UL ) );
501  const size_t jend( ( IsLower<MT1>::value )
502  ?( IsStrictlyLower<MT1>::value ? i-1UL : i )
503  :( N ) );
504  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
505 
506  const size_t jnum( jend - jbegin );
507  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
508 
509  for( size_t j=jbegin; j<jpos; j+=2UL ) {
510  y[j ] += x[i] * A(i,j );
511  y[j+1UL] += x[i] * A(i,j+1UL);
512  }
513  if( jpos < jend ) {
514  y[jpos] += x[i] * A(i,jpos);
515  }
516  if( IsLower<MT1>::value ) {
517  y[jend] = x[i] * A(i,jend);
518  }
519  }
520  }
521 
522  if( IsStrictlyLower<MT1>::value ) {
523  reset( y[N-1UL] );
524  }
525  }
527  //**********************************************************************************************
528 
529  //**Default assignment to dense vectors (small matrices)****************************************
543  template< typename VT1 // Type of the left-hand side target vector
544  , typename VT2 // Type of the left-hand side vector operand
545  , typename MT1 > // Type of the right-hand side matrix operand
546  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
547  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
548  {
549  selectDefaultAssignKernel( y, x, A );
550  }
552  //**********************************************************************************************
553 
554  //**Vectorized default assignment to dense vectors (small matrices)*****************************
568  template< typename VT1 // Type of the left-hand side target vector
569  , typename VT2 // Type of the left-hand side vector operand
570  , typename MT1 > // Type of the right-hand side matrix operand
571  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
572  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
573  {
574  const size_t M( A.rows() );
575  const size_t N( A.columns() );
576 
577  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
578 
579  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
580  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
581 
582  size_t j( 0UL );
583 
584  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
585  {
586  const size_t ibegin( ( IsLower<MT1>::value )
587  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
588  :( 0UL ) );
589  const size_t iend( ( IsUpper<MT1>::value )
590  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
591  :( M ) );
592  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
593 
594  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
595 
596  for( size_t i=ibegin; i<iend; ++i ) {
597  const SIMDType x1( set( x[i] ) );
598  xmm1 = xmm1 + x1 * A.load(i,j );
599  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
600  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
601  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
602  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
603  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
604  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
605  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
606  }
607 
608  y.store( j , xmm1 );
609  y.store( j+SIMDSIZE , xmm2 );
610  y.store( j+SIMDSIZE*2UL, xmm3 );
611  y.store( j+SIMDSIZE*3UL, xmm4 );
612  y.store( j+SIMDSIZE*4UL, xmm5 );
613  y.store( j+SIMDSIZE*5UL, xmm6 );
614  y.store( j+SIMDSIZE*6UL, xmm7 );
615  y.store( j+SIMDSIZE*7UL, xmm8 );
616  }
617 
618  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
619  {
620  const size_t ibegin( ( IsLower<MT1>::value )
621  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
622  :( 0UL ) );
623  const size_t iend( ( IsUpper<MT1>::value )
624  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
625  :( M ) );
626  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
627 
628  SIMDType xmm1, xmm2, xmm3, xmm4;
629 
630  for( size_t i=ibegin; i<iend; ++i ) {
631  const SIMDType x1( set( x[i] ) );
632  xmm1 = xmm1 + x1 * A.load(i,j );
633  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
634  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
635  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
636  }
637 
638  y.store( j , xmm1 );
639  y.store( j+SIMDSIZE , xmm2 );
640  y.store( j+SIMDSIZE*2UL, xmm3 );
641  y.store( j+SIMDSIZE*3UL, xmm4 );
642  }
643 
644  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
645  {
646  const size_t ibegin( ( IsLower<MT1>::value )
647  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
648  :( 0UL ) );
649  const size_t iend( ( IsUpper<MT1>::value )
650  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
651  :( M ) );
652  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
653 
654  SIMDType xmm1, xmm2, xmm3;
655 
656  for( size_t i=ibegin; i<iend; ++i ) {
657  const SIMDType x1( set( x[i] ) );
658  xmm1 = xmm1 + x1 * A.load(i,j );
659  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
660  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
661  }
662 
663  y.store( j , xmm1 );
664  y.store( j+SIMDSIZE , xmm2 );
665  y.store( j+SIMDSIZE*2UL, xmm3 );
666  }
667 
668  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
669  {
670  const size_t ibegin( ( IsLower<MT1>::value )
671  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
672  :( 0UL ) );
673  const size_t iend( ( IsUpper<MT1>::value )
674  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
675  :( M ) );
676  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
677 
678  SIMDType xmm1, xmm2;
679 
680  for( size_t i=ibegin; i<iend; ++i ) {
681  const SIMDType x1( set( x[i] ) );
682  xmm1 = xmm1 + x1 * A.load(i,j );
683  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
684  }
685 
686  y.store( j , xmm1 );
687  y.store( j+SIMDSIZE, xmm2 );
688  }
689 
690  for( ; j<jpos; j+=SIMDSIZE )
691  {
692  const size_t ibegin( ( IsLower<MT1>::value )
693  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
694  :( 0UL ) );
695  const size_t iend( ( IsUpper<MT1>::value )
696  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
697  :( M ) );
698  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
699 
700  SIMDType xmm1;
701 
702  for( size_t i=ibegin; i<iend; ++i ) {
703  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
704  }
705 
706  y.store( j, xmm1 );
707  }
708 
709  for( ; remainder && j<N; ++j )
710  {
711  const size_t ibegin( ( IsLower<MT1>::value )
712  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
713  :( 0UL ) );
714  const size_t iend( ( IsUpper<MT1>::value )
715  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
716  :( M ) );
717  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
718 
719  ElementType value = ElementType();
720 
721  for( size_t i=ibegin; i<iend; ++i ) {
722  value += x[i] * A(i,j);
723  }
724 
725  y[j] = value;
726  }
727  }
729  //**********************************************************************************************
730 
731  //**Default assignment to dense vectors (large matrices)****************************************
745  template< typename VT1 // Type of the left-hand side target vector
746  , typename VT2 // Type of the left-hand side vector operand
747  , typename MT1 > // Type of the right-hand side matrix operand
748  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
749  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
750  {
751  selectDefaultAssignKernel( y, x, A );
752  }
754  //**********************************************************************************************
755 
756  //**Vectorized default assignment to dense vectors (large matrices)*****************************
770  template< typename VT1 // Type of the left-hand side target vector
771  , typename VT2 // Type of the left-hand side vector operand
772  , typename MT1 > // Type of the right-hand side matrix operand
773  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
774  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
775  {
776  const size_t M( A.rows() );
777  const size_t N( A.columns() );
778 
779  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
780 
781  const size_t jblock( 32768UL / sizeof( ElementType ) );
782  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
783 
784  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
785 
786  reset( y );
787 
788  for( size_t jj=0U; jj<N; jj+=jblock ) {
789  for( size_t ii=0UL; ii<M; ii+=iblock )
790  {
791  const size_t iend( min( ii+iblock, M ) );
792  const size_t jtmp( min( jj+jblock, N ) );
793  const size_t jend( ( IsLower<MT1>::value )
794  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
795  :( jtmp ) );
796 
797  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
798  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
799 
800  size_t j( ( IsUpper<MT1>::value )
801  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
802  :( jj ) );
803 
804  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
805  {
806  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
807 
808  for( size_t i=ii; i<iend; ++i ) {
809  const SIMDType x1( set( x[i] ) );
810  xmm1 = xmm1 + x1 * A.load(i,j );
811  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
812  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
813  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
814  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
815  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
816  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
817  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
818  }
819 
820  y.store( j , y.load(j ) + xmm1 );
821  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
822  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
823  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
824  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
825  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
826  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
827  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
828  }
829 
830  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
831  {
832  SIMDType xmm1, xmm2, xmm3, xmm4;
833 
834  for( size_t i=ii; i<iend; ++i ) {
835  const SIMDType x1( set( x[i] ) );
836  xmm1 = xmm1 + x1 * A.load(i,j );
837  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
838  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
839  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
840  }
841 
842  y.store( j , y.load(j ) + xmm1 );
843  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
844  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
845  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
846  }
847 
848  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
849  {
850  SIMDType xmm1, xmm2, xmm3;
851 
852  for( size_t i=ii; i<iend; ++i ) {
853  const SIMDType x1( set( x[i] ) );
854  xmm1 = xmm1 + x1 * A.load(i,j );
855  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
856  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
857  }
858 
859  y.store( j , y.load(j ) + xmm1 );
860  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
861  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
862  }
863 
864  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
865  {
866  SIMDType xmm1, xmm2;
867 
868  for( size_t i=ii; i<iend; ++i ) {
869  const SIMDType x1( set( x[i] ) );
870  xmm1 = xmm1 + x1 * A.load(i,j );
871  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
872  }
873 
874  y.store( j , y.load(j ) + xmm1 );
875  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
876  }
877 
878  for( ; j<jpos; j+=SIMDSIZE )
879  {
880  SIMDType xmm1;
881 
882  for( size_t i=ii; i<iend; ++i ) {
883  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
884  }
885 
886  y.store( j, y.load(j) + xmm1 );
887  }
888 
889  for( ; remainder && j<jend; ++j )
890  {
891  ElementType value = ElementType();
892 
893  for( size_t i=ii; i<iend; ++i ) {
894  value += x[i] * A(i,j);
895  }
896 
897  y[j] += value;
898  }
899  }
900  }
901  }
903  //**********************************************************************************************
904 
905  //**BLAS-based assignment to dense vectors (default)********************************************
919  template< typename VT1 // Type of the left-hand side target vector
920  , typename VT2 // Type of the left-hand side vector operand
921  , typename MT1 > // Type of the right-hand side matrix operand
922  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
923  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
924  {
925  selectLargeAssignKernel( y, x, A );
926  }
928  //**********************************************************************************************
929 
930  //**BLAS-based assignment to dense vectors******************************************************
931 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
932 
945  template< typename VT1 // Type of the left-hand side target vector
946  , typename VT2 // Type of the left-hand side vector operand
947  , typename MT1 > // Type of the right-hand side matrix operand
948  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
949  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
950  {
951  typedef ElementType_<VT1> ET;
952 
953  if( IsTriangular<MT1>::value ) {
954  assign( y, x );
955  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
956  }
957  else {
958  gemv( y, x, A, ET(1), ET(0) );
959  }
960  }
962 #endif
963  //**********************************************************************************************
964 
965  //**Assignment to sparse vectors****************************************************************
978  template< typename VT1 > // Type of the target sparse vector
979  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
980  {
982 
985  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
986 
987  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
988 
989  const ResultType tmp( serial( rhs ) );
990  assign( ~lhs, tmp );
991  }
993  //**********************************************************************************************
994 
995  //**Addition assignment to dense vectors********************************************************
1008  template< typename VT1 > // Type of the target dense vector
1009  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1010  {
1012 
1013  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1014 
1015  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1016  return;
1017  }
1018 
1019  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1020  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1021 
1022  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1023  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1024  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1025  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1026 
1027  TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1028  }
1030  //**********************************************************************************************
1031 
1032  //**Addition assignment to dense vectors (kernel selection)*************************************
1043  template< typename VT1 // Type of the left-hand side target vector
1044  , typename VT2 // Type of the left-hand side vector operand
1045  , typename MT1 > // Type of the right-hand side matrix operand
1046  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1047  {
1048  if( ( IsDiagonal<MT1>::value ) ||
1049  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1050  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1051  selectSmallAddAssignKernel( y, x, A );
1052  else
1053  selectBlasAddAssignKernel( y, x, A );
1054  }
1056  //**********************************************************************************************
1057 
1058  //**Default addition assignment to dense vectors************************************************
1072  template< typename VT1 // Type of the left-hand side target vector
1073  , typename VT2 // Type of the left-hand side vector operand
1074  , typename MT1 > // Type of the right-hand side matrix operand
1075  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1076  {
1077  const size_t M( A.rows() );
1078  const size_t N( A.columns() );
1079 
1080  for( size_t i=0UL; i<M; ++i )
1081  {
1082  if( IsDiagonal<MT1>::value )
1083  {
1084  y[i] += x[i] * A(i,i);
1085  }
1086  else
1087  {
1088  const size_t jbegin( ( IsUpper<MT1>::value )
1089  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1090  :( 0UL ) );
1091  const size_t jend( ( IsLower<MT1>::value )
1092  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1093  :( N ) );
1094  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1095 
1096  const size_t jnum( jend - jbegin );
1097  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1098 
1099  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1100  y[j ] += x[i] * A(i,j );
1101  y[j+1UL] += x[i] * A(i,j+1UL);
1102  }
1103  if( jpos < jend ) {
1104  y[jpos] += x[i] * A(i,jpos);
1105  }
1106  }
1107  }
1108  }
1110  //**********************************************************************************************
1111 
1112  //**Default addition assignment to dense vectors (small matrices)*******************************
1126  template< typename VT1 // Type of the left-hand side target vector
1127  , typename VT2 // Type of the left-hand side vector operand
1128  , typename MT1 > // Type of the right-hand side matrix operand
1129  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1130  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1131  {
1132  selectDefaultAddAssignKernel( y, x, A );
1133  }
1135  //**********************************************************************************************
1136 
1137  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1151  template< typename VT1 // Type of the left-hand side target vector
1152  , typename VT2 // Type of the left-hand side vector operand
1153  , typename MT1 > // Type of the right-hand side matrix operand
1154  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1155  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1156  {
1157  const size_t M( A.rows() );
1158  const size_t N( A.columns() );
1159 
1160  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1161 
1162  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1163  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1164 
1165  size_t j( 0UL );
1166 
1167  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1168  {
1169  const size_t ibegin( ( IsLower<MT1>::value )
1170  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1171  :( 0UL ) );
1172  const size_t iend( ( IsUpper<MT1>::value )
1173  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1174  :( M ) );
1175  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1176 
1177  SIMDType xmm1( y.load(j ) );
1178  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1179  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1180  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1181  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1182  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1183  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1184  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1185 
1186  for( size_t i=ibegin; i<iend; ++i ) {
1187  const SIMDType x1( set( x[i] ) );
1188  xmm1 = xmm1 + x1 * A.load(i,j );
1189  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1190  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1191  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1192  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
1193  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
1194  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
1195  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
1196  }
1197 
1198  y.store( j , xmm1 );
1199  y.store( j+SIMDSIZE , xmm2 );
1200  y.store( j+SIMDSIZE*2UL, xmm3 );
1201  y.store( j+SIMDSIZE*3UL, xmm4 );
1202  y.store( j+SIMDSIZE*4UL, xmm5 );
1203  y.store( j+SIMDSIZE*5UL, xmm6 );
1204  y.store( j+SIMDSIZE*6UL, xmm7 );
1205  y.store( j+SIMDSIZE*7UL, xmm8 );
1206  }
1207 
1208  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1209  {
1210  const size_t ibegin( ( IsLower<MT1>::value )
1211  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1212  :( 0UL ) );
1213  const size_t iend( ( IsUpper<MT1>::value )
1214  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1215  :( M ) );
1216  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1217 
1218  SIMDType xmm1( y.load(j ) );
1219  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1220  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1221  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1222 
1223  for( size_t i=ibegin; i<iend; ++i ) {
1224  const SIMDType x1( set( x[i] ) );
1225  xmm1 = xmm1 + x1 * A.load(i,j );
1226  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1227  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1228  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1229  }
1230 
1231  y.store( j , xmm1 );
1232  y.store( j+SIMDSIZE , xmm2 );
1233  y.store( j+SIMDSIZE*2UL, xmm3 );
1234  y.store( j+SIMDSIZE*3UL, xmm4 );
1235  }
1236 
1237  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1238  {
1239  const size_t ibegin( ( IsLower<MT1>::value )
1240  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1241  :( 0UL ) );
1242  const size_t iend( ( IsUpper<MT1>::value )
1243  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1244  :( M ) );
1245  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1246 
1247  SIMDType xmm1( y.load(j ) );
1248  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1249  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1250 
1251  for( size_t i=ibegin; i<iend; ++i ) {
1252  const SIMDType x1( set( x[i] ) );
1253  xmm1 = xmm1 + x1 * A.load(i,j );
1254  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1255  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1256  }
1257 
1258  y.store( j , xmm1 );
1259  y.store( j+SIMDSIZE , xmm2 );
1260  y.store( j+SIMDSIZE*2UL, xmm3 );
1261  }
1262 
1263  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1264  {
1265  const size_t ibegin( ( IsLower<MT1>::value )
1266  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1267  :( 0UL ) );
1268  const size_t iend( ( IsUpper<MT1>::value )
1269  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1270  :( M ) );
1271  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1272 
1273  SIMDType xmm1( y.load(j ) );
1274  SIMDType xmm2( y.load(j+SIMDSIZE) );
1275 
1276  for( size_t i=ibegin; i<iend; ++i ) {
1277  const SIMDType x1( set( x[i] ) );
1278  xmm1 = xmm1 + x1 * A.load(i,j );
1279  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
1280  }
1281 
1282  y.store( j , xmm1 );
1283  y.store( j+SIMDSIZE, xmm2 );
1284  }
1285 
1286  for( ; j<jpos; j+=SIMDSIZE )
1287  {
1288  const size_t ibegin( ( IsLower<MT1>::value )
1289  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1290  :( 0UL ) );
1291  const size_t iend( ( IsUpper<MT1>::value )
1292  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1293  :( M ) );
1294  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1295 
1296  SIMDType xmm1( y.load(j) );
1297 
1298  for( size_t i=ibegin; i<iend; ++i ) {
1299  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
1300  }
1301 
1302  y.store( j, xmm1 );
1303  }
1304 
1305  for( ; remainder && j<N; ++j )
1306  {
1307  const size_t ibegin( ( IsLower<MT1>::value )
1308  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1309  :( 0UL ) );
1310  const size_t iend( ( IsUpper<MT1>::value )
1311  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1312  :( M ) );
1313  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1314 
1315  ElementType value = ElementType();
1316 
1317  for( size_t i=ibegin; i<iend; ++i ) {
1318  value += x[i] * A(i,j);
1319  }
1320 
1321  y[j] += value;
1322  }
1323  }
1325  //**********************************************************************************************
1326 
1327  //**Default addition assignment to dense vectors (large matrices)*******************************
1341  template< typename VT1 // Type of the left-hand side target vector
1342  , typename VT2 // Type of the left-hand side vector operand
1343  , typename MT1 > // Type of the right-hand side matrix operand
1344  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1345  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1346  {
1347  selectDefaultAddAssignKernel( y, x, A );
1348  }
1350  //**********************************************************************************************
1351 
1352  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1366  template< typename VT1 // Type of the left-hand side target vector
1367  , typename VT2 // Type of the left-hand side vector operand
1368  , typename MT1 > // Type of the right-hand side matrix operand
1369  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1370  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1371  {
1372  const size_t M( A.rows() );
1373  const size_t N( A.columns() );
1374 
1375  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1376 
1377  const size_t jblock( 32768UL / sizeof( ElementType ) );
1378  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1379 
1380  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1381 
1382  for( size_t jj=0U; jj<N; jj+=jblock ) {
1383  for( size_t ii=0UL; ii<M; ii+=iblock )
1384  {
1385  const size_t iend( min( ii+iblock, M ) );
1386  const size_t jtmp( min( jj+jblock, N ) );
1387  const size_t jend( ( IsLower<MT1>::value )
1388  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
1389  :( jtmp ) );
1390 
1391  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1392  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1393 
1394  size_t j( ( IsUpper<MT1>::value )
1395  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1396  :( jj ) );
1397 
1398  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1399  {
1400  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1401 
1402  for( size_t i=ii; i<iend; ++i ) {
1403  const SIMDType x1( set( x[i] ) );
1404  xmm1 = xmm1 + x1 * A.load(i,j );
1405  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1406  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1407  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1408  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
1409  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
1410  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
1411  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
1412  }
1413 
1414  y.store( j , y.load(j ) + xmm1 );
1415  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1416  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1417  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1418  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1419  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1420  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1421  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1422  }
1423 
1424  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1425  {
1426  SIMDType xmm1, xmm2, xmm3, xmm4;
1427 
1428  for( size_t i=ii; i<iend; ++i ) {
1429  const SIMDType x1( set( x[i] ) );
1430  xmm1 = xmm1 + x1 * A.load(i,j );
1431  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1432  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1433  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1434  }
1435 
1436  y.store( j , y.load(j ) + xmm1 );
1437  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1438  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1439  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1440  }
1441 
1442  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1443  {
1444  SIMDType xmm1, xmm2, xmm3;
1445 
1446  for( size_t i=ii; i<iend; ++i ) {
1447  const SIMDType x1( set( x[i] ) );
1448  xmm1 = xmm1 + x1 * A.load(i,j );
1449  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1450  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1451  }
1452 
1453  y.store( j , y.load(j ) + xmm1 );
1454  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1455  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1456  }
1457 
1458  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1459  {
1460  SIMDType xmm1, xmm2;
1461 
1462  for( size_t i=ii; i<iend; ++i ) {
1463  const SIMDType x1( set( x[i] ) );
1464  xmm1 = xmm1 + x1 * A.load(i,j );
1465  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
1466  }
1467 
1468  y.store( j , y.load(j ) + xmm1 );
1469  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1470  }
1471 
1472  for( ; j<jpos; j+=SIMDSIZE )
1473  {
1474  SIMDType xmm1;
1475 
1476  for( size_t i=ii; i<iend; ++i ) {
1477  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
1478  }
1479 
1480  y.store( j, y.load(j) + xmm1 );
1481  }
1482 
1483  for( ; remainder && j<jend; ++j )
1484  {
1485  ElementType value = ElementType();
1486 
1487  for( size_t i=ii; i<iend; ++i ) {
1488  value += x[i] * A(i,j);
1489  }
1490 
1491  y[j] += value;
1492  }
1493  }
1494  }
1495  }
1497  //**********************************************************************************************
1498 
1499  //**BLAS-based addition assignment to dense vectors (default)***********************************
1513  template< typename VT1 // Type of the left-hand side target vector
1514  , typename VT2 // Type of the left-hand side vector operand
1515  , typename MT1 > // Type of the right-hand side matrix operand
1516  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
1517  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1518  {
1519  selectLargeAddAssignKernel( y, x, A );
1520  }
1522  //**********************************************************************************************
1523 
1524  //**BLAS-based addition assignment to dense vectors*********************************************
1525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1526 
1539  template< typename VT1 // Type of the left-hand side target vector
1540  , typename VT2 // Type of the left-hand side vector operand
1541  , typename MT1 > // Type of the right-hand side matrix operand
1542  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
1543  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1544  {
1545  typedef ElementType_<VT1> ET;
1546 
1547  if( IsTriangular<MT1>::value ) {
1548  ResultType_<VT1> tmp( serial( x ) );
1549  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1550  addAssign( y, tmp );
1551  }
1552  else {
1553  gemv( y, x, A, ET(1), ET(1) );
1554  }
1555  }
1557 #endif
1558  //**********************************************************************************************
1559 
1560  //**Addition assignment to sparse vectors*******************************************************
1561  // No special implementation for the addition assignment to sparse vectors.
1562  //**********************************************************************************************
1563 
1564  //**Subtraction assignment to dense vectors*****************************************************
1577  template< typename VT1 > // Type of the target dense vector
1578  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1579  {
1581 
1582  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1583 
1584  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1585  return;
1586  }
1587 
1588  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1589  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1590 
1591  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1592  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1593  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1594  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1595 
1596  TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1597  }
1599  //**********************************************************************************************
1600 
1601  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1612  template< typename VT1 // Type of the left-hand side target vector
1613  , typename VT2 // Type of the left-hand side vector operand
1614  , typename MT1 > // Type of the right-hand side matrix operand
1615  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1616  {
1617  if( ( IsDiagonal<MT1>::value ) ||
1618  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1619  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1620  selectSmallSubAssignKernel( y, x, A );
1621  else
1622  selectBlasSubAssignKernel( y, x, A );
1623  }
1625  //**********************************************************************************************
1626 
1627  //**Default subtraction assignment to dense vectors*********************************************
1641  template< typename VT1 // Type of the left-hand side target vector
1642  , typename VT2 // Type of the left-hand side vector operand
1643  , typename MT1 > // Type of the right-hand side matrix operand
1644  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1645  {
1646  const size_t M( A.rows() );
1647  const size_t N( A.columns() );
1648 
1649  for( size_t i=0UL; i<M; ++i )
1650  {
1651  if( IsDiagonal<MT1>::value )
1652  {
1653  y[i] -= x[i] * A(i,i);
1654  }
1655  else
1656  {
1657  const size_t jbegin( ( IsUpper<MT1>::value )
1658  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1659  :( 0UL ) );
1660  const size_t jend( ( IsLower<MT1>::value )
1661  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1662  :( N ) );
1663  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1664 
1665  const size_t jnum( jend - jbegin );
1666  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1667 
1668  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1669  y[j ] -= x[i] * A(i,j );
1670  y[j+1UL] -= x[i] * A(i,j+1UL);
1671  }
1672  if( jpos < jend ) {
1673  y[jpos] -= x[i] * A(i,jpos);
1674  }
1675  }
1676  }
1677  }
1679  //**********************************************************************************************
1680 
1681  //**Default subtraction assignment to dense vectors (small matrices)****************************
1695  template< typename VT1 // Type of the left-hand side target vector
1696  , typename VT2 // Type of the left-hand side vector operand
1697  , typename MT1 > // Type of the right-hand side matrix operand
1698  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1699  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1700  {
1701  selectDefaultSubAssignKernel( y, x, A );
1702  }
1704  //**********************************************************************************************
1705 
1706  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1721  template< typename VT1 // Type of the left-hand side target vector
1722  , typename VT2 // Type of the left-hand side vector operand
1723  , typename MT1 > // Type of the right-hand side matrix operand
1724  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1725  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1726  {
1727  const size_t M( A.rows() );
1728  const size_t N( A.columns() );
1729 
1730  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1731 
1732  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1733  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1734 
1735  size_t j( 0UL );
1736 
1737  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1738  {
1739  const size_t ibegin( ( IsLower<MT1>::value )
1740  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1741  :( 0UL ) );
1742  const size_t iend( ( IsUpper<MT1>::value )
1743  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1744  :( M ) );
1745  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1746 
1747  SIMDType xmm1( y.load(j ) );
1748  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1749  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1750  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1751  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1752  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1753  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1754  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1755 
1756  for( size_t i=ibegin; i<iend; ++i ) {
1757  const SIMDType x1( set( x[i] ) );
1758  xmm1 = xmm1 - x1 * A.load(i,j );
1759  xmm2 = xmm2 - x1 * A.load(i,j+SIMDSIZE );
1760  xmm3 = xmm3 - x1 * A.load(i,j+SIMDSIZE*2UL);
1761  xmm4 = xmm4 - x1 * A.load(i,j+SIMDSIZE*3UL);
1762  xmm5 = xmm5 - x1 * A.load(i,j+SIMDSIZE*4UL);
1763  xmm6 = xmm6 - x1 * A.load(i,j+SIMDSIZE*5UL);
1764  xmm7 = xmm7 - x1 * A.load(i,j+SIMDSIZE*6UL);
1765  xmm8 = xmm8 - x1 * A.load(i,j+SIMDSIZE*7UL);
1766  }
1767 
1768  y.store( j , xmm1 );
1769  y.store( j+SIMDSIZE , xmm2 );
1770  y.store( j+SIMDSIZE*2UL, xmm3 );
1771  y.store( j+SIMDSIZE*3UL, xmm4 );
1772  y.store( j+SIMDSIZE*4UL, xmm5 );
1773  y.store( j+SIMDSIZE*5UL, xmm6 );
1774  y.store( j+SIMDSIZE*6UL, xmm7 );
1775  y.store( j+SIMDSIZE*7UL, xmm8 );
1776  }
1777 
1778  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1779  {
1780  const size_t ibegin( ( IsLower<MT1>::value )
1781  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1782  :( 0UL ) );
1783  const size_t iend( ( IsUpper<MT1>::value )
1784  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1785  :( M ) );
1786  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1787 
1788  SIMDType xmm1( y.load(j ) );
1789  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1790  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1791  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1792 
1793  for( size_t i=ibegin; i<iend; ++i ) {
1794  const SIMDType x1( set( x[i] ) );
1795  xmm1 = xmm1 - x1 * A.load(i,j );
1796  xmm2 = xmm2 - x1 * A.load(i,j+SIMDSIZE );
1797  xmm3 = xmm3 - x1 * A.load(i,j+SIMDSIZE*2UL);
1798  xmm4 = xmm4 - x1 * A.load(i,j+SIMDSIZE*3UL);
1799  }
1800 
1801  y.store( j , xmm1 );
1802  y.store( j+SIMDSIZE , xmm2 );
1803  y.store( j+SIMDSIZE*2UL, xmm3 );
1804  y.store( j+SIMDSIZE*3UL, xmm4 );
1805  }
1806 
1807  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1808  {
1809  const size_t ibegin( ( IsLower<MT1>::value )
1810  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1811  :( 0UL ) );
1812  const size_t iend( ( IsUpper<MT1>::value )
1813  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1814  :( M ) );
1815  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1816 
1817  SIMDType xmm1( y.load(j ) );
1818  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1819  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1820 
1821  for( size_t i=ibegin; i<iend; ++i ) {
1822  const SIMDType x1( set( x[i] ) );
1823  xmm1 = xmm1 - x1 * A.load(i,j );
1824  xmm2 = xmm2 - x1 * A.load(i,j+SIMDSIZE );
1825  xmm3 = xmm3 - x1 * A.load(i,j+SIMDSIZE*2UL);
1826  }
1827 
1828  y.store( j , xmm1 );
1829  y.store( j+SIMDSIZE , xmm2 );
1830  y.store( j+SIMDSIZE*2UL, xmm3 );
1831  }
1832 
1833  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1834  {
1835  const size_t ibegin( ( IsLower<MT1>::value )
1836  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1837  :( 0UL ) );
1838  const size_t iend( ( IsUpper<MT1>::value )
1839  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1840  :( M ) );
1841  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1842 
1843  SIMDType xmm1( y.load(j ) );
1844  SIMDType xmm2( y.load(j+SIMDSIZE) );
1845 
1846  for( size_t i=ibegin; i<iend; ++i ) {
1847  const SIMDType x1( set( x[i] ) );
1848  xmm1 = xmm1 - x1 * A.load(i,j );
1849  xmm2 = xmm2 - x1 * A.load(i,j+SIMDSIZE);
1850  }
1851 
1852  y.store( j , xmm1 );
1853  y.store( j+SIMDSIZE, xmm2 );
1854  }
1855 
1856  for( ; j<jpos; j+=SIMDSIZE )
1857  {
1858  const size_t ibegin( ( IsLower<MT1>::value )
1859  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1860  :( 0UL ) );
1861  const size_t iend( ( IsUpper<MT1>::value )
1862  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1863  :( M ) );
1864  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1865 
1866  SIMDType xmm1( y.load(j) );
1867 
1868  for( size_t i=ibegin; i<iend; ++i ) {
1869  xmm1 = xmm1 - set( x[i] ) * A.load(i,j);
1870  }
1871 
1872  y.store( j, xmm1 );
1873  }
1874 
1875  for( ; remainder && j<N; ++j )
1876  {
1877  const size_t ibegin( ( IsLower<MT1>::value )
1878  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1879  :( 0UL ) );
1880  const size_t iend( ( IsUpper<MT1>::value )
1881  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1882  :( M ) );
1883  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1884 
1885  ElementType value = ElementType();
1886 
1887  for( size_t i=ibegin; i<iend; ++i ) {
1888  value += x[i] * A(i,j);
1889  }
1890 
1891  y[j] -= value;
1892  }
1893  }
1895  //**********************************************************************************************
1896 
1897  //**Default subtraction assignment to dense vectors (large matrices)****************************
1911  template< typename VT1 // Type of the left-hand side target vector
1912  , typename VT2 // Type of the left-hand side vector operand
1913  , typename MT1 > // Type of the right-hand side matrix operand
1914  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1915  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1916  {
1917  selectDefaultSubAssignKernel( y, x, A );
1918  }
1920  //**********************************************************************************************
1921 
1922  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
1937  template< typename VT1 // Type of the left-hand side target vector
1938  , typename VT2 // Type of the left-hand side vector operand
1939  , typename MT1 > // Type of the right-hand side matrix operand
1940  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1941  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1942  {
1943  const size_t M( A.rows() );
1944  const size_t N( A.columns() );
1945 
1946  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1947 
1948  const size_t jblock( 32768UL / sizeof( ElementType ) );
1949  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1950 
1951  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1952 
1953  for( size_t jj=0U; jj<N; jj+=jblock ) {
1954  for( size_t ii=0UL; ii<M; ii+=iblock )
1955  {
1956  const size_t iend( min( ii+iblock, M ) );
1957  const size_t jtmp( min( jj+jblock, N ) );
1958  const size_t jend( ( IsLower<MT1>::value )
1959  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
1960  :( jtmp ) );
1961 
1962  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1963  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1964 
1965  size_t j( ( IsUpper<MT1>::value )
1966  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1967  :( jj ) );
1968 
1969  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1970  {
1971  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1972 
1973  for( size_t i=ii; i<iend; ++i ) {
1974  const SIMDType x1( set( x[i] ) );
1975  xmm1 = xmm1 + x1 * A.load(i,j );
1976  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1977  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1978  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1979  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
1980  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
1981  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
1982  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
1983  }
1984 
1985  y.store( j , y.load(j ) - xmm1 );
1986  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
1987  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
1988  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
1989  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
1990  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
1991  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
1992  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
1993  }
1994 
1995  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1996  {
1997  SIMDType xmm1, xmm2, xmm3, xmm4;
1998 
1999  for( size_t i=ii; i<iend; ++i ) {
2000  const SIMDType x1( set( x[i] ) );
2001  xmm1 = xmm1 + x1 * A.load(i,j );
2002  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2003  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
2004  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
2005  }
2006 
2007  y.store( j , y.load(j ) - xmm1 );
2008  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2009  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2010  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2011  }
2012 
2013  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2014  {
2015  SIMDType xmm1, xmm2, xmm3;
2016 
2017  for( size_t i=ii; i<iend; ++i ) {
2018  const SIMDType x1( set( x[i] ) );
2019  xmm1 = xmm1 + x1 * A.load(i,j );
2020  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2021  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
2022  }
2023 
2024  y.store( j , y.load(j ) - xmm1 );
2025  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2026  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2027  }
2028 
2029  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2030  {
2031  SIMDType xmm1, xmm2;
2032 
2033  for( size_t i=ii; i<iend; ++i ) {
2034  const SIMDType x1( set( x[i] ) );
2035  xmm1 = xmm1 + x1 * A.load(i,j );
2036  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
2037  }
2038 
2039  y.store( j , y.load(j ) - xmm1 );
2040  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2041  }
2042 
2043  for( ; j<jpos; j+=SIMDSIZE )
2044  {
2045  SIMDType xmm1;
2046 
2047  for( size_t i=ii; i<iend; ++i ) {
2048  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
2049  }
2050 
2051  y.store( j, y.load(j) - xmm1 );
2052  }
2053 
2054  for( ; remainder && j<jend; ++j )
2055  {
2056  ElementType value = ElementType();
2057 
2058  for( size_t i=ii; i<iend; ++i ) {
2059  value += x[i] * A(i,j);
2060  }
2061 
2062  y[j] -= value;
2063  }
2064  }
2065  }
2066  }
2068  //**********************************************************************************************
2069 
2070  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2084  template< typename VT1 // Type of the left-hand side target vector
2085  , typename VT2 // Type of the left-hand side vector operand
2086  , typename MT1 > // Type of the right-hand side matrix operand
2087  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
2088  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2089  {
2090  selectLargeSubAssignKernel( y, x, A );
2091  }
2093  //**********************************************************************************************
2094 
2095  //**BLAS-based subtraction assignment to dense vectors******************************************
2096 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2097 
2110  template< typename VT1 // Type of the left-hand side target vector
2111  , typename VT2 // Type of the left-hand side vector operand
2112  , typename MT1 > // Type of the right-hand side matrix operand
2113  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
2114  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2115  {
2116  typedef ElementType_<VT1> ET;
2117 
2118  if( IsTriangular<MT1>::value ) {
2119  ResultType_<VT1> tmp( serial( x ) );
2120  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2121  subAssign( y, tmp );
2122  }
2123  else {
2124  gemv( y, x, A, ET(-1), ET(1) );
2125  }
2126  }
2128 #endif
2129  //**********************************************************************************************
2130 
2131  //**Subtraction assignment to sparse vectors****************************************************
2132  // No special implementation for the subtraction assignment to sparse vectors.
2133  //**********************************************************************************************
2134 
2135  //**Multiplication assignment to dense vectors**************************************************
2148  template< typename VT1 > // Type of the target dense vector
2149  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2150  {
2152 
2155  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2156 
2157  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2158 
2159  const ResultType tmp( serial( rhs ) );
2160  multAssign( ~lhs, tmp );
2161  }
2163  //**********************************************************************************************
2164 
2165  //**Multiplication assignment to sparse vectors*************************************************
2166  // No special implementation for the multiplication assignment to sparse vectors.
2167  //**********************************************************************************************
2168 
2169  //**Division assignment to dense vectors********************************************************
2182  template< typename VT1 > // Type of the target dense vector
2183  friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2184  {
2186 
2189  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2190 
2191  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2192 
2193  const ResultType tmp( serial( rhs ) );
2194  divAssign( ~lhs, tmp );
2195  }
2197  //**********************************************************************************************
2198 
2199  //**Division assignment to sparse vectors*******************************************************
2200  // No special implementation for the division assignment to sparse vectors.
2201  //**********************************************************************************************
2202 
2203  //**SMP assignment to dense vectors*************************************************************
2218  template< typename VT1 > // Type of the target dense vector
2219  friend inline EnableIf_< UseSMPAssign<VT1> >
2220  smpAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2221  {
2223 
2224  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2225 
2226  if( rhs.mat_.rows() == 0UL ) {
2227  reset( ~lhs );
2228  return;
2229  }
2230  else if( rhs.mat_.columns() == 0UL ) {
2231  return;
2232  }
2233 
2234  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2235  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2236 
2237  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2238  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2239  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2240  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2241 
2242  smpAssign( ~lhs, x * A );
2243  }
2245  //**********************************************************************************************
2246 
2247  //**SMP assignment to sparse vectors************************************************************
2262  template< typename VT1 > // Type of the target sparse vector
2263  friend inline EnableIf_< UseSMPAssign<VT1> >
2264  smpAssign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2265  {
2267 
2270  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2271 
2272  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2273 
2274  const ResultType tmp( rhs );
2275  smpAssign( ~lhs, tmp );
2276  }
2278  //**********************************************************************************************
2279 
2280  //**SMP addition assignment to dense vectors****************************************************
2295  template< typename VT1 > // Type of the target dense vector
2296  friend inline EnableIf_< UseSMPAssign<VT1> >
2297  smpAddAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2298  {
2300 
2301  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2302 
2303  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2304  return;
2305  }
2306 
2307  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2308  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2309 
2310  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2311  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2312  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2313  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2314 
2315  smpAddAssign( ~lhs, x * A );
2316  }
2318  //**********************************************************************************************
2319 
2320  //**SMP addition assignment to sparse vectors***************************************************
2321  // No special implementation for the SMP addition assignment to sparse vectors.
2322  //**********************************************************************************************
2323 
2324  //**SMP subtraction assignment to dense vectors*************************************************
2339  template< typename VT1 > // Type of the target dense vector
2340  friend inline EnableIf_< UseSMPAssign<VT1> >
2341  smpSubAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2342  {
2344 
2345  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2346 
2347  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2348  return;
2349  }
2350 
2351  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2352  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2353 
2354  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2355  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2356  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2357  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2358 
2359  smpSubAssign( ~lhs, x * A );
2360  }
2362  //**********************************************************************************************
2363 
2364  //**SMP subtraction assignment to sparse vectors************************************************
2365  // No special implementation for the SMP subtraction assignment to sparse vectors.
2366  //**********************************************************************************************
2367 
2368  //**SMP multiplication assignment to dense vectors**********************************************
2383  template< typename VT1 > // Type of the target dense vector
2384  friend inline EnableIf_< UseSMPAssign<VT1> >
2385  smpMultAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2386  {
2388 
2391  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2392 
2393  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2394 
2395  const ResultType tmp( rhs );
2396  smpMultAssign( ~lhs, tmp );
2397  }
2399  //**********************************************************************************************
2400 
2401  //**SMP multiplication assignment to sparse vectors*********************************************
2402  // No special implementation for the SMP multiplication assignment to sparse vectors.
2403  //**********************************************************************************************
2404 
2405  //**SMP division assignment to dense vectors****************************************************
2420  template< typename VT1 > // Type of the target dense vector
2421  friend inline EnableIf_< UseSMPAssign<VT1> >
2422  smpDivAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2423  {
2425 
2428  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
2429 
2430  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2431 
2432  const ResultType tmp( rhs );
2433  smpDivAssign( ~lhs, tmp );
2434  }
2436  //**********************************************************************************************
2437 
2438  //**SMP division assignment to sparse vectors***************************************************
2439  // No special implementation for the SMP division assignment to sparse vectors.
2440  //**********************************************************************************************
2441 
2442  //**Compile time checks*************************************************************************
2450  //**********************************************************************************************
2451 };
2452 //*************************************************************************************************
2453 
2454 
2455 
2456 
2457 //=================================================================================================
2458 //
2459 // DVECSCALARMULTEXPR SPECIALIZATION
2460 //
2461 //=================================================================================================
2462 
2463 //*************************************************************************************************
2471 template< typename VT // Type of the left-hand side dense vector
2472  , typename MT // Type of the right-hand side dense matrix
2473  , typename ST > // Type of the side scalar value
2474 class DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >
2475  : public DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true >
2476  , private VecScalarMultExpr
2477  , private Computation
2478 {
2479  private:
2480  //**Type definitions****************************************************************************
2481  typedef TDVecDMatMultExpr<VT,MT> VMM;
2482  typedef ResultType_<VMM> RES;
2483  typedef ResultType_<VT> VRT;
2484  typedef ResultType_<MT> MRT;
2485  typedef ElementType_<VRT> VET;
2486  typedef ElementType_<MRT> MET;
2487  typedef CompositeType_<VT> VCT;
2488  typedef CompositeType_<MT> MCT;
2489  //**********************************************************************************************
2490 
2491  //**********************************************************************************************
2493  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2494  //**********************************************************************************************
2495 
2496  //**********************************************************************************************
2498  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2499  IsBLASCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2500  //**********************************************************************************************
2501 
2502  //**********************************************************************************************
2504 
2507  template< typename T1 >
2508  struct UseSMPAssign {
2509  enum : bool { value = ( evaluateVector || evaluateMatrix ) };
2510  };
2511  //**********************************************************************************************
2512 
2513  //**********************************************************************************************
2515 
2517  template< typename T1, typename T2, typename T3, typename T4 >
2518  struct UseBlasKernel {
2520  HasMutableDataAccess<T1>::value &&
2521  HasConstDataAccess<T2>::value &&
2522  HasConstDataAccess<T3>::value &&
2523  !IsDiagonal<T3>::value &&
2524  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2525  IsBLASCompatible< ElementType_<T1> >::value &&
2526  IsBLASCompatible< ElementType_<T2> >::value &&
2527  IsBLASCompatible< ElementType_<T3> >::value &&
2528  IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
2529  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2530  !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
2531  };
2532  //**********************************************************************************************
2533 
2534  //**********************************************************************************************
2536 
2539  template< typename T1, typename T2, typename T3, typename T4 >
2540  struct UseVectorizedDefaultKernel {
2541  enum : bool { value = useOptimizedKernels &&
2542  !IsDiagonal<T3>::value &&
2543  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2544  AreSIMDCombinable< ElementType_<T1>
2545  , ElementType_<T2>
2546  , ElementType_<T3>
2547  , T4 >::value &&
2548  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2549  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2550  };
2551  //**********************************************************************************************
2552 
2553  public:
2554  //**Type definitions****************************************************************************
2555  typedef DVecScalarMultExpr<VMM,ST,true> This;
2556  typedef MultTrait_<RES,ST> ResultType;
2557  typedef TransposeType_<ResultType> TransposeType;
2558  typedef ElementType_<ResultType> ElementType;
2559  typedef SIMDTrait_<ElementType> SIMDType;
2560  typedef const ElementType ReturnType;
2561  typedef const ResultType CompositeType;
2562 
2564  typedef const TDVecDMatMultExpr<VT,MT> LeftOperand;
2565 
2567  typedef ST RightOperand;
2568 
2570  typedef IfTrue_< evaluateVector, const VRT, VCT > LT;
2571 
2573  typedef IfTrue_< evaluateMatrix, const MRT, MCT > RT;
2574  //**********************************************************************************************
2575 
2576  //**Compilation flags***************************************************************************
2578  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2579  VT::simdEnabled && MT::simdEnabled &&
2580  AreSIMDCombinable<VET,MET,ST>::value &&
2581  HasSIMDAdd<VET,MET>::value &&
2582  HasSIMDMult<VET,MET>::value };
2583 
2585  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2586  !evaluateMatrix && MT::smpAssignable };
2587  //**********************************************************************************************
2588 
2589  //**SIMD properties*****************************************************************************
2591  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2592  //**********************************************************************************************
2593 
2594  //**Constructor*********************************************************************************
2600  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2601  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2602  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2603  {}
2604  //**********************************************************************************************
2605 
2606  //**Subscript operator**************************************************************************
2612  inline ReturnType operator[]( size_t index ) const {
2613  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2614  return vector_[index] * scalar_;
2615  }
2616  //**********************************************************************************************
2617 
2618  //**At function*********************************************************************************
2625  inline ReturnType at( size_t index ) const {
2626  if( index >= vector_.size() ) {
2627  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2628  }
2629  return (*this)[index];
2630  }
2631  //**********************************************************************************************
2632 
2633  //**Size function*******************************************************************************
2638  inline size_t size() const {
2639  return vector_.size();
2640  }
2641  //**********************************************************************************************
2642 
2643  //**Left operand access*************************************************************************
2648  inline LeftOperand leftOperand() const {
2649  return vector_;
2650  }
2651  //**********************************************************************************************
2652 
2653  //**Right operand access************************************************************************
2658  inline RightOperand rightOperand() const {
2659  return scalar_;
2660  }
2661  //**********************************************************************************************
2662 
2663  //**********************************************************************************************
2669  template< typename T >
2670  inline bool canAlias( const T* alias ) const {
2671  return vector_.canAlias( alias );
2672  }
2673  //**********************************************************************************************
2674 
2675  //**********************************************************************************************
2681  template< typename T >
2682  inline bool isAliased( const T* alias ) const {
2683  return vector_.isAliased( alias );
2684  }
2685  //**********************************************************************************************
2686 
2687  //**********************************************************************************************
2692  inline bool isAligned() const {
2693  return vector_.isAligned();
2694  }
2695  //**********************************************************************************************
2696 
2697  //**********************************************************************************************
2702  inline bool canSMPAssign() const noexcept {
2703  RightOperand_<VMM> A( vector_.rightOperand() );
2704  return ( !BLAZE_BLAS_IS_PARALLEL ||
2705  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2706  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2707  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
2708  }
2709  //**********************************************************************************************
2710 
2711  private:
2712  //**Member variables****************************************************************************
2713  LeftOperand vector_;
2714  RightOperand scalar_;
2715  //**********************************************************************************************
2716 
2717  //**Assignment to dense vectors*****************************************************************
2729  template< typename VT1 > // Type of the target dense vector
2730  friend inline void assign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
2731  {
2733 
2734  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2735 
2736  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
2737  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
2738 
2739  if( right.rows() == 0UL ) {
2740  reset( ~lhs );
2741  return;
2742  }
2743  else if( right.columns() == 0UL ) {
2744  return;
2745  }
2746 
2747  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
2748  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
2749 
2750  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
2751  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
2752  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
2753  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2754 
2755  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2756  }
2757  //**********************************************************************************************
2758 
2759  //**Assignment to dense vectors (kernel selection)**********************************************
2770  template< typename VT1 // Type of the left-hand side target vector
2771  , typename VT2 // Type of the left-hand side vector operand
2772  , typename MT1 // Type of the right-hand side matrix operand
2773  , typename ST2 > // Type of the scalar value
2774  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2775  {
2776  if( ( IsDiagonal<MT1>::value ) ||
2777  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2778  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2779  selectSmallAssignKernel( y, x, A, scalar );
2780  else
2781  selectBlasAssignKernel( y, x, A, scalar );
2782  }
2783  //**********************************************************************************************
2784 
2785  //**Default assignment to dense vectors*********************************************************
2799  template< typename VT1 // Type of the left-hand side target vector
2800  , typename VT2 // Type of the left-hand side vector operand
2801  , typename MT1 // Type of the right-hand side matrix operand
2802  , typename ST2 > // Type of the scalar value
2803  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2804  {
2805  const size_t M( A.rows() );
2806  const size_t N( A.columns() );
2807 
2808  if( IsStrictlyUpper<MT1>::value ) {
2809  reset( y[0] );
2810  }
2811 
2812  if( !IsLower<MT1>::value )
2813  {
2814  for( size_t j=( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ); j<N; ++j ) {
2815  y[j] = x[0UL] * A(0UL,j);
2816  }
2817  }
2818 
2819  for( size_t i=( IsLower<MT1>::value && !IsStrictlyLower<MT1>::value ? 0UL : 1UL ); i<M; ++i )
2820  {
2821  if( IsDiagonal<MT1>::value )
2822  {
2823  y[i] = x[i] * A(i,i) * scalar;
2824  }
2825  else
2826  {
2827  const size_t jbegin( ( IsUpper<MT1>::value )
2828  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2829  :( 0UL ) );
2830  const size_t jend( ( IsLower<MT1>::value )
2831  ?( IsStrictlyLower<MT1>::value ? i-1UL : i )
2832  :( N ) );
2833  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2834 
2835  const size_t jnum( jend - jbegin );
2836  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2837 
2838  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2839  y[j ] += x[i] * A(i,j );
2840  y[j+1UL] += x[i] * A(i,j+1UL);
2841  }
2842  if( jpos < jend ) {
2843  y[jpos] += x[i] * A(i,jpos);
2844  }
2845  if( IsLower<MT1>::value ) {
2846  y[jend] = x[i] * A(i,jend);
2847  }
2848  }
2849  }
2850 
2851  if( IsStrictlyLower<MT1>::value ) {
2852  reset( y[N-1UL] );
2853  }
2854 
2855  if( !IsDiagonal<MT1>::value )
2856  {
2857  const size_t iend( IsStrictlyLower<MT1>::value ? N-1UL : N );
2858  for( size_t j=( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ); j<iend; ++j ) {
2859  y[j] *= scalar;
2860  }
2861  }
2862  }
2863  //**********************************************************************************************
2864 
2865  //**Default assignment to dense vectors (small matrices)****************************************
2879  template< typename VT1 // Type of the left-hand side target vector
2880  , typename VT2 // Type of the left-hand side vector operand
2881  , typename MT1 // Type of the right-hand side matrix operand
2882  , typename ST2 > // Type of the scalar value
2883  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
2884  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2885  {
2886  selectDefaultAssignKernel( y, x, A, scalar );
2887  }
2888  //**********************************************************************************************
2889 
2890  //**Default assignment to dense vectors (small matrices)****************************************
2904  template< typename VT1 // Type of the left-hand side target vector
2905  , typename VT2 // Type of the left-hand side vector operand
2906  , typename MT1 // Type of the right-hand side matrix operand
2907  , typename ST2 > // Type of the scalar value
2908  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
2909  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2910  {
2911  const size_t M( A.rows() );
2912  const size_t N( A.columns() );
2913 
2914  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
2915 
2916  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2917  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2918 
2919  const SIMDType factor( set( scalar ) );
2920 
2921  size_t j( 0UL );
2922 
2923  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2924  {
2925  const size_t ibegin( ( IsLower<MT1>::value )
2926  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2927  :( 0UL ) );
2928  const size_t iend( ( IsUpper<MT1>::value )
2929  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2930  :( M ) );
2931  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2932 
2933  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934 
2935  for( size_t i=ibegin; i<iend; ++i ) {
2936  const SIMDType x1( set( x[i] ) );
2937  xmm1 = xmm1 + x1 * A.load(i,j );
2938  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2939  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
2940  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
2941  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
2942  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
2943  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
2944  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
2945  }
2946 
2947  y.store( j , xmm1*factor );
2948  y.store( j+SIMDSIZE , xmm2*factor );
2949  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2950  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2951  y.store( j+SIMDSIZE*4UL, xmm5*factor );
2952  y.store( j+SIMDSIZE*5UL, xmm6*factor );
2953  y.store( j+SIMDSIZE*6UL, xmm7*factor );
2954  y.store( j+SIMDSIZE*7UL, xmm8*factor );
2955  }
2956 
2957  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2958  {
2959  const size_t ibegin( ( IsLower<MT1>::value )
2960  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2961  :( 0UL ) );
2962  const size_t iend( ( IsUpper<MT1>::value )
2963  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2964  :( M ) );
2965  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2966 
2967  SIMDType xmm1, xmm2, xmm3, xmm4;
2968 
2969  for( size_t i=ibegin; i<iend; ++i ) {
2970  const SIMDType x1( set( x[i] ) );
2971  xmm1 = xmm1 + x1 * A.load(i,j );
2972  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2973  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
2974  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
2975  }
2976 
2977  y.store( j , xmm1*factor );
2978  y.store( j+SIMDSIZE , xmm2*factor );
2979  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2980  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2981  }
2982 
2983  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2984  {
2985  const size_t ibegin( ( IsLower<MT1>::value )
2986  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2987  :( 0UL ) );
2988  const size_t iend( ( IsUpper<MT1>::value )
2989  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2990  :( M ) );
2991  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2992 
2993  SIMDType xmm1, xmm2, xmm3;
2994 
2995  for( size_t i=ibegin; i<iend; ++i ) {
2996  const SIMDType x1( set( x[i] ) );
2997  xmm1 = xmm1 + x1 * A.load(i,j );
2998  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2999  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3000  }
3001 
3002  y.store( j , xmm1*factor );
3003  y.store( j+SIMDSIZE , xmm2*factor );
3004  y.store( j+SIMDSIZE*2UL, xmm3*factor );
3005  }
3006 
3007  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3008  {
3009  const size_t ibegin( ( IsLower<MT1>::value )
3010  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3011  :( 0UL ) );
3012  const size_t iend( ( IsUpper<MT1>::value )
3013  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3014  :( M ) );
3015  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3016 
3017  SIMDType xmm1, xmm2;
3018 
3019  for( size_t i=ibegin; i<iend; ++i ) {
3020  const SIMDType x1( set( x[i] ) );
3021  xmm1 = xmm1 + x1 * A.load(i,j );
3022  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
3023  }
3024 
3025  y.store( j , xmm1*factor );
3026  y.store( j+SIMDSIZE, xmm2*factor );
3027  }
3028 
3029  for( ; j<jpos; j+=SIMDSIZE )
3030  {
3031  const size_t ibegin( ( IsLower<MT1>::value )
3032  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3033  :( 0UL ) );
3034  const size_t iend( ( IsUpper<MT1>::value )
3035  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3036  :( M ) );
3037  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3038 
3039  SIMDType xmm1;
3040 
3041  for( size_t i=ibegin; i<iend; ++i ) {
3042  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
3043  }
3044 
3045  y.store( j, xmm1*factor );
3046  }
3047 
3048  for( ; remainder && j<N; ++j )
3049  {
3050  const size_t ibegin( ( IsLower<MT1>::value )
3051  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3052  :( 0UL ) );
3053  const size_t iend( ( IsUpper<MT1>::value )
3054  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3055  :( M ) );
3056  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3057 
3058  ElementType value = ElementType();
3059 
3060  for( size_t i=ibegin; i<iend; ++i ) {
3061  value += x[i] * A(i,j);
3062  }
3063 
3064  y[j] = value * scalar;
3065  }
3066  }
3067  //**********************************************************************************************
3068 
3069  //**Default assignment to dense vectors (large matrices)****************************************
3083  template< typename VT1 // Type of the left-hand side target vector
3084  , typename VT2 // Type of the left-hand side vector operand
3085  , typename MT1 // Type of the right-hand side matrix operand
3086  , typename ST2 > // Type of the scalar value
3087  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3088  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3089  {
3090  selectDefaultAssignKernel( y, x, A, scalar );
3091  }
3092  //**********************************************************************************************
3093 
3094  //**Default assignment to dense vectors (large matrices)****************************************
3108  template< typename VT1 // Type of the left-hand side target vector
3109  , typename VT2 // Type of the left-hand side vector operand
3110  , typename MT1 // Type of the right-hand side matrix operand
3111  , typename ST2 > // Type of the scalar value
3112  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3113  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3114  {
3115  const size_t M( A.rows() );
3116  const size_t N( A.columns() );
3117 
3118  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3119 
3120  const size_t jblock( 32768UL / sizeof( ElementType ) );
3121  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3122 
3123  const SIMDType factor( set( scalar ) );
3124 
3125  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3126 
3127  reset( y );
3128 
3129  for( size_t jj=0U; jj<N; jj+=jblock ) {
3130  for( size_t ii=0UL; ii<M; ii+=iblock )
3131  {
3132  const size_t iend( min( ii+iblock, M ) );
3133  const size_t jtmp( min( jj+jblock, N ) );
3134  const size_t jend( ( IsLower<MT1>::value )
3135  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
3136  :( jtmp ) );
3137 
3138  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3139  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3140 
3141  size_t j( ( IsUpper<MT1>::value )
3142  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3143  :( jj ) );
3144 
3145  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3146  {
3147  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3148 
3149  for( size_t i=ii; i<iend; ++i ) {
3150  const SIMDType x1( set( x[i] ) );
3151  xmm1 = xmm1 + x1 * A.load(i,j );
3152  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3153  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3154  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3155  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
3156  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
3157  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
3158  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
3159  }
3160 
3161  y.store( j , y.load(j ) + xmm1*factor );
3162  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3163  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3164  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3165  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3166  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3167  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3168  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3169  }
3170 
3171  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3172  {
3173  SIMDType xmm1, xmm2, xmm3, xmm4;
3174 
3175  for( size_t i=ii; i<iend; ++i ) {
3176  const SIMDType x1( set( x[i] ) );
3177  xmm1 = xmm1 + x1 * A.load(i,j );
3178  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3179  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3180  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3181  }
3182 
3183  y.store( j , y.load(j ) + xmm1*factor );
3184  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3185  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3186  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3187  }
3188 
3189  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3190  {
3191  SIMDType xmm1, xmm2, xmm3;
3192 
3193  for( size_t i=ii; i<iend; ++i ) {
3194  const SIMDType x1( set( x[i] ) );
3195  xmm1 = xmm1 + x1 * A.load(i,j );
3196  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3197  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3198  }
3199 
3200  y.store( j , y.load(j ) + xmm1*factor );
3201  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3202  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3203  }
3204 
3205  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3206  {
3207  SIMDType xmm1, xmm2;
3208 
3209  for( size_t i=ii; i<iend; ++i ) {
3210  const SIMDType x1( set( x[i] ) );
3211  xmm1 = xmm1 + x1 * A.load(i,j );
3212  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
3213  }
3214 
3215  y.store( j , y.load(j ) + xmm1*factor );
3216  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3217  }
3218 
3219  for( ; j<jpos; j+=SIMDSIZE )
3220  {
3221  SIMDType xmm1;
3222 
3223  for( size_t i=ii; i<iend; ++i ) {
3224  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
3225  }
3226 
3227  y.store( j, y.load(j) + xmm1*factor );
3228  }
3229 
3230  for( ; remainder && j<jend; ++j )
3231  {
3232  ElementType value = ElementType();
3233 
3234  for( size_t i=ii; i<iend; ++i ) {
3235  value += x[i] * A(i,j);
3236  }
3237 
3238  y[j] += value * scalar;
3239  }
3240  }
3241  }
3242  }
3243  //**********************************************************************************************
3244 
3245  //**BLAS-based assignment to dense vectors (default)********************************************
3258  template< typename VT1 // Type of the left-hand side target vector
3259  , typename VT2 // Type of the left-hand side vector operand
3260  , typename MT1 // Type of the right-hand side matrix operand
3261  , typename ST2 > // Type of the scalar value
3262  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3263  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3264  {
3265  selectLargeAssignKernel( y, x, A, scalar );
3266  }
3267  //**********************************************************************************************
3268 
3269  //**BLAS-based assignment to dense vectors******************************************************
3270 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3271 
3284  template< typename VT1 // Type of the left-hand side target vector
3285  , typename VT2 // Type of the left-hand side vector operand
3286  , typename MT1 // Type of the right-hand side matrix operand
3287  , typename ST2 > // Type of the scalar value
3288  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3289  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3290  {
3291  typedef ElementType_<VT1> ET;
3292 
3293  if( IsTriangular<MT1>::value ) {
3294  assign( y, scalar * x );
3295  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3296  }
3297  else {
3298  gemv( y, x, A, ET(scalar), ET(0) );
3299  }
3300  }
3301 #endif
3302  //**********************************************************************************************
3303 
3304  //**Assignment to sparse vectors****************************************************************
3316  template< typename VT1 > // Type of the target sparse vector
3317  friend inline void assign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3318  {
3320 
3323  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
3324 
3325  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3326 
3327  const ResultType tmp( serial( rhs ) );
3328  assign( ~lhs, tmp );
3329  }
3330  //**********************************************************************************************
3331 
3332  //**Addition assignment to dense vectors********************************************************
3344  template< typename VT1 > // Type of the target dense vector
3345  friend inline void addAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3346  {
3348 
3349  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3350 
3351  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3352  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3353 
3354  if( right.rows() == 0UL || right.columns() == 0UL ) {
3355  return;
3356  }
3357 
3358  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3359  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3360 
3361  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3362  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3363  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3364  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3365 
3366  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3367  }
3368  //**********************************************************************************************
3369 
3370  //**Addition assignment to dense vectors (kernel selection)*************************************
3381  template< typename VT1 // Type of the left-hand side target vector
3382  , typename VT2 // Type of the left-hand side vector operand
3383  , typename MT1 // Type of the right-hand side matrix operand
3384  , typename ST2 > // Type of the scalar value
3385  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3386  {
3387  if( ( IsDiagonal<MT1>::value ) ||
3388  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3389  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3390  selectSmallAddAssignKernel( y, x, A, scalar );
3391  else
3392  selectBlasAddAssignKernel( y, x, A, scalar );
3393  }
3394  //**********************************************************************************************
3395 
3396  //**Default addition assignment to dense vectors************************************************
3410  template< typename VT1 // Type of the left-hand side target vector
3411  , typename VT2 // Type of the left-hand side vector operand
3412  , typename MT1 // Type of the right-hand side matrix operand
3413  , typename ST2 > // Type of the scalar value
3414  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3415  {
3416  y.addAssign( x * A * scalar );
3417  }
3418  //**********************************************************************************************
3419 
3420  //**Default addition assignment to dense vectors (small matrices)*******************************
3434  template< typename VT1 // Type of the left-hand side target vector
3435  , typename VT2 // Type of the left-hand side vector operand
3436  , typename MT1 // Type of the right-hand side matrix operand
3437  , typename ST2 > // Type of the scalar value
3438  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3439  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3440  {
3441  selectDefaultAddAssignKernel( y, x, A, scalar );
3442  }
3443  //**********************************************************************************************
3444 
3445  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3460  template< typename VT1 // Type of the left-hand side target vector
3461  , typename VT2 // Type of the left-hand side vector operand
3462  , typename MT1 // Type of the right-hand side matrix operand
3463  , typename ST2 > // Type of the scalar value
3464  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3465  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3466  {
3467  const size_t M( A.rows() );
3468  const size_t N( A.columns() );
3469 
3470  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3471 
3472  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3473  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3474 
3475  const SIMDType factor( set( scalar ) );
3476 
3477  size_t j( 0UL );
3478 
3479  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3480  {
3481  const size_t ibegin( ( IsLower<MT1>::value )
3482  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3483  :( 0UL ) );
3484  const size_t iend( ( IsUpper<MT1>::value )
3485  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3486  :( M ) );
3487  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3488 
3489  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3490 
3491  for( size_t i=ibegin; i<iend; ++i ) {
3492  const SIMDType x1( set( x[i] ) );
3493  xmm1 = xmm1 + x1 * A.load(i,j );
3494  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3495  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3496  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3497  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
3498  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
3499  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
3500  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
3501  }
3502 
3503  y.store( j , y.load(j ) + xmm1*factor );
3504  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3505  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3506  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3507  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3508  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3509  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3510  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3511  }
3512 
3513  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3514  {
3515  const size_t ibegin( ( IsLower<MT1>::value )
3516  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3517  :( 0UL ) );
3518  const size_t iend( ( IsUpper<MT1>::value )
3519  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3520  :( M ) );
3521  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3522 
3523  SIMDType xmm1, xmm2, xmm3, xmm4;
3524 
3525  for( size_t i=ibegin; i<iend; ++i ) {
3526  const SIMDType x1( set( x[i] ) );
3527  xmm1 = xmm1 + x1 * A.load(i,j );
3528  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3529  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3530  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3531  }
3532 
3533  y.store( j , y.load(j ) + xmm1*factor );
3534  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3535  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3536  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3537  }
3538 
3539  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3540  {
3541  const size_t ibegin( ( IsLower<MT1>::value )
3542  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3543  :( 0UL ) );
3544  const size_t iend( ( IsUpper<MT1>::value )
3545  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3546  :( M ) );
3547  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3548 
3549  SIMDType xmm1, xmm2, xmm3;
3550 
3551  for( size_t i=ibegin; i<iend; ++i ) {
3552  const SIMDType x1( set( x[i] ) );
3553  xmm1 = xmm1 + x1 * A.load(i,j );
3554  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3555  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3556  }
3557 
3558  y.store( j , y.load(j ) + xmm1*factor );
3559  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3560  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3561  }
3562 
3563  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3564  {
3565  const size_t ibegin( ( IsLower<MT1>::value )
3566  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3567  :( 0UL ) );
3568  const size_t iend( ( IsUpper<MT1>::value )
3569  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3570  :( M ) );
3571  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3572 
3573  SIMDType xmm1, xmm2;
3574 
3575  for( size_t i=ibegin; i<iend; ++i ) {
3576  const SIMDType x1( set( x[i] ) );
3577  xmm1 = xmm1 + x1 * A.load(i,j );
3578  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
3579  }
3580 
3581  y.store( j , y.load(j ) + xmm1*factor );
3582  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3583  }
3584 
3585  for( ; j<jpos; j+=SIMDSIZE )
3586  {
3587  const size_t ibegin( ( IsLower<MT1>::value )
3588  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3589  :( 0UL ) );
3590  const size_t iend( ( IsUpper<MT1>::value )
3591  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3592  :( M ) );
3593  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3594 
3595  SIMDType xmm1;
3596 
3597  for( size_t i=ibegin; i<iend; ++i ) {
3598  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
3599  }
3600 
3601  y.store( j, y.load(j) + xmm1*factor );
3602  }
3603 
3604  for( ; remainder && j<N; ++j )
3605  {
3606  const size_t ibegin( ( IsLower<MT1>::value )
3607  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3608  :( 0UL ) );
3609  const size_t iend( ( IsUpper<MT1>::value )
3610  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3611  :( M ) );
3612  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3613 
3614  ElementType value = ElementType();
3615 
3616  for( size_t i=ibegin; i<iend; ++i ) {
3617  value += x[i] * A(i,j);
3618  }
3619 
3620  y[j] += value * scalar;
3621  }
3622  }
3623  //**********************************************************************************************
3624 
3625  //**Default addition assignment to dense vectors (large matrices)*******************************
3639  template< typename VT1 // Type of the left-hand side target vector
3640  , typename VT2 // Type of the left-hand side vector operand
3641  , typename MT1 // Type of the right-hand side matrix operand
3642  , typename ST2 > // Type of the scalar value
3643  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3644  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3645  {
3646  selectDefaultAddAssignKernel( y, x, A, scalar );
3647  }
3648  //**********************************************************************************************
3649 
3650  //**Vectorized default addition assignment to dense vectors (large matrices)********************
3665  template< typename VT1 // Type of the left-hand side target vector
3666  , typename VT2 // Type of the left-hand side vector operand
3667  , typename MT1 // Type of the right-hand side matrix operand
3668  , typename ST2 > // Type of the scalar value
3669  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3670  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3671  {
3672  const size_t M( A.rows() );
3673  const size_t N( A.columns() );
3674 
3675  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3676 
3677  const size_t jblock( 32768UL / sizeof( ElementType ) );
3678  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3679 
3680  const SIMDType factor( set( scalar ) );
3681 
3682  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3683 
3684  for( size_t jj=0U; jj<N; jj+=jblock ) {
3685  for( size_t ii=0UL; ii<M; ii+=iblock )
3686  {
3687  const size_t iend( min( ii+iblock, M ) );
3688  const size_t jtmp( min( jj+jblock, N ) );
3689  const size_t jend( ( IsLower<MT1>::value )
3690  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
3691  :( jtmp ) );
3692 
3693  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3694  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3695 
3696  size_t j( ( IsUpper<MT1>::value )
3697  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3698  :( jj ) );
3699 
3700  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3701  {
3702  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3703 
3704  for( size_t i=ii; i<iend; ++i ) {
3705  const SIMDType x1( set( x[i] ) );
3706  xmm1 = xmm1 + x1 * A.load(i,j );
3707  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3708  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3709  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3710  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
3711  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
3712  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
3713  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
3714  }
3715 
3716  y.store( j , y.load(j ) + xmm1*factor );
3717  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3718  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3719  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3720  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3721  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3722  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3723  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3724  }
3725 
3726  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3727  {
3728  SIMDType xmm1, xmm2, xmm3, xmm4;
3729 
3730  for( size_t i=ii; i<iend; ++i ) {
3731  const SIMDType x1( set( x[i] ) );
3732  xmm1 = xmm1 + x1 * A.load(i,j );
3733  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3734  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3735  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3736  }
3737 
3738  y.store( j , y.load(j ) + xmm1*factor );
3739  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3740  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3741  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3742  }
3743 
3744  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3745  {
3746  SIMDType xmm1, xmm2, xmm3;
3747 
3748  for( size_t i=ii; i<iend; ++i ) {
3749  const SIMDType x1( set( x[i] ) );
3750  xmm1 = xmm1 + x1 * A.load(i,j );
3751  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3752  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3753  }
3754 
3755  y.store( j , y.load(j ) + xmm1*factor );
3756  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3757  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3758  }
3759 
3760  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3761  {
3762  SIMDType xmm1, xmm2;
3763 
3764  for( size_t i=ii; i<iend; ++i ) {
3765  const SIMDType x1( set( x[i] ) );
3766  xmm1 = xmm1 + x1 * A.load(i,j );
3767  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
3768  }
3769 
3770  y.store( j , y.load(j ) + xmm1*factor );
3771  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3772  }
3773 
3774  for( ; j<jpos; j+=SIMDSIZE )
3775  {
3776  SIMDType xmm1;
3777 
3778  for( size_t i=ii; i<iend; ++i ) {
3779  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
3780  }
3781 
3782  y.store( j, y.load(j) + xmm1*factor );
3783  }
3784 
3785  for( ; remainder && j<jend; ++j )
3786  {
3787  ElementType value = ElementType();
3788 
3789  for( size_t i=ii; i<iend; ++i ) {
3790  value += x[i] * A(i,j);
3791  }
3792 
3793  y[j] += value * scalar;
3794  }
3795  }
3796  }
3797  }
3798  //**********************************************************************************************
3799 
3800  //**BLAS-based addition assignment to dense vectors (default)***********************************
3814  template< typename VT1 // Type of the left-hand side target vector
3815  , typename VT2 // Type of the left-hand side vector operand
3816  , typename MT1 // Type of the right-hand side matrix operand
3817  , typename ST2 > // Type of the scalar value
3818  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3819  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3820  {
3821  selectLargeAddAssignKernel( y, x, A, scalar );
3822  }
3823  //**********************************************************************************************
3824 
3825  //**BLAS-based addition assignment to dense vectors*********************************************
3826 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3827 
3840  template< typename VT1 // Type of the left-hand side target vector
3841  , typename VT2 // Type of the left-hand side vector operand
3842  , typename MT1 // Type of the right-hand side matrix operand
3843  , typename ST2 > // Type of the scalar value
3844  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3845  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3846  {
3847  typedef ElementType_<VT1> ET;
3848 
3849  if( IsTriangular<MT1>::value ) {
3850  ResultType_<VT1> tmp( serial( scalar * x ) );
3851  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3852  addAssign( y, tmp );
3853  }
3854  else {
3855  gemv( y, x, A, ET(scalar), ET(1) );
3856  }
3857  }
3858 #endif
3859  //**********************************************************************************************
3860 
3861  //**Addition assignment to sparse vectors*******************************************************
3862  // No special implementation for the addition assignment to sparse vectors.
3863  //**********************************************************************************************
3864 
3865  //**Subtraction assignment to dense vectors*****************************************************
3877  template< typename VT1 > // Type of the target dense vector
3878  friend inline void subAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3879  {
3881 
3882  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3883 
3884  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3885  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3886 
3887  if( right.rows() == 0UL || right.columns() == 0UL ) {
3888  return;
3889  }
3890 
3891  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3892  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3893 
3894  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3895  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3896  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3897  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3898 
3899  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
3900  }
3901  //**********************************************************************************************
3902 
3903  //**Subtraction assignment to dense vectors (kernel selection)**********************************
3914  template< typename VT1 // Type of the left-hand side target vector
3915  , typename VT2 // Type of the left-hand side vector operand
3916  , typename MT1 // Type of the right-hand side matrix operand
3917  , typename ST2 > // Type of the scalar value
3918  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3919  {
3920  if( ( IsDiagonal<MT1>::value ) ||
3921  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3922  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3923  selectSmallSubAssignKernel( y, x, A, scalar );
3924  else
3925  selectBlasSubAssignKernel( y, x, A, scalar );
3926  }
3927  //**********************************************************************************************
3928 
3929  //**Default subtraction assignment to dense vectors*********************************************
3943  template< typename VT1 // Type of the left-hand side target vector
3944  , typename VT2 // Type of the left-hand side vector operand
3945  , typename MT1 // Type of the right-hand side matrix operand
3946  , typename ST2 > // Type of the scalar value
3947  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3948  {
3949  y.subAssign( x * A * scalar );
3950  }
3951  //**********************************************************************************************
3952 
3953  //**Default subtraction assignment to dense vectors (small matrices)****************************
3967  template< typename VT1 // Type of the left-hand side target vector
3968  , typename VT2 // Type of the left-hand side vector operand
3969  , typename MT1 // Type of the right-hand side matrix operand
3970  , typename ST2 > // Type of the scalar value
3971  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3972  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3973  {
3974  selectDefaultSubAssignKernel( y, x, A, scalar );
3975  }
3976  //**********************************************************************************************
3977 
3978  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
3993  template< typename VT1 // Type of the left-hand side target vector
3994  , typename VT2 // Type of the left-hand side vector operand
3995  , typename MT1 // Type of the right-hand side matrix operand
3996  , typename ST2 > // Type of the scalar value
3997  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3998  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3999  {
4000  const size_t M( A.rows() );
4001  const size_t N( A.columns() );
4002 
4003  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
4004 
4005  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
4006  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4007 
4008  const SIMDType factor( set( scalar ) );
4009 
4010  size_t j( 0UL );
4011 
4012  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4013  {
4014  const size_t ibegin( ( IsLower<MT1>::value )
4015  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4016  :( 0UL ) );
4017  const size_t iend( ( IsUpper<MT1>::value )
4018  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4019  :( M ) );
4020  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4021 
4022  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4023 
4024  for( size_t i=ibegin; i<iend; ++i ) {
4025  const SIMDType x1( set( x[i] ) );
4026  xmm1 = xmm1 + x1 * A.load(i,j );
4027  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4028  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4029  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
4030  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
4031  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
4032  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
4033  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
4034  }
4035 
4036  y.store( j , y.load(j ) - xmm1*factor );
4037  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4038  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4039  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4040  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4041  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4042  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4043  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4044  }
4045 
4046  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4047  {
4048  const size_t ibegin( ( IsLower<MT1>::value )
4049  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4050  :( 0UL ) );
4051  const size_t iend( ( IsUpper<MT1>::value )
4052  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4053  :( M ) );
4054  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4055 
4056  SIMDType xmm1, xmm2, xmm3, xmm4;
4057 
4058  for( size_t i=ibegin; i<iend; ++i ) {
4059  const SIMDType x1( set( x[i] ) );
4060  xmm1 = xmm1 + x1 * A.load(i,j );
4061  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4062  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4063  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
4064  }
4065 
4066  y.store( j , y.load(j ) - xmm1*factor );
4067  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4068  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4069  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4070  }
4071 
4072  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4073  {
4074  const size_t ibegin( ( IsLower<MT1>::value )
4075  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4076  :( 0UL ) );
4077  const size_t iend( ( IsUpper<MT1>::value )
4078  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4079  :( M ) );
4080  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4081 
4082  SIMDType xmm1, xmm2, xmm3;
4083 
4084  for( size_t i=ibegin; i<iend; ++i ) {
4085  const SIMDType x1( set( x[i] ) );
4086  xmm1 = xmm1 + x1 * A.load(i,j );
4087  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4088  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4089  }
4090 
4091  y.store( j , y.load(j ) - xmm1*factor );
4092  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4093  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4094  }
4095 
4096  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4097  {
4098  const size_t ibegin( ( IsLower<MT1>::value )
4099  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4100  :( 0UL ) );
4101  const size_t iend( ( IsUpper<MT1>::value )
4102  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4103  :( M ) );
4104  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4105 
4106  SIMDType xmm1, xmm2;
4107 
4108  for( size_t i=ibegin; i<iend; ++i ) {
4109  const SIMDType x1( set( x[i] ) );
4110  xmm1 = xmm1 + x1 * A.load(i,j );
4111  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
4112  }
4113 
4114  y.store( j , y.load(j ) - xmm1*factor );
4115  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4116  }
4117 
4118  for( ; j<jpos; j+=SIMDSIZE )
4119  {
4120  const size_t ibegin( ( IsLower<MT1>::value )
4121  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4122  :( 0UL ) );
4123  const size_t iend( ( IsUpper<MT1>::value )
4124  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4125  :( M ) );
4126  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4127 
4128  SIMDType xmm1;
4129 
4130  for( size_t i=ibegin; i<iend; ++i ) {
4131  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
4132  }
4133 
4134  y.store( j, y.load(j) - xmm1*factor );
4135  }
4136 
4137  for( ; remainder && j<N; ++j )
4138  {
4139  const size_t ibegin( ( IsLower<MT1>::value )
4140  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4141  :( 0UL ) );
4142  const size_t iend( ( IsUpper<MT1>::value )
4143  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4144  :( M ) );
4145  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4146 
4147  ElementType value = ElementType();
4148 
4149  for( size_t i=ibegin; i<iend; ++i ) {
4150  value += x[i] * A(i,j);
4151  }
4152 
4153  y[j] -= value * scalar;
4154  }
4155  }
4156  //**********************************************************************************************
4157 
4158  //**Default subtraction assignment to dense vectors (large matrices)****************************
4172  template< typename VT1 // Type of the left-hand side target vector
4173  , typename VT2 // Type of the left-hand side vector operand
4174  , typename MT1 // Type of the right-hand side matrix operand
4175  , typename ST2 > // Type of the scalar value
4176  static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4177  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4178  {
4179  selectDefaultSubAssignKernel( y, x, A, scalar );
4180  }
4181  //**********************************************************************************************
4182 
4183  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4198  template< typename VT1 // Type of the left-hand side target vector
4199  , typename VT2 // Type of the left-hand side vector operand
4200  , typename MT1 // Type of the right-hand side matrix operand
4201  , typename ST2 > // Type of the scalar value
4202  static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4203  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4204  {
4205  const size_t M( A.rows() );
4206  const size_t N( A.columns() );
4207 
4208  const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
4209 
4210  const size_t jblock( 32768UL / sizeof( ElementType ) );
4211  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4212 
4213  const SIMDType factor( set( scalar ) );
4214 
4215  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
4216 
4217  for( size_t jj=0U; jj<N; jj+=jblock ) {
4218  for( size_t ii=0UL; ii<M; ii+=iblock )
4219  {
4220  const size_t iend( min( ii+iblock, M ) );
4221  const size_t jtmp( min( jj+jblock, N ) );
4222  const size_t jend( ( IsLower<MT1>::value )
4223  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
4224  :( jtmp ) );
4225 
4226  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4227  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4228 
4229  size_t j( ( IsUpper<MT1>::value )
4230  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
4231  :( jj ) );
4232 
4233  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4234  {
4235  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4236 
4237  for( size_t i=ii; i<iend; ++i ) {
4238  const SIMDType x1( set( x[i] ) );
4239  xmm1 = xmm1 + x1 * A.load(i,j );
4240  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4241  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4242  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
4243  xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
4244  xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
4245  xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
4246  xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
4247  }
4248 
4249  y.store( j , y.load(j ) - xmm1*factor );
4250  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4251  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4252  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4253  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4254  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4255  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4256  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4257  }
4258 
4259  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4260  {
4261  SIMDType xmm1, xmm2, xmm3, xmm4;
4262 
4263  for( size_t i=ii; i<iend; ++i ) {
4264  const SIMDType x1( set( x[i] ) );
4265  xmm1 = xmm1 + x1 * A.load(i,j );
4266  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4267  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4268  xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
4269  }
4270 
4271  y.store( j , y.load(j ) - xmm1*factor );
4272  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4273  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4274  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4275  }
4276 
4277  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4278  {
4279  SIMDType xmm1, xmm2, xmm3;
4280 
4281  for( size_t i=ii; i<iend; ++i ) {
4282  const SIMDType x1( set( x[i] ) );
4283  xmm1 = xmm1 + x1 * A.load(i,j );
4284  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4285  xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4286  }
4287 
4288  y.store( j , y.load(j ) - xmm1*factor );
4289  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4290  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4291  }
4292 
4293  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4294  {
4295  SIMDType xmm1, xmm2;
4296 
4297  for( size_t i=ii; i<iend; ++i ) {
4298  const SIMDType x1( set( x[i] ) );
4299  xmm1 = xmm1 + x1 * A.load(i,j );
4300  xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
4301  }
4302 
4303  y.store( j , y.load(j ) - xmm1*factor );
4304  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4305  }
4306 
4307  for( ; j<jpos; j+=SIMDSIZE )
4308  {
4309  SIMDType xmm1;
4310 
4311  for( size_t i=ii; i<iend; ++i ) {
4312  xmm1 = xmm1 + set( x[i] ) * A.load(i,j);
4313  }
4314 
4315  y.store( j, y.load(j) - xmm1*factor );
4316  }
4317 
4318  for( ; remainder && j<jend; ++j )
4319  {
4320  ElementType value = ElementType();
4321 
4322  for( size_t i=ii; i<iend; ++i ) {
4323  value += x[i] * A(i,j);
4324  }
4325 
4326  y[j] -= value * scalar;
4327  }
4328  }
4329  }
4330  }
4331  //**********************************************************************************************
4332 
4333  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4347  template< typename VT1 // Type of the left-hand side target vector
4348  , typename VT2 // Type of the left-hand side vector operand
4349  , typename MT1 // Type of the right-hand side matrix operand
4350  , typename ST2 > // Type of the scalar value
4351  static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4352  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4353  {
4354  selectLargeSubAssignKernel( y, x, A, scalar );
4355  }
4356  //**********************************************************************************************
4357 
4358  //**BLAS-based subtraction assignment to dense vectors******************************************
4359 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4360 
4373  template< typename VT1 // Type of the left-hand side target vector
4374  , typename VT2 // Type of the left-hand side vector operand
4375  , typename MT1 // Type of the right-hand side matrix operand
4376  , typename ST2 > // Type of the scalar value
4377  static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4378  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4379  {
4380  typedef ElementType_<VT1> ET;
4381 
4382  if( IsTriangular<MT1>::value ) {
4383  ResultType_<VT1> tmp( serial( scalar * x ) );
4384  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4385  subAssign( y, tmp );
4386  }
4387  else {
4388  gemv( y, x, A, ET(-scalar), ET(1) );
4389  }
4390  }
4391 #endif
4392  //**********************************************************************************************
4393 
4394  //**Subtraction assignment to sparse vectors****************************************************
4395  // No special implementation for the subtraction assignment to sparse vectors.
4396  //**********************************************************************************************
4397 
4398  //**Multiplication assignment to dense vectors**************************************************
4410  template< typename VT1 > // Type of the target dense vector
4411  friend inline void multAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4412  {
4414 
4417  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4418 
4419  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4420 
4421  const ResultType tmp( serial( rhs ) );
4422  multAssign( ~lhs, tmp );
4423  }
4424  //**********************************************************************************************
4425 
4426  //**Multiplication assignment to sparse vectors*************************************************
4427  // No special implementation for the multiplication assignment to sparse vectors.
4428  //**********************************************************************************************
4429 
4430  //**Division assignment to dense vectors********************************************************
4442  template< typename VT1 > // Type of the target dense vector
4443  friend inline void divAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4444  {
4446 
4449  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4450 
4451  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4452 
4453  const ResultType tmp( serial( rhs ) );
4454  divAssign( ~lhs, tmp );
4455  }
4456  //**********************************************************************************************
4457 
4458  //**Division assignment to sparse vectors*******************************************************
4459  // No special implementation for the division assignment to sparse vectors.
4460  //**********************************************************************************************
4461 
4462  //**SMP assignment to dense vectors*************************************************************
4476  template< typename VT1 > // Type of the target dense vector
4477  friend inline EnableIf_< UseSMPAssign<VT1> >
4478  smpAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4479  {
4481 
4482  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4483 
4484  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4485  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4486 
4487  if( right.rows() == 0UL ) {
4488  reset( ~lhs );
4489  return;
4490  }
4491  else if( right.columns() == 0UL ) {
4492  return;
4493  }
4494 
4495  LT x( left ); // Evaluation of the left-hand side dense vector operand
4496  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4497 
4498  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4499  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4500  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4501  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4502 
4503  smpAssign( ~lhs, x * A * rhs.scalar_ );
4504  }
4505  //**********************************************************************************************
4506 
4507  //**SMP assignment to sparse vectors************************************************************
4521  template< typename VT1 > // Type of the target sparse vector
4522  friend inline EnableIf_< UseSMPAssign<VT1> >
4523  smpAssign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4524  {
4526 
4529  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4530 
4531  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4532 
4533  const ResultType tmp( rhs );
4534  smpAssign( ~lhs, tmp );
4535  }
4536  //**********************************************************************************************
4537 
4538  //**SMP addition assignment to dense vectors****************************************************
4552  template< typename VT1 > // Type of the target dense vector
4553  friend inline EnableIf_< UseSMPAssign<VT1> >
4554  smpAddAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4555  {
4557 
4558  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4559 
4560  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4561  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4562 
4563  if( right.rows() == 0UL || right.columns() == 0UL ) {
4564  return;
4565  }
4566 
4567  LT x( left ); // Evaluation of the left-hand side dense vector operand
4568  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4569 
4570  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4571  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4572  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4573  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4574 
4575  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
4576  }
4577  //**********************************************************************************************
4578 
4579  //**SMP addition assignment to sparse vectors***************************************************
4580  // No special implementation for the SMP addition assignment to sparse vectors.
4581  //**********************************************************************************************
4582 
4583  //**SMP subtraction assignment to dense vectors*************************************************
4597  template< typename VT1 > // Type of the target dense vector
4598  friend inline EnableIf_< UseSMPAssign<VT1> >
4599  smpSubAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4600  {
4602 
4603  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4604 
4605  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4606  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4607 
4608  if( right.rows() == 0UL || right.columns() == 0UL ) {
4609  return;
4610  }
4611 
4612  LT x( left ); // Evaluation of the left-hand side dense vector operand
4613  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4614 
4615  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4616  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4617  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4618  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4619 
4620  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
4621  }
4622  //**********************************************************************************************
4623 
4624  //**SMP subtraction assignment to sparse vectors************************************************
4625  // No special implementation for the SMP subtraction assignment to sparse vectors.
4626  //**********************************************************************************************
4627 
4628  //**SMP multiplication assignment to dense vectors**********************************************
4643  template< typename VT1 > // Type of the target dense vector
4644  friend inline EnableIf_< UseSMPAssign<VT1> >
4645  smpMultAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4646  {
4648 
4651  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4652 
4653  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4654 
4655  const ResultType tmp( rhs );
4656  smpMultAssign( ~lhs, tmp );
4657  }
4658  //**********************************************************************************************
4659 
4660  //**SMP multiplication assignment to sparse vectors*********************************************
4661  // No special implementation for the SMP multiplication assignment to sparse vectors.
4662  //**********************************************************************************************
4663 
4664  //**SMP division assignment to dense vectors****************************************************
4678  template< typename VT1 > // Type of the target dense vector
4679  friend inline EnableIf_< UseSMPAssign<VT1> >
4680  smpDivAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4681  {
4683 
4686  BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE( CompositeType_<ResultType> );
4687 
4688  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4689 
4690  const ResultType tmp( rhs );
4691  smpDivAssign( ~lhs, tmp );
4692  }
4693  //**********************************************************************************************
4694 
4695  //**SMP division assignment to sparse vectors***************************************************
4696  // No special implementation for the SMP division assignment to sparse vectors.
4697  //**********************************************************************************************
4698 
4699  //**Compile time checks*************************************************************************
4707  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
4708  //**********************************************************************************************
4709 };
4711 //*************************************************************************************************
4712 
4713 
4714 
4715 
4716 //=================================================================================================
4717 //
4718 // GLOBAL BINARY ARITHMETIC OPERATORS
4719 //
4720 //=================================================================================================
4721 
4722 //*************************************************************************************************
4753 template< typename T1 // Type of the left-hand side dense vector
4754  , typename T2 > // Type of the right-hand side dense matrix
4755 inline const DisableIf_< IsMatMatMultExpr<T2>, TDVecDMatMultExpr<T1,T2> >
4757 {
4759 
4760  if( (~vec).size() != (~mat).rows() ) {
4761  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
4762  }
4763 
4764  return TDVecDMatMultExpr<T1,T2>( ~vec, ~mat );
4765 }
4766 //*************************************************************************************************
4767 
4768 
4769 
4770 
4771 //=================================================================================================
4772 //
4773 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
4774 //
4775 //=================================================================================================
4776 
4777 //*************************************************************************************************
4790 template< typename T1 // Type of the left-hand side dense vector
4791  , typename T2 // Type of the right-hand side dense matrix
4792  , bool SO > // Storage order of the right-hand side dense matrix
4793 inline const EnableIf_< IsMatMatMultExpr<T2>, MultExprTrait_<T1,T2> >
4795 {
4797 
4799 
4800  return ( vec * (~mat).leftOperand() ) * (~mat).rightOperand();
4801 }
4802 //*************************************************************************************************
4803 
4804 
4805 
4806 
4807 //=================================================================================================
4808 //
4809 // SIZE SPECIALIZATIONS
4810 //
4811 //=================================================================================================
4812 
4813 //*************************************************************************************************
4815 template< typename VT, typename MT >
4816 struct Size< TDVecDMatMultExpr<VT,MT> > : public Columns<MT>
4817 {};
4819 //*************************************************************************************************
4820 
4821 
4822 
4823 
4824 //=================================================================================================
4825 //
4826 // ISALIGNED SPECIALIZATIONS
4827 //
4828 //=================================================================================================
4829 
4830 //*************************************************************************************************
4832 template< typename VT, typename MT >
4833 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4834  : public BoolConstant< And< IsAligned<VT>, IsAligned<MT> >::value >
4835 {};
4837 //*************************************************************************************************
4838 
4839 
4840 
4841 
4842 //=================================================================================================
4843 //
4844 // EXPRESSION TRAIT SPECIALIZATIONS
4845 //
4846 //=================================================================================================
4847 
4848 //*************************************************************************************************
4850 template< typename VT, typename MT, bool AF >
4851 struct SubvectorExprTrait< TDVecDMatMultExpr<VT,MT>, AF >
4852 {
4853  public:
4854  //**********************************************************************************************
4855  using Type = MultExprTrait_< SubvectorExprTrait_<const VT,AF>
4856  , SubmatrixExprTrait_<const MT,AF> >;
4857  //**********************************************************************************************
4858 };
4860 //*************************************************************************************************
4861 
4862 } // namespace blaze
4863 
4864 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:213
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Header file for basic type definitions.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:211
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:385
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:331
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:133
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:214
Header file for the IsComplexDouble type trait.
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:136
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:266
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:135
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
If_< IsExpression< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:217
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
If_< IsExpression< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:220
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the Columns type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:355
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:137
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraint on the data type.
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:134
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:386
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:252
TDVecDMatMultExpr< VT, MT > This
Type of this TDVecDMatMultExpr instance.
Definition: TDVecDMatMultExpr.h:208
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:212
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:321
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:226
Header file for the TVecMatMultExpr base class.
Constraint on the data type.
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:298
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:314
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:110
Header file for the AreSIMDCombinable type trait.
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:209
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:223
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:132
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:210
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:311
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:343
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:375
Header file for the FunctionTrace class.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:365