TDVecTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/Exception.h>
59 #include <blaze/math/shims/Reset.h>
61 #include <blaze/math/SIMD.h>
82 #include <blaze/math/views/Check.h>
83 #include <blaze/system/BLAS.h>
86 #include <blaze/util/Assert.h>
87 #include <blaze/util/Complex.h>
89 #include <blaze/util/DisableIf.h>
90 #include <blaze/util/EnableIf.h>
92 #include <blaze/util/mpl/And.h>
93 #include <blaze/util/mpl/If.h>
94 #include <blaze/util/Types.h>
102 
103 
104 namespace blaze {
105 
106 //=================================================================================================
107 //
108 // CLASS TDVECTDMATMULTEXPR
109 //
110 //=================================================================================================
111 
112 //*************************************************************************************************
119 template< typename VT // Type of the left-hand side dense vector
120  , typename MT > // Type of the right-hand side dense matrix
121 class TDVecTDMatMultExpr
122  : public TVecMatMultExpr< DenseVector< TDVecTDMatMultExpr<VT,MT>, true > >
123  , private Computation
124 {
125  private:
126  //**Type definitions****************************************************************************
133  //**********************************************************************************************
134 
135  //**********************************************************************************************
137  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
138  //**********************************************************************************************
139 
140  //**********************************************************************************************
142  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
144  //**********************************************************************************************
145 
146  //**********************************************************************************************
148 
152  template< typename T1 >
153  struct UseSMPAssign {
154  enum : bool { value = ( evaluateVector || evaluateMatrix ) };
155  };
157  //**********************************************************************************************
158 
159  //**********************************************************************************************
161 
164  template< typename T1, typename T2, typename T3 >
165  struct UseBlasKernel {
171  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
176  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
177  };
179  //**********************************************************************************************
180 
181  //**********************************************************************************************
183 
187  template< typename T1, typename T2, typename T3 >
188  struct UseVectorizedDefaultKernel {
189  enum : bool { value = useOptimizedKernels &&
191  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
194  , ElementType_<T3> >::value &&
197  };
199  //**********************************************************************************************
200 
201  public:
202  //**Type definitions****************************************************************************
208  using ReturnType = const ElementType;
209  using CompositeType = const ResultType;
210 
212  using LeftOperand = If_< IsExpression<VT>, const VT, const VT& >;
213 
215  using RightOperand = If_< IsExpression<MT>, const MT, const MT& >;
216 
219 
222  //**********************************************************************************************
223 
224  //**Compilation flags***************************************************************************
226  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
227  VT::simdEnabled && MT::simdEnabled &&
230 
232  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
233  !evaluateMatrix && MT::smpAssignable };
234  //**********************************************************************************************
235 
236  //**SIMD properties*****************************************************************************
238  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
239  //**********************************************************************************************
240 
241  //**Constructor*********************************************************************************
247  explicit inline TDVecTDMatMultExpr( const VT& vec, const MT& mat ) noexcept
248  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
249  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
250  {
251  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
252  }
253  //**********************************************************************************************
254 
255  //**Subscript operator**************************************************************************
261  inline ReturnType operator[]( size_t index ) const {
262  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
263 
265  {
266  return vec_[index] * mat_(index,index);
267  }
268  else if( IsLower<MT>::value && ( index > 8UL ) )
269  {
270  const size_t begin( IsStrictlyLower<MT>::value ? index+1UL : index );
271  const size_t n ( mat_.rows() - begin );
272  return subvector( vec_, begin, n, unchecked ) *
273  subvector( column( mat_, index, unchecked ), begin, n, unchecked );
274  }
275  else if( IsUpper<MT>::value && ( index + 8UL < mat_.rows() ) )
276  {
277  const size_t n( IsStrictlyUpper<MT>::value ? index : index+1UL );
278  return subvector( vec_, 0UL, n, unchecked ) *
279  subvector( column( mat_, index, unchecked ), 0UL, n, unchecked );
280  }
281  else
282  {
283  return vec_ * column( mat_, index, unchecked );
284  }
285  }
286  //**********************************************************************************************
287 
288  //**At function*********************************************************************************
295  inline ReturnType at( size_t index ) const {
296  if( index >= mat_.columns() ) {
297  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
298  }
299  return (*this)[index];
300  }
301  //**********************************************************************************************
302 
303  //**Size function*******************************************************************************
308  inline size_t size() const noexcept {
309  return mat_.columns();
310  }
311  //**********************************************************************************************
312 
313  //**Left operand access*************************************************************************
318  inline LeftOperand leftOperand() const noexcept {
319  return vec_;
320  }
321  //**********************************************************************************************
322 
323  //**Right operand access************************************************************************
328  inline RightOperand rightOperand() const noexcept {
329  return mat_;
330  }
331  //**********************************************************************************************
332 
333  //**********************************************************************************************
339  template< typename T >
340  inline bool canAlias( const T* alias ) const noexcept {
341  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
342  }
343  //**********************************************************************************************
344 
345  //**********************************************************************************************
351  template< typename T >
352  inline bool isAliased( const T* alias ) const noexcept {
353  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
354  }
355  //**********************************************************************************************
356 
357  //**********************************************************************************************
362  inline bool isAligned() const noexcept {
363  return vec_.isAligned() && mat_.isAligned();
364  }
365  //**********************************************************************************************
366 
367  //**********************************************************************************************
372  inline bool canSMPAssign() const noexcept {
373  return ( !BLAZE_BLAS_MODE ||
376  ( IsComputation<MT>::value && !evaluateMatrix ) ||
377  ( mat_.rows() * mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
378  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
379  }
380  //**********************************************************************************************
381 
382  private:
383  //**Member variables****************************************************************************
386  //**********************************************************************************************
387 
388  //**Assignment to dense vectors*****************************************************************
401  template< typename VT1 > // Type of the target dense vector
402  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
403  {
405 
406  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
407 
408  if( rhs.mat_.rows() == 0UL ) {
409  reset( ~lhs );
410  return;
411  }
412  else if( rhs.mat_.columns() == 0UL ) {
413  return;
414  }
415 
416  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
417  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
418 
419  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
420  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
421  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
422  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
423 
424  TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
425  }
427  //**********************************************************************************************
428 
429  //**Assignment to dense vectors (kernel selection)**********************************************
440  template< typename VT1 // Type of the left-hand side target vector
441  , typename VT2 // Type of the left-hand side vector operand
442  , typename MT1 > // Type of the right-hand side matrix operand
443  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
444  {
445  if( ( IsDiagonal<MT1>::value ) ||
446  ( IsComputation<MT>::value && !evaluateMatrix ) ||
447  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
448  selectSmallAssignKernel( y, x, A );
449  else
450  selectBlasAssignKernel( y, x, A );
451  }
453  //**********************************************************************************************
454 
455  //**Default assignment to dense vectors*********************************************************
469  template< typename VT1 // Type of the left-hand side target vector
470  , typename VT2 // Type of the left-hand side vector operand
471  , typename MT1 > // Type of the right-hand side matrix operand
472  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
473  {
474  y.assign( x * A );
475  }
477  //**********************************************************************************************
478 
479  //**Default assignment to dense vectors (small matrices)****************************************
493  template< typename VT1 // Type of the left-hand side target vector
494  , typename VT2 // Type of the left-hand side vector operand
495  , typename MT1 > // Type of the right-hand side matrix operand
497  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
498  {
499  selectDefaultAssignKernel( y, x, A );
500  }
502  //**********************************************************************************************
503 
504  //**Vectorized default assignment to dense vectors (small matrices)*****************************
518  template< typename VT1 // Type of the left-hand side target vector
519  , typename VT2 // Type of the left-hand side vector operand
520  , typename MT1 > // Type of the right-hand side matrix operand
522  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
523  {
524  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
525 
526  const size_t M( A.rows() );
527  const size_t N( A.columns() );
528 
529  size_t j( 0UL );
530 
531  for( ; (j+8UL) <= N; j+=8UL )
532  {
533  const size_t ibegin( ( IsLower<MT1>::value )
534  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
535  :( 0UL ) );
536  const size_t iend( ( IsUpper<MT1>::value )
537  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
538  :( M ) );
539  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
540 
541  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
542  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
543 
544  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
545  size_t i( ibegin );
546 
547  for( ; i<ipos; i+=SIMDSIZE ) {
548  const SIMDType x1( x.load(i) );
549  xmm1 += x1 * A.load(i,j );
550  xmm2 += x1 * A.load(i,j+1UL);
551  xmm3 += x1 * A.load(i,j+2UL);
552  xmm4 += x1 * A.load(i,j+3UL);
553  xmm5 += x1 * A.load(i,j+4UL);
554  xmm6 += x1 * A.load(i,j+5UL);
555  xmm7 += x1 * A.load(i,j+6UL);
556  xmm8 += x1 * A.load(i,j+7UL);
557  }
558 
559  y[j ] = sum( xmm1 );
560  y[j+1UL] = sum( xmm2 );
561  y[j+2UL] = sum( xmm3 );
562  y[j+3UL] = sum( xmm4 );
563  y[j+4UL] = sum( xmm5 );
564  y[j+5UL] = sum( xmm6 );
565  y[j+6UL] = sum( xmm7 );
566  y[j+7UL] = sum( xmm8 );
567 
568  for( ; remainder && i<iend; ++i ) {
569  y[j ] += x[i] * A(i,j );
570  y[j+1UL] += x[i] * A(i,j+1UL);
571  y[j+2UL] += x[i] * A(i,j+2UL);
572  y[j+3UL] += x[i] * A(i,j+3UL);
573  y[j+4UL] += x[i] * A(i,j+4UL);
574  y[j+5UL] += x[i] * A(i,j+5UL);
575  y[j+6UL] += x[i] * A(i,j+6UL);
576  y[j+7UL] += x[i] * A(i,j+7UL);
577  }
578  }
579 
580  for( ; (j+4UL) <= N; j+=4UL )
581  {
582  const size_t ibegin( ( IsLower<MT1>::value )
583  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
584  :( 0UL ) );
585  const size_t iend( ( IsUpper<MT1>::value )
586  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
587  :( M ) );
588  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
589 
590  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
591  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
592 
593  SIMDType xmm1, xmm2, xmm3, xmm4;
594  size_t i( ibegin );
595 
596  for( ; i<ipos; i+=SIMDSIZE ) {
597  const SIMDType x1( x.load(i) );
598  xmm1 += x1 * A.load(i,j );
599  xmm2 += x1 * A.load(i,j+1UL);
600  xmm3 += x1 * A.load(i,j+2UL);
601  xmm4 += x1 * A.load(i,j+3UL);
602  }
603 
604  y[j ] = sum( xmm1 );
605  y[j+1UL] = sum( xmm2 );
606  y[j+2UL] = sum( xmm3 );
607  y[j+3UL] = sum( xmm4 );
608 
609  for( ; remainder && i<iend; ++i ) {
610  y[j ] += x[i] * A(i,j );
611  y[j+1UL] += x[i] * A(i,j+1UL);
612  y[j+2UL] += x[i] * A(i,j+2UL);
613  y[j+3UL] += x[i] * A(i,j+3UL);
614  }
615  }
616 
617  for( ; (j+3UL) <= N; j+=3UL )
618  {
619  const size_t ibegin( ( IsLower<MT1>::value )
620  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
621  :( 0UL ) );
622  const size_t iend( ( IsUpper<MT1>::value )
623  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
624  :( M ) );
625  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
626 
627  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
628  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
629 
630  SIMDType xmm1, xmm2, xmm3;
631  size_t i( ibegin );
632 
633  for( ; i<ipos; i+=SIMDSIZE ) {
634  const SIMDType x1( x.load(i) );
635  xmm1 += x1 * A.load(i,j );
636  xmm2 += x1 * A.load(i,j+1UL);
637  xmm3 += x1 * A.load(i,j+2UL);
638  }
639 
640  y[j ] = sum( xmm1 );
641  y[j+1UL] = sum( xmm2 );
642  y[j+2UL] = sum( xmm3 );
643 
644  for( ; remainder && i<iend; ++i ) {
645  y[j ] += x[i] * A(i,j );
646  y[j+1UL] += x[i] * A(i,j+1UL);
647  y[j+2UL] += x[i] * A(i,j+2UL);
648  }
649  }
650 
651  for( ; (j+2UL) <= N; j+=2UL )
652  {
653  const size_t ibegin( ( IsLower<MT1>::value )
654  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
655  :( 0UL ) );
656  const size_t iend( ( IsUpper<MT1>::value )
657  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
658  :( M ) );
659  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
660 
661  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
662  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
663 
664  SIMDType xmm1, xmm2;
665  size_t i( ibegin );
666 
667  for( ; i<ipos; i+=SIMDSIZE ) {
668  const SIMDType x1( x.load(i) );
669  xmm1 += x1 * A.load(i,j );
670  xmm2 += x1 * A.load(i,j+1UL);
671  }
672 
673  y[j ] = sum( xmm1 );
674  y[j+1UL] = sum( xmm2 );
675 
676  for( ; remainder && i<iend; ++i ) {
677  y[j ] += x[i] * A(i,j );
678  y[j+1UL] += x[i] * A(i,j+1UL);
679  }
680  }
681 
682  if( j < N )
683  {
684  const size_t ibegin( ( IsLower<MT1>::value )
685  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
686  :( 0UL ) );
687  const size_t iend( ( IsUpper<MT1>::value )
688  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
689  :( M ) );
690  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
691 
692  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
693  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
694 
695  SIMDType xmm1;
696  size_t i( ibegin );
697 
698  for( ; i<ipos; i+=SIMDSIZE ) {
699  xmm1 += x.load(i) * A.load(i,j);
700  }
701 
702  y[j] = sum( xmm1 );
703 
704  for( ; remainder && i<iend; ++i ) {
705  y[j] += x[i] * A(i,j);
706  }
707  }
708  }
710  //**********************************************************************************************
711 
712  //**Default assignment to dense vectors (large matrices)****************************************
726  template< typename VT1 // Type of the left-hand side target vector
727  , typename VT2 // Type of the left-hand side vector operand
728  , typename MT1 > // Type of the right-hand side matrix operand
730  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
731  {
732  selectDefaultAssignKernel( y, x, A );
733  }
735  //**********************************************************************************************
736 
737  //**Vectorized default assignment to dense vectors (large matrices)*****************************
751  template< typename VT1 // Type of the left-hand side target vector
752  , typename VT2 // Type of the left-hand side vector operand
753  , typename MT1 > // Type of the right-hand side matrix operand
755  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
756  {
757  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
758 
759  const size_t M( A.rows() );
760  const size_t N( A.columns() );
761 
762  reset( y );
763 
764  size_t j( 0UL );
765 
766  for( ; (j+8UL) <= N; j+=8UL )
767  {
768  const size_t ibegin( ( IsLower<MT1>::value )
769  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
770  :( 0UL ) );
771  const size_t iend( ( IsUpper<MT1>::value )
772  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
773  :( M ) );
774  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
775 
776  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
777  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
778 
779  size_t i( ibegin );
780 
781  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
782  const size_t i1( i+SIMDSIZE );
783  const size_t i2( i+SIMDSIZE*2UL );
784  const size_t i3( i+SIMDSIZE*3UL );
785  const SIMDType x1( x.load(i ) );
786  const SIMDType x2( x.load(i1) );
787  const SIMDType x3( x.load(i2) );
788  const SIMDType x4( x.load(i3) );
789  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
790  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
791  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
792  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
793  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
794  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
795  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
796  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
797  }
798 
799  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
800  const size_t i1( i+SIMDSIZE );
801  const SIMDType x1( x.load(i ) );
802  const SIMDType x2( x.load(i1) );
803  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
804  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
805  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
806  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
807  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
808  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
809  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
810  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
811  }
812 
813  for( ; i<ipos; i+=SIMDSIZE ) {
814  const SIMDType x1( x.load(i) );
815  y[j ] += sum( x1 * A.load(i,j ) );
816  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
817  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
818  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
819  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
820  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
821  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
822  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
823  }
824 
825  for( ; remainder && i<iend; ++i ) {
826  y[j ] += x[i] * A(i,j );
827  y[j+1UL] += x[i] * A(i,j+1UL);
828  y[j+2UL] += x[i] * A(i,j+2UL);
829  y[j+3UL] += x[i] * A(i,j+3UL);
830  y[j+4UL] += x[i] * A(i,j+4UL);
831  y[j+5UL] += x[i] * A(i,j+5UL);
832  y[j+6UL] += x[i] * A(i,j+6UL);
833  y[j+7UL] += x[i] * A(i,j+7UL);
834  }
835  }
836 
837  for( ; (j+4UL) <= N; j+=4UL )
838  {
839  const size_t ibegin( ( IsLower<MT1>::value )
840  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
841  :( 0UL ) );
842  const size_t iend( ( IsUpper<MT1>::value )
843  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
844  :( M ) );
845  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
846 
847  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
848  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
849 
850  size_t i( ibegin );
851 
852  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
853  const size_t i1( i+SIMDSIZE );
854  const size_t i2( i+SIMDSIZE*2UL );
855  const size_t i3( i+SIMDSIZE*3UL );
856  const SIMDType x1( x.load(i ) );
857  const SIMDType x2( x.load(i1) );
858  const SIMDType x3( x.load(i2) );
859  const SIMDType x4( x.load(i3) );
860  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
861  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
862  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
863  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
864  }
865 
866  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
867  const size_t i1( i+SIMDSIZE );
868  const SIMDType x1( x.load(i ) );
869  const SIMDType x2( x.load(i1) );
870  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
871  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
872  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
873  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
874  }
875 
876  for( ; i<ipos; i+=SIMDSIZE ) {
877  const SIMDType x1( x.load(i) );
878  y[j ] += sum( x1 * A.load(i,j ) );
879  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
880  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
881  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
882  }
883 
884  for( ; remainder && i<iend; ++i ) {
885  y[j ] += x[i] * A(i,j );
886  y[j+1UL] += x[i] * A(i,j+1UL);
887  y[j+2UL] += x[i] * A(i,j+2UL);
888  y[j+3UL] += x[i] * A(i,j+3UL);
889  }
890  }
891 
892  for( ; (j+2UL) <= N; j+=2UL )
893  {
894  const size_t ibegin( ( IsLower<MT1>::value )
895  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
896  :( 0UL ) );
897  const size_t iend( ( IsUpper<MT1>::value )
898  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
899  :( M ) );
900  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
901 
902  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
903  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
904 
905  size_t i( ibegin );
906 
907  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
908  const size_t i1( i+SIMDSIZE );
909  const size_t i2( i+SIMDSIZE*2UL );
910  const size_t i3( i+SIMDSIZE*3UL );
911  const SIMDType x1( x.load(i ) );
912  const SIMDType x2( x.load(i1) );
913  const SIMDType x3( x.load(i2) );
914  const SIMDType x4( x.load(i3) );
915  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
916  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
917  }
918 
919  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
920  const size_t i1( i+SIMDSIZE );
921  const SIMDType x1( x.load(i ) );
922  const SIMDType x2( x.load(i1) );
923  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
924  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
925  }
926 
927  for( ; i<ipos; i+=SIMDSIZE ) {
928  const SIMDType x1( x.load(i) );
929  y[j ] += sum( x1 * A.load(i,j ) );
930  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
931  }
932 
933  for( ; remainder && i<iend; ++i ) {
934  y[j ] += x[i] * A(i,j );
935  y[j+1UL] += x[i] * A(i,j+1UL);
936  }
937  }
938 
939  if( j < N )
940  {
941  const size_t ibegin( ( IsLower<MT1>::value )
942  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
943  :( 0UL ) );
944  const size_t iend( ( IsUpper<MT1>::value )
945  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
946  :( M ) );
947  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
948 
949  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
950  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
951 
952  size_t i( ibegin );
953 
954  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
955  const size_t i1( i+SIMDSIZE );
956  const size_t i2( i+SIMDSIZE*2UL );
957  const size_t i3( i+SIMDSIZE*3UL );
958  const SIMDType x1( x.load(i ) );
959  const SIMDType x2( x.load(i1) );
960  const SIMDType x3( x.load(i2) );
961  const SIMDType x4( x.load(i3) );
962  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
963  }
964 
965  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
966  const size_t i1( i+SIMDSIZE );
967  const SIMDType x1( x.load(i ) );
968  const SIMDType x2( x.load(i1) );
969  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
970  }
971 
972  for( ; i<ipos; i+=SIMDSIZE ) {
973  const SIMDType x1( x.load(i) );
974  y[j] += sum( x1 * A.load(i,j) );
975  }
976 
977  for( ; remainder && i<iend; ++i ) {
978  y[j] += x[i] * A(i,j);
979  }
980  }
981  }
983  //**********************************************************************************************
984 
985  //**BLAS-based assignment to dense vectors (default)********************************************
999  template< typename VT1 // Type of the left-hand side target vector
1000  , typename VT2 // Type of the left-hand side vector operand
1001  , typename MT1 > // Type of the right-hand side matrix operand
1003  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1004  {
1005  selectLargeAssignKernel( y, x, A );
1006  }
1008  //**********************************************************************************************
1009 
1010  //**BLAS-based assignment to dense vectors******************************************************
1011 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1012 
1025  template< typename VT1 // Type of the left-hand side target vector
1026  , typename VT2 // Type of the left-hand side vector operand
1027  , typename MT1 > // Type of the right-hand side matrix operand
1029  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1030  {
1031  using ET = ElementType_<VT1>;
1032 
1033  if( IsTriangular<MT1>::value ) {
1034  assign( y, x );
1035  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1036  }
1037  else {
1038  gemv( y, x, A, ET(1), ET(0) );
1039  }
1040  }
1042 #endif
1043  //**********************************************************************************************
1044 
1045  //**Assignment to sparse vectors****************************************************************
1058  template< typename VT1 > // Type of the target sparse vector
1059  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1060  {
1062 
1066 
1067  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1068 
1069  const ResultType tmp( serial( rhs ) );
1070  assign( ~lhs, tmp );
1071  }
1073  //**********************************************************************************************
1074 
1075  //**Addition assignment to dense vectors********************************************************
1088  template< typename VT1 > // Type of the target dense vector
1089  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1090  {
1092 
1093  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1094 
1095  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1096  return;
1097  }
1098 
1099  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1100  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1101 
1102  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1103  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1104  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1105  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1106 
1107  TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1108  }
1110  //**********************************************************************************************
1111 
1112  //**Addition assignment to dense vectors (kernel selection)*************************************
1123  template< typename VT1 // Type of the left-hand side target vector
1124  , typename VT2 // Type of the left-hand side vector operand
1125  , typename MT1 > // Type of the right-hand side matrix operand
1126  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1127  {
1128  if( ( IsDiagonal<MT1>::value ) ||
1129  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1130  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1131  selectSmallAddAssignKernel( y, x, A );
1132  else
1133  selectBlasAddAssignKernel( y, x, A );
1134  }
1136  //**********************************************************************************************
1137 
1138  //**Default addition assignment to dense vectors************************************************
1152  template< typename VT1 // Type of the left-hand side target vector
1153  , typename VT2 // Type of the left-hand side vector operand
1154  , typename MT1 > // Type of the right-hand side matrix operand
1155  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1156  {
1157  y.addAssign( x * A );
1158  }
1160  //**********************************************************************************************
1161 
1162  //**Default addition assignment to dense vectors (small matrices)*******************************
1176  template< typename VT1 // Type of the left-hand side target vector
1177  , typename VT2 // Type of the left-hand side vector operand
1178  , typename MT1 > // Type of the right-hand side matrix operand
1180  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1181  {
1182  selectDefaultAddAssignKernel( y, x, A );
1183  }
1185  //**********************************************************************************************
1186 
1187  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1202  template< typename VT1 // Type of the left-hand side target vector
1203  , typename VT2 // Type of the left-hand side vector operand
1204  , typename MT1 > // Type of the right-hand side matrix operand
1206  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1207  {
1208  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1209 
1210  const size_t M( A.rows() );
1211  const size_t N( A.columns() );
1212 
1213  size_t j( 0UL );
1214 
1215  for( ; (j+8UL) <= N; j+=8UL )
1216  {
1217  const size_t ibegin( ( IsLower<MT1>::value )
1218  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1219  :( 0UL ) );
1220  const size_t iend( ( IsUpper<MT1>::value )
1221  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1222  :( M ) );
1223  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1224 
1225  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1226  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1227 
1228  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1229  size_t i( ibegin );
1230 
1231  for( ; i<ipos; i+=SIMDSIZE ) {
1232  const SIMDType x1( x.load(i) );
1233  xmm1 += x1 * A.load(i,j );
1234  xmm2 += x1 * A.load(i,j+1UL);
1235  xmm3 += x1 * A.load(i,j+2UL);
1236  xmm4 += x1 * A.load(i,j+3UL);
1237  xmm5 += x1 * A.load(i,j+4UL);
1238  xmm6 += x1 * A.load(i,j+5UL);
1239  xmm7 += x1 * A.load(i,j+6UL);
1240  xmm8 += x1 * A.load(i,j+7UL);
1241  }
1242 
1243  y[j ] += sum( xmm1 );
1244  y[j+1UL] += sum( xmm2 );
1245  y[j+2UL] += sum( xmm3 );
1246  y[j+3UL] += sum( xmm4 );
1247  y[j+4UL] += sum( xmm5 );
1248  y[j+5UL] += sum( xmm6 );
1249  y[j+6UL] += sum( xmm7 );
1250  y[j+7UL] += sum( xmm8 );
1251 
1252  for( ; remainder && i<iend; ++i ) {
1253  y[j ] += x[i] * A(i,j );
1254  y[j+1UL] += x[i] * A(i,j+1UL);
1255  y[j+2UL] += x[i] * A(i,j+2UL);
1256  y[j+3UL] += x[i] * A(i,j+3UL);
1257  y[j+4UL] += x[i] * A(i,j+4UL);
1258  y[j+5UL] += x[i] * A(i,j+5UL);
1259  y[j+6UL] += x[i] * A(i,j+6UL);
1260  y[j+7UL] += x[i] * A(i,j+7UL);
1261  }
1262  }
1263 
1264  for( ; (j+4UL) <= N; j+=4UL )
1265  {
1266  const size_t ibegin( ( IsLower<MT1>::value )
1267  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1268  :( 0UL ) );
1269  const size_t iend( ( IsUpper<MT1>::value )
1270  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1271  :( M ) );
1272  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1273 
1274  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1275  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1276 
1277  SIMDType xmm1, xmm2, xmm3, xmm4;
1278  size_t i( ibegin );
1279 
1280  for( ; i<ipos; i+=SIMDSIZE ) {
1281  const SIMDType x1( x.load(i) );
1282  xmm1 += x1 * A.load(i,j );
1283  xmm2 += x1 * A.load(i,j+1UL);
1284  xmm3 += x1 * A.load(i,j+2UL);
1285  xmm4 += x1 * A.load(i,j+3UL);
1286  }
1287 
1288  y[j ] += sum( xmm1 );
1289  y[j+1UL] += sum( xmm2 );
1290  y[j+2UL] += sum( xmm3 );
1291  y[j+3UL] += sum( xmm4 );
1292 
1293  for( ; remainder && i<iend; ++i ) {
1294  y[j ] += x[i] * A(i,j );
1295  y[j+1UL] += x[i] * A(i,j+1UL);
1296  y[j+2UL] += x[i] * A(i,j+2UL);
1297  y[j+3UL] += x[i] * A(i,j+3UL);
1298  }
1299  }
1300 
1301  for( ; (j+3UL) <= N; j+=3UL )
1302  {
1303  const size_t ibegin( ( IsLower<MT1>::value )
1304  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1305  :( 0UL ) );
1306  const size_t iend( ( IsUpper<MT1>::value )
1307  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1308  :( M ) );
1309  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1310 
1311  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1312  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1313 
1314  SIMDType xmm1, xmm2, xmm3;
1315  size_t i( ibegin );
1316 
1317  for( ; i<ipos; i+=SIMDSIZE ) {
1318  const SIMDType x1( x.load(i) );
1319  xmm1 += x1 * A.load(i,j );
1320  xmm2 += x1 * A.load(i,j+1UL);
1321  xmm3 += x1 * A.load(i,j+2UL);
1322  }
1323 
1324  y[j ] += sum( xmm1 );
1325  y[j+1UL] += sum( xmm2 );
1326  y[j+2UL] += sum( xmm3 );
1327 
1328  for( ; remainder && i<iend; ++i ) {
1329  y[j ] += x[i] * A(i,j );
1330  y[j+1UL] += x[i] * A(i,j+1UL);
1331  y[j+2UL] += x[i] * A(i,j+2UL);
1332  }
1333  }
1334 
1335  for( ; (j+2UL) <= N; j+=2UL )
1336  {
1337  const size_t ibegin( ( IsLower<MT1>::value )
1338  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1339  :( 0UL ) );
1340  const size_t iend( ( IsUpper<MT1>::value )
1341  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1342  :( M ) );
1343  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1344 
1345  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1346  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1347 
1348  SIMDType xmm1, xmm2;
1349  size_t i( ibegin );
1350 
1351  for( ; i<ipos; i+=SIMDSIZE ) {
1352  const SIMDType x1( x.load(i) );
1353  xmm1 += x1 * A.load(i,j );
1354  xmm2 += x1 * A.load(i,j+1UL);
1355  }
1356 
1357  y[j ] += sum( xmm1 );
1358  y[j+1UL] += sum( xmm2 );
1359 
1360  for( ; remainder && i<iend; ++i ) {
1361  y[j ] += x[i] * A(i,j );
1362  y[j+1UL] += x[i] * A(i,j+1UL);
1363  }
1364  }
1365 
1366  if( j < N )
1367  {
1368  const size_t ibegin( ( IsLower<MT1>::value )
1369  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1370  :( 0UL ) );
1371  const size_t iend( ( IsUpper<MT1>::value )
1372  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1373  :( M ) );
1374  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1375 
1376  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1377  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1378 
1379  SIMDType xmm1;
1380  size_t i( ibegin );
1381 
1382  for( ; i<ipos; i+=SIMDSIZE ) {
1383  xmm1 += A.load(i,j) * x.load(i);
1384  }
1385 
1386  y[j] += sum( xmm1 );
1387 
1388  for( ; remainder && i<iend; ++i ) {
1389  y[j] += x[i] * A(i,j);
1390  }
1391  }
1392  }
1394  //**********************************************************************************************
1395 
1396  //**Default addition assignment to dense vectors (large matrices)*******************************
1410  template< typename VT1 // Type of the left-hand side target vector
1411  , typename VT2 // Type of the left-hand side vector operand
1412  , typename MT1 > // Type of the right-hand side matrix operand
1414  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1415  {
1416  selectDefaultAddAssignKernel( y, x, A );
1417  }
1419  //**********************************************************************************************
1420 
1421  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1436  template< typename VT1 // Type of the left-hand side target vector
1437  , typename VT2 // Type of the left-hand side vector operand
1438  , typename MT1 > // Type of the right-hand side matrix operand
1440  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1441  {
1442  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1443 
1444  const size_t M( A.rows() );
1445  const size_t N( A.columns() );
1446 
1447  size_t j( 0UL );
1448 
1449  for( ; (j+8UL) <= N; j+=8UL )
1450  {
1451  const size_t ibegin( ( IsLower<MT1>::value )
1452  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1453  :( 0UL ) );
1454  const size_t iend( ( IsUpper<MT1>::value )
1455  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1456  :( M ) );
1457  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1458 
1459  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1460  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1461 
1462  size_t i( ibegin );
1463 
1464  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1465  const size_t i1( i+SIMDSIZE );
1466  const size_t i2( i+SIMDSIZE*2UL );
1467  const size_t i3( i+SIMDSIZE*3UL );
1468  const SIMDType x1( x.load(i ) );
1469  const SIMDType x2( x.load(i1) );
1470  const SIMDType x3( x.load(i2) );
1471  const SIMDType x4( x.load(i3) );
1472  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1473  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1474  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1475  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1476  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1477  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1478  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1479  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1480  }
1481 
1482  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1483  const size_t i1( i+SIMDSIZE );
1484  const SIMDType x1( x.load(i ) );
1485  const SIMDType x2( x.load(i1) );
1486  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1487  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1488  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1489  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1490  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1491  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1492  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1493  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1494  }
1495 
1496  for( ; i<ipos; i+=SIMDSIZE ) {
1497  const SIMDType x1( x.load(i) );
1498  y[j ] += sum( x1 * A.load(i,j ) );
1499  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1500  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1501  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1502  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
1503  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
1504  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
1505  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
1506  }
1507 
1508  for( ; remainder && i<iend; ++i ) {
1509  y[j ] += x[i] * A(i,j );
1510  y[j+1UL] += x[i] * A(i,j+1UL);
1511  y[j+2UL] += x[i] * A(i,j+2UL);
1512  y[j+3UL] += x[i] * A(i,j+3UL);
1513  y[j+4UL] += x[i] * A(i,j+4UL);
1514  y[j+5UL] += x[i] * A(i,j+5UL);
1515  y[j+6UL] += x[i] * A(i,j+6UL);
1516  y[j+7UL] += x[i] * A(i,j+7UL);
1517  }
1518  }
1519 
1520  for( ; (j+4UL) <= N; j+=4UL )
1521  {
1522  const size_t ibegin( ( IsLower<MT1>::value )
1523  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1524  :( 0UL ) );
1525  const size_t iend( ( IsUpper<MT1>::value )
1526  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1527  :( M ) );
1528  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1529 
1530  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1531  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1532 
1533  size_t i( ibegin );
1534 
1535  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1536  const size_t i1( i+SIMDSIZE );
1537  const size_t i2( i+SIMDSIZE*2UL );
1538  const size_t i3( i+SIMDSIZE*3UL );
1539  const SIMDType x1( x.load(i ) );
1540  const SIMDType x2( x.load(i1) );
1541  const SIMDType x3( x.load(i2) );
1542  const SIMDType x4( x.load(i3) );
1543  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1544  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1545  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1546  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1547  }
1548 
1549  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1550  const size_t i1( i+SIMDSIZE );
1551  const SIMDType x1( x.load(i ) );
1552  const SIMDType x2( x.load(i1) );
1553  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1554  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1555  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1556  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1557  }
1558 
1559  for( ; i<ipos; i+=SIMDSIZE ) {
1560  const SIMDType x1( x.load(i) );
1561  y[j ] += sum( x1 * A.load(i,j ) );
1562  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1563  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1564  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1565  }
1566 
1567  for( ; remainder && i<iend; ++i ) {
1568  y[j ] += x[i] * A(i,j );
1569  y[j+1UL] += x[i] * A(i,j+1UL);
1570  y[j+2UL] += x[i] * A(i,j+2UL);
1571  y[j+3UL] += x[i] * A(i,j+3UL);
1572  }
1573  }
1574 
1575  for( ; (j+2UL) <= N; j+=2UL )
1576  {
1577  const size_t ibegin( ( IsLower<MT1>::value )
1578  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1579  :( 0UL ) );
1580  const size_t iend( ( IsUpper<MT1>::value )
1581  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1582  :( M ) );
1583  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1584 
1585  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1586  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1587 
1588  size_t i( ibegin );
1589 
1590  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1591  const size_t i1( i+SIMDSIZE );
1592  const size_t i2( i+SIMDSIZE*2UL );
1593  const size_t i3( i+SIMDSIZE*3UL );
1594  const SIMDType x1( x.load(i ) );
1595  const SIMDType x2( x.load(i1) );
1596  const SIMDType x3( x.load(i2) );
1597  const SIMDType x4( x.load(i3) );
1598  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1599  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1600  }
1601 
1602  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1603  const size_t i1( i+SIMDSIZE );
1604  const SIMDType x1( x.load(i ) );
1605  const SIMDType x2( x.load(i1) );
1606  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1607  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1608  }
1609 
1610  for( ; i<ipos; i+=SIMDSIZE ) {
1611  const SIMDType x1( x.load(i) );
1612  y[j ] += sum( x1 * A.load(i,j ) );
1613  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1614  }
1615 
1616  for( ; remainder && i<iend; ++i ) {
1617  y[j ] += x[i] * A(i,j );
1618  y[j+1UL] += x[i] * A(i,j+1UL);
1619  }
1620  }
1621 
1622  if( j < N )
1623  {
1624  const size_t ibegin( ( IsLower<MT1>::value )
1625  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1626  :( 0UL ) );
1627  const size_t iend( ( IsUpper<MT1>::value )
1628  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1629  :( M ) );
1630  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1631 
1632  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1633  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1634 
1635  size_t i( ibegin );
1636 
1637  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1638  const size_t i1( i+SIMDSIZE );
1639  const size_t i2( i+SIMDSIZE*2UL );
1640  const size_t i3( i+SIMDSIZE*3UL );
1641  const SIMDType x1( x.load(i ) );
1642  const SIMDType x2( x.load(i1) );
1643  const SIMDType x3( x.load(i2) );
1644  const SIMDType x4( x.load(i3) );
1645  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1646  }
1647 
1648  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1649  const size_t i1( i+SIMDSIZE );
1650  const SIMDType x1( x.load(i ) );
1651  const SIMDType x2( x.load(i1) );
1652  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1653  }
1654 
1655  for( ; i<ipos; i+=SIMDSIZE ) {
1656  const SIMDType x1( x.load(i) );
1657  y[j] += sum( x1 * A.load(i,j) );
1658  }
1659 
1660  for( ; remainder && i<iend; ++i ) {
1661  y[j] += x[i] * A(i,j);
1662  }
1663  }
1664  }
1666  //**********************************************************************************************
1667 
1668  //**BLAS-based addition assignment to dense vectors (default)***********************************
1682  template< typename VT1 // Type of the left-hand side target vector
1683  , typename VT2 // Type of the left-hand side vector operand
1684  , typename MT1 > // Type of the right-hand side matrix operand
1686  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1687  {
1688  selectLargeAddAssignKernel( y, x, A );
1689  }
1691  //**********************************************************************************************
1692 
1693  //**BLAS-based addition assignment to dense vectors*********************************************
1694 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1695 
1708  template< typename VT1 // Type of the left-hand side target vector
1709  , typename VT2 // Type of the left-hand side vector operand
1710  , typename MT1 > // Type of the right-hand side matrix operand
1712  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1713  {
1714  using ET = ElementType_<VT1>;
1715 
1716  if( IsTriangular<MT1>::value ) {
1717  ResultType_<VT1> tmp( serial( x ) );
1718  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1719  addAssign( y, tmp );
1720  }
1721  else {
1722  gemv( y, x, A, ET(1), ET(1) );
1723  }
1724  }
1726 #endif
1727  //**********************************************************************************************
1728 
1729  //**Addition assignment to sparse vectors*******************************************************
1730  // No special implementation for the addition assignment to sparse vectors.
1731  //**********************************************************************************************
1732 
1733  //**Subtraction assignment to dense vectors*****************************************************
1746  template< typename VT1 > // Type of the target dense vector
1747  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1748  {
1750 
1751  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1752 
1753  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1754  return;
1755  }
1756 
1757  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1758  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1759 
1760  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1761  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1762  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1763  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1764 
1765  TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1766  }
1768  //**********************************************************************************************
1769 
1770  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1781  template< typename VT1 // Type of the left-hand side target vector
1782  , typename VT2 // Type of the left-hand side vector operand
1783  , typename MT1 > // Type of the right-hand side matrix operand
1784  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1785  {
1786  if( ( IsDiagonal<MT1>::value ) ||
1787  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1788  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1789  selectSmallSubAssignKernel( y, x, A );
1790  else
1791  selectBlasSubAssignKernel( y, x, A );
1792  }
1794  //**********************************************************************************************
1795 
1796  //**Default subtraction assignment to dense vectors*********************************************
1810  template< typename VT1 // Type of the left-hand side target vector
1811  , typename VT2 // Type of the left-hand side vector operand
1812  , typename MT1 > // Type of the right-hand side matrix operand
1813  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1814  {
1815  y.subAssign( x * A );
1816  }
1818  //**********************************************************************************************
1819 
1820  //**Default subtraction assignment to dense vectors (small matrices)****************************
1834  template< typename VT1 // Type of the left-hand side target vector
1835  , typename VT2 // Type of the left-hand side vector operand
1836  , typename MT1 > // Type of the right-hand side matrix operand
1838  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1839  {
1840  selectDefaultSubAssignKernel( y, x, A );
1841  }
1843  //**********************************************************************************************
1844 
1845  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1860  template< typename VT1 // Type of the left-hand side target vector
1861  , typename VT2 // Type of the left-hand side vector operand
1862  , typename MT1 > // Type of the right-hand side matrix operand
1864  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1865  {
1866  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1867 
1868  const size_t M( A.rows() );
1869  const size_t N( A.columns() );
1870 
1871  size_t j( 0UL );
1872 
1873  for( ; (j+8UL) <= N; j+=8UL )
1874  {
1875  const size_t ibegin( ( IsLower<MT1>::value )
1876  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1877  :( 0UL ) );
1878  const size_t iend( ( IsUpper<MT1>::value )
1879  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1880  :( M ) );
1881  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1882 
1883  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1884  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1885 
1886  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1887  size_t i( ibegin );
1888 
1889  for( ; i<ipos; i+=SIMDSIZE ) {
1890  const SIMDType x1( x.load(i) );
1891  xmm1 += x1 * A.load(i,j );
1892  xmm2 += x1 * A.load(i,j+1UL);
1893  xmm3 += x1 * A.load(i,j+2UL);
1894  xmm4 += x1 * A.load(i,j+3UL);
1895  xmm5 += x1 * A.load(i,j+4UL);
1896  xmm6 += x1 * A.load(i,j+5UL);
1897  xmm7 += x1 * A.load(i,j+6UL);
1898  xmm8 += x1 * A.load(i,j+7UL);
1899  }
1900 
1901  y[j ] -= sum( xmm1 );
1902  y[j+1UL] -= sum( xmm2 );
1903  y[j+2UL] -= sum( xmm3 );
1904  y[j+3UL] -= sum( xmm4 );
1905  y[j+4UL] -= sum( xmm5 );
1906  y[j+5UL] -= sum( xmm6 );
1907  y[j+6UL] -= sum( xmm7 );
1908  y[j+7UL] -= sum( xmm8 );
1909 
1910  for( ; remainder && i<iend; ++i ) {
1911  y[j ] -= x[i] * A(i,j );
1912  y[j+1UL] -= x[i] * A(i,j+1UL);
1913  y[j+2UL] -= x[i] * A(i,j+2UL);
1914  y[j+3UL] -= x[i] * A(i,j+3UL);
1915  y[j+4UL] -= x[i] * A(i,j+4UL);
1916  y[j+5UL] -= x[i] * A(i,j+5UL);
1917  y[j+6UL] -= x[i] * A(i,j+6UL);
1918  y[j+7UL] -= x[i] * A(i,j+7UL);
1919  }
1920  }
1921 
1922  for( ; (j+4UL) <= N; j+=4UL )
1923  {
1924  const size_t ibegin( ( IsLower<MT1>::value )
1925  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1926  :( 0UL ) );
1927  const size_t iend( ( IsUpper<MT1>::value )
1928  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1929  :( M ) );
1930  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1931 
1932  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1933  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1934 
1935  SIMDType xmm1, xmm2, xmm3, xmm4;
1936  size_t i( ibegin );
1937 
1938  for( ; i<ipos; i+=SIMDSIZE ) {
1939  const SIMDType x1( x.load(i) );
1940  xmm1 += x1 * A.load(i,j );
1941  xmm2 += x1 * A.load(i,j+1UL);
1942  xmm3 += x1 * A.load(i,j+2UL);
1943  xmm4 += x1 * A.load(i,j+3UL);
1944  }
1945 
1946  y[j ] -= sum( xmm1 );
1947  y[j+1UL] -= sum( xmm2 );
1948  y[j+2UL] -= sum( xmm3 );
1949  y[j+3UL] -= sum( xmm4 );
1950 
1951  for( ; remainder && i<iend; ++i ) {
1952  y[j ] -= x[i] * A(i,j );
1953  y[j+1UL] -= x[i] * A(i,j+1UL);
1954  y[j+2UL] -= x[i] * A(i,j+2UL);
1955  y[j+3UL] -= x[i] * A(i,j+3UL);
1956  }
1957  }
1958 
1959  for( ; (j+3UL) <= N; j+=3UL )
1960  {
1961  const size_t ibegin( ( IsLower<MT1>::value )
1962  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1963  :( 0UL ) );
1964  const size_t iend( ( IsUpper<MT1>::value )
1965  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1966  :( M ) );
1967  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1968 
1969  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1970  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1971 
1972  SIMDType xmm1, xmm2, xmm3;
1973  size_t i( ibegin );
1974 
1975  for( ; i<ipos; i+=SIMDSIZE ) {
1976  const SIMDType x1( x.load(i) );
1977  xmm1 += x1 * A.load(i,j );
1978  xmm2 += x1 * A.load(i,j+1UL);
1979  xmm3 += x1 * A.load(i,j+2UL);
1980  }
1981 
1982  y[j ] -= sum( xmm1 );
1983  y[j+1UL] -= sum( xmm2 );
1984  y[j+2UL] -= sum( xmm3 );
1985 
1986  for( ; remainder && i<iend; ++i ) {
1987  y[j ] -= x[i] * A(i,j );
1988  y[j+1UL] -= x[i] * A(i,j+1UL);
1989  y[j+2UL] -= x[i] * A(i,j+2UL);
1990  }
1991  }
1992 
1993  for( ; (j+2UL) <= N; j+=2UL )
1994  {
1995  const size_t ibegin( ( IsLower<MT1>::value )
1996  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1997  :( 0UL ) );
1998  const size_t iend( ( IsUpper<MT1>::value )
1999  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2000  :( M ) );
2001  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2002 
2003  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2004  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2005 
2006  SIMDType xmm1, xmm2;
2007  size_t i( ibegin );
2008 
2009  for( ; i<ipos; i+=SIMDSIZE ) {
2010  const SIMDType x1( x.load(i) );
2011  xmm1 += x1 * A.load(i,j );
2012  xmm2 += x1 * A.load(i,j+1UL);
2013  }
2014 
2015  y[j ] -= sum( xmm1 );
2016  y[j+1UL] -= sum( xmm2 );
2017 
2018  for( ; remainder && i<iend; ++i ) {
2019  y[j ] -= x[i] * A(i,j );
2020  y[j+1UL] -= x[i] * A(i,j+1UL);
2021  }
2022  }
2023 
2024  if( j < N )
2025  {
2026  const size_t ibegin( ( IsLower<MT1>::value )
2027  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2028  :( 0UL ) );
2029  const size_t iend( ( IsUpper<MT1>::value )
2030  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2031  :( M ) );
2032  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2033 
2034  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2035  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2036 
2037  SIMDType xmm1;
2038  size_t i( ibegin );
2039 
2040  for( ; i<ipos; i+=SIMDSIZE ) {
2041  xmm1 += A.load(i,j) * x.load(i);
2042  }
2043 
2044  y[j] -= sum( xmm1 );
2045 
2046  for( ; remainder && i<iend; ++i ) {
2047  y[j] -= x[i] * A(i,j);
2048  }
2049  }
2050  }
2052  //**********************************************************************************************
2053 
2054  //**Default subtraction assignment to dense vectors (large matrices)****************************
2068  template< typename VT1 // Type of the left-hand side target vector
2069  , typename VT2 // Type of the left-hand side vector operand
2070  , typename MT1 > // Type of the right-hand side matrix operand
2072  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2073  {
2074  selectDefaultSubAssignKernel( y, x, A );
2075  }
2077  //**********************************************************************************************
2078 
2079  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2094  template< typename VT1 // Type of the left-hand side target vector
2095  , typename VT2 // Type of the left-hand side vector operand
2096  , typename MT1 > // Type of the right-hand side matrix operand
2098  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2099  {
2100  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
2101 
2102  const size_t M( A.rows() );
2103  const size_t N( A.columns() );
2104 
2105  size_t j( 0UL );
2106 
2107  for( ; (j+8UL) <= N; j+=8UL )
2108  {
2109  const size_t ibegin( ( IsLower<MT1>::value )
2110  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2111  :( 0UL ) );
2112  const size_t iend( ( IsUpper<MT1>::value )
2113  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
2114  :( M ) );
2115  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2116 
2117  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2118  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2119 
2120  size_t i( ibegin );
2121 
2122  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2123  const size_t i1( i+SIMDSIZE );
2124  const size_t i2( i+SIMDSIZE*2UL );
2125  const size_t i3( i+SIMDSIZE*3UL );
2126  const SIMDType x1( x.load(i ) );
2127  const SIMDType x2( x.load(i1) );
2128  const SIMDType x3( x.load(i2) );
2129  const SIMDType x4( x.load(i3) );
2130  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2131  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2132  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2133  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2134  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2135  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2136  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2137  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2138  }
2139 
2140  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2141  const size_t i1( i+SIMDSIZE );
2142  const SIMDType x1( x.load(i ) );
2143  const SIMDType x2( x.load(i1) );
2144  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2145  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2146  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2147  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2148  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2149  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2150  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2151  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2152  }
2153 
2154  for( ; i<ipos; i+=SIMDSIZE ) {
2155  const SIMDType x1( x.load(i) );
2156  y[j ] -= sum( x1 * A.load(i,j ) );
2157  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2158  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2159  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2160  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) );
2161  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) );
2162  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) );
2163  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) );
2164  }
2165 
2166  for( ; remainder && i<iend; ++i ) {
2167  y[j ] -= x[i] * A(i,j );
2168  y[j+1UL] -= x[i] * A(i,j+1UL);
2169  y[j+2UL] -= x[i] * A(i,j+2UL);
2170  y[j+3UL] -= x[i] * A(i,j+3UL);
2171  y[j+4UL] -= x[i] * A(i,j+4UL);
2172  y[j+5UL] -= x[i] * A(i,j+5UL);
2173  y[j+6UL] -= x[i] * A(i,j+6UL);
2174  y[j+7UL] -= x[i] * A(i,j+7UL);
2175  }
2176  }
2177 
2178  for( ; (j+4UL) <= N; j+=4UL )
2179  {
2180  const size_t ibegin( ( IsLower<MT1>::value )
2181  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2182  :( 0UL ) );
2183  const size_t iend( ( IsUpper<MT1>::value )
2184  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
2185  :( M ) );
2186  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2187 
2188  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2189  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2190 
2191  size_t i( ibegin );
2192 
2193  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2194  const size_t i1( i+SIMDSIZE );
2195  const size_t i2( i+SIMDSIZE*2UL );
2196  const size_t i3( i+SIMDSIZE*3UL );
2197  const SIMDType x1( x.load(i ) );
2198  const SIMDType x2( x.load(i1) );
2199  const SIMDType x3( x.load(i2) );
2200  const SIMDType x4( x.load(i3) );
2201  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2202  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2203  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2204  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2205  }
2206 
2207  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2208  const size_t i1( i+SIMDSIZE );
2209  const SIMDType x1( x.load(i ) );
2210  const SIMDType x2( x.load(i1) );
2211  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2212  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2213  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2214  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2215  }
2216 
2217  for( ; i<ipos; i+=SIMDSIZE ) {
2218  const SIMDType x1( x.load(i) );
2219  y[j ] -= sum( x1 * A.load(i,j ) );
2220  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2221  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2222  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2223  }
2224 
2225  for( ; remainder && i<iend; ++i ) {
2226  y[j ] -= x[i] * A(i,j );
2227  y[j+1UL] -= x[i] * A(i,j+1UL);
2228  y[j+2UL] -= x[i] * A(i,j+2UL);
2229  y[j+3UL] -= x[i] * A(i,j+3UL);
2230  }
2231  }
2232 
2233  for( ; (j+2UL) <= N; j+=2UL )
2234  {
2235  const size_t ibegin( ( IsLower<MT1>::value )
2236  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2237  :( 0UL ) );
2238  const size_t iend( ( IsUpper<MT1>::value )
2239  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2240  :( M ) );
2241  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2242 
2243  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2244  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2245 
2246  size_t i( ibegin );
2247 
2248  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2249  const size_t i1( i+SIMDSIZE );
2250  const size_t i2( i+SIMDSIZE*2UL );
2251  const size_t i3( i+SIMDSIZE*3UL );
2252  const SIMDType x1( x.load(i ) );
2253  const SIMDType x2( x.load(i1) );
2254  const SIMDType x3( x.load(i2) );
2255  const SIMDType x4( x.load(i3) );
2256  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2257  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2258  }
2259 
2260  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2261  const size_t i1( i+SIMDSIZE );
2262  const SIMDType x1( x.load(i ) );
2263  const SIMDType x2( x.load(i1) );
2264  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2265  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2266  }
2267 
2268  for( ; i<ipos; i+=SIMDSIZE ) {
2269  const SIMDType x1( x.load(i) );
2270  y[j ] -= sum( x1 * A.load(i,j ) );
2271  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2272  }
2273 
2274  for( ; remainder && i<iend; ++i ) {
2275  y[j ] -= x[i] * A(i,j );
2276  y[j+1UL] -= x[i] * A(i,j+1UL);
2277  }
2278  }
2279 
2280  if( j < N )
2281  {
2282  const size_t ibegin( ( IsLower<MT1>::value )
2283  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2284  :( 0UL ) );
2285  const size_t iend( ( IsUpper<MT1>::value )
2286  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2287  :( M ) );
2288  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2289 
2290  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2291  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2292 
2293  size_t i( ibegin );
2294 
2295  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2296  const size_t i1( i+SIMDSIZE );
2297  const size_t i2( i+SIMDSIZE*2UL );
2298  const size_t i3( i+SIMDSIZE*3UL );
2299  const SIMDType x1( x.load(i ) );
2300  const SIMDType x2( x.load(i1) );
2301  const SIMDType x3( x.load(i2) );
2302  const SIMDType x4( x.load(i3) );
2303  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2304  }
2305 
2306  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2307  const size_t i1( i+SIMDSIZE );
2308  const SIMDType x1( x.load(i ) );
2309  const SIMDType x2( x.load(i1) );
2310  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2311  }
2312 
2313  for( ; i<ipos; i+=SIMDSIZE ) {
2314  const SIMDType x1( x.load(i) );
2315  y[j] -= sum( x1 * A.load(i,j) );
2316  }
2317 
2318  for( ; remainder && i<iend; ++i ) {
2319  y[j] -= x[i] * A(i,j);
2320  }
2321  }
2322  }
2324  //**********************************************************************************************
2325 
2326  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2340  template< typename VT1 // Type of the left-hand side target vector
2341  , typename VT2 // Type of the left-hand side vector operand
2342  , typename MT1 > // Type of the right-hand side matrix operand
2344  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2345  {
2346  selectLargeSubAssignKernel( y, x, A );
2347  }
2349  //**********************************************************************************************
2350 
2351  //**BLAS-based subtraction assignment to dense vectors******************************************
2352 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2353 
2366  template< typename VT1 // Type of the left-hand side target vector
2367  , typename VT2 // Type of the left-hand side vector operand
2368  , typename MT1 > // Type of the right-hand side matrix operand
2370  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2371  {
2372  using ET = ElementType_<VT1>;
2373 
2374  if( IsTriangular<MT1>::value ) {
2375  ResultType_<VT1> tmp( serial( x ) );
2376  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2377  subAssign( y, tmp );
2378  }
2379  else {
2380  gemv( y, x, A, ET(-1), ET(1) );
2381  }
2382  }
2384 #endif
2385  //**********************************************************************************************
2386 
2387  //**Subtraction assignment to sparse vectors****************************************************
2388  // No special implementation for the subtraction assignment to sparse vectors.
2389  //**********************************************************************************************
2390 
2391  //**Multiplication assignment to dense vectors**************************************************
2404  template< typename VT1 > // Type of the target dense vector
2405  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2406  {
2408 
2412 
2413  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2414 
2415  const ResultType tmp( serial( rhs ) );
2416  multAssign( ~lhs, tmp );
2417  }
2419  //**********************************************************************************************
2420 
2421  //**Multiplication assignment to sparse vectors*************************************************
2422  // No special implementation for the multiplication assignment to sparse vectors.
2423  //**********************************************************************************************
2424 
2425  //**Division assignment to dense vectors********************************************************
2438  template< typename VT1 > // Type of the target dense vector
2439  friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2440  {
2442 
2446 
2447  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2448 
2449  const ResultType tmp( serial( rhs ) );
2450  divAssign( ~lhs, tmp );
2451  }
2453  //**********************************************************************************************
2454 
2455  //**Division assignment to sparse vectors*******************************************************
2456  // No special implementation for the division assignment to sparse vectors.
2457  //**********************************************************************************************
2458 
2459  //**SMP assignment to dense vectors*************************************************************
2474  template< typename VT1 > // Type of the target dense vector
2475  friend inline EnableIf_< UseSMPAssign<VT1> >
2477  {
2479 
2480  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2481 
2482  if( rhs.mat_.rows() == 0UL ) {
2483  reset( ~lhs );
2484  return;
2485  }
2486  else if( rhs.mat_.columns() == 0UL ) {
2487  return;
2488  }
2489 
2490  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2491  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2492 
2493  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2494  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2495  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2496  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2497 
2498  smpAssign( ~lhs, x * A );
2499  }
2501  //**********************************************************************************************
2502 
2503  //**SMP assignment to sparse vectors************************************************************
2518  template< typename VT1 > // Type of the target sparse vector
2519  friend inline EnableIf_< UseSMPAssign<VT1> >
2521  {
2523 
2527 
2528  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2529 
2530  const ResultType tmp( rhs );
2531  smpAssign( ~lhs, tmp );
2532  }
2534  //**********************************************************************************************
2535 
2536  //**SMP addition assignment to dense vectors****************************************************
2551  template< typename VT1 > // Type of the target dense vector
2552  friend inline EnableIf_< UseSMPAssign<VT1> >
2554  {
2556 
2557  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2558 
2559  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2560  return;
2561  }
2562 
2563  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2564  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2565 
2566  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2567  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2568  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2569  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2570 
2571  smpAddAssign( ~lhs, x * A );
2572  }
2574  //**********************************************************************************************
2575 
2576  //**SMP addition assignment to sparse vectors***************************************************
2577  // No special implementation for the SMP addition assignment to sparse vectors.
2578  //**********************************************************************************************
2579 
2580  //**SMP subtraction assignment to dense vectors*************************************************
2595  template< typename VT1 > // Type of the target dense vector
2596  friend inline EnableIf_< UseSMPAssign<VT1> >
2598  {
2600 
2601  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2602 
2603  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2604  return;
2605  }
2606 
2607  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2608  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2609 
2610  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2611  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2612  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2613  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2614 
2615  smpSubAssign( ~lhs, x * A );
2616  }
2618  //**********************************************************************************************
2619 
2620  //**SMP subtraction assignment to sparse vectors************************************************
2621  // No special implementation for the SMP subtraction assignment to sparse vectors.
2622  //**********************************************************************************************
2623 
2624  //**SMP multiplication assignment to dense vectors**********************************************
2639  template< typename VT1 > // Type of the target dense vector
2640  friend inline EnableIf_< UseSMPAssign<VT1> >
2642  {
2644 
2648 
2649  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2650 
2651  const ResultType tmp( rhs );
2652  smpMultAssign( ~lhs, tmp );
2653  }
2655  //**********************************************************************************************
2656 
2657  //**SMP multiplication assignment to sparse vectors*********************************************
2658  // No special implementation for the SMP multiplication assignment to sparse vectors.
2659  //**********************************************************************************************
2660 
2661  //**SMP division assignment to dense vectors****************************************************
2676  template< typename VT1 > // Type of the target dense vector
2677  friend inline EnableIf_< UseSMPAssign<VT1> >
2679  {
2681 
2685 
2686  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2687 
2688  const ResultType tmp( rhs );
2689  smpDivAssign( ~lhs, tmp );
2690  }
2692  //**********************************************************************************************
2693 
2694  //**SMP division assignment to sparse vectors***************************************************
2695  // No special implementation for the SMP division assignment to sparse vectors.
2696  //**********************************************************************************************
2697 
2698  //**Compile time checks*************************************************************************
2706  //**********************************************************************************************
2707 };
2708 //*************************************************************************************************
2709 
2710 
2711 
2712 
2713 //=================================================================================================
2714 //
2715 // DVECSCALARMULTEXPR SPECIALIZATION
2716 //
2717 //=================================================================================================
2718 
2719 //*************************************************************************************************
2727 template< typename VT // Type of the left-hand side dense vector
2728  , typename MT // Type of the right-hand side dense matrix
2729  , typename ST > // Type of the side scalar value
2730 class DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >
2731  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true > >
2732  , private Computation
2733 {
2734  private:
2735  //**Type definitions****************************************************************************
2736  using VMM = TDVecTDMatMultExpr<VT,MT>;
2737  using RES = ResultType_<VMM>;
2738  using VRT = ResultType_<VT>;
2739  using MRT = ResultType_<MT>;
2740  using VET = ElementType_<VRT>;
2741  using MET = ElementType_<MRT>;
2742  using VCT = CompositeType_<VT>;
2743  using MCT = CompositeType_<MT>;
2744  //**********************************************************************************************
2745 
2746  //**********************************************************************************************
2748  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2749  //**********************************************************************************************
2750 
2751  //**********************************************************************************************
2753  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2755  //**********************************************************************************************
2756 
2757  //**********************************************************************************************
2759 
2762  template< typename T1 >
2763  struct UseSMPAssign {
2764  enum : bool { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
2765  };
2766  //**********************************************************************************************
2767 
2768  //**********************************************************************************************
2770 
2772  template< typename T1, typename T2, typename T3, typename T4 >
2773  struct UseBlasKernel {
2779  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2784  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2786  };
2787  //**********************************************************************************************
2788 
2789  //**********************************************************************************************
2791 
2794  template< typename T1, typename T2, typename T3, typename T4 >
2795  struct UseVectorizedDefaultKernel {
2796  enum : bool { value = useOptimizedKernels &&
2798  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2802  , T4 >::value &&
2803  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2804  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2805  };
2806  //**********************************************************************************************
2807 
2808  public:
2809  //**Type definitions****************************************************************************
2811  using ResultType = MultTrait_<RES,ST>;
2815  using ReturnType = const ElementType;
2816  using CompositeType = const ResultType;
2817 
2819  using LeftOperand = const TDVecTDMatMultExpr<VT,MT>;
2820 
2822  using RightOperand = ST;
2823 
2826 
2829  //**********************************************************************************************
2830 
2831  //**Compilation flags***************************************************************************
2833  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2834  VT::simdEnabled && MT::simdEnabled &&
2838 
2840  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2841  !evaluateMatrix && MT::smpAssignable };
2842  //**********************************************************************************************
2843 
2844  //**SIMD properties*****************************************************************************
2846  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2847  //**********************************************************************************************
2848 
2849  //**Constructor*********************************************************************************
2855  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2856  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2857  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2858  {}
2859  //**********************************************************************************************
2860 
2861  //**Subscript operator**************************************************************************
2867  inline ReturnType operator[]( size_t index ) const {
2868  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2869  return vector_[index] * scalar_;
2870  }
2871  //**********************************************************************************************
2872 
2873  //**At function*********************************************************************************
2880  inline ReturnType at( size_t index ) const {
2881  if( index >= vector_.size() ) {
2882  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2883  }
2884  return (*this)[index];
2885  }
2886  //**********************************************************************************************
2887 
2888  //**Size function*******************************************************************************
2893  inline size_t size() const {
2894  return vector_.size();
2895  }
2896  //**********************************************************************************************
2897 
2898  //**Left operand access*************************************************************************
2903  inline LeftOperand leftOperand() const {
2904  return vector_;
2905  }
2906  //**********************************************************************************************
2907 
2908  //**Right operand access************************************************************************
2913  inline RightOperand rightOperand() const {
2914  return scalar_;
2915  }
2916  //**********************************************************************************************
2917 
2918  //**********************************************************************************************
2924  template< typename T >
2925  inline bool canAlias( const T* alias ) const {
2926  return vector_.canAlias( alias );
2927  }
2928  //**********************************************************************************************
2929 
2930  //**********************************************************************************************
2936  template< typename T >
2937  inline bool isAliased( const T* alias ) const {
2938  return vector_.isAliased( alias );
2939  }
2940  //**********************************************************************************************
2941 
2942  //**********************************************************************************************
2947  inline bool isAligned() const {
2948  return vector_.isAligned();
2949  }
2950  //**********************************************************************************************
2951 
2952  //**********************************************************************************************
2957  inline bool canSMPAssign() const noexcept {
2958  RightOperand_<VMM> A( vector_.rightOperand() );
2959  return ( !BLAZE_BLAS_MODE ||
2962  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2963  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2964  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
2965  }
2966  //**********************************************************************************************
2967 
2968  private:
2969  //**Member variables****************************************************************************
2970  LeftOperand vector_;
2971  RightOperand scalar_;
2972  //**********************************************************************************************
2973 
2974  //**Assignment to dense vectors*****************************************************************
2986  template< typename VT1 // Type of the target dense vector
2987  , bool TF > // Transpose flag of the target dense vector
2988  friend inline void assign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
2989  {
2991 
2992  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2993 
2994  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
2995  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
2996 
2997  if( right.rows() == 0UL ) {
2998  reset( ~lhs );
2999  return;
3000  }
3001  else if( right.columns() == 0UL ) {
3002  return;
3003  }
3004 
3005  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3006  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3007 
3008  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3009  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3010  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3011  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3012 
3013  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
3014  }
3015  //**********************************************************************************************
3016 
3017  //**Assignment to dense vectors (kernel selection)**********************************************
3028  template< typename VT1 // Type of the left-hand side target vector
3029  , typename VT2 // Type of the left-hand side vector operand
3030  , typename MT1 // Type of the right-hand side matrix operand
3031  , typename ST2 > // Type of the scalar value
3032  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3033  {
3034  if( ( IsDiagonal<MT1>::value ) ||
3035  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3036  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3037  selectSmallAssignKernel( y, x, A, scalar );
3038  else
3039  selectBlasAssignKernel( y, x, A, scalar );
3040  }
3041  //**********************************************************************************************
3042 
3043  //**Default assignment to dense vectors*********************************************************
3057  template< typename VT1 // Type of the left-hand side target vector
3058  , typename VT2 // Type of the left-hand side vector operand
3059  , typename MT1 // Type of the right-hand side matrix operand
3060  , typename ST2 > // Type of the scalar value
3061  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3062  {
3063  y.assign( x * A * scalar );
3064  }
3065  //**********************************************************************************************
3066 
3067  //**Default assignment to dense vectors (small matrices)****************************************
3081  template< typename VT1 // Type of the left-hand side target vector
3082  , typename VT2 // Type of the left-hand side vector operand
3083  , typename MT1 // Type of the right-hand side matrix operand
3084  , typename ST2 > // Type of the scalar value
3086  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3087  {
3088  selectDefaultAssignKernel( y, x, A, scalar );
3089  }
3090  //**********************************************************************************************
3091 
3092  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3107  template< typename VT1 // Type of the left-hand side target vector
3108  , typename VT2 // Type of the left-hand side vector operand
3109  , typename MT1 // Type of the right-hand side matrix operand
3110  , typename ST2 > // Type of the scalar value
3112  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3113  {
3114  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3115 
3116  const size_t M( A.rows() );
3117  const size_t N( A.columns() );
3118 
3119  size_t j( 0UL );
3120 
3121  for( ; (j+8UL) <= N; j+=8UL )
3122  {
3123  const size_t ibegin( ( IsLower<MT1>::value )
3124  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3125  :( 0UL ) );
3126  const size_t iend( ( IsUpper<MT1>::value )
3127  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3128  :( M ) );
3129  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3130 
3131  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3132  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3133 
3134  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3135  size_t i( ibegin );
3136 
3137  for( ; i<ipos; i+=SIMDSIZE ) {
3138  const SIMDType x1( x.load(i) );
3139  xmm1 += x1 * A.load(i,j );
3140  xmm2 += x1 * A.load(i,j+1UL);
3141  xmm3 += x1 * A.load(i,j+2UL);
3142  xmm4 += x1 * A.load(i,j+3UL);
3143  xmm5 += x1 * A.load(i,j+4UL);
3144  xmm6 += x1 * A.load(i,j+5UL);
3145  xmm7 += x1 * A.load(i,j+6UL);
3146  xmm8 += x1 * A.load(i,j+7UL);
3147  }
3148 
3149  y[j ] = sum( xmm1 ) * scalar;
3150  y[j+1UL] = sum( xmm2 ) * scalar;
3151  y[j+2UL] = sum( xmm3 ) * scalar;
3152  y[j+3UL] = sum( xmm4 ) * scalar;
3153  y[j+4UL] = sum( xmm5 ) * scalar;
3154  y[j+5UL] = sum( xmm6 ) * scalar;
3155  y[j+6UL] = sum( xmm7 ) * scalar;
3156  y[j+7UL] = sum( xmm8 ) * scalar;
3157 
3158  for( ; remainder && i<iend; ++i ) {
3159  y[j ] += x[i] * A(i,j ) * scalar;
3160  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3161  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3162  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3163  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3164  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3165  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3166  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3167  }
3168  }
3169 
3170  for( ; (j+4UL) <= N; j+=4UL )
3171  {
3172  const size_t ibegin( ( IsLower<MT1>::value )
3173  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3174  :( 0UL ) );
3175  const size_t iend( ( IsUpper<MT1>::value )
3176  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3177  :( M ) );
3178  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3179 
3180  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3181  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3182 
3183  SIMDType xmm1, xmm2, xmm3, xmm4;
3184  size_t i( ibegin );
3185 
3186  for( ; i<ipos; i+=SIMDSIZE ) {
3187  const SIMDType x1( x.load(i) );
3188  xmm1 += x1 * A.load(i,j );
3189  xmm2 += x1 * A.load(i,j+1UL);
3190  xmm3 += x1 * A.load(i,j+2UL);
3191  xmm4 += x1 * A.load(i,j+3UL);
3192  }
3193 
3194  y[j ] = sum( xmm1 ) * scalar;
3195  y[j+1UL] = sum( xmm2 ) * scalar;
3196  y[j+2UL] = sum( xmm3 ) * scalar;
3197  y[j+3UL] = sum( xmm4 ) * scalar;
3198 
3199  for( ; remainder && i<iend; ++i ) {
3200  y[j ] += x[i] * A(i,j ) * scalar;
3201  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3202  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3203  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3204  }
3205  }
3206 
3207  for( ; (j+3UL) <= N; j+=3UL )
3208  {
3209  const size_t ibegin( ( IsLower<MT1>::value )
3210  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3211  :( 0UL ) );
3212  const size_t iend( ( IsUpper<MT1>::value )
3213  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3214  :( M ) );
3215  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3216 
3217  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3218  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3219 
3220  SIMDType xmm1, xmm2, xmm3;
3221  size_t i( ibegin );
3222 
3223  for( ; i<ipos; i+=SIMDSIZE ) {
3224  const SIMDType x1( x.load(i) );
3225  xmm1 += x1 * A.load(i,j );
3226  xmm2 += x1 * A.load(i,j+1UL);
3227  xmm3 += x1 * A.load(i,j+2UL);
3228  }
3229 
3230  y[j ] = sum( xmm1 ) * scalar;
3231  y[j+1UL] = sum( xmm2 ) * scalar;
3232  y[j+2UL] = sum( xmm3 ) * scalar;
3233 
3234  for( ; remainder && i<iend; ++i ) {
3235  y[j ] += x[i] * A(i,j ) * scalar;
3236  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3237  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3238  }
3239  }
3240 
3241  for( ; (j+2UL) <= N; j+=2UL )
3242  {
3243  const size_t ibegin( ( IsLower<MT1>::value )
3244  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3245  :( 0UL ) );
3246  const size_t iend( ( IsUpper<MT1>::value )
3247  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3248  :( M ) );
3249  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3250 
3251  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3252  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3253 
3254  SIMDType xmm1, xmm2;
3255  size_t i( ibegin );
3256 
3257  for( ; i<ipos; i+=SIMDSIZE ) {
3258  const SIMDType x1( x.load(i) );
3259  xmm1 += x1 * A.load(i,j );
3260  xmm2 += x1 * A.load(i,j+1UL);
3261  }
3262 
3263  y[j ] = sum( xmm1 ) * scalar;
3264  y[j+1UL] = sum( xmm2 ) * scalar;
3265 
3266  for( ; remainder && i<iend; ++i ) {
3267  y[j ] += x[i] * A(i,j ) * scalar;
3268  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3269  }
3270  }
3271 
3272  if( j < N )
3273  {
3274  const size_t ibegin( ( IsLower<MT1>::value )
3275  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3276  :( 0UL ) );
3277  const size_t iend( ( IsUpper<MT1>::value )
3278  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3279  :( M ) );
3280  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3281 
3282  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3283  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3284 
3285  SIMDType xmm1;
3286  size_t i( ibegin );
3287 
3288  for( ; i<ipos; i+=SIMDSIZE ) {
3289  xmm1 += A.load(i,j) * x.load(i);
3290  }
3291 
3292  y[j] = sum( xmm1 ) * scalar;
3293 
3294  for( ; remainder && i<iend; ++i ) {
3295  y[j] += x[i] * A(i,j) * scalar;
3296  }
3297  }
3298  }
3299  //**********************************************************************************************
3300 
3301  //**Default assignment to dense vectors (large matrices)****************************************
3315  template< typename VT1 // Type of the left-hand side target vector
3316  , typename VT2 // Type of the left-hand side vector operand
3317  , typename MT1 // Type of the right-hand side matrix operand
3318  , typename ST2 > // Type of the scalar value
3320  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3321  {
3322  selectDefaultAssignKernel( y, x, A, scalar );
3323  }
3324  //**********************************************************************************************
3325 
3326  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3341  template< typename VT1 // Type of the left-hand side target vector
3342  , typename VT2 // Type of the left-hand side vector operand
3343  , typename MT1 // Type of the right-hand side matrix operand
3344  , typename ST2 > // Type of the scalar value
3346  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3347  {
3348  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3349 
3350  const size_t M( A.rows() );
3351  const size_t N( A.columns() );
3352 
3353  reset( y );
3354 
3355  size_t j( 0UL );
3356 
3357  for( ; (j+8UL) <= N; j+=8UL )
3358  {
3359  const size_t ibegin( ( IsLower<MT1>::value )
3360  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3361  :( 0UL ) );
3362  const size_t iend( ( IsUpper<MT1>::value )
3363  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3364  :( M ) );
3365  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3366 
3367  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3368  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3369 
3370  size_t i( ibegin );
3371 
3372  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3373  const size_t i1( i+SIMDSIZE );
3374  const size_t i2( i+SIMDSIZE*2UL );
3375  const size_t i3( i+SIMDSIZE*3UL );
3376  const SIMDType x1( x.load(i ) );
3377  const SIMDType x2( x.load(i1) );
3378  const SIMDType x3( x.load(i2) );
3379  const SIMDType x4( x.load(i3) );
3380  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3381  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3382  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3383  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3384  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3385  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3386  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3387  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3388  }
3389 
3390  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3391  const size_t i1( i+SIMDSIZE );
3392  const SIMDType x1( x.load(i ) );
3393  const SIMDType x2( x.load(i1) );
3394  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3395  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3396  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3397  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3398  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3399  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3400  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3401  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3402  }
3403 
3404  for( ; i<ipos; i+=SIMDSIZE ) {
3405  const SIMDType x1( x.load(i) );
3406  y[j ] += sum( x1 * A.load(i,j ) );
3407  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3408  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3409  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3410  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
3411  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
3412  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
3413  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
3414  }
3415 
3416  for( ; remainder && i<iend; ++i ) {
3417  y[j ] += x[i] * A(i,j );
3418  y[j+1UL] += x[i] * A(i,j+1UL);
3419  y[j+2UL] += x[i] * A(i,j+2UL);
3420  y[j+3UL] += x[i] * A(i,j+3UL);
3421  y[j+4UL] += x[i] * A(i,j+4UL);
3422  y[j+5UL] += x[i] * A(i,j+5UL);
3423  y[j+6UL] += x[i] * A(i,j+6UL);
3424  y[j+7UL] += x[i] * A(i,j+7UL);
3425  }
3426 
3427  y[j ] *= scalar;
3428  y[j+1UL] *= scalar;
3429  y[j+2UL] *= scalar;
3430  y[j+3UL] *= scalar;
3431  y[j+4UL] *= scalar;
3432  y[j+5UL] *= scalar;
3433  y[j+6UL] *= scalar;
3434  y[j+7UL] *= scalar;
3435  }
3436 
3437  for( ; (j+4UL) <= N; j+=4UL )
3438  {
3439  const size_t ibegin( ( IsLower<MT1>::value )
3440  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3441  :( 0UL ) );
3442  const size_t iend( ( IsUpper<MT1>::value )
3443  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3444  :( M ) );
3445  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3446 
3447  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3448  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3449 
3450  size_t i( ibegin );
3451 
3452  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3453  const size_t i1( i+SIMDSIZE );
3454  const size_t i2( i+SIMDSIZE*2UL );
3455  const size_t i3( i+SIMDSIZE*3UL );
3456  const SIMDType x1( x.load(i ) );
3457  const SIMDType x2( x.load(i1) );
3458  const SIMDType x3( x.load(i2) );
3459  const SIMDType x4( x.load(i3) );
3460  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3461  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3462  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3463  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3464  }
3465 
3466  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3467  const size_t i1( i+SIMDSIZE );
3468  const SIMDType x1( x.load(i ) );
3469  const SIMDType x2( x.load(i1) );
3470  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3471  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3472  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3473  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3474  }
3475 
3476  for( ; i<ipos; i+=SIMDSIZE ) {
3477  const SIMDType x1( x.load(i) );
3478  y[j ] += sum( x1 * A.load(i,j ) );
3479  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3480  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3481  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3482  }
3483 
3484  for( ; remainder && i<iend; ++i ) {
3485  y[j ] += x[i] * A(i,j );
3486  y[j+1UL] += x[i] * A(i,j+1UL);
3487  y[j+2UL] += x[i] * A(i,j+2UL);
3488  y[j+3UL] += x[i] * A(i,j+3UL);
3489  }
3490 
3491  y[j ] *= scalar;
3492  y[j+1UL] *= scalar;
3493  y[j+2UL] *= scalar;
3494  y[j+3UL] *= scalar;
3495  }
3496 
3497  for( ; (j+2UL) <= N; j+=2UL )
3498  {
3499  const size_t ibegin( ( IsLower<MT1>::value )
3500  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3501  :( 0UL ) );
3502  const size_t iend( ( IsUpper<MT1>::value )
3503  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3504  :( M ) );
3505  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3506 
3507  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3508  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3509 
3510  size_t i( ibegin );
3511 
3512  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3513  const size_t i1( i+SIMDSIZE );
3514  const size_t i2( i+SIMDSIZE*2UL );
3515  const size_t i3( i+SIMDSIZE*3UL );
3516  const SIMDType x1( x.load(i ) );
3517  const SIMDType x2( x.load(i1) );
3518  const SIMDType x3( x.load(i2) );
3519  const SIMDType x4( x.load(i3) );
3520  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3521  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3522  }
3523 
3524  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3525  const size_t i1( i+SIMDSIZE );
3526  const SIMDType x1( x.load(i ) );
3527  const SIMDType x2( x.load(i1) );
3528  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3529  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3530  }
3531 
3532  for( ; i<ipos; i+=SIMDSIZE ) {
3533  const SIMDType x1( x.load(i) );
3534  y[j ] += sum( x1 * A.load(i,j ) );
3535  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3536  }
3537 
3538  for( ; remainder && i<iend; ++i ) {
3539  y[j ] += x[i] * A(i,j );
3540  y[j+1UL] += x[i] * A(i,j+1UL);
3541  }
3542 
3543  y[j ] *= scalar;
3544  y[j+1UL] *= scalar;
3545  }
3546 
3547  if( j < N )
3548  {
3549  const size_t ibegin( ( IsLower<MT1>::value )
3550  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3551  :( 0UL ) );
3552  const size_t iend( ( IsUpper<MT1>::value )
3553  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3554  :( M ) );
3555  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3556 
3557  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3558  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3559 
3560  size_t i( ibegin );
3561 
3562  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3563  const size_t i1( i+SIMDSIZE );
3564  const size_t i2( i+SIMDSIZE*2UL );
3565  const size_t i3( i+SIMDSIZE*3UL );
3566  const SIMDType x1( x.load(i ) );
3567  const SIMDType x2( x.load(i1) );
3568  const SIMDType x3( x.load(i2) );
3569  const SIMDType x4( x.load(i3) );
3570  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3571  }
3572 
3573  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3574  const size_t i1( i+SIMDSIZE );
3575  const SIMDType x1( x.load(i ) );
3576  const SIMDType x2( x.load(i1) );
3577  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3578  }
3579 
3580  for( ; i<ipos; i+=SIMDSIZE ) {
3581  const SIMDType x1( x.load(i) );
3582  y[j] += sum( x1 * A.load(i,j) );
3583  }
3584 
3585  for( ; remainder && i<iend; ++i ) {
3586  y[j] += x[i] * A(i,j);
3587  }
3588 
3589  y[j] *= scalar;
3590  }
3591  }
3592  //**********************************************************************************************
3593 
3594  //**BLAS-based assignment to dense vectors (default)********************************************
3607  template< typename VT1 // Type of the left-hand side target vector
3608  , typename VT2 // Type of the left-hand side vector operand
3609  , typename MT1 // Type of the right-hand side matrix operand
3610  , typename ST2 > // Type of the scalar value
3612  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3613  {
3614  selectLargeAssignKernel( y, x, A, scalar );
3615  }
3616  //**********************************************************************************************
3617 
3618  //**BLAS-based assignment to dense vectors******************************************************
3619 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3620 
3633  template< typename VT1 // Type of the left-hand side target vector
3634  , typename VT2 // Type of the left-hand side vector operand
3635  , typename MT1 // Type of the right-hand side matrix operand
3636  , typename ST2 > // Type of the scalar value
3638  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3639  {
3640  using ET = ElementType_<VT1>;
3641 
3642  if( IsTriangular<MT1>::value ) {
3643  assign( y, scalar * x );
3644  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3645  }
3646  else {
3647  gemv( y, x, A, ET(scalar), ET(0) );
3648  }
3649  }
3650 #endif
3651  //**********************************************************************************************
3652 
3653  //**Assignment to sparse vectors****************************************************************
3665  template< typename VT1 // Type of the target sparse vector
3666  , bool TF > // Transpose flag of the target sparse vector
3667  friend inline void assign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3668  {
3670 
3674 
3675  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3676 
3677  const ResultType tmp( serial( rhs ) );
3678  assign( ~lhs, tmp );
3679  }
3680  //**********************************************************************************************
3681 
3682  //**Addition assignment to dense vectors********************************************************
3694  template< typename VT1 // Type of the target dense vector
3695  , bool TF > // Transpose flag of the target dense vector
3696  friend inline void addAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3697  {
3699 
3700  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3701 
3702  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3703  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3704 
3705  if( right.rows() == 0UL || right.columns() == 0UL ) {
3706  return;
3707  }
3708 
3709  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3710  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3711 
3712  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3713  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3714  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3715  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3716 
3717  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3718  }
3719  //**********************************************************************************************
3720 
3721  //**Addition assignment to dense vectors (kernel selection)*************************************
3732  template< typename VT1 // Type of the left-hand side target vector
3733  , typename VT2 // Type of the left-hand side vector operand
3734  , typename MT1 // Type of the right-hand side matrix operand
3735  , typename ST2 > // Type of the scalar value
3736  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3737  {
3738  if( ( IsDiagonal<MT1>::value ) ||
3739  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3740  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3741  selectSmallAddAssignKernel( y, x, A, scalar );
3742  else
3743  selectBlasAddAssignKernel( y, x, A, scalar );
3744  }
3745  //**********************************************************************************************
3746 
3747  //**Default addition assignment to dense vectors************************************************
3761  template< typename VT1 // Type of the left-hand side target vector
3762  , typename VT2 // Type of the left-hand side vector operand
3763  , typename MT1 // Type of the right-hand side matrix operand
3764  , typename ST2 > // Type of the scalar value
3765  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3766  {
3767  y.addAssign( x * A * scalar );
3768  }
3769  //**********************************************************************************************
3770 
3771  //**Default addition assignment to dense vectors (small matrices)*******************************
3785  template< typename VT1 // Type of the left-hand side target vector
3786  , typename VT2 // Type of the left-hand side vector operand
3787  , typename MT1 // Type of the right-hand side matrix operand
3788  , typename ST2 > // Type of the scalar value
3790  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3791  {
3792  selectDefaultAddAssignKernel( y, x, A, scalar );
3793  }
3794  //**********************************************************************************************
3795 
3796  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3811  template< typename VT1 // Type of the left-hand side target vector
3812  , typename VT2 // Type of the left-hand side vector operand
3813  , typename MT1 // Type of the right-hand side matrix operand
3814  , typename ST2 > // Type of the scalar value
3816  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3817  {
3818  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3819 
3820  const size_t M( A.rows() );
3821  const size_t N( A.columns() );
3822 
3823  size_t j( 0UL );
3824 
3825  for( ; (j+8UL) <= N; j+=8UL )
3826  {
3827  const size_t ibegin( ( IsLower<MT1>::value )
3828  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3829  :( 0UL ) );
3830  const size_t iend( ( IsUpper<MT1>::value )
3831  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3832  :( M ) );
3833  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3834 
3835  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3836  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3837 
3838  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3839  size_t i( ibegin );
3840 
3841  for( ; i<ipos; i+=SIMDSIZE ) {
3842  const SIMDType x1( x.load(i) );
3843  xmm1 += x1 * A.load(i,j );
3844  xmm2 += x1 * A.load(i,j+1UL);
3845  xmm3 += x1 * A.load(i,j+2UL);
3846  xmm4 += x1 * A.load(i,j+3UL);
3847  xmm5 += x1 * A.load(i,j+4UL);
3848  xmm6 += x1 * A.load(i,j+5UL);
3849  xmm7 += x1 * A.load(i,j+6UL);
3850  xmm8 += x1 * A.load(i,j+7UL);
3851  }
3852 
3853  y[j ] += sum( xmm1 ) * scalar;
3854  y[j+1UL] += sum( xmm2 ) * scalar;
3855  y[j+2UL] += sum( xmm3 ) * scalar;
3856  y[j+3UL] += sum( xmm4 ) * scalar;
3857  y[j+4UL] += sum( xmm5 ) * scalar;
3858  y[j+5UL] += sum( xmm6 ) * scalar;
3859  y[j+6UL] += sum( xmm7 ) * scalar;
3860  y[j+7UL] += sum( xmm8 ) * scalar;
3861 
3862  for( ; remainder && i<iend; ++i ) {
3863  y[j ] += x[i] * A(i,j ) * scalar;
3864  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3865  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3866  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3867  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3868  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3869  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3870  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3871  }
3872  }
3873 
3874  for( ; (j+4UL) <= N; j+=4UL )
3875  {
3876  const size_t ibegin( ( IsLower<MT1>::value )
3877  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3878  :( 0UL ) );
3879  const size_t iend( ( IsUpper<MT1>::value )
3880  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3881  :( M ) );
3882  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3883 
3884  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3885  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3886 
3887  SIMDType xmm1, xmm2, xmm3, xmm4;
3888  size_t i( ibegin );
3889 
3890  for( ; i<ipos; i+=SIMDSIZE ) {
3891  const SIMDType x1( x.load(i) );
3892  xmm1 += x1 * A.load(i,j );
3893  xmm2 += x1 * A.load(i,j+1UL);
3894  xmm3 += x1 * A.load(i,j+2UL);
3895  xmm4 += x1 * A.load(i,j+3UL);
3896  }
3897 
3898  y[j ] += sum( xmm1 ) * scalar;
3899  y[j+1UL] += sum( xmm2 ) * scalar;
3900  y[j+2UL] += sum( xmm3 ) * scalar;
3901  y[j+3UL] += sum( xmm4 ) * scalar;
3902 
3903  for( ; remainder && i<iend; ++i ) {
3904  y[j ] += x[i] * A(i,j ) * scalar;
3905  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3906  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3907  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3908  }
3909  }
3910 
3911  for( ; (j+3UL) <= N; j+=3UL )
3912  {
3913  const size_t ibegin( ( IsLower<MT1>::value )
3914  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3915  :( 0UL ) );
3916  const size_t iend( ( IsUpper<MT1>::value )
3917  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3918  :( M ) );
3919  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3920 
3921  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3922  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3923 
3924  SIMDType xmm1, xmm2, xmm3;
3925  size_t i( ibegin );
3926 
3927  for( ; i<ipos; i+=SIMDSIZE ) {
3928  const SIMDType x1( x.load(i) );
3929  xmm1 += x1 * A.load(i,j );
3930  xmm2 += x1 * A.load(i,j+1UL);
3931  xmm3 += x1 * A.load(i,j+2UL);
3932  }
3933 
3934  y[j ] += sum( xmm1 ) * scalar;
3935  y[j+1UL] += sum( xmm2 ) * scalar;
3936  y[j+2UL] += sum( xmm3 ) * scalar;
3937 
3938  for( ; remainder && i<iend; ++i ) {
3939  y[j ] += x[i] * A(i,j ) * scalar;
3940  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3941  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3942  }
3943  }
3944 
3945  for( ; (j+2UL) <= N; j+=2UL )
3946  {
3947  const size_t ibegin( ( IsLower<MT1>::value )
3948  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3949  :( 0UL ) );
3950  const size_t iend( ( IsUpper<MT1>::value )
3951  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3952  :( M ) );
3953  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3954 
3955  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3956  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3957 
3958  SIMDType xmm1, xmm2;
3959  size_t i( ibegin );
3960 
3961  for( ; i<ipos; i+=SIMDSIZE ) {
3962  const SIMDType x1( x.load(i) );
3963  xmm1 += x1 * A.load(i,j );
3964  xmm2 += x1 * A.load(i,j+1UL);
3965  }
3966 
3967  y[j ] += sum( xmm1 ) * scalar;
3968  y[j+1UL] += sum( xmm2 ) * scalar;
3969 
3970  for( ; remainder && i<iend; ++i ) {
3971  y[j ] += x[i] * A(i,j ) * scalar;
3972  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3973  }
3974  }
3975 
3976  if( j < N )
3977  {
3978  const size_t ibegin( ( IsLower<MT1>::value )
3979  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3980  :( 0UL ) );
3981  const size_t iend( ( IsUpper<MT1>::value )
3982  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3983  :( M ) );
3984  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3985 
3986  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3987  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3988 
3989  SIMDType xmm1;
3990  size_t i( ibegin );
3991 
3992  for( ; i<ipos; i+=SIMDSIZE ) {
3993  xmm1 += A.load(i,j) * x.load(i);
3994  }
3995 
3996  y[j] += sum( xmm1 ) * scalar;
3997 
3998  for( ; remainder && i<iend; ++i ) {
3999  y[j] += x[i] * A(i,j) * scalar;
4000  }
4001  }
4002  }
4003  //**********************************************************************************************
4004 
4005  //**Default addition assignment to dense vectors (large matrices)*******************************
4019  template< typename VT1 // Type of the left-hand side target vector
4020  , typename VT2 // Type of the left-hand side vector operand
4021  , typename MT1 // Type of the right-hand side matrix operand
4022  , typename ST2 > // Type of the scalar value
4024  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4025  {
4026  selectDefaultAddAssignKernel( y, x, A, scalar );
4027  }
4028  //**********************************************************************************************
4029 
4030  //**Vectorized default addition assignment to dense vectors (large matrices)********************
4045  template< typename VT1 // Type of the left-hand side target vector
4046  , typename VT2 // Type of the left-hand side vector operand
4047  , typename MT1 // Type of the right-hand side matrix operand
4048  , typename ST2 > // Type of the scalar value
4050  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4051  {
4052  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4053 
4054  const size_t M( A.rows() );
4055  const size_t N( A.columns() );
4056 
4057  size_t j( 0UL );
4058 
4059  for( ; (j+8UL) <= N; j+=8UL )
4060  {
4061  const size_t ibegin( ( IsLower<MT1>::value )
4062  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4063  :( 0UL ) );
4064  const size_t iend( ( IsUpper<MT1>::value )
4065  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4066  :( M ) );
4067  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4068 
4069  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4070  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4071 
4072  size_t i( ibegin );
4073 
4074  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4075  const size_t i1( i+SIMDSIZE );
4076  const size_t i2( i+SIMDSIZE*2UL );
4077  const size_t i3( i+SIMDSIZE*3UL );
4078  const SIMDType x1( x.load(i ) );
4079  const SIMDType x2( x.load(i1) );
4080  const SIMDType x3( x.load(i2) );
4081  const SIMDType x4( x.load(i3) );
4082  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4083  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4084  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4085  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4086  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4087  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4088  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4089  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4090  }
4091 
4092  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4093  const size_t i1( i+SIMDSIZE );
4094  const SIMDType x1( x.load(i ) );
4095  const SIMDType x2( x.load(i1) );
4096  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4097  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4098  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4099  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4100  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4101  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4102  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4103  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4104  }
4105 
4106  for( ; i<ipos; i+=SIMDSIZE ) {
4107  const SIMDType x1( x.load(i) );
4108  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4109  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4110  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4111  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4112  y[j+4UL] += sum( x1 * A.load(i,j+4UL) ) * scalar;
4113  y[j+5UL] += sum( x1 * A.load(i,j+5UL) ) * scalar;
4114  y[j+6UL] += sum( x1 * A.load(i,j+6UL) ) * scalar;
4115  y[j+7UL] += sum( x1 * A.load(i,j+7UL) ) * scalar;
4116  }
4117 
4118  for( ; remainder && i<iend; ++i ) {
4119  y[j ] += x[i] * A(i,j ) * scalar;
4120  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4121  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4122  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4123  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4124  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4125  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4126  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4127  }
4128  }
4129 
4130  for( ; (j+4UL) <= N; j+=4UL )
4131  {
4132  const size_t ibegin( ( IsLower<MT1>::value )
4133  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4134  :( 0UL ) );
4135  const size_t iend( ( IsUpper<MT1>::value )
4136  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4137  :( M ) );
4138  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4139 
4140  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4141  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4142 
4143  size_t i( ibegin );
4144 
4145  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4146  const size_t i1( i+SIMDSIZE );
4147  const size_t i2( i+SIMDSIZE*2UL );
4148  const size_t i3( i+SIMDSIZE*3UL );
4149  const SIMDType x1( x.load(i ) );
4150  const SIMDType x2( x.load(i1) );
4151  const SIMDType x3( x.load(i2) );
4152  const SIMDType x4( x.load(i3) );
4153  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4154  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4155  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4156  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4157  }
4158 
4159  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4160  const size_t i1( i+SIMDSIZE );
4161  const SIMDType x1( x.load(i ) );
4162  const SIMDType x2( x.load(i1) );
4163  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4164  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4165  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4166  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4167  }
4168 
4169  for( ; i<ipos; i+=SIMDSIZE ) {
4170  const SIMDType x1( x.load(i) );
4171  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4172  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4173  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4174  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4175  }
4176 
4177  for( ; remainder && i<iend; ++i ) {
4178  y[j ] += x[i] * A(i,j ) * scalar;
4179  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4180  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4181  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4182  }
4183  }
4184 
4185  for( ; (j+2UL) <= N; j+=2UL )
4186  {
4187  const size_t ibegin( ( IsLower<MT1>::value )
4188  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4189  :( 0UL ) );
4190  const size_t iend( ( IsUpper<MT1>::value )
4191  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4192  :( M ) );
4193  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4194 
4195  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4196  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4197 
4198  size_t i( ibegin );
4199 
4200  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4201  const size_t i1( i+SIMDSIZE );
4202  const size_t i2( i+SIMDSIZE*2UL );
4203  const size_t i3( i+SIMDSIZE*3UL );
4204  const SIMDType x1( x.load(i ) );
4205  const SIMDType x2( x.load(i1) );
4206  const SIMDType x3( x.load(i2) );
4207  const SIMDType x4( x.load(i3) );
4208  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4209  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4210  }
4211 
4212  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4213  const size_t i1( i+SIMDSIZE );
4214  const SIMDType x1( x.load(i ) );
4215  const SIMDType x2( x.load(i1) );
4216  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4217  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4218  }
4219 
4220  for( ; i<ipos; i+=SIMDSIZE ) {
4221  const SIMDType x1( x.load(i) );
4222  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4223  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4224  }
4225 
4226  for( ; remainder && i<iend; ++i ) {
4227  y[j ] += x[i] * A(i,j ) * scalar;
4228  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4229  }
4230  }
4231 
4232  if( j < N )
4233  {
4234  const size_t ibegin( ( IsLower<MT1>::value )
4235  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4236  :( 0UL ) );
4237  const size_t iend( ( IsUpper<MT1>::value )
4238  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4239  :( M ) );
4240  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4241 
4242  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4243  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4244 
4245  size_t i( ibegin );
4246 
4247  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4248  const size_t i1( i+SIMDSIZE );
4249  const size_t i2( i+SIMDSIZE*2UL );
4250  const size_t i3( i+SIMDSIZE*3UL );
4251  const SIMDType x1( x.load(i ) );
4252  const SIMDType x2( x.load(i1) );
4253  const SIMDType x3( x.load(i2) );
4254  const SIMDType x4( x.load(i3) );
4255  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4256  }
4257 
4258  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4259  const size_t i1( i+SIMDSIZE );
4260  const SIMDType x1( x.load(i ) );
4261  const SIMDType x2( x.load(i1) );
4262  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4263  }
4264 
4265  for( ; i<ipos; i+=SIMDSIZE ) {
4266  const SIMDType x1( x.load(i) );
4267  y[j] += sum( x1 * A.load(i,j) ) * scalar;
4268  }
4269 
4270  for( ; remainder && i<iend; ++i ) {
4271  y[j] += x[i] * A(i,j) * scalar;
4272  }
4273  }
4274  }
4275  //**********************************************************************************************
4276 
4277  //**BLAS-based addition assignment to dense vectors (default)***********************************
4292  template< typename VT1 // Type of the left-hand side target vector
4293  , typename VT2 // Type of the left-hand side vector operand
4294  , typename MT1 // Type of the right-hand side matrix operand
4295  , typename ST2 > // Type of the scalar value
4297  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4298  {
4299  selectLargeAddAssignKernel( y, x, A, scalar );
4300  }
4301  //**********************************************************************************************
4302 
4303  //**BLAS-based addition assignment to dense vectors*********************************************
4304 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4305 
4318  template< typename VT1 // Type of the left-hand side target vector
4319  , typename VT2 // Type of the left-hand side vector operand
4320  , typename MT1 // Type of the right-hand side matrix operand
4321  , typename ST2 > // Type of the scalar value
4323  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4324  {
4325  using ET = ElementType_<VT1>;
4326 
4327  if( IsTriangular<MT1>::value ) {
4328  ResultType_<VT1> tmp( serial( scalar * x ) );
4329  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4330  addAssign( y, tmp );
4331  }
4332  else {
4333  gemv( y, x, A, ET(scalar), ET(1) );
4334  }
4335  }
4336 #endif
4337  //**********************************************************************************************
4338 
4339  //**Addition assignment to sparse vectors*******************************************************
4340  // No special implementation for the addition assignment to sparse vectors.
4341  //**********************************************************************************************
4342 
4343  //**Subtraction assignment to dense vectors*****************************************************
4355  template< typename VT1 // Type of the target dense vector
4356  , bool TF > // Transpose flag of the target dense vector
4357  friend inline void subAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
4358  {
4360 
4361  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4362 
4363  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4364  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4365 
4366  if( right.rows() == 0UL || right.columns() == 0UL ) {
4367  return;
4368  }
4369 
4370  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
4371  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4372 
4373  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4374  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4375  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4376  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4377 
4378  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
4379  }
4380  //**********************************************************************************************
4381 
4382  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4393  template< typename VT1 // Type of the left-hand side target vector
4394  , typename VT2 // Type of the left-hand side vector operand
4395  , typename MT1 // Type of the right-hand side matrix operand
4396  , typename ST2 > // Type of the scalar value
4397  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4398  {
4399  if( ( IsDiagonal<MT1>::value ) ||
4400  ( IsComputation<MT>::value && !evaluateMatrix ) ||
4401  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4402  selectSmallSubAssignKernel( y, x, A, scalar );
4403  else
4404  selectBlasSubAssignKernel( y, x, A, scalar );
4405  }
4406  //**********************************************************************************************
4407 
4408  //**Default subtraction assignment to dense vectors*********************************************
4422  template< typename VT1 // Type of the left-hand side target vector
4423  , typename VT2 // Type of the left-hand side vector operand
4424  , typename MT1 // Type of the right-hand side matrix operand
4425  , typename ST2 > // Type of the scalar value
4426  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4427  {
4428  y.subAssign( x * A * scalar );
4429  }
4430  //**********************************************************************************************
4431 
4432  //**Default subtraction assignment to dense vectors (small matrices)****************************
4446  template< typename VT1 // Type of the left-hand side target vector
4447  , typename VT2 // Type of the left-hand side vector operand
4448  , typename MT1 // Type of the right-hand side matrix operand
4449  , typename ST2 > // Type of the scalar value
4451  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4452  {
4453  selectDefaultSubAssignKernel( y, x, A, scalar );
4454  }
4455  //**********************************************************************************************
4456 
4457  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4472  template< typename VT1 // Type of the left-hand side target vector
4473  , typename VT2 // Type of the left-hand side vector operand
4474  , typename MT1 // Type of the right-hand side matrix operand
4475  , typename ST2 > // Type of the scalar value
4477  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4478  {
4479  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4480 
4481  const size_t M( A.rows() );
4482  const size_t N( A.columns() );
4483 
4484  size_t j( 0UL );
4485 
4486  for( ; (j+8UL) <= N; j+=8UL )
4487  {
4488  const size_t ibegin( ( IsLower<MT1>::value )
4489  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4490  :( 0UL ) );
4491  const size_t iend( ( IsUpper<MT1>::value )
4492  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4493  :( M ) );
4494  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4495 
4496  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4497  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4498 
4499  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4500  size_t i( ibegin );
4501 
4502  for( ; i<ipos; i+=SIMDSIZE ) {
4503  const SIMDType x1( x.load(i) );
4504  xmm1 += x1 * A.load(i,j );
4505  xmm2 += x1 * A.load(i,j+1UL);
4506  xmm3 += x1 * A.load(i,j+2UL);
4507  xmm4 += x1 * A.load(i,j+3UL);
4508  xmm5 += x1 * A.load(i,j+4UL);
4509  xmm6 += x1 * A.load(i,j+5UL);
4510  xmm7 += x1 * A.load(i,j+6UL);
4511  xmm8 += x1 * A.load(i,j+7UL);
4512  }
4513 
4514  y[j ] -= sum( xmm1 ) * scalar;
4515  y[j+1UL] -= sum( xmm2 ) * scalar;
4516  y[j+2UL] -= sum( xmm3 ) * scalar;
4517  y[j+3UL] -= sum( xmm4 ) * scalar;
4518  y[j+4UL] -= sum( xmm5 ) * scalar;
4519  y[j+5UL] -= sum( xmm6 ) * scalar;
4520  y[j+6UL] -= sum( xmm7 ) * scalar;
4521  y[j+7UL] -= sum( xmm8 ) * scalar;
4522 
4523  for( ; remainder && i<iend; ++i ) {
4524  y[j ] -= x[i] * A(i,j ) * scalar;
4525  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4526  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4527  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4528  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4529  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4530  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4531  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4532  }
4533  }
4534 
4535  for( ; (j+4UL) <= N; j+=4UL )
4536  {
4537  const size_t ibegin( ( IsLower<MT1>::value )
4538  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4539  :( 0UL ) );
4540  const size_t iend( ( IsUpper<MT1>::value )
4541  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4542  :( M ) );
4543  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4544 
4545  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4546  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4547 
4548  SIMDType xmm1, xmm2, xmm3, xmm4;
4549  size_t i( ibegin );
4550 
4551  for( ; i<ipos; i+=SIMDSIZE ) {
4552  const SIMDType x1( x.load(i) );
4553  xmm1 += x1 * A.load(i,j );
4554  xmm2 += x1 * A.load(i,j+1UL);
4555  xmm3 += x1 * A.load(i,j+2UL);
4556  xmm4 += x1 * A.load(i,j+3UL);
4557  }
4558 
4559  y[j ] -= sum( xmm1 ) * scalar;
4560  y[j+1UL] -= sum( xmm2 ) * scalar;
4561  y[j+2UL] -= sum( xmm3 ) * scalar;
4562  y[j+3UL] -= sum( xmm4 ) * scalar;
4563 
4564  for( ; remainder && i<iend; ++i ) {
4565  y[j ] -= x[i] * A(i,j ) * scalar;
4566  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4567  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4568  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4569  }
4570  }
4571 
4572  for( ; (j+3UL) <= N; j+=3UL )
4573  {
4574  const size_t ibegin( ( IsLower<MT1>::value )
4575  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4576  :( 0UL ) );
4577  const size_t iend( ( IsUpper<MT1>::value )
4578  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
4579  :( M ) );
4580  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4581 
4582  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4583  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4584 
4585  SIMDType xmm1, xmm2, xmm3;
4586  size_t i( ibegin );
4587 
4588  for( ; i<ipos; i+=SIMDSIZE ) {
4589  const SIMDType x1( x.load(i) );
4590  xmm1 += x1 * A.load(i,j );
4591  xmm2 += x1 * A.load(i,j+1UL);
4592  xmm3 += x1 * A.load(i,j+2UL);
4593  }
4594 
4595  y[j ] -= sum( xmm1 ) * scalar;
4596  y[j+1UL] -= sum( xmm2 ) * scalar;
4597  y[j+2UL] -= sum( xmm3 ) * scalar;
4598 
4599  for( ; remainder && i<iend; ++i ) {
4600  y[j ] -= x[i] * A(i,j ) * scalar;
4601  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4602  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4603  }
4604  }
4605 
4606  for( ; (j+2UL) <= N; j+=2UL )
4607  {
4608  const size_t ibegin( ( IsLower<MT1>::value )
4609  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4610  :( 0UL ) );
4611  const size_t iend( ( IsUpper<MT1>::value )
4612  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4613  :( M ) );
4614  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4615 
4616  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4617  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4618 
4619  SIMDType xmm1, xmm2;
4620  size_t i( ibegin );
4621 
4622  for( ; i<ipos; i+=SIMDSIZE ) {
4623  const SIMDType x1( x.load(i) );
4624  xmm1 += x1 * A.load(i,j );
4625  xmm2 += x1 * A.load(i,j+1UL);
4626  }
4627 
4628  y[j ] -= sum( xmm1 ) * scalar;
4629  y[j+1UL] -= sum( xmm2 ) * scalar;
4630 
4631  for( ; remainder && i<iend; ++i ) {
4632  y[j ] -= x[i] * A(i,j ) * scalar;
4633  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4634  }
4635  }
4636 
4637  if( j < N )
4638  {
4639  const size_t ibegin( ( IsLower<MT1>::value )
4640  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4641  :( 0UL ) );
4642  const size_t iend( ( IsUpper<MT1>::value )
4643  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4644  :( M ) );
4645  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4646 
4647  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4648  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4649 
4650  SIMDType xmm1;
4651  size_t i( ibegin );
4652 
4653  for( ; i<ipos; i+=SIMDSIZE ) {
4654  xmm1 += A.load(i,j) * x.load(i);
4655  }
4656 
4657  y[j] -= sum( xmm1 ) * scalar;
4658 
4659  for( ; remainder && i<iend; ++i ) {
4660  y[j] -= x[i] * A(i,j) * scalar;
4661  }
4662  }
4663  }
4664  //**********************************************************************************************
4665 
4666  //**Default subtraction assignment to dense vectors (large matrices)****************************
4680  template< typename VT1 // Type of the left-hand side target vector
4681  , typename VT2 // Type of the left-hand side vector operand
4682  , typename MT1 // Type of the right-hand side matrix operand
4683  , typename ST2 > // Type of the scalar value
4685  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4686  {
4687  selectDefaultSubAssignKernel( y, x, A, scalar );
4688  }
4689  //**********************************************************************************************
4690 
4691  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4706  template< typename VT1 // Type of the left-hand side target vector
4707  , typename VT2 // Type of the left-hand side vector operand
4708  , typename MT1 // Type of the right-hand side matrix operand
4709  , typename ST2 > // Type of the scalar value
4711  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4712  {
4713  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4714 
4715  const size_t M( A.rows() );
4716  const size_t N( A.columns() );
4717 
4718  size_t j( 0UL );
4719 
4720  for( ; (j+8UL) <= N; j+=8UL )
4721  {
4722  const size_t ibegin( ( IsLower<MT1>::value )
4723  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4724  :( 0UL ) );
4725  const size_t iend( ( IsUpper<MT1>::value )
4726  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4727  :( M ) );
4728  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4729 
4730  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4731  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4732 
4733  size_t i( ibegin );
4734 
4735  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4736  const size_t i1( i+SIMDSIZE );
4737  const size_t i2( i+SIMDSIZE*2UL );
4738  const size_t i3( i+SIMDSIZE*3UL );
4739  const SIMDType x1( x.load(i ) );
4740  const SIMDType x2( x.load(i1) );
4741  const SIMDType x3( x.load(i2) );
4742  const SIMDType x4( x.load(i3) );
4743  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4744  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4745  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4746  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4747  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4748  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4749  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4750  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4751  }
4752 
4753  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4754  const size_t i1( i+SIMDSIZE );
4755  const SIMDType x1( x.load(i ) );
4756  const SIMDType x2( x.load(i1) );
4757  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4758  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4759  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4760  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4761  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4762  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4763  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4764  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4765  }
4766 
4767  for( ; i<ipos; i+=SIMDSIZE ) {
4768  const SIMDType x1( x.load(i) );
4769  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4770  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4771  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4772  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4773  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) ) * scalar;
4774  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) ) * scalar;
4775  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) ) * scalar;
4776  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) ) * scalar;
4777  }
4778 
4779  for( ; remainder && i<iend; ++i ) {
4780  y[j ] -= x[i] * A(i,j ) * scalar;
4781  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4782  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4783  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4784  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4785  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4786  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4787  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4788  }
4789  }
4790 
4791  for( ; (j+4UL) <= N; j+=4UL )
4792  {
4793  const size_t ibegin( ( IsLower<MT1>::value )
4794  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4795  :( 0UL ) );
4796  const size_t iend( ( IsUpper<MT1>::value )
4797  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4798  :( M ) );
4799  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4800 
4801  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4802  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4803 
4804  size_t i( ibegin );
4805 
4806  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4807  const size_t i1( i+SIMDSIZE );
4808  const size_t i2( i+SIMDSIZE*2UL );
4809  const size_t i3( i+SIMDSIZE*3UL );
4810  const SIMDType x1( x.load(i ) );
4811  const SIMDType x2( x.load(i1) );
4812  const SIMDType x3( x.load(i2) );
4813  const SIMDType x4( x.load(i3) );
4814  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4815  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4816  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4817  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4818  }
4819 
4820  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4821  const size_t i1( i+SIMDSIZE );
4822  const SIMDType x1( x.load(i ) );
4823  const SIMDType x2( x.load(i1) );
4824  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4825  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4826  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4827  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4828  }
4829 
4830  for( ; i<ipos; i+=SIMDSIZE ) {
4831  const SIMDType x1( x.load(i) );
4832  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4833  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4834  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4835  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4836  }
4837 
4838  for( ; remainder && i<iend; ++i ) {
4839  y[j ] -= x[i] * A(i,j ) * scalar;
4840  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4841  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4842  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4843  }
4844  }
4845 
4846  for( ; (j+2UL) <= N; j+=2UL )
4847  {
4848  const size_t ibegin( ( IsLower<MT1>::value )
4849  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4850  :( 0UL ) );
4851  const size_t iend( ( IsUpper<MT1>::value )
4852  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4853  :( M ) );
4854  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4855 
4856  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4857  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4858 
4859  size_t i( ibegin );
4860 
4861  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4862  const size_t i1( i+SIMDSIZE );
4863  const size_t i2( i+SIMDSIZE*2UL );
4864  const size_t i3( i+SIMDSIZE*3UL );
4865  const SIMDType x1( x.load(i ) );
4866  const SIMDType x2( x.load(i1) );
4867  const SIMDType x3( x.load(i2) );
4868  const SIMDType x4( x.load(i3) );
4869  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4870  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4871  }
4872 
4873  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4874  const size_t i1( i+SIMDSIZE );
4875  const SIMDType x1( x.load(i ) );
4876  const SIMDType x2( x.load(i1) );
4877  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4878  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4879  }
4880 
4881  for( ; i<ipos; i+=SIMDSIZE ) {
4882  const SIMDType x1( x.load(i) );
4883  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4884  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4885  }
4886 
4887  for( ; remainder && i<iend; ++i ) {
4888  y[j ] -= x[i] * A(i,j ) * scalar;
4889  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4890  }
4891  }
4892 
4893  if( j < N )
4894  {
4895  const size_t ibegin( ( IsLower<MT1>::value )
4896  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4897  :( 0UL ) );
4898  const size_t iend( ( IsUpper<MT1>::value )
4899  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4900  :( M ) );
4901  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4902 
4903  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4904  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4905 
4906  size_t i( ibegin );
4907 
4908  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4909  const size_t i1( i+SIMDSIZE );
4910  const size_t i2( i+SIMDSIZE*2UL );
4911  const size_t i3( i+SIMDSIZE*3UL );
4912  const SIMDType x1( x.load(i ) );
4913  const SIMDType x2( x.load(i1) );
4914  const SIMDType x3( x.load(i2) );
4915  const SIMDType x4( x.load(i3) );
4916  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4917  }
4918 
4919  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4920  const size_t i1( i+SIMDSIZE );
4921  const SIMDType x1( x.load(i ) );
4922  const SIMDType x2( x.load(i1) );
4923  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4924  }
4925 
4926  for( ; i<ipos; i+=SIMDSIZE ) {
4927  const SIMDType x1( x.load(i) );
4928  y[j] -= sum( x1 * A.load(i,j) ) * scalar;
4929  }
4930 
4931  for( ; remainder && i<iend; ++i ) {
4932  y[j] -= x[i] * A(i,j) * scalar;
4933  }
4934  }
4935  }
4936  //**********************************************************************************************
4937 
4938  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4953  template< typename VT1 // Type of the left-hand side target vector
4954  , typename VT2 // Type of the left-hand side vector operand
4955  , typename MT1 // Type of the right-hand side matrix operand
4956  , typename ST2 > // Type of the scalar value
4958  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4959  {
4960  selectLargeSubAssignKernel( y, x, A, scalar );
4961  }
4962  //**********************************************************************************************
4963 
4964  //**BLAS-based subtraction assignment to dense vectors******************************************
4965 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4966 
4979  template< typename VT1 // Type of the left-hand side target vector
4980  , typename VT2 // Type of the left-hand side vector operand
4981  , typename MT1 // Type of the right-hand side matrix operand
4982  , typename ST2 > // Type of the scalar value
4984  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4985  {
4986  using ET = ElementType_<VT1>;
4987 
4988  if( IsTriangular<MT1>::value ) {
4989  ResultType_<VT1> tmp( serial( scalar * x ) );
4990  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4991  subAssign( y, tmp );
4992  }
4993  else {
4994  gemv( y, x, A, ET(-scalar), ET(1) );
4995  }
4996  }
4997 #endif
4998  //**********************************************************************************************
4999 
5000  //**Subtraction assignment to sparse vectors****************************************************
5001  // No special implementation for the subtraction assignment to sparse vectors.
5002  //**********************************************************************************************
5003 
5004  //**Multiplication assignment to dense vectors**************************************************
5016  template< typename VT1 // Type of the target dense vector
5017  , bool TF > // Transpose flag of the target dense vector
5018  friend inline void multAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5019  {
5021 
5025 
5026  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5027 
5028  const ResultType tmp( serial( rhs ) );
5029  multAssign( ~lhs, tmp );
5030  }
5031  //**********************************************************************************************
5032 
5033  //**Multiplication assignment to sparse vectors*************************************************
5034  // No special implementation for the multiplication assignment to sparse vectors.
5035  //**********************************************************************************************
5036 
5037  //**Division assignment to dense vectors********************************************************
5049  template< typename VT1 // Type of the target dense vector
5050  , bool TF > // Transpose flag of the target dense vector
5051  friend inline void divAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5052  {
5054 
5058 
5059  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5060 
5061  const ResultType tmp( serial( rhs ) );
5062  divAssign( ~lhs, tmp );
5063  }
5064  //**********************************************************************************************
5065 
5066  //**Division assignment to sparse vectors*******************************************************
5067  // No special implementation for the division assignment to sparse vectors.
5068  //**********************************************************************************************
5069 
5070  //**SMP assignment to dense vectors*************************************************************
5084  template< typename VT1 // Type of the target dense vector
5085  , bool TF > // Transpose flag of the target dense vector
5086  friend inline EnableIf_< UseSMPAssign<VT1> >
5088  {
5090 
5091  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5092 
5093  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5094  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5095 
5096  if( right.rows() == 0UL ) {
5097  reset( ~lhs );
5098  return;
5099  }
5100  else if( right.columns() == 0UL ) {
5101  return;
5102  }
5103 
5104  LT x( left ); // Evaluation of the left-hand side dense vector operand
5105  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5106 
5107  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5108  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5109  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5110  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5111 
5112  smpAssign( ~lhs, x * A * rhs.scalar_ );
5113  }
5114  //**********************************************************************************************
5115 
5116  //**SMP assignment to sparse vectors************************************************************
5130  template< typename VT1 // Type of the target sparse vector
5131  , bool TF > // Transpose flag of the target sparse vector
5132  friend inline EnableIf_< UseSMPAssign<VT1> >
5134  {
5136 
5140 
5141  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5142 
5143  const ResultType tmp( rhs );
5144  smpAssign( ~lhs, tmp );
5145  }
5146  //**********************************************************************************************
5147 
5148  //**SMP addition assignment to dense vectors****************************************************
5162  template< typename VT1 // Type of the target dense vector
5163  , bool TF > // Transpose flag of the target dense vector
5164  friend inline EnableIf_< UseSMPAssign<VT1> >
5166  {
5168 
5169  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5170 
5171  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5172  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5173 
5174  if( right.rows() == 0UL || right.columns() == 0UL ) {
5175  return;
5176  }
5177 
5178  LT x( left ); // Evaluation of the left-hand side dense vector operand
5179  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5180 
5181  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5182  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5183  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5184  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5185 
5186  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
5187  }
5188  //**********************************************************************************************
5189 
5190  //**SMP addition assignment to sparse vectors***************************************************
5191  // No special implementation for the SMP addition assignment to sparse vectors.
5192  //**********************************************************************************************
5193 
5194  //**SMP subtraction assignment to dense vectors*************************************************
5208  template< typename VT1 // Type of the target dense vector
5209  , bool TF > // Transpose flag of the target dense vector
5210  friend inline EnableIf_< UseSMPAssign<VT1> >
5212  {
5214 
5215  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5216 
5217  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5218  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5219 
5220  if( right.rows() == 0UL || right.columns() == 0UL ) {
5221  return;
5222  }
5223 
5224  LT x( left ); // Evaluation of the left-hand side dense vector operand
5225  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5226 
5227  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5228  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5229  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5230  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5231 
5232  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
5233  }
5234  //**********************************************************************************************
5235 
5236  //**SMP subtraction assignment to sparse vectors************************************************
5237  // No special implementation for the SMP subtraction assignment to sparse vectors.
5238  //**********************************************************************************************
5239 
5240  //**SMP multiplication assignment to dense vectors**********************************************
5254  template< typename VT1 // Type of the target dense vector
5255  , bool TF > // Transpose flag of the target dense vector
5256  friend inline EnableIf_< UseSMPAssign<VT1> >
5258  {
5260 
5264 
5265  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5266 
5267  const ResultType tmp( rhs );
5268  smpMultAssign( ~lhs, tmp );
5269  }
5270  //**********************************************************************************************
5271 
5272  //**SMP multiplication assignment to sparse vectors*********************************************
5273  // No special implementation for the SMP multiplication assignment to sparse vectors.
5274  //**********************************************************************************************
5275 
5276  //**SMP division assignment to dense vectors****************************************************
5290  template< typename VT1 // Type of the target dense vector
5291  , bool TF > // Transpose flag of the target dense vector
5292  friend inline EnableIf_< UseSMPAssign<VT1> >
5294  {
5296 
5300 
5301  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5302 
5303  const ResultType tmp( rhs );
5304  smpDivAssign( ~lhs, tmp );
5305  }
5306  //**********************************************************************************************
5307 
5308  //**SMP division assignment to sparse vectors***************************************************
5309  // No special implementation for the SMP division assignment to sparse vectors.
5310  //**********************************************************************************************
5311 
5312  //**Compile time checks*************************************************************************
5321  //**********************************************************************************************
5322 };
5324 //*************************************************************************************************
5325 
5326 
5327 
5328 
5329 //=================================================================================================
5330 //
5331 // GLOBAL BINARY ARITHMETIC OPERATORS
5332 //
5333 //=================================================================================================
5334 
5335 //*************************************************************************************************
5366 template< typename VT // Type of the left-hand side dense vector
5367  , typename MT > // Type of the right-hand side dense matrix
5368 inline decltype(auto)
5369  operator*( const DenseVector<VT,true>& vec, const DenseMatrix<MT,true>& mat )
5370 {
5372 
5374 
5375  if( (~vec).size() != (~mat).rows() ) {
5376  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
5377  }
5378 
5379  using ReturnType = const TDVecTDMatMultExpr<VT,MT>;
5380  return ReturnType( ~vec, ~mat );
5381 }
5382 //*************************************************************************************************
5383 
5384 
5385 
5386 
5387 //=================================================================================================
5388 //
5389 // SIZE SPECIALIZATIONS
5390 //
5391 //=================================================================================================
5392 
5393 //*************************************************************************************************
5395 template< typename VT, typename MT >
5396 struct Size< TDVecTDMatMultExpr<VT,MT>, 0UL >
5397  : public Size<MT,1UL>
5398 {};
5400 //*************************************************************************************************
5401 
5402 
5403 
5404 
5405 //=================================================================================================
5406 //
5407 // ISALIGNED SPECIALIZATIONS
5408 //
5409 //=================================================================================================
5410 
5411 //*************************************************************************************************
5413 template< typename VT, typename MT >
5414 struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
5415  : public And< IsAligned<VT>, IsAligned<MT> >
5416 {};
5418 //*************************************************************************************************
5419 
5420 } // namespace blaze
5421 
5422 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:158
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:131
Data type constraint.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:207
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:204
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:221
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:372
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:247
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:261
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:519
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:384
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:328
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:308
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Header file for the DisableIf class template.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:209
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:128
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
If_< IsExpression< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:215
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:352
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:127
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsTriangular type trait.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:205
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:130
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:318
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:590
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
Header file for the HasSIMDMult type trait.
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
Header file for run time assertion macros.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:295
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:385
Header file for the IsContiguous type trait.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Header file for the TVecMatMultExpr base class.
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:218
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:490
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:108
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:104
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:206
Header file for the IsComplexFloat type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:131
Header file for the IsComplex type trait.
Compile time logical &#39;and&#39; evaluation.The And alias declaration performs at compile time a logical &#39;a...
Definition: And.h:76
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:132
Constraint on the data type.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:340
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:129
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:208
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:362
Header file for the Size type trait.
If_< IsExpression< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:212
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.