TDVecTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/Exception.h>
59 #include <blaze/math/shims/Reset.h>
61 #include <blaze/math/SIMD.h>
81 #include <blaze/system/BLAS.h>
84 #include <blaze/util/Assert.h>
85 #include <blaze/util/Complex.h>
87 #include <blaze/util/DisableIf.h>
88 #include <blaze/util/EnableIf.h>
91 #include <blaze/util/mpl/And.h>
92 #include <blaze/util/mpl/If.h>
93 #include <blaze/util/Types.h>
101 
102 
103 namespace blaze {
104 
105 //=================================================================================================
106 //
107 // CLASS TDVECTDMATMULTEXPR
108 //
109 //=================================================================================================
110 
111 //*************************************************************************************************
118 template< typename VT // Type of the left-hand side dense vector
119  , typename MT > // Type of the right-hand side dense matrix
120 class TDVecTDMatMultExpr
121  : public TVecMatMultExpr< DenseVector< TDVecTDMatMultExpr<VT,MT>, true > >
122  , private Computation
123 {
124  private:
125  //**Type definitions****************************************************************************
132  //**********************************************************************************************
133 
134  //**********************************************************************************************
136  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
137  //**********************************************************************************************
138 
139  //**********************************************************************************************
141  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
143  //**********************************************************************************************
144 
145  //**********************************************************************************************
147 
151  template< typename T1 >
152  struct UseSMPAssign {
153  enum : bool { value = ( evaluateVector || evaluateMatrix ) };
154  };
156  //**********************************************************************************************
157 
158  //**********************************************************************************************
160 
163  template< typename T1, typename T2, typename T3 >
164  struct UseBlasKernel {
170  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
175  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
176  };
178  //**********************************************************************************************
179 
180  //**********************************************************************************************
182 
186  template< typename T1, typename T2, typename T3 >
187  struct UseVectorizedDefaultKernel {
188  enum : bool { value = useOptimizedKernels &&
190  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
193  , ElementType_<T3> >::value &&
196  };
198  //**********************************************************************************************
199 
200  public:
201  //**Type definitions****************************************************************************
207  using ReturnType = const ElementType;
208  using CompositeType = const ResultType;
209 
211  using LeftOperand = If_< IsExpression<VT>, const VT, const VT& >;
212 
214  using RightOperand = If_< IsExpression<MT>, const MT, const MT& >;
215 
218 
221  //**********************************************************************************************
222 
223  //**Compilation flags***************************************************************************
225  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
226  VT::simdEnabled && MT::simdEnabled &&
229 
231  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
232  !evaluateMatrix && MT::smpAssignable };
233  //**********************************************************************************************
234 
235  //**SIMD properties*****************************************************************************
237  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
238  //**********************************************************************************************
239 
240  //**Constructor*********************************************************************************
246  explicit inline TDVecTDMatMultExpr( const VT& vec, const MT& mat ) noexcept
247  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
248  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
249  {
250  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
251  }
252  //**********************************************************************************************
253 
254  //**Subscript operator**************************************************************************
260  inline ReturnType operator[]( size_t index ) const {
261  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
262 
264  {
265  return vec_[index] * mat_(index,index);
266  }
267  else if( IsLower<MT>::value && ( index > 8UL ) )
268  {
269  const size_t begin( IsStrictlyLower<MT>::value ? index+1UL : index );
270  const size_t n ( mat_.rows() - begin );
271  return subvector( vec_, begin, n ) * subvector( column( mat_, index ), begin, n );
272  }
273  else if( IsUpper<MT>::value && ( index + 8UL < mat_.rows() ) )
274  {
275  const size_t n( IsStrictlyUpper<MT>::value ? index : index+1UL );
276  return subvector( vec_, 0UL, n ) * subvector( column( mat_, index ), 0UL, n );
277  }
278  else
279  {
280  return vec_ * column( mat_, index );
281  }
282  }
283  //**********************************************************************************************
284 
285  //**At function*********************************************************************************
292  inline ReturnType at( size_t index ) const {
293  if( index >= mat_.columns() ) {
294  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
295  }
296  return (*this)[index];
297  }
298  //**********************************************************************************************
299 
300  //**Size function*******************************************************************************
305  inline size_t size() const noexcept {
306  return mat_.columns();
307  }
308  //**********************************************************************************************
309 
310  //**Left operand access*************************************************************************
315  inline LeftOperand leftOperand() const noexcept {
316  return vec_;
317  }
318  //**********************************************************************************************
319 
320  //**Right operand access************************************************************************
325  inline RightOperand rightOperand() const noexcept {
326  return mat_;
327  }
328  //**********************************************************************************************
329 
330  //**********************************************************************************************
336  template< typename T >
337  inline bool canAlias( const T* alias ) const noexcept {
338  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
339  }
340  //**********************************************************************************************
341 
342  //**********************************************************************************************
348  template< typename T >
349  inline bool isAliased( const T* alias ) const noexcept {
350  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
351  }
352  //**********************************************************************************************
353 
354  //**********************************************************************************************
359  inline bool isAligned() const noexcept {
360  return vec_.isAligned() && mat_.isAligned();
361  }
362  //**********************************************************************************************
363 
364  //**********************************************************************************************
369  inline bool canSMPAssign() const noexcept {
370  return ( !BLAZE_BLAS_MODE ||
373  ( IsComputation<MT>::value && !evaluateMatrix ) ||
374  ( mat_.rows() * mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
375  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
376  }
377  //**********************************************************************************************
378 
379  private:
380  //**Member variables****************************************************************************
383  //**********************************************************************************************
384 
385  //**Assignment to dense vectors*****************************************************************
398  template< typename VT1 > // Type of the target dense vector
399  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
400  {
402 
403  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
404 
405  if( rhs.mat_.rows() == 0UL ) {
406  reset( ~lhs );
407  return;
408  }
409  else if( rhs.mat_.columns() == 0UL ) {
410  return;
411  }
412 
413  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
414  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
415 
416  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
417  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
418  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
419  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
420 
421  TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
422  }
424  //**********************************************************************************************
425 
426  //**Assignment to dense vectors (kernel selection)**********************************************
437  template< typename VT1 // Type of the left-hand side target vector
438  , typename VT2 // Type of the left-hand side vector operand
439  , typename MT1 > // Type of the right-hand side matrix operand
440  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
441  {
442  if( ( IsDiagonal<MT1>::value ) ||
443  ( IsComputation<MT>::value && !evaluateMatrix ) ||
444  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
445  selectSmallAssignKernel( y, x, A );
446  else
447  selectBlasAssignKernel( y, x, A );
448  }
450  //**********************************************************************************************
451 
452  //**Default assignment to dense vectors*********************************************************
466  template< typename VT1 // Type of the left-hand side target vector
467  , typename VT2 // Type of the left-hand side vector operand
468  , typename MT1 > // Type of the right-hand side matrix operand
469  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
470  {
471  y.assign( x * A );
472  }
474  //**********************************************************************************************
475 
476  //**Default assignment to dense vectors (small matrices)****************************************
490  template< typename VT1 // Type of the left-hand side target vector
491  , typename VT2 // Type of the left-hand side vector operand
492  , typename MT1 > // Type of the right-hand side matrix operand
494  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
495  {
496  selectDefaultAssignKernel( y, x, A );
497  }
499  //**********************************************************************************************
500 
501  //**Vectorized default assignment to dense vectors (small matrices)*****************************
515  template< typename VT1 // Type of the left-hand side target vector
516  , typename VT2 // Type of the left-hand side vector operand
517  , typename MT1 > // Type of the right-hand side matrix operand
519  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
520  {
521  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
522 
523  const size_t M( A.rows() );
524  const size_t N( A.columns() );
525 
526  size_t j( 0UL );
527 
528  for( ; (j+8UL) <= N; j+=8UL )
529  {
530  const size_t ibegin( ( IsLower<MT1>::value )
531  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
532  :( 0UL ) );
533  const size_t iend( ( IsUpper<MT1>::value )
534  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
535  :( M ) );
536  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
537 
538  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
539  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
540 
541  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
542  size_t i( ibegin );
543 
544  for( ; i<ipos; i+=SIMDSIZE ) {
545  const SIMDType x1( x.load(i) );
546  xmm1 += x1 * A.load(i,j );
547  xmm2 += x1 * A.load(i,j+1UL);
548  xmm3 += x1 * A.load(i,j+2UL);
549  xmm4 += x1 * A.load(i,j+3UL);
550  xmm5 += x1 * A.load(i,j+4UL);
551  xmm6 += x1 * A.load(i,j+5UL);
552  xmm7 += x1 * A.load(i,j+6UL);
553  xmm8 += x1 * A.load(i,j+7UL);
554  }
555 
556  y[j ] = sum( xmm1 );
557  y[j+1UL] = sum( xmm2 );
558  y[j+2UL] = sum( xmm3 );
559  y[j+3UL] = sum( xmm4 );
560  y[j+4UL] = sum( xmm5 );
561  y[j+5UL] = sum( xmm6 );
562  y[j+6UL] = sum( xmm7 );
563  y[j+7UL] = sum( xmm8 );
564 
565  for( ; remainder && i<iend; ++i ) {
566  y[j ] += x[i] * A(i,j );
567  y[j+1UL] += x[i] * A(i,j+1UL);
568  y[j+2UL] += x[i] * A(i,j+2UL);
569  y[j+3UL] += x[i] * A(i,j+3UL);
570  y[j+4UL] += x[i] * A(i,j+4UL);
571  y[j+5UL] += x[i] * A(i,j+5UL);
572  y[j+6UL] += x[i] * A(i,j+6UL);
573  y[j+7UL] += x[i] * A(i,j+7UL);
574  }
575  }
576 
577  for( ; (j+4UL) <= N; j+=4UL )
578  {
579  const size_t ibegin( ( IsLower<MT1>::value )
580  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
581  :( 0UL ) );
582  const size_t iend( ( IsUpper<MT1>::value )
583  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
584  :( M ) );
585  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
586 
587  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
588  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
589 
590  SIMDType xmm1, xmm2, xmm3, xmm4;
591  size_t i( ibegin );
592 
593  for( ; i<ipos; i+=SIMDSIZE ) {
594  const SIMDType x1( x.load(i) );
595  xmm1 += x1 * A.load(i,j );
596  xmm2 += x1 * A.load(i,j+1UL);
597  xmm3 += x1 * A.load(i,j+2UL);
598  xmm4 += x1 * A.load(i,j+3UL);
599  }
600 
601  y[j ] = sum( xmm1 );
602  y[j+1UL] = sum( xmm2 );
603  y[j+2UL] = sum( xmm3 );
604  y[j+3UL] = sum( xmm4 );
605 
606  for( ; remainder && i<iend; ++i ) {
607  y[j ] += x[i] * A(i,j );
608  y[j+1UL] += x[i] * A(i,j+1UL);
609  y[j+2UL] += x[i] * A(i,j+2UL);
610  y[j+3UL] += x[i] * A(i,j+3UL);
611  }
612  }
613 
614  for( ; (j+3UL) <= N; j+=3UL )
615  {
616  const size_t ibegin( ( IsLower<MT1>::value )
617  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
618  :( 0UL ) );
619  const size_t iend( ( IsUpper<MT1>::value )
620  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
621  :( M ) );
622  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
623 
624  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
625  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
626 
627  SIMDType xmm1, xmm2, xmm3;
628  size_t i( ibegin );
629 
630  for( ; i<ipos; i+=SIMDSIZE ) {
631  const SIMDType x1( x.load(i) );
632  xmm1 += x1 * A.load(i,j );
633  xmm2 += x1 * A.load(i,j+1UL);
634  xmm3 += x1 * A.load(i,j+2UL);
635  }
636 
637  y[j ] = sum( xmm1 );
638  y[j+1UL] = sum( xmm2 );
639  y[j+2UL] = sum( xmm3 );
640 
641  for( ; remainder && i<iend; ++i ) {
642  y[j ] += x[i] * A(i,j );
643  y[j+1UL] += x[i] * A(i,j+1UL);
644  y[j+2UL] += x[i] * A(i,j+2UL);
645  }
646  }
647 
648  for( ; (j+2UL) <= N; j+=2UL )
649  {
650  const size_t ibegin( ( IsLower<MT1>::value )
651  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
652  :( 0UL ) );
653  const size_t iend( ( IsUpper<MT1>::value )
654  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
655  :( M ) );
656  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
657 
658  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
659  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
660 
661  SIMDType xmm1, xmm2;
662  size_t i( ibegin );
663 
664  for( ; i<ipos; i+=SIMDSIZE ) {
665  const SIMDType x1( x.load(i) );
666  xmm1 += x1 * A.load(i,j );
667  xmm2 += x1 * A.load(i,j+1UL);
668  }
669 
670  y[j ] = sum( xmm1 );
671  y[j+1UL] = sum( xmm2 );
672 
673  for( ; remainder && i<iend; ++i ) {
674  y[j ] += x[i] * A(i,j );
675  y[j+1UL] += x[i] * A(i,j+1UL);
676  }
677  }
678 
679  if( j < N )
680  {
681  const size_t ibegin( ( IsLower<MT1>::value )
682  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
683  :( 0UL ) );
684  const size_t iend( ( IsUpper<MT1>::value )
685  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
686  :( M ) );
687  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
688 
689  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
690  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
691 
692  SIMDType xmm1;
693  size_t i( ibegin );
694 
695  for( ; i<ipos; i+=SIMDSIZE ) {
696  xmm1 += x.load(i) * A.load(i,j);
697  }
698 
699  y[j] = sum( xmm1 );
700 
701  for( ; remainder && i<iend; ++i ) {
702  y[j] += x[i] * A(i,j);
703  }
704  }
705  }
707  //**********************************************************************************************
708 
709  //**Default assignment to dense vectors (large matrices)****************************************
723  template< typename VT1 // Type of the left-hand side target vector
724  , typename VT2 // Type of the left-hand side vector operand
725  , typename MT1 > // Type of the right-hand side matrix operand
727  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
728  {
729  selectDefaultAssignKernel( y, x, A );
730  }
732  //**********************************************************************************************
733 
734  //**Vectorized default assignment to dense vectors (large matrices)*****************************
748  template< typename VT1 // Type of the left-hand side target vector
749  , typename VT2 // Type of the left-hand side vector operand
750  , typename MT1 > // Type of the right-hand side matrix operand
752  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
753  {
754  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
755 
756  const size_t M( A.rows() );
757  const size_t N( A.columns() );
758 
759  reset( y );
760 
761  size_t j( 0UL );
762 
763  for( ; (j+8UL) <= N; j+=8UL )
764  {
765  const size_t ibegin( ( IsLower<MT1>::value )
766  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
767  :( 0UL ) );
768  const size_t iend( ( IsUpper<MT1>::value )
769  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
770  :( M ) );
771  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
772 
773  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
774  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
775 
776  size_t i( ibegin );
777 
778  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
779  const size_t i1( i+SIMDSIZE );
780  const size_t i2( i+SIMDSIZE*2UL );
781  const size_t i3( i+SIMDSIZE*3UL );
782  const SIMDType x1( x.load(i ) );
783  const SIMDType x2( x.load(i1) );
784  const SIMDType x3( x.load(i2) );
785  const SIMDType x4( x.load(i3) );
786  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
787  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
788  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
789  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
790  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
791  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
792  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
793  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
794  }
795 
796  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
797  const size_t i1( i+SIMDSIZE );
798  const SIMDType x1( x.load(i ) );
799  const SIMDType x2( x.load(i1) );
800  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
801  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
802  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
803  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
804  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
805  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
806  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
807  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
808  }
809 
810  for( ; i<ipos; i+=SIMDSIZE ) {
811  const SIMDType x1( x.load(i) );
812  y[j ] += sum( x1 * A.load(i,j ) );
813  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
814  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
815  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
816  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
817  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
818  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
819  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
820  }
821 
822  for( ; remainder && i<iend; ++i ) {
823  y[j ] += x[i] * A(i,j );
824  y[j+1UL] += x[i] * A(i,j+1UL);
825  y[j+2UL] += x[i] * A(i,j+2UL);
826  y[j+3UL] += x[i] * A(i,j+3UL);
827  y[j+4UL] += x[i] * A(i,j+4UL);
828  y[j+5UL] += x[i] * A(i,j+5UL);
829  y[j+6UL] += x[i] * A(i,j+6UL);
830  y[j+7UL] += x[i] * A(i,j+7UL);
831  }
832  }
833 
834  for( ; (j+4UL) <= N; j+=4UL )
835  {
836  const size_t ibegin( ( IsLower<MT1>::value )
837  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
838  :( 0UL ) );
839  const size_t iend( ( IsUpper<MT1>::value )
840  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
841  :( M ) );
842  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
843 
844  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
845  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
846 
847  size_t i( ibegin );
848 
849  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
850  const size_t i1( i+SIMDSIZE );
851  const size_t i2( i+SIMDSIZE*2UL );
852  const size_t i3( i+SIMDSIZE*3UL );
853  const SIMDType x1( x.load(i ) );
854  const SIMDType x2( x.load(i1) );
855  const SIMDType x3( x.load(i2) );
856  const SIMDType x4( x.load(i3) );
857  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
858  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
859  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
860  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
861  }
862 
863  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
864  const size_t i1( i+SIMDSIZE );
865  const SIMDType x1( x.load(i ) );
866  const SIMDType x2( x.load(i1) );
867  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
868  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
869  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
870  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
871  }
872 
873  for( ; i<ipos; i+=SIMDSIZE ) {
874  const SIMDType x1( x.load(i) );
875  y[j ] += sum( x1 * A.load(i,j ) );
876  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
877  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
878  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
879  }
880 
881  for( ; remainder && i<iend; ++i ) {
882  y[j ] += x[i] * A(i,j );
883  y[j+1UL] += x[i] * A(i,j+1UL);
884  y[j+2UL] += x[i] * A(i,j+2UL);
885  y[j+3UL] += x[i] * A(i,j+3UL);
886  }
887  }
888 
889  for( ; (j+2UL) <= N; j+=2UL )
890  {
891  const size_t ibegin( ( IsLower<MT1>::value )
892  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
893  :( 0UL ) );
894  const size_t iend( ( IsUpper<MT1>::value )
895  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
896  :( M ) );
897  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
898 
899  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
900  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
901 
902  size_t i( ibegin );
903 
904  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
905  const size_t i1( i+SIMDSIZE );
906  const size_t i2( i+SIMDSIZE*2UL );
907  const size_t i3( i+SIMDSIZE*3UL );
908  const SIMDType x1( x.load(i ) );
909  const SIMDType x2( x.load(i1) );
910  const SIMDType x3( x.load(i2) );
911  const SIMDType x4( x.load(i3) );
912  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
913  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
914  }
915 
916  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
917  const size_t i1( i+SIMDSIZE );
918  const SIMDType x1( x.load(i ) );
919  const SIMDType x2( x.load(i1) );
920  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
921  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
922  }
923 
924  for( ; i<ipos; i+=SIMDSIZE ) {
925  const SIMDType x1( x.load(i) );
926  y[j ] += sum( x1 * A.load(i,j ) );
927  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
928  }
929 
930  for( ; remainder && i<iend; ++i ) {
931  y[j ] += x[i] * A(i,j );
932  y[j+1UL] += x[i] * A(i,j+1UL);
933  }
934  }
935 
936  if( j < N )
937  {
938  const size_t ibegin( ( IsLower<MT1>::value )
939  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
940  :( 0UL ) );
941  const size_t iend( ( IsUpper<MT1>::value )
942  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
943  :( M ) );
944  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
945 
946  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
947  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
948 
949  size_t i( ibegin );
950 
951  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
952  const size_t i1( i+SIMDSIZE );
953  const size_t i2( i+SIMDSIZE*2UL );
954  const size_t i3( i+SIMDSIZE*3UL );
955  const SIMDType x1( x.load(i ) );
956  const SIMDType x2( x.load(i1) );
957  const SIMDType x3( x.load(i2) );
958  const SIMDType x4( x.load(i3) );
959  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
960  }
961 
962  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
963  const size_t i1( i+SIMDSIZE );
964  const SIMDType x1( x.load(i ) );
965  const SIMDType x2( x.load(i1) );
966  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
967  }
968 
969  for( ; i<ipos; i+=SIMDSIZE ) {
970  const SIMDType x1( x.load(i) );
971  y[j] += sum( x1 * A.load(i,j) );
972  }
973 
974  for( ; remainder && i<iend; ++i ) {
975  y[j] += x[i] * A(i,j);
976  }
977  }
978  }
980  //**********************************************************************************************
981 
982  //**BLAS-based assignment to dense vectors (default)********************************************
996  template< typename VT1 // Type of the left-hand side target vector
997  , typename VT2 // Type of the left-hand side vector operand
998  , typename MT1 > // Type of the right-hand side matrix operand
1000  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1001  {
1002  selectLargeAssignKernel( y, x, A );
1003  }
1005  //**********************************************************************************************
1006 
1007  //**BLAS-based assignment to dense vectors******************************************************
1008 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1009 
1022  template< typename VT1 // Type of the left-hand side target vector
1023  , typename VT2 // Type of the left-hand side vector operand
1024  , typename MT1 > // Type of the right-hand side matrix operand
1026  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
1027  {
1028  using ET = ElementType_<VT1>;
1029 
1030  if( IsTriangular<MT1>::value ) {
1031  assign( y, x );
1032  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1033  }
1034  else {
1035  gemv( y, x, A, ET(1), ET(0) );
1036  }
1037  }
1039 #endif
1040  //**********************************************************************************************
1041 
1042  //**Assignment to sparse vectors****************************************************************
1055  template< typename VT1 > // Type of the target sparse vector
1056  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1057  {
1059 
1063 
1064  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1065 
1066  const ResultType tmp( serial( rhs ) );
1067  assign( ~lhs, tmp );
1068  }
1070  //**********************************************************************************************
1071 
1072  //**Addition assignment to dense vectors********************************************************
1085  template< typename VT1 > // Type of the target dense vector
1086  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1087  {
1089 
1090  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1091 
1092  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1093  return;
1094  }
1095 
1096  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1097  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1098 
1099  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1100  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1101  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1102  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1103 
1104  TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1105  }
1107  //**********************************************************************************************
1108 
1109  //**Addition assignment to dense vectors (kernel selection)*************************************
1120  template< typename VT1 // Type of the left-hand side target vector
1121  , typename VT2 // Type of the left-hand side vector operand
1122  , typename MT1 > // Type of the right-hand side matrix operand
1123  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1124  {
1125  if( ( IsDiagonal<MT1>::value ) ||
1126  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1127  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1128  selectSmallAddAssignKernel( y, x, A );
1129  else
1130  selectBlasAddAssignKernel( y, x, A );
1131  }
1133  //**********************************************************************************************
1134 
1135  //**Default addition assignment to dense vectors************************************************
1149  template< typename VT1 // Type of the left-hand side target vector
1150  , typename VT2 // Type of the left-hand side vector operand
1151  , typename MT1 > // Type of the right-hand side matrix operand
1152  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1153  {
1154  y.addAssign( x * A );
1155  }
1157  //**********************************************************************************************
1158 
1159  //**Default addition assignment to dense vectors (small matrices)*******************************
1173  template< typename VT1 // Type of the left-hand side target vector
1174  , typename VT2 // Type of the left-hand side vector operand
1175  , typename MT1 > // Type of the right-hand side matrix operand
1177  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1178  {
1179  selectDefaultAddAssignKernel( y, x, A );
1180  }
1182  //**********************************************************************************************
1183 
1184  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1199  template< typename VT1 // Type of the left-hand side target vector
1200  , typename VT2 // Type of the left-hand side vector operand
1201  , typename MT1 > // Type of the right-hand side matrix operand
1203  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1204  {
1205  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1206 
1207  const size_t M( A.rows() );
1208  const size_t N( A.columns() );
1209 
1210  size_t j( 0UL );
1211 
1212  for( ; (j+8UL) <= N; j+=8UL )
1213  {
1214  const size_t ibegin( ( IsLower<MT1>::value )
1215  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1216  :( 0UL ) );
1217  const size_t iend( ( IsUpper<MT1>::value )
1218  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1219  :( M ) );
1220  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1221 
1222  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1223  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1224 
1225  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1226  size_t i( ibegin );
1227 
1228  for( ; i<ipos; i+=SIMDSIZE ) {
1229  const SIMDType x1( x.load(i) );
1230  xmm1 += x1 * A.load(i,j );
1231  xmm2 += x1 * A.load(i,j+1UL);
1232  xmm3 += x1 * A.load(i,j+2UL);
1233  xmm4 += x1 * A.load(i,j+3UL);
1234  xmm5 += x1 * A.load(i,j+4UL);
1235  xmm6 += x1 * A.load(i,j+5UL);
1236  xmm7 += x1 * A.load(i,j+6UL);
1237  xmm8 += x1 * A.load(i,j+7UL);
1238  }
1239 
1240  y[j ] += sum( xmm1 );
1241  y[j+1UL] += sum( xmm2 );
1242  y[j+2UL] += sum( xmm3 );
1243  y[j+3UL] += sum( xmm4 );
1244  y[j+4UL] += sum( xmm5 );
1245  y[j+5UL] += sum( xmm6 );
1246  y[j+6UL] += sum( xmm7 );
1247  y[j+7UL] += sum( xmm8 );
1248 
1249  for( ; remainder && i<iend; ++i ) {
1250  y[j ] += x[i] * A(i,j );
1251  y[j+1UL] += x[i] * A(i,j+1UL);
1252  y[j+2UL] += x[i] * A(i,j+2UL);
1253  y[j+3UL] += x[i] * A(i,j+3UL);
1254  y[j+4UL] += x[i] * A(i,j+4UL);
1255  y[j+5UL] += x[i] * A(i,j+5UL);
1256  y[j+6UL] += x[i] * A(i,j+6UL);
1257  y[j+7UL] += x[i] * A(i,j+7UL);
1258  }
1259  }
1260 
1261  for( ; (j+4UL) <= N; j+=4UL )
1262  {
1263  const size_t ibegin( ( IsLower<MT1>::value )
1264  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1265  :( 0UL ) );
1266  const size_t iend( ( IsUpper<MT1>::value )
1267  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1268  :( M ) );
1269  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1270 
1271  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1272  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1273 
1274  SIMDType xmm1, xmm2, xmm3, xmm4;
1275  size_t i( ibegin );
1276 
1277  for( ; i<ipos; i+=SIMDSIZE ) {
1278  const SIMDType x1( x.load(i) );
1279  xmm1 += x1 * A.load(i,j );
1280  xmm2 += x1 * A.load(i,j+1UL);
1281  xmm3 += x1 * A.load(i,j+2UL);
1282  xmm4 += x1 * A.load(i,j+3UL);
1283  }
1284 
1285  y[j ] += sum( xmm1 );
1286  y[j+1UL] += sum( xmm2 );
1287  y[j+2UL] += sum( xmm3 );
1288  y[j+3UL] += sum( xmm4 );
1289 
1290  for( ; remainder && i<iend; ++i ) {
1291  y[j ] += x[i] * A(i,j );
1292  y[j+1UL] += x[i] * A(i,j+1UL);
1293  y[j+2UL] += x[i] * A(i,j+2UL);
1294  y[j+3UL] += x[i] * A(i,j+3UL);
1295  }
1296  }
1297 
1298  for( ; (j+3UL) <= N; j+=3UL )
1299  {
1300  const size_t ibegin( ( IsLower<MT1>::value )
1301  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1302  :( 0UL ) );
1303  const size_t iend( ( IsUpper<MT1>::value )
1304  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1305  :( M ) );
1306  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1307 
1308  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1309  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1310 
1311  SIMDType xmm1, xmm2, xmm3;
1312  size_t i( ibegin );
1313 
1314  for( ; i<ipos; i+=SIMDSIZE ) {
1315  const SIMDType x1( x.load(i) );
1316  xmm1 += x1 * A.load(i,j );
1317  xmm2 += x1 * A.load(i,j+1UL);
1318  xmm3 += x1 * A.load(i,j+2UL);
1319  }
1320 
1321  y[j ] += sum( xmm1 );
1322  y[j+1UL] += sum( xmm2 );
1323  y[j+2UL] += sum( xmm3 );
1324 
1325  for( ; remainder && i<iend; ++i ) {
1326  y[j ] += x[i] * A(i,j );
1327  y[j+1UL] += x[i] * A(i,j+1UL);
1328  y[j+2UL] += x[i] * A(i,j+2UL);
1329  }
1330  }
1331 
1332  for( ; (j+2UL) <= N; j+=2UL )
1333  {
1334  const size_t ibegin( ( IsLower<MT1>::value )
1335  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1336  :( 0UL ) );
1337  const size_t iend( ( IsUpper<MT1>::value )
1338  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1339  :( M ) );
1340  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1341 
1342  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1343  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1344 
1345  SIMDType xmm1, xmm2;
1346  size_t i( ibegin );
1347 
1348  for( ; i<ipos; i+=SIMDSIZE ) {
1349  const SIMDType x1( x.load(i) );
1350  xmm1 += x1 * A.load(i,j );
1351  xmm2 += x1 * A.load(i,j+1UL);
1352  }
1353 
1354  y[j ] += sum( xmm1 );
1355  y[j+1UL] += sum( xmm2 );
1356 
1357  for( ; remainder && i<iend; ++i ) {
1358  y[j ] += x[i] * A(i,j );
1359  y[j+1UL] += x[i] * A(i,j+1UL);
1360  }
1361  }
1362 
1363  if( j < N )
1364  {
1365  const size_t ibegin( ( IsLower<MT1>::value )
1366  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1367  :( 0UL ) );
1368  const size_t iend( ( IsUpper<MT1>::value )
1369  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1370  :( M ) );
1371  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1372 
1373  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1374  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1375 
1376  SIMDType xmm1;
1377  size_t i( ibegin );
1378 
1379  for( ; i<ipos; i+=SIMDSIZE ) {
1380  xmm1 += A.load(i,j) * x.load(i);
1381  }
1382 
1383  y[j] += sum( xmm1 );
1384 
1385  for( ; remainder && i<iend; ++i ) {
1386  y[j] += x[i] * A(i,j);
1387  }
1388  }
1389  }
1391  //**********************************************************************************************
1392 
1393  //**Default addition assignment to dense vectors (large matrices)*******************************
1407  template< typename VT1 // Type of the left-hand side target vector
1408  , typename VT2 // Type of the left-hand side vector operand
1409  , typename MT1 > // Type of the right-hand side matrix operand
1411  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1412  {
1413  selectDefaultAddAssignKernel( y, x, A );
1414  }
1416  //**********************************************************************************************
1417 
1418  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1433  template< typename VT1 // Type of the left-hand side target vector
1434  , typename VT2 // Type of the left-hand side vector operand
1435  , typename MT1 > // Type of the right-hand side matrix operand
1437  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1438  {
1439  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1440 
1441  const size_t M( A.rows() );
1442  const size_t N( A.columns() );
1443 
1444  size_t j( 0UL );
1445 
1446  for( ; (j+8UL) <= N; j+=8UL )
1447  {
1448  const size_t ibegin( ( IsLower<MT1>::value )
1449  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1450  :( 0UL ) );
1451  const size_t iend( ( IsUpper<MT1>::value )
1452  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1453  :( M ) );
1454  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1455 
1456  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1457  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1458 
1459  size_t i( ibegin );
1460 
1461  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1462  const size_t i1( i+SIMDSIZE );
1463  const size_t i2( i+SIMDSIZE*2UL );
1464  const size_t i3( i+SIMDSIZE*3UL );
1465  const SIMDType x1( x.load(i ) );
1466  const SIMDType x2( x.load(i1) );
1467  const SIMDType x3( x.load(i2) );
1468  const SIMDType x4( x.load(i3) );
1469  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1470  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1471  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1472  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1473  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1474  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1475  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1476  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1477  }
1478 
1479  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1480  const size_t i1( i+SIMDSIZE );
1481  const SIMDType x1( x.load(i ) );
1482  const SIMDType x2( x.load(i1) );
1483  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1484  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1485  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1486  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1487  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1488  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1489  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1490  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1491  }
1492 
1493  for( ; i<ipos; i+=SIMDSIZE ) {
1494  const SIMDType x1( x.load(i) );
1495  y[j ] += sum( x1 * A.load(i,j ) );
1496  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1497  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1498  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1499  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
1500  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
1501  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
1502  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
1503  }
1504 
1505  for( ; remainder && i<iend; ++i ) {
1506  y[j ] += x[i] * A(i,j );
1507  y[j+1UL] += x[i] * A(i,j+1UL);
1508  y[j+2UL] += x[i] * A(i,j+2UL);
1509  y[j+3UL] += x[i] * A(i,j+3UL);
1510  y[j+4UL] += x[i] * A(i,j+4UL);
1511  y[j+5UL] += x[i] * A(i,j+5UL);
1512  y[j+6UL] += x[i] * A(i,j+6UL);
1513  y[j+7UL] += x[i] * A(i,j+7UL);
1514  }
1515  }
1516 
1517  for( ; (j+4UL) <= N; j+=4UL )
1518  {
1519  const size_t ibegin( ( IsLower<MT1>::value )
1520  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1521  :( 0UL ) );
1522  const size_t iend( ( IsUpper<MT1>::value )
1523  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1524  :( M ) );
1525  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1526 
1527  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1528  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1529 
1530  size_t i( ibegin );
1531 
1532  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1533  const size_t i1( i+SIMDSIZE );
1534  const size_t i2( i+SIMDSIZE*2UL );
1535  const size_t i3( i+SIMDSIZE*3UL );
1536  const SIMDType x1( x.load(i ) );
1537  const SIMDType x2( x.load(i1) );
1538  const SIMDType x3( x.load(i2) );
1539  const SIMDType x4( x.load(i3) );
1540  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1541  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1542  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1543  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1544  }
1545 
1546  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1547  const size_t i1( i+SIMDSIZE );
1548  const SIMDType x1( x.load(i ) );
1549  const SIMDType x2( x.load(i1) );
1550  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1551  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1552  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1553  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1554  }
1555 
1556  for( ; i<ipos; i+=SIMDSIZE ) {
1557  const SIMDType x1( x.load(i) );
1558  y[j ] += sum( x1 * A.load(i,j ) );
1559  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1560  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
1561  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
1562  }
1563 
1564  for( ; remainder && i<iend; ++i ) {
1565  y[j ] += x[i] * A(i,j );
1566  y[j+1UL] += x[i] * A(i,j+1UL);
1567  y[j+2UL] += x[i] * A(i,j+2UL);
1568  y[j+3UL] += x[i] * A(i,j+3UL);
1569  }
1570  }
1571 
1572  for( ; (j+2UL) <= N; j+=2UL )
1573  {
1574  const size_t ibegin( ( IsLower<MT1>::value )
1575  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1576  :( 0UL ) );
1577  const size_t iend( ( IsUpper<MT1>::value )
1578  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1579  :( M ) );
1580  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1581 
1582  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1583  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1584 
1585  size_t i( ibegin );
1586 
1587  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1588  const size_t i1( i+SIMDSIZE );
1589  const size_t i2( i+SIMDSIZE*2UL );
1590  const size_t i3( i+SIMDSIZE*3UL );
1591  const SIMDType x1( x.load(i ) );
1592  const SIMDType x2( x.load(i1) );
1593  const SIMDType x3( x.load(i2) );
1594  const SIMDType x4( x.load(i3) );
1595  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1596  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1597  }
1598 
1599  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1600  const size_t i1( i+SIMDSIZE );
1601  const SIMDType x1( x.load(i ) );
1602  const SIMDType x2( x.load(i1) );
1603  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1604  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1605  }
1606 
1607  for( ; i<ipos; i+=SIMDSIZE ) {
1608  const SIMDType x1( x.load(i) );
1609  y[j ] += sum( x1 * A.load(i,j ) );
1610  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
1611  }
1612 
1613  for( ; remainder && i<iend; ++i ) {
1614  y[j ] += x[i] * A(i,j );
1615  y[j+1UL] += x[i] * A(i,j+1UL);
1616  }
1617  }
1618 
1619  if( j < N )
1620  {
1621  const size_t ibegin( ( IsLower<MT1>::value )
1622  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1623  :( 0UL ) );
1624  const size_t iend( ( IsUpper<MT1>::value )
1625  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1626  :( M ) );
1627  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1628 
1629  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1630  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1631 
1632  size_t i( ibegin );
1633 
1634  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1635  const size_t i1( i+SIMDSIZE );
1636  const size_t i2( i+SIMDSIZE*2UL );
1637  const size_t i3( i+SIMDSIZE*3UL );
1638  const SIMDType x1( x.load(i ) );
1639  const SIMDType x2( x.load(i1) );
1640  const SIMDType x3( x.load(i2) );
1641  const SIMDType x4( x.load(i3) );
1642  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1643  }
1644 
1645  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1646  const size_t i1( i+SIMDSIZE );
1647  const SIMDType x1( x.load(i ) );
1648  const SIMDType x2( x.load(i1) );
1649  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1650  }
1651 
1652  for( ; i<ipos; i+=SIMDSIZE ) {
1653  const SIMDType x1( x.load(i) );
1654  y[j] += sum( x1 * A.load(i,j) );
1655  }
1656 
1657  for( ; remainder && i<iend; ++i ) {
1658  y[j] += x[i] * A(i,j);
1659  }
1660  }
1661  }
1663  //**********************************************************************************************
1664 
1665  //**BLAS-based addition assignment to dense vectors (default)***********************************
1679  template< typename VT1 // Type of the left-hand side target vector
1680  , typename VT2 // Type of the left-hand side vector operand
1681  , typename MT1 > // Type of the right-hand side matrix operand
1683  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1684  {
1685  selectLargeAddAssignKernel( y, x, A );
1686  }
1688  //**********************************************************************************************
1689 
1690  //**BLAS-based addition assignment to dense vectors*********************************************
1691 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1692 
1705  template< typename VT1 // Type of the left-hand side target vector
1706  , typename VT2 // Type of the left-hand side vector operand
1707  , typename MT1 > // Type of the right-hand side matrix operand
1709  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1710  {
1711  using ET = ElementType_<VT1>;
1712 
1713  if( IsTriangular<MT1>::value ) {
1714  ResultType_<VT1> tmp( serial( x ) );
1715  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1716  addAssign( y, tmp );
1717  }
1718  else {
1719  gemv( y, x, A, ET(1), ET(1) );
1720  }
1721  }
1723 #endif
1724  //**********************************************************************************************
1725 
1726  //**Addition assignment to sparse vectors*******************************************************
1727  // No special implementation for the addition assignment to sparse vectors.
1728  //**********************************************************************************************
1729 
1730  //**Subtraction assignment to dense vectors*****************************************************
1743  template< typename VT1 > // Type of the target dense vector
1744  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
1745  {
1747 
1748  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1749 
1750  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1751  return;
1752  }
1753 
1754  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1755  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1756 
1757  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1758  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1759  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1760  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1761 
1762  TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1763  }
1765  //**********************************************************************************************
1766 
1767  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1778  template< typename VT1 // Type of the left-hand side target vector
1779  , typename VT2 // Type of the left-hand side vector operand
1780  , typename MT1 > // Type of the right-hand side matrix operand
1781  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1782  {
1783  if( ( IsDiagonal<MT1>::value ) ||
1784  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1785  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1786  selectSmallSubAssignKernel( y, x, A );
1787  else
1788  selectBlasSubAssignKernel( y, x, A );
1789  }
1791  //**********************************************************************************************
1792 
1793  //**Default subtraction assignment to dense vectors*********************************************
1807  template< typename VT1 // Type of the left-hand side target vector
1808  , typename VT2 // Type of the left-hand side vector operand
1809  , typename MT1 > // Type of the right-hand side matrix operand
1810  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1811  {
1812  y.subAssign( x * A );
1813  }
1815  //**********************************************************************************************
1816 
1817  //**Default subtraction assignment to dense vectors (small matrices)****************************
1831  template< typename VT1 // Type of the left-hand side target vector
1832  , typename VT2 // Type of the left-hand side vector operand
1833  , typename MT1 > // Type of the right-hand side matrix operand
1835  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1836  {
1837  selectDefaultSubAssignKernel( y, x, A );
1838  }
1840  //**********************************************************************************************
1841 
1842  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1857  template< typename VT1 // Type of the left-hand side target vector
1858  , typename VT2 // Type of the left-hand side vector operand
1859  , typename MT1 > // Type of the right-hand side matrix operand
1861  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1862  {
1863  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1864 
1865  const size_t M( A.rows() );
1866  const size_t N( A.columns() );
1867 
1868  size_t j( 0UL );
1869 
1870  for( ; (j+8UL) <= N; j+=8UL )
1871  {
1872  const size_t ibegin( ( IsLower<MT1>::value )
1873  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1874  :( 0UL ) );
1875  const size_t iend( ( IsUpper<MT1>::value )
1876  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1877  :( M ) );
1878  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1879 
1880  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1881  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1882 
1883  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1884  size_t i( ibegin );
1885 
1886  for( ; i<ipos; i+=SIMDSIZE ) {
1887  const SIMDType x1( x.load(i) );
1888  xmm1 += x1 * A.load(i,j );
1889  xmm2 += x1 * A.load(i,j+1UL);
1890  xmm3 += x1 * A.load(i,j+2UL);
1891  xmm4 += x1 * A.load(i,j+3UL);
1892  xmm5 += x1 * A.load(i,j+4UL);
1893  xmm6 += x1 * A.load(i,j+5UL);
1894  xmm7 += x1 * A.load(i,j+6UL);
1895  xmm8 += x1 * A.load(i,j+7UL);
1896  }
1897 
1898  y[j ] -= sum( xmm1 );
1899  y[j+1UL] -= sum( xmm2 );
1900  y[j+2UL] -= sum( xmm3 );
1901  y[j+3UL] -= sum( xmm4 );
1902  y[j+4UL] -= sum( xmm5 );
1903  y[j+5UL] -= sum( xmm6 );
1904  y[j+6UL] -= sum( xmm7 );
1905  y[j+7UL] -= sum( xmm8 );
1906 
1907  for( ; remainder && i<iend; ++i ) {
1908  y[j ] -= x[i] * A(i,j );
1909  y[j+1UL] -= x[i] * A(i,j+1UL);
1910  y[j+2UL] -= x[i] * A(i,j+2UL);
1911  y[j+3UL] -= x[i] * A(i,j+3UL);
1912  y[j+4UL] -= x[i] * A(i,j+4UL);
1913  y[j+5UL] -= x[i] * A(i,j+5UL);
1914  y[j+6UL] -= x[i] * A(i,j+6UL);
1915  y[j+7UL] -= x[i] * A(i,j+7UL);
1916  }
1917  }
1918 
1919  for( ; (j+4UL) <= N; j+=4UL )
1920  {
1921  const size_t ibegin( ( IsLower<MT1>::value )
1922  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1923  :( 0UL ) );
1924  const size_t iend( ( IsUpper<MT1>::value )
1925  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1926  :( M ) );
1927  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1928 
1929  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1930  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1931 
1932  SIMDType xmm1, xmm2, xmm3, xmm4;
1933  size_t i( ibegin );
1934 
1935  for( ; i<ipos; i+=SIMDSIZE ) {
1936  const SIMDType x1( x.load(i) );
1937  xmm1 += x1 * A.load(i,j );
1938  xmm2 += x1 * A.load(i,j+1UL);
1939  xmm3 += x1 * A.load(i,j+2UL);
1940  xmm4 += x1 * A.load(i,j+3UL);
1941  }
1942 
1943  y[j ] -= sum( xmm1 );
1944  y[j+1UL] -= sum( xmm2 );
1945  y[j+2UL] -= sum( xmm3 );
1946  y[j+3UL] -= sum( xmm4 );
1947 
1948  for( ; remainder && i<iend; ++i ) {
1949  y[j ] -= x[i] * A(i,j );
1950  y[j+1UL] -= x[i] * A(i,j+1UL);
1951  y[j+2UL] -= x[i] * A(i,j+2UL);
1952  y[j+3UL] -= x[i] * A(i,j+3UL);
1953  }
1954  }
1955 
1956  for( ; (j+3UL) <= N; j+=3UL )
1957  {
1958  const size_t ibegin( ( IsLower<MT1>::value )
1959  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1960  :( 0UL ) );
1961  const size_t iend( ( IsUpper<MT1>::value )
1962  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1963  :( M ) );
1964  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1965 
1966  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
1967  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
1968 
1969  SIMDType xmm1, xmm2, xmm3;
1970  size_t i( ibegin );
1971 
1972  for( ; i<ipos; i+=SIMDSIZE ) {
1973  const SIMDType x1( x.load(i) );
1974  xmm1 += x1 * A.load(i,j );
1975  xmm2 += x1 * A.load(i,j+1UL);
1976  xmm3 += x1 * A.load(i,j+2UL);
1977  }
1978 
1979  y[j ] -= sum( xmm1 );
1980  y[j+1UL] -= sum( xmm2 );
1981  y[j+2UL] -= sum( xmm3 );
1982 
1983  for( ; remainder && i<iend; ++i ) {
1984  y[j ] -= x[i] * A(i,j );
1985  y[j+1UL] -= x[i] * A(i,j+1UL);
1986  y[j+2UL] -= x[i] * A(i,j+2UL);
1987  }
1988  }
1989 
1990  for( ; (j+2UL) <= N; j+=2UL )
1991  {
1992  const size_t ibegin( ( IsLower<MT1>::value )
1993  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
1994  :( 0UL ) );
1995  const size_t iend( ( IsUpper<MT1>::value )
1996  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1997  :( M ) );
1998  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1999 
2000  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2001  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2002 
2003  SIMDType xmm1, xmm2;
2004  size_t i( ibegin );
2005 
2006  for( ; i<ipos; i+=SIMDSIZE ) {
2007  const SIMDType x1( x.load(i) );
2008  xmm1 += x1 * A.load(i,j );
2009  xmm2 += x1 * A.load(i,j+1UL);
2010  }
2011 
2012  y[j ] -= sum( xmm1 );
2013  y[j+1UL] -= sum( xmm2 );
2014 
2015  for( ; remainder && i<iend; ++i ) {
2016  y[j ] -= x[i] * A(i,j );
2017  y[j+1UL] -= x[i] * A(i,j+1UL);
2018  }
2019  }
2020 
2021  if( j < N )
2022  {
2023  const size_t ibegin( ( IsLower<MT1>::value )
2024  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2025  :( 0UL ) );
2026  const size_t iend( ( IsUpper<MT1>::value )
2027  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2028  :( M ) );
2029  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2030 
2031  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2032  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2033 
2034  SIMDType xmm1;
2035  size_t i( ibegin );
2036 
2037  for( ; i<ipos; i+=SIMDSIZE ) {
2038  xmm1 += A.load(i,j) * x.load(i);
2039  }
2040 
2041  y[j] -= sum( xmm1 );
2042 
2043  for( ; remainder && i<iend; ++i ) {
2044  y[j] -= x[i] * A(i,j);
2045  }
2046  }
2047  }
2049  //**********************************************************************************************
2050 
2051  //**Default subtraction assignment to dense vectors (large matrices)****************************
2065  template< typename VT1 // Type of the left-hand side target vector
2066  , typename VT2 // Type of the left-hand side vector operand
2067  , typename MT1 > // Type of the right-hand side matrix operand
2069  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2070  {
2071  selectDefaultSubAssignKernel( y, x, A );
2072  }
2074  //**********************************************************************************************
2075 
2076  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
2091  template< typename VT1 // Type of the left-hand side target vector
2092  , typename VT2 // Type of the left-hand side vector operand
2093  , typename MT1 > // Type of the right-hand side matrix operand
2095  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2096  {
2097  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
2098 
2099  const size_t M( A.rows() );
2100  const size_t N( A.columns() );
2101 
2102  size_t j( 0UL );
2103 
2104  for( ; (j+8UL) <= N; j+=8UL )
2105  {
2106  const size_t ibegin( ( IsLower<MT1>::value )
2107  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2108  :( 0UL ) );
2109  const size_t iend( ( IsUpper<MT1>::value )
2110  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
2111  :( M ) );
2112  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2113 
2114  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2115  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2116 
2117  size_t i( ibegin );
2118 
2119  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2120  const size_t i1( i+SIMDSIZE );
2121  const size_t i2( i+SIMDSIZE*2UL );
2122  const size_t i3( i+SIMDSIZE*3UL );
2123  const SIMDType x1( x.load(i ) );
2124  const SIMDType x2( x.load(i1) );
2125  const SIMDType x3( x.load(i2) );
2126  const SIMDType x4( x.load(i3) );
2127  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2128  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2129  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2130  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2131  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2132  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2133  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2134  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2135  }
2136 
2137  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2138  const size_t i1( i+SIMDSIZE );
2139  const SIMDType x1( x.load(i ) );
2140  const SIMDType x2( x.load(i1) );
2141  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2142  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2143  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2144  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2145  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2146  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2147  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2148  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2149  }
2150 
2151  for( ; i<ipos; i+=SIMDSIZE ) {
2152  const SIMDType x1( x.load(i) );
2153  y[j ] -= sum( x1 * A.load(i,j ) );
2154  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2155  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2156  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2157  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) );
2158  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) );
2159  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) );
2160  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) );
2161  }
2162 
2163  for( ; remainder && i<iend; ++i ) {
2164  y[j ] -= x[i] * A(i,j );
2165  y[j+1UL] -= x[i] * A(i,j+1UL);
2166  y[j+2UL] -= x[i] * A(i,j+2UL);
2167  y[j+3UL] -= x[i] * A(i,j+3UL);
2168  y[j+4UL] -= x[i] * A(i,j+4UL);
2169  y[j+5UL] -= x[i] * A(i,j+5UL);
2170  y[j+6UL] -= x[i] * A(i,j+6UL);
2171  y[j+7UL] -= x[i] * A(i,j+7UL);
2172  }
2173  }
2174 
2175  for( ; (j+4UL) <= N; j+=4UL )
2176  {
2177  const size_t ibegin( ( IsLower<MT1>::value )
2178  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2179  :( 0UL ) );
2180  const size_t iend( ( IsUpper<MT1>::value )
2181  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
2182  :( M ) );
2183  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2184 
2185  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2186  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2187 
2188  size_t i( ibegin );
2189 
2190  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2191  const size_t i1( i+SIMDSIZE );
2192  const size_t i2( i+SIMDSIZE*2UL );
2193  const size_t i3( i+SIMDSIZE*3UL );
2194  const SIMDType x1( x.load(i ) );
2195  const SIMDType x2( x.load(i1) );
2196  const SIMDType x3( x.load(i2) );
2197  const SIMDType x4( x.load(i3) );
2198  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2199  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2200  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2201  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2202  }
2203 
2204  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2205  const size_t i1( i+SIMDSIZE );
2206  const SIMDType x1( x.load(i ) );
2207  const SIMDType x2( x.load(i1) );
2208  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2209  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2210  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2211  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2212  }
2213 
2214  for( ; i<ipos; i+=SIMDSIZE ) {
2215  const SIMDType x1( x.load(i) );
2216  y[j ] -= sum( x1 * A.load(i,j ) );
2217  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2218  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) );
2219  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) );
2220  }
2221 
2222  for( ; remainder && i<iend; ++i ) {
2223  y[j ] -= x[i] * A(i,j );
2224  y[j+1UL] -= x[i] * A(i,j+1UL);
2225  y[j+2UL] -= x[i] * A(i,j+2UL);
2226  y[j+3UL] -= x[i] * A(i,j+3UL);
2227  }
2228  }
2229 
2230  for( ; (j+2UL) <= N; j+=2UL )
2231  {
2232  const size_t ibegin( ( IsLower<MT1>::value )
2233  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2234  :( 0UL ) );
2235  const size_t iend( ( IsUpper<MT1>::value )
2236  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2237  :( M ) );
2238  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2239 
2240  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2241  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2242 
2243  size_t i( ibegin );
2244 
2245  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2246  const size_t i1( i+SIMDSIZE );
2247  const size_t i2( i+SIMDSIZE*2UL );
2248  const size_t i3( i+SIMDSIZE*3UL );
2249  const SIMDType x1( x.load(i ) );
2250  const SIMDType x2( x.load(i1) );
2251  const SIMDType x3( x.load(i2) );
2252  const SIMDType x4( x.load(i3) );
2253  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2254  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2255  }
2256 
2257  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2258  const size_t i1( i+SIMDSIZE );
2259  const SIMDType x1( x.load(i ) );
2260  const SIMDType x2( x.load(i1) );
2261  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2262  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2263  }
2264 
2265  for( ; i<ipos; i+=SIMDSIZE ) {
2266  const SIMDType x1( x.load(i) );
2267  y[j ] -= sum( x1 * A.load(i,j ) );
2268  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) );
2269  }
2270 
2271  for( ; remainder && i<iend; ++i ) {
2272  y[j ] -= x[i] * A(i,j );
2273  y[j+1UL] -= x[i] * A(i,j+1UL);
2274  }
2275  }
2276 
2277  if( j < N )
2278  {
2279  const size_t ibegin( ( IsLower<MT1>::value )
2280  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
2281  :( 0UL ) );
2282  const size_t iend( ( IsUpper<MT1>::value )
2283  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2284  :( M ) );
2285  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2286 
2287  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
2288  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
2289 
2290  size_t i( ibegin );
2291 
2292  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2293  const size_t i1( i+SIMDSIZE );
2294  const size_t i2( i+SIMDSIZE*2UL );
2295  const size_t i3( i+SIMDSIZE*3UL );
2296  const SIMDType x1( x.load(i ) );
2297  const SIMDType x2( x.load(i1) );
2298  const SIMDType x3( x.load(i2) );
2299  const SIMDType x4( x.load(i3) );
2300  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2301  }
2302 
2303  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2304  const size_t i1( i+SIMDSIZE );
2305  const SIMDType x1( x.load(i ) );
2306  const SIMDType x2( x.load(i1) );
2307  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2308  }
2309 
2310  for( ; i<ipos; i+=SIMDSIZE ) {
2311  const SIMDType x1( x.load(i) );
2312  y[j] -= sum( x1 * A.load(i,j) );
2313  }
2314 
2315  for( ; remainder && i<iend; ++i ) {
2316  y[j] -= x[i] * A(i,j);
2317  }
2318  }
2319  }
2321  //**********************************************************************************************
2322 
2323  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2337  template< typename VT1 // Type of the left-hand side target vector
2338  , typename VT2 // Type of the left-hand side vector operand
2339  , typename MT1 > // Type of the right-hand side matrix operand
2341  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2342  {
2343  selectLargeSubAssignKernel( y, x, A );
2344  }
2346  //**********************************************************************************************
2347 
2348  //**BLAS-based subtraction assignment to dense vectors******************************************
2349 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2350 
2363  template< typename VT1 // Type of the left-hand side target vector
2364  , typename VT2 // Type of the left-hand side vector operand
2365  , typename MT1 > // Type of the right-hand side matrix operand
2367  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2368  {
2369  using ET = ElementType_<VT1>;
2370 
2371  if( IsTriangular<MT1>::value ) {
2372  ResultType_<VT1> tmp( serial( x ) );
2373  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2374  subAssign( y, tmp );
2375  }
2376  else {
2377  gemv( y, x, A, ET(-1), ET(1) );
2378  }
2379  }
2381 #endif
2382  //**********************************************************************************************
2383 
2384  //**Subtraction assignment to sparse vectors****************************************************
2385  // No special implementation for the subtraction assignment to sparse vectors.
2386  //**********************************************************************************************
2387 
2388  //**Multiplication assignment to dense vectors**************************************************
2401  template< typename VT1 > // Type of the target dense vector
2402  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2403  {
2405 
2409 
2410  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2411 
2412  const ResultType tmp( serial( rhs ) );
2413  multAssign( ~lhs, tmp );
2414  }
2416  //**********************************************************************************************
2417 
2418  //**Multiplication assignment to sparse vectors*************************************************
2419  // No special implementation for the multiplication assignment to sparse vectors.
2420  //**********************************************************************************************
2421 
2422  //**Division assignment to dense vectors********************************************************
2435  template< typename VT1 > // Type of the target dense vector
2436  friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecTDMatMultExpr& rhs )
2437  {
2439 
2443 
2444  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2445 
2446  const ResultType tmp( serial( rhs ) );
2447  divAssign( ~lhs, tmp );
2448  }
2450  //**********************************************************************************************
2451 
2452  //**Division assignment to sparse vectors*******************************************************
2453  // No special implementation for the division assignment to sparse vectors.
2454  //**********************************************************************************************
2455 
2456  //**SMP assignment to dense vectors*************************************************************
2471  template< typename VT1 > // Type of the target dense vector
2472  friend inline EnableIf_< UseSMPAssign<VT1> >
2474  {
2476 
2477  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2478 
2479  if( rhs.mat_.rows() == 0UL ) {
2480  reset( ~lhs );
2481  return;
2482  }
2483  else if( rhs.mat_.columns() == 0UL ) {
2484  return;
2485  }
2486 
2487  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2488  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2489 
2490  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2491  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2492  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2493  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2494 
2495  smpAssign( ~lhs, x * A );
2496  }
2498  //**********************************************************************************************
2499 
2500  //**SMP assignment to sparse vectors************************************************************
2515  template< typename VT1 > // Type of the target sparse vector
2516  friend inline EnableIf_< UseSMPAssign<VT1> >
2518  {
2520 
2524 
2525  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2526 
2527  const ResultType tmp( rhs );
2528  smpAssign( ~lhs, tmp );
2529  }
2531  //**********************************************************************************************
2532 
2533  //**SMP addition assignment to dense vectors****************************************************
2548  template< typename VT1 > // Type of the target dense vector
2549  friend inline EnableIf_< UseSMPAssign<VT1> >
2551  {
2553 
2554  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2555 
2556  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2557  return;
2558  }
2559 
2560  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2561  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2562 
2563  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2564  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2565  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2566  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2567 
2568  smpAddAssign( ~lhs, x * A );
2569  }
2571  //**********************************************************************************************
2572 
2573  //**SMP addition assignment to sparse vectors***************************************************
2574  // No special implementation for the SMP addition assignment to sparse vectors.
2575  //**********************************************************************************************
2576 
2577  //**SMP subtraction assignment to dense vectors*************************************************
2592  template< typename VT1 > // Type of the target dense vector
2593  friend inline EnableIf_< UseSMPAssign<VT1> >
2595  {
2597 
2598  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2599 
2600  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2601  return;
2602  }
2603 
2604  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2605  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2606 
2607  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2608  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2609  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2610  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2611 
2612  smpSubAssign( ~lhs, x * A );
2613  }
2615  //**********************************************************************************************
2616 
2617  //**SMP subtraction assignment to sparse vectors************************************************
2618  // No special implementation for the SMP subtraction assignment to sparse vectors.
2619  //**********************************************************************************************
2620 
2621  //**SMP multiplication assignment to dense vectors**********************************************
2636  template< typename VT1 > // Type of the target dense vector
2637  friend inline EnableIf_< UseSMPAssign<VT1> >
2639  {
2641 
2645 
2646  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2647 
2648  const ResultType tmp( rhs );
2649  smpMultAssign( ~lhs, tmp );
2650  }
2652  //**********************************************************************************************
2653 
2654  //**SMP multiplication assignment to sparse vectors*********************************************
2655  // No special implementation for the SMP multiplication assignment to sparse vectors.
2656  //**********************************************************************************************
2657 
2658  //**SMP division assignment to dense vectors****************************************************
2673  template< typename VT1 > // Type of the target dense vector
2674  friend inline EnableIf_< UseSMPAssign<VT1> >
2676  {
2678 
2682 
2683  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2684 
2685  const ResultType tmp( rhs );
2686  smpDivAssign( ~lhs, tmp );
2687  }
2689  //**********************************************************************************************
2690 
2691  //**SMP division assignment to sparse vectors***************************************************
2692  // No special implementation for the SMP division assignment to sparse vectors.
2693  //**********************************************************************************************
2694 
2695  //**Compile time checks*************************************************************************
2703  //**********************************************************************************************
2704 };
2705 //*************************************************************************************************
2706 
2707 
2708 
2709 
2710 //=================================================================================================
2711 //
2712 // DVECSCALARMULTEXPR SPECIALIZATION
2713 //
2714 //=================================================================================================
2715 
2716 //*************************************************************************************************
2724 template< typename VT // Type of the left-hand side dense vector
2725  , typename MT // Type of the right-hand side dense matrix
2726  , typename ST > // Type of the side scalar value
2727 class DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >
2728  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true > >
2729  , private Computation
2730 {
2731  private:
2732  //**Type definitions****************************************************************************
2733  using VMM = TDVecTDMatMultExpr<VT,MT>;
2734  using RES = ResultType_<VMM>;
2735  using VRT = ResultType_<VT>;
2736  using MRT = ResultType_<MT>;
2737  using VET = ElementType_<VRT>;
2738  using MET = ElementType_<MRT>;
2739  using VCT = CompositeType_<VT>;
2740  using MCT = CompositeType_<MT>;
2741  //**********************************************************************************************
2742 
2743  //**********************************************************************************************
2745  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2746  //**********************************************************************************************
2747 
2748  //**********************************************************************************************
2750  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2752  //**********************************************************************************************
2753 
2754  //**********************************************************************************************
2756 
2759  template< typename T1 >
2760  struct UseSMPAssign {
2761  enum : bool { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
2762  };
2763  //**********************************************************************************************
2764 
2765  //**********************************************************************************************
2767 
2769  template< typename T1, typename T2, typename T3, typename T4 >
2770  struct UseBlasKernel {
2776  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2781  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2783  };
2784  //**********************************************************************************************
2785 
2786  //**********************************************************************************************
2788 
2791  template< typename T1, typename T2, typename T3, typename T4 >
2792  struct UseVectorizedDefaultKernel {
2793  enum : bool { value = useOptimizedKernels &&
2795  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2799  , T4 >::value &&
2800  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2801  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2802  };
2803  //**********************************************************************************************
2804 
2805  public:
2806  //**Type definitions****************************************************************************
2808  using ResultType = MultTrait_<RES,ST>;
2812  using ReturnType = const ElementType;
2813  using CompositeType = const ResultType;
2814 
2816  using LeftOperand = const TDVecTDMatMultExpr<VT,MT>;
2817 
2819  using RightOperand = ST;
2820 
2823 
2826  //**********************************************************************************************
2827 
2828  //**Compilation flags***************************************************************************
2830  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2831  VT::simdEnabled && MT::simdEnabled &&
2835 
2837  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2838  !evaluateMatrix && MT::smpAssignable };
2839  //**********************************************************************************************
2840 
2841  //**SIMD properties*****************************************************************************
2843  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2844  //**********************************************************************************************
2845 
2846  //**Constructor*********************************************************************************
2852  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2853  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2854  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2855  {}
2856  //**********************************************************************************************
2857 
2858  //**Subscript operator**************************************************************************
2864  inline ReturnType operator[]( size_t index ) const {
2865  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2866  return vector_[index] * scalar_;
2867  }
2868  //**********************************************************************************************
2869 
2870  //**At function*********************************************************************************
2877  inline ReturnType at( size_t index ) const {
2878  if( index >= vector_.size() ) {
2879  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2880  }
2881  return (*this)[index];
2882  }
2883  //**********************************************************************************************
2884 
2885  //**Size function*******************************************************************************
2890  inline size_t size() const {
2891  return vector_.size();
2892  }
2893  //**********************************************************************************************
2894 
2895  //**Left operand access*************************************************************************
2900  inline LeftOperand leftOperand() const {
2901  return vector_;
2902  }
2903  //**********************************************************************************************
2904 
2905  //**Right operand access************************************************************************
2910  inline RightOperand rightOperand() const {
2911  return scalar_;
2912  }
2913  //**********************************************************************************************
2914 
2915  //**********************************************************************************************
2921  template< typename T >
2922  inline bool canAlias( const T* alias ) const {
2923  return vector_.canAlias( alias );
2924  }
2925  //**********************************************************************************************
2926 
2927  //**********************************************************************************************
2933  template< typename T >
2934  inline bool isAliased( const T* alias ) const {
2935  return vector_.isAliased( alias );
2936  }
2937  //**********************************************************************************************
2938 
2939  //**********************************************************************************************
2944  inline bool isAligned() const {
2945  return vector_.isAligned();
2946  }
2947  //**********************************************************************************************
2948 
2949  //**********************************************************************************************
2954  inline bool canSMPAssign() const noexcept {
2955  RightOperand_<VMM> A( vector_.rightOperand() );
2956  return ( !BLAZE_BLAS_MODE ||
2959  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2960  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2961  ( size() > SMP_TDVECTDMATMULT_THRESHOLD );
2962  }
2963  //**********************************************************************************************
2964 
2965  private:
2966  //**Member variables****************************************************************************
2967  LeftOperand vector_;
2968  RightOperand scalar_;
2969  //**********************************************************************************************
2970 
2971  //**Assignment to dense vectors*****************************************************************
2983  template< typename VT1 // Type of the target dense vector
2984  , bool TF > // Transpose flag of the target dense vector
2985  friend inline void assign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
2986  {
2988 
2989  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2990 
2991  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
2992  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
2993 
2994  if( right.rows() == 0UL ) {
2995  reset( ~lhs );
2996  return;
2997  }
2998  else if( right.columns() == 0UL ) {
2999  return;
3000  }
3001 
3002  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3003  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3004 
3005  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3006  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3007  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3008  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3009 
3010  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
3011  }
3012  //**********************************************************************************************
3013 
3014  //**Assignment to dense vectors (kernel selection)**********************************************
3025  template< typename VT1 // Type of the left-hand side target vector
3026  , typename VT2 // Type of the left-hand side vector operand
3027  , typename MT1 // Type of the right-hand side matrix operand
3028  , typename ST2 > // Type of the scalar value
3029  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3030  {
3031  if( ( IsDiagonal<MT1>::value ) ||
3032  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3033  ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3034  selectSmallAssignKernel( y, x, A, scalar );
3035  else
3036  selectBlasAssignKernel( y, x, A, scalar );
3037  }
3038  //**********************************************************************************************
3039 
3040  //**Default assignment to dense vectors*********************************************************
3054  template< typename VT1 // Type of the left-hand side target vector
3055  , typename VT2 // Type of the left-hand side vector operand
3056  , typename MT1 // Type of the right-hand side matrix operand
3057  , typename ST2 > // Type of the scalar value
3058  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3059  {
3060  y.assign( x * A * scalar );
3061  }
3062  //**********************************************************************************************
3063 
3064  //**Default assignment to dense vectors (small matrices)****************************************
3078  template< typename VT1 // Type of the left-hand side target vector
3079  , typename VT2 // Type of the left-hand side vector operand
3080  , typename MT1 // Type of the right-hand side matrix operand
3081  , typename ST2 > // Type of the scalar value
3083  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3084  {
3085  selectDefaultAssignKernel( y, x, A, scalar );
3086  }
3087  //**********************************************************************************************
3088 
3089  //**Vectorized default assignment to dense vectors (small matrices)*****************************
3104  template< typename VT1 // Type of the left-hand side target vector
3105  , typename VT2 // Type of the left-hand side vector operand
3106  , typename MT1 // Type of the right-hand side matrix operand
3107  , typename ST2 > // Type of the scalar value
3109  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3110  {
3111  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3112 
3113  const size_t M( A.rows() );
3114  const size_t N( A.columns() );
3115 
3116  size_t j( 0UL );
3117 
3118  for( ; (j+8UL) <= N; j+=8UL )
3119  {
3120  const size_t ibegin( ( IsLower<MT1>::value )
3121  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3122  :( 0UL ) );
3123  const size_t iend( ( IsUpper<MT1>::value )
3124  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3125  :( M ) );
3126  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3127 
3128  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3129  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3130 
3131  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3132  size_t i( ibegin );
3133 
3134  for( ; i<ipos; i+=SIMDSIZE ) {
3135  const SIMDType x1( x.load(i) );
3136  xmm1 += x1 * A.load(i,j );
3137  xmm2 += x1 * A.load(i,j+1UL);
3138  xmm3 += x1 * A.load(i,j+2UL);
3139  xmm4 += x1 * A.load(i,j+3UL);
3140  xmm5 += x1 * A.load(i,j+4UL);
3141  xmm6 += x1 * A.load(i,j+5UL);
3142  xmm7 += x1 * A.load(i,j+6UL);
3143  xmm8 += x1 * A.load(i,j+7UL);
3144  }
3145 
3146  y[j ] = sum( xmm1 ) * scalar;
3147  y[j+1UL] = sum( xmm2 ) * scalar;
3148  y[j+2UL] = sum( xmm3 ) * scalar;
3149  y[j+3UL] = sum( xmm4 ) * scalar;
3150  y[j+4UL] = sum( xmm5 ) * scalar;
3151  y[j+5UL] = sum( xmm6 ) * scalar;
3152  y[j+6UL] = sum( xmm7 ) * scalar;
3153  y[j+7UL] = sum( xmm8 ) * scalar;
3154 
3155  for( ; remainder && i<iend; ++i ) {
3156  y[j ] += x[i] * A(i,j ) * scalar;
3157  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3158  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3159  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3160  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3161  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3162  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3163  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3164  }
3165  }
3166 
3167  for( ; (j+4UL) <= N; j+=4UL )
3168  {
3169  const size_t ibegin( ( IsLower<MT1>::value )
3170  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3171  :( 0UL ) );
3172  const size_t iend( ( IsUpper<MT1>::value )
3173  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3174  :( M ) );
3175  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3176 
3177  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3178  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3179 
3180  SIMDType xmm1, xmm2, xmm3, xmm4;
3181  size_t i( ibegin );
3182 
3183  for( ; i<ipos; i+=SIMDSIZE ) {
3184  const SIMDType x1( x.load(i) );
3185  xmm1 += x1 * A.load(i,j );
3186  xmm2 += x1 * A.load(i,j+1UL);
3187  xmm3 += x1 * A.load(i,j+2UL);
3188  xmm4 += x1 * A.load(i,j+3UL);
3189  }
3190 
3191  y[j ] = sum( xmm1 ) * scalar;
3192  y[j+1UL] = sum( xmm2 ) * scalar;
3193  y[j+2UL] = sum( xmm3 ) * scalar;
3194  y[j+3UL] = sum( xmm4 ) * scalar;
3195 
3196  for( ; remainder && i<iend; ++i ) {
3197  y[j ] += x[i] * A(i,j ) * scalar;
3198  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3199  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3200  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3201  }
3202  }
3203 
3204  for( ; (j+3UL) <= N; j+=3UL )
3205  {
3206  const size_t ibegin( ( IsLower<MT1>::value )
3207  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3208  :( 0UL ) );
3209  const size_t iend( ( IsUpper<MT1>::value )
3210  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3211  :( M ) );
3212  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3213 
3214  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3215  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3216 
3217  SIMDType xmm1, xmm2, xmm3;
3218  size_t i( ibegin );
3219 
3220  for( ; i<ipos; i+=SIMDSIZE ) {
3221  const SIMDType x1( x.load(i) );
3222  xmm1 += x1 * A.load(i,j );
3223  xmm2 += x1 * A.load(i,j+1UL);
3224  xmm3 += x1 * A.load(i,j+2UL);
3225  }
3226 
3227  y[j ] = sum( xmm1 ) * scalar;
3228  y[j+1UL] = sum( xmm2 ) * scalar;
3229  y[j+2UL] = sum( xmm3 ) * scalar;
3230 
3231  for( ; remainder && i<iend; ++i ) {
3232  y[j ] += x[i] * A(i,j ) * scalar;
3233  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3234  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3235  }
3236  }
3237 
3238  for( ; (j+2UL) <= N; j+=2UL )
3239  {
3240  const size_t ibegin( ( IsLower<MT1>::value )
3241  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3242  :( 0UL ) );
3243  const size_t iend( ( IsUpper<MT1>::value )
3244  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3245  :( M ) );
3246  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3247 
3248  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3249  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3250 
3251  SIMDType xmm1, xmm2;
3252  size_t i( ibegin );
3253 
3254  for( ; i<ipos; i+=SIMDSIZE ) {
3255  const SIMDType x1( x.load(i) );
3256  xmm1 += x1 * A.load(i,j );
3257  xmm2 += x1 * A.load(i,j+1UL);
3258  }
3259 
3260  y[j ] = sum( xmm1 ) * scalar;
3261  y[j+1UL] = sum( xmm2 ) * scalar;
3262 
3263  for( ; remainder && i<iend; ++i ) {
3264  y[j ] += x[i] * A(i,j ) * scalar;
3265  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3266  }
3267  }
3268 
3269  if( j < N )
3270  {
3271  const size_t ibegin( ( IsLower<MT1>::value )
3272  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3273  :( 0UL ) );
3274  const size_t iend( ( IsUpper<MT1>::value )
3275  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3276  :( M ) );
3277  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3278 
3279  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3280  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3281 
3282  SIMDType xmm1;
3283  size_t i( ibegin );
3284 
3285  for( ; i<ipos; i+=SIMDSIZE ) {
3286  xmm1 += A.load(i,j) * x.load(i);
3287  }
3288 
3289  y[j] = sum( xmm1 ) * scalar;
3290 
3291  for( ; remainder && i<iend; ++i ) {
3292  y[j] += x[i] * A(i,j) * scalar;
3293  }
3294  }
3295  }
3296  //**********************************************************************************************
3297 
3298  //**Default assignment to dense vectors (large matrices)****************************************
3312  template< typename VT1 // Type of the left-hand side target vector
3313  , typename VT2 // Type of the left-hand side vector operand
3314  , typename MT1 // Type of the right-hand side matrix operand
3315  , typename ST2 > // Type of the scalar value
3317  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3318  {
3319  selectDefaultAssignKernel( y, x, A, scalar );
3320  }
3321  //**********************************************************************************************
3322 
3323  //**Vectorized default assignment to dense vectors (large matrices)*****************************
3338  template< typename VT1 // Type of the left-hand side target vector
3339  , typename VT2 // Type of the left-hand side vector operand
3340  , typename MT1 // Type of the right-hand side matrix operand
3341  , typename ST2 > // Type of the scalar value
3343  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3344  {
3345  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3346 
3347  const size_t M( A.rows() );
3348  const size_t N( A.columns() );
3349 
3350  reset( y );
3351 
3352  size_t j( 0UL );
3353 
3354  for( ; (j+8UL) <= N; j+=8UL )
3355  {
3356  const size_t ibegin( ( IsLower<MT1>::value )
3357  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3358  :( 0UL ) );
3359  const size_t iend( ( IsUpper<MT1>::value )
3360  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3361  :( M ) );
3362  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3363 
3364  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3365  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3366 
3367  size_t i( ibegin );
3368 
3369  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3370  const size_t i1( i+SIMDSIZE );
3371  const size_t i2( i+SIMDSIZE*2UL );
3372  const size_t i3( i+SIMDSIZE*3UL );
3373  const SIMDType x1( x.load(i ) );
3374  const SIMDType x2( x.load(i1) );
3375  const SIMDType x3( x.load(i2) );
3376  const SIMDType x4( x.load(i3) );
3377  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3378  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3379  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3380  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3381  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3382  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3383  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3384  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3385  }
3386 
3387  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3388  const size_t i1( i+SIMDSIZE );
3389  const SIMDType x1( x.load(i ) );
3390  const SIMDType x2( x.load(i1) );
3391  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3392  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3393  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3394  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3395  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3396  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3397  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3398  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3399  }
3400 
3401  for( ; i<ipos; i+=SIMDSIZE ) {
3402  const SIMDType x1( x.load(i) );
3403  y[j ] += sum( x1 * A.load(i,j ) );
3404  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3405  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3406  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3407  y[j+4UL] += sum( x1 * A.load(i,j+4UL) );
3408  y[j+5UL] += sum( x1 * A.load(i,j+5UL) );
3409  y[j+6UL] += sum( x1 * A.load(i,j+6UL) );
3410  y[j+7UL] += sum( x1 * A.load(i,j+7UL) );
3411  }
3412 
3413  for( ; remainder && i<iend; ++i ) {
3414  y[j ] += x[i] * A(i,j );
3415  y[j+1UL] += x[i] * A(i,j+1UL);
3416  y[j+2UL] += x[i] * A(i,j+2UL);
3417  y[j+3UL] += x[i] * A(i,j+3UL);
3418  y[j+4UL] += x[i] * A(i,j+4UL);
3419  y[j+5UL] += x[i] * A(i,j+5UL);
3420  y[j+6UL] += x[i] * A(i,j+6UL);
3421  y[j+7UL] += x[i] * A(i,j+7UL);
3422  }
3423 
3424  y[j ] *= scalar;
3425  y[j+1UL] *= scalar;
3426  y[j+2UL] *= scalar;
3427  y[j+3UL] *= scalar;
3428  y[j+4UL] *= scalar;
3429  y[j+5UL] *= scalar;
3430  y[j+6UL] *= scalar;
3431  y[j+7UL] *= scalar;
3432  }
3433 
3434  for( ; (j+4UL) <= N; j+=4UL )
3435  {
3436  const size_t ibegin( ( IsLower<MT1>::value )
3437  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3438  :( 0UL ) );
3439  const size_t iend( ( IsUpper<MT1>::value )
3440  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3441  :( M ) );
3442  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3443 
3444  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3445  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3446 
3447  size_t i( ibegin );
3448 
3449  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3450  const size_t i1( i+SIMDSIZE );
3451  const size_t i2( i+SIMDSIZE*2UL );
3452  const size_t i3( i+SIMDSIZE*3UL );
3453  const SIMDType x1( x.load(i ) );
3454  const SIMDType x2( x.load(i1) );
3455  const SIMDType x3( x.load(i2) );
3456  const SIMDType x4( x.load(i3) );
3457  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3458  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3459  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3460  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3461  }
3462 
3463  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3464  const size_t i1( i+SIMDSIZE );
3465  const SIMDType x1( x.load(i ) );
3466  const SIMDType x2( x.load(i1) );
3467  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3468  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3469  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3470  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3471  }
3472 
3473  for( ; i<ipos; i+=SIMDSIZE ) {
3474  const SIMDType x1( x.load(i) );
3475  y[j ] += sum( x1 * A.load(i,j ) );
3476  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3477  y[j+2UL] += sum( x1 * A.load(i,j+2UL) );
3478  y[j+3UL] += sum( x1 * A.load(i,j+3UL) );
3479  }
3480 
3481  for( ; remainder && i<iend; ++i ) {
3482  y[j ] += x[i] * A(i,j );
3483  y[j+1UL] += x[i] * A(i,j+1UL);
3484  y[j+2UL] += x[i] * A(i,j+2UL);
3485  y[j+3UL] += x[i] * A(i,j+3UL);
3486  }
3487 
3488  y[j ] *= scalar;
3489  y[j+1UL] *= scalar;
3490  y[j+2UL] *= scalar;
3491  y[j+3UL] *= scalar;
3492  }
3493 
3494  for( ; (j+2UL) <= N; j+=2UL )
3495  {
3496  const size_t ibegin( ( IsLower<MT1>::value )
3497  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3498  :( 0UL ) );
3499  const size_t iend( ( IsUpper<MT1>::value )
3500  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3501  :( M ) );
3502  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3503 
3504  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3505  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3506 
3507  size_t i( ibegin );
3508 
3509  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3510  const size_t i1( i+SIMDSIZE );
3511  const size_t i2( i+SIMDSIZE*2UL );
3512  const size_t i3( i+SIMDSIZE*3UL );
3513  const SIMDType x1( x.load(i ) );
3514  const SIMDType x2( x.load(i1) );
3515  const SIMDType x3( x.load(i2) );
3516  const SIMDType x4( x.load(i3) );
3517  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3518  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3519  }
3520 
3521  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3522  const size_t i1( i+SIMDSIZE );
3523  const SIMDType x1( x.load(i ) );
3524  const SIMDType x2( x.load(i1) );
3525  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3526  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3527  }
3528 
3529  for( ; i<ipos; i+=SIMDSIZE ) {
3530  const SIMDType x1( x.load(i) );
3531  y[j ] += sum( x1 * A.load(i,j ) );
3532  y[j+1UL] += sum( x1 * A.load(i,j+1UL) );
3533  }
3534 
3535  for( ; remainder && i<iend; ++i ) {
3536  y[j ] += x[i] * A(i,j );
3537  y[j+1UL] += x[i] * A(i,j+1UL);
3538  }
3539 
3540  y[j ] *= scalar;
3541  y[j+1UL] *= scalar;
3542  }
3543 
3544  if( j < N )
3545  {
3546  const size_t ibegin( ( IsLower<MT1>::value )
3547  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3548  :( 0UL ) );
3549  const size_t iend( ( IsUpper<MT1>::value )
3550  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3551  :( M ) );
3552  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3553 
3554  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3555  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3556 
3557  size_t i( ibegin );
3558 
3559  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3560  const size_t i1( i+SIMDSIZE );
3561  const size_t i2( i+SIMDSIZE*2UL );
3562  const size_t i3( i+SIMDSIZE*3UL );
3563  const SIMDType x1( x.load(i ) );
3564  const SIMDType x2( x.load(i1) );
3565  const SIMDType x3( x.load(i2) );
3566  const SIMDType x4( x.load(i3) );
3567  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3568  }
3569 
3570  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3571  const size_t i1( i+SIMDSIZE );
3572  const SIMDType x1( x.load(i ) );
3573  const SIMDType x2( x.load(i1) );
3574  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3575  }
3576 
3577  for( ; i<ipos; i+=SIMDSIZE ) {
3578  const SIMDType x1( x.load(i) );
3579  y[j] += sum( x1 * A.load(i,j) );
3580  }
3581 
3582  for( ; remainder && i<iend; ++i ) {
3583  y[j] += x[i] * A(i,j);
3584  }
3585 
3586  y[j] *= scalar;
3587  }
3588  }
3589  //**********************************************************************************************
3590 
3591  //**BLAS-based assignment to dense vectors (default)********************************************
3604  template< typename VT1 // Type of the left-hand side target vector
3605  , typename VT2 // Type of the left-hand side vector operand
3606  , typename MT1 // Type of the right-hand side matrix operand
3607  , typename ST2 > // Type of the scalar value
3609  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3610  {
3611  selectLargeAssignKernel( y, x, A, scalar );
3612  }
3613  //**********************************************************************************************
3614 
3615  //**BLAS-based assignment to dense vectors******************************************************
3616 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3617 
3630  template< typename VT1 // Type of the left-hand side target vector
3631  , typename VT2 // Type of the left-hand side vector operand
3632  , typename MT1 // Type of the right-hand side matrix operand
3633  , typename ST2 > // Type of the scalar value
3635  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3636  {
3637  using ET = ElementType_<VT1>;
3638 
3639  if( IsTriangular<MT1>::value ) {
3640  assign( y, scalar * x );
3641  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3642  }
3643  else {
3644  gemv( y, x, A, ET(scalar), ET(0) );
3645  }
3646  }
3647 #endif
3648  //**********************************************************************************************
3649 
3650  //**Assignment to sparse vectors****************************************************************
3662  template< typename VT1 // Type of the target sparse vector
3663  , bool TF > // Transpose flag of the target sparse vector
3664  friend inline void assign( SparseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3665  {
3667 
3671 
3672  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3673 
3674  const ResultType tmp( serial( rhs ) );
3675  assign( ~lhs, tmp );
3676  }
3677  //**********************************************************************************************
3678 
3679  //**Addition assignment to dense vectors********************************************************
3691  template< typename VT1 // Type of the target dense vector
3692  , bool TF > // Transpose flag of the target dense vector
3693  friend inline void addAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
3694  {
3696 
3697  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3698 
3699  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3700  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3701 
3702  if( right.rows() == 0UL || right.columns() == 0UL ) {
3703  return;
3704  }
3705 
3706  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3707  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3708 
3709  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3710  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3711  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3712  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3713 
3714  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3715  }
3716  //**********************************************************************************************
3717 
3718  //**Addition assignment to dense vectors (kernel selection)*************************************
3729  template< typename VT1 // Type of the left-hand side target vector
3730  , typename VT2 // Type of the left-hand side vector operand
3731  , typename MT1 // Type of the right-hand side matrix operand
3732  , typename ST2 > // Type of the scalar value
3733  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3734  {
3735  if( ( IsDiagonal<MT1>::value ) ||
3736  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3737  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3738  selectSmallAddAssignKernel( y, x, A, scalar );
3739  else
3740  selectBlasAddAssignKernel( y, x, A, scalar );
3741  }
3742  //**********************************************************************************************
3743 
3744  //**Default addition assignment to dense vectors************************************************
3758  template< typename VT1 // Type of the left-hand side target vector
3759  , typename VT2 // Type of the left-hand side vector operand
3760  , typename MT1 // Type of the right-hand side matrix operand
3761  , typename ST2 > // Type of the scalar value
3762  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3763  {
3764  y.addAssign( x * A * scalar );
3765  }
3766  //**********************************************************************************************
3767 
3768  //**Default addition assignment to dense vectors (small matrices)*******************************
3782  template< typename VT1 // Type of the left-hand side target vector
3783  , typename VT2 // Type of the left-hand side vector operand
3784  , typename MT1 // Type of the right-hand side matrix operand
3785  , typename ST2 > // Type of the scalar value
3787  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3788  {
3789  selectDefaultAddAssignKernel( y, x, A, scalar );
3790  }
3791  //**********************************************************************************************
3792 
3793  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3808  template< typename VT1 // Type of the left-hand side target vector
3809  , typename VT2 // Type of the left-hand side vector operand
3810  , typename MT1 // Type of the right-hand side matrix operand
3811  , typename ST2 > // Type of the scalar value
3813  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3814  {
3815  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3816 
3817  const size_t M( A.rows() );
3818  const size_t N( A.columns() );
3819 
3820  size_t j( 0UL );
3821 
3822  for( ; (j+8UL) <= N; j+=8UL )
3823  {
3824  const size_t ibegin( ( IsLower<MT1>::value )
3825  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3826  :( 0UL ) );
3827  const size_t iend( ( IsUpper<MT1>::value )
3828  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3829  :( M ) );
3830  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3831 
3832  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3833  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3834 
3835  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3836  size_t i( ibegin );
3837 
3838  for( ; i<ipos; i+=SIMDSIZE ) {
3839  const SIMDType x1( x.load(i) );
3840  xmm1 += x1 * A.load(i,j );
3841  xmm2 += x1 * A.load(i,j+1UL);
3842  xmm3 += x1 * A.load(i,j+2UL);
3843  xmm4 += x1 * A.load(i,j+3UL);
3844  xmm5 += x1 * A.load(i,j+4UL);
3845  xmm6 += x1 * A.load(i,j+5UL);
3846  xmm7 += x1 * A.load(i,j+6UL);
3847  xmm8 += x1 * A.load(i,j+7UL);
3848  }
3849 
3850  y[j ] += sum( xmm1 ) * scalar;
3851  y[j+1UL] += sum( xmm2 ) * scalar;
3852  y[j+2UL] += sum( xmm3 ) * scalar;
3853  y[j+3UL] += sum( xmm4 ) * scalar;
3854  y[j+4UL] += sum( xmm5 ) * scalar;
3855  y[j+5UL] += sum( xmm6 ) * scalar;
3856  y[j+6UL] += sum( xmm7 ) * scalar;
3857  y[j+7UL] += sum( xmm8 ) * scalar;
3858 
3859  for( ; remainder && i<iend; ++i ) {
3860  y[j ] += x[i] * A(i,j ) * scalar;
3861  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3862  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3863  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3864  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3865  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3866  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3867  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3868  }
3869  }
3870 
3871  for( ; (j+4UL) <= N; j+=4UL )
3872  {
3873  const size_t ibegin( ( IsLower<MT1>::value )
3874  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3875  :( 0UL ) );
3876  const size_t iend( ( IsUpper<MT1>::value )
3877  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3878  :( M ) );
3879  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3880 
3881  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3882  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3883 
3884  SIMDType xmm1, xmm2, xmm3, xmm4;
3885  size_t i( ibegin );
3886 
3887  for( ; i<ipos; i+=SIMDSIZE ) {
3888  const SIMDType x1( x.load(i) );
3889  xmm1 += x1 * A.load(i,j );
3890  xmm2 += x1 * A.load(i,j+1UL);
3891  xmm3 += x1 * A.load(i,j+2UL);
3892  xmm4 += x1 * A.load(i,j+3UL);
3893  }
3894 
3895  y[j ] += sum( xmm1 ) * scalar;
3896  y[j+1UL] += sum( xmm2 ) * scalar;
3897  y[j+2UL] += sum( xmm3 ) * scalar;
3898  y[j+3UL] += sum( xmm4 ) * scalar;
3899 
3900  for( ; remainder && i<iend; ++i ) {
3901  y[j ] += x[i] * A(i,j ) * scalar;
3902  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3903  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3904  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3905  }
3906  }
3907 
3908  for( ; (j+3UL) <= N; j+=3UL )
3909  {
3910  const size_t ibegin( ( IsLower<MT1>::value )
3911  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3912  :( 0UL ) );
3913  const size_t iend( ( IsUpper<MT1>::value )
3914  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3915  :( M ) );
3916  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3917 
3918  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3919  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3920 
3921  SIMDType xmm1, xmm2, xmm3;
3922  size_t i( ibegin );
3923 
3924  for( ; i<ipos; i+=SIMDSIZE ) {
3925  const SIMDType x1( x.load(i) );
3926  xmm1 += x1 * A.load(i,j );
3927  xmm2 += x1 * A.load(i,j+1UL);
3928  xmm3 += x1 * A.load(i,j+2UL);
3929  }
3930 
3931  y[j ] += sum( xmm1 ) * scalar;
3932  y[j+1UL] += sum( xmm2 ) * scalar;
3933  y[j+2UL] += sum( xmm3 ) * scalar;
3934 
3935  for( ; remainder && i<iend; ++i ) {
3936  y[j ] += x[i] * A(i,j ) * scalar;
3937  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3938  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3939  }
3940  }
3941 
3942  for( ; (j+2UL) <= N; j+=2UL )
3943  {
3944  const size_t ibegin( ( IsLower<MT1>::value )
3945  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3946  :( 0UL ) );
3947  const size_t iend( ( IsUpper<MT1>::value )
3948  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3949  :( M ) );
3950  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3951 
3952  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3953  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3954 
3955  SIMDType xmm1, xmm2;
3956  size_t i( ibegin );
3957 
3958  for( ; i<ipos; i+=SIMDSIZE ) {
3959  const SIMDType x1( x.load(i) );
3960  xmm1 += x1 * A.load(i,j );
3961  xmm2 += x1 * A.load(i,j+1UL);
3962  }
3963 
3964  y[j ] += sum( xmm1 ) * scalar;
3965  y[j+1UL] += sum( xmm2 ) * scalar;
3966 
3967  for( ; remainder && i<iend; ++i ) {
3968  y[j ] += x[i] * A(i,j ) * scalar;
3969  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3970  }
3971  }
3972 
3973  if( j < N )
3974  {
3975  const size_t ibegin( ( IsLower<MT1>::value )
3976  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
3977  :( 0UL ) );
3978  const size_t iend( ( IsUpper<MT1>::value )
3979  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3980  :( M ) );
3981  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3982 
3983  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
3984  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
3985 
3986  SIMDType xmm1;
3987  size_t i( ibegin );
3988 
3989  for( ; i<ipos; i+=SIMDSIZE ) {
3990  xmm1 += A.load(i,j) * x.load(i);
3991  }
3992 
3993  y[j] += sum( xmm1 ) * scalar;
3994 
3995  for( ; remainder && i<iend; ++i ) {
3996  y[j] += x[i] * A(i,j) * scalar;
3997  }
3998  }
3999  }
4000  //**********************************************************************************************
4001 
4002  //**Default addition assignment to dense vectors (large matrices)*******************************
4016  template< typename VT1 // Type of the left-hand side target vector
4017  , typename VT2 // Type of the left-hand side vector operand
4018  , typename MT1 // Type of the right-hand side matrix operand
4019  , typename ST2 > // Type of the scalar value
4021  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4022  {
4023  selectDefaultAddAssignKernel( y, x, A, scalar );
4024  }
4025  //**********************************************************************************************
4026 
4027  //**Vectorized default addition assignment to dense vectors (large matrices)********************
4042  template< typename VT1 // Type of the left-hand side target vector
4043  , typename VT2 // Type of the left-hand side vector operand
4044  , typename MT1 // Type of the right-hand side matrix operand
4045  , typename ST2 > // Type of the scalar value
4047  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4048  {
4049  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4050 
4051  const size_t M( A.rows() );
4052  const size_t N( A.columns() );
4053 
4054  size_t j( 0UL );
4055 
4056  for( ; (j+8UL) <= N; j+=8UL )
4057  {
4058  const size_t ibegin( ( IsLower<MT1>::value )
4059  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4060  :( 0UL ) );
4061  const size_t iend( ( IsUpper<MT1>::value )
4062  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4063  :( M ) );
4064  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4065 
4066  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4067  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4068 
4069  size_t i( ibegin );
4070 
4071  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4072  const size_t i1( i+SIMDSIZE );
4073  const size_t i2( i+SIMDSIZE*2UL );
4074  const size_t i3( i+SIMDSIZE*3UL );
4075  const SIMDType x1( x.load(i ) );
4076  const SIMDType x2( x.load(i1) );
4077  const SIMDType x3( x.load(i2) );
4078  const SIMDType x4( x.load(i3) );
4079  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4080  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4081  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4082  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4083  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4084  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4085  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4086  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4087  }
4088 
4089  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4090  const size_t i1( i+SIMDSIZE );
4091  const SIMDType x1( x.load(i ) );
4092  const SIMDType x2( x.load(i1) );
4093  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4094  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4095  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4096  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4097  y[j+4UL] += sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4098  y[j+5UL] += sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4099  y[j+6UL] += sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4100  y[j+7UL] += sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4101  }
4102 
4103  for( ; i<ipos; i+=SIMDSIZE ) {
4104  const SIMDType x1( x.load(i) );
4105  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4106  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4107  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4108  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4109  y[j+4UL] += sum( x1 * A.load(i,j+4UL) ) * scalar;
4110  y[j+5UL] += sum( x1 * A.load(i,j+5UL) ) * scalar;
4111  y[j+6UL] += sum( x1 * A.load(i,j+6UL) ) * scalar;
4112  y[j+7UL] += sum( x1 * A.load(i,j+7UL) ) * scalar;
4113  }
4114 
4115  for( ; remainder && i<iend; ++i ) {
4116  y[j ] += x[i] * A(i,j ) * scalar;
4117  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4118  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4119  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4120  y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4121  y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4122  y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4123  y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4124  }
4125  }
4126 
4127  for( ; (j+4UL) <= N; j+=4UL )
4128  {
4129  const size_t ibegin( ( IsLower<MT1>::value )
4130  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4131  :( 0UL ) );
4132  const size_t iend( ( IsUpper<MT1>::value )
4133  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4134  :( M ) );
4135  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4136 
4137  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4138  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4139 
4140  size_t i( ibegin );
4141 
4142  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4143  const size_t i1( i+SIMDSIZE );
4144  const size_t i2( i+SIMDSIZE*2UL );
4145  const size_t i3( i+SIMDSIZE*3UL );
4146  const SIMDType x1( x.load(i ) );
4147  const SIMDType x2( x.load(i1) );
4148  const SIMDType x3( x.load(i2) );
4149  const SIMDType x4( x.load(i3) );
4150  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4151  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4152  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4153  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4154  }
4155 
4156  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4157  const size_t i1( i+SIMDSIZE );
4158  const SIMDType x1( x.load(i ) );
4159  const SIMDType x2( x.load(i1) );
4160  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4161  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4162  y[j+2UL] += sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4163  y[j+3UL] += sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4164  }
4165 
4166  for( ; i<ipos; i+=SIMDSIZE ) {
4167  const SIMDType x1( x.load(i) );
4168  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4169  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4170  y[j+2UL] += sum( x1 * A.load(i,j+2UL) ) * scalar;
4171  y[j+3UL] += sum( x1 * A.load(i,j+3UL) ) * scalar;
4172  }
4173 
4174  for( ; remainder && i<iend; ++i ) {
4175  y[j ] += x[i] * A(i,j ) * scalar;
4176  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4177  y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4178  y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4179  }
4180  }
4181 
4182  for( ; (j+2UL) <= N; j+=2UL )
4183  {
4184  const size_t ibegin( ( IsLower<MT1>::value )
4185  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4186  :( 0UL ) );
4187  const size_t iend( ( IsUpper<MT1>::value )
4188  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4189  :( M ) );
4190  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4191 
4192  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4193  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4194 
4195  size_t i( ibegin );
4196 
4197  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4198  const size_t i1( i+SIMDSIZE );
4199  const size_t i2( i+SIMDSIZE*2UL );
4200  const size_t i3( i+SIMDSIZE*3UL );
4201  const SIMDType x1( x.load(i ) );
4202  const SIMDType x2( x.load(i1) );
4203  const SIMDType x3( x.load(i2) );
4204  const SIMDType x4( x.load(i3) );
4205  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4206  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4207  }
4208 
4209  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4210  const size_t i1( i+SIMDSIZE );
4211  const SIMDType x1( x.load(i ) );
4212  const SIMDType x2( x.load(i1) );
4213  y[j ] += sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4214  y[j+1UL] += sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4215  }
4216 
4217  for( ; i<ipos; i+=SIMDSIZE ) {
4218  const SIMDType x1( x.load(i) );
4219  y[j ] += sum( x1 * A.load(i,j ) ) * scalar;
4220  y[j+1UL] += sum( x1 * A.load(i,j+1UL) ) * scalar;
4221  }
4222 
4223  for( ; remainder && i<iend; ++i ) {
4224  y[j ] += x[i] * A(i,j ) * scalar;
4225  y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4226  }
4227  }
4228 
4229  if( j < N )
4230  {
4231  const size_t ibegin( ( IsLower<MT1>::value )
4232  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4233  :( 0UL ) );
4234  const size_t iend( ( IsUpper<MT1>::value )
4235  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4236  :( M ) );
4237  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4238 
4239  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4240  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4241 
4242  size_t i( ibegin );
4243 
4244  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4245  const size_t i1( i+SIMDSIZE );
4246  const size_t i2( i+SIMDSIZE*2UL );
4247  const size_t i3( i+SIMDSIZE*3UL );
4248  const SIMDType x1( x.load(i ) );
4249  const SIMDType x2( x.load(i1) );
4250  const SIMDType x3( x.load(i2) );
4251  const SIMDType x4( x.load(i3) );
4252  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4253  }
4254 
4255  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4256  const size_t i1( i+SIMDSIZE );
4257  const SIMDType x1( x.load(i ) );
4258  const SIMDType x2( x.load(i1) );
4259  y[j] += sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4260  }
4261 
4262  for( ; i<ipos; i+=SIMDSIZE ) {
4263  const SIMDType x1( x.load(i) );
4264  y[j] += sum( x1 * A.load(i,j) ) * scalar;
4265  }
4266 
4267  for( ; remainder && i<iend; ++i ) {
4268  y[j] += x[i] * A(i,j) * scalar;
4269  }
4270  }
4271  }
4272  //**********************************************************************************************
4273 
4274  //**BLAS-based addition assignment to dense vectors (default)***********************************
4289  template< typename VT1 // Type of the left-hand side target vector
4290  , typename VT2 // Type of the left-hand side vector operand
4291  , typename MT1 // Type of the right-hand side matrix operand
4292  , typename ST2 > // Type of the scalar value
4294  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4295  {
4296  selectLargeAddAssignKernel( y, x, A, scalar );
4297  }
4298  //**********************************************************************************************
4299 
4300  //**BLAS-based addition assignment to dense vectors*********************************************
4301 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4302 
4315  template< typename VT1 // Type of the left-hand side target vector
4316  , typename VT2 // Type of the left-hand side vector operand
4317  , typename MT1 // Type of the right-hand side matrix operand
4318  , typename ST2 > // Type of the scalar value
4320  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4321  {
4322  using ET = ElementType_<VT1>;
4323 
4324  if( IsTriangular<MT1>::value ) {
4325  ResultType_<VT1> tmp( serial( scalar * x ) );
4326  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4327  addAssign( y, tmp );
4328  }
4329  else {
4330  gemv( y, x, A, ET(scalar), ET(1) );
4331  }
4332  }
4333 #endif
4334  //**********************************************************************************************
4335 
4336  //**Addition assignment to sparse vectors*******************************************************
4337  // No special implementation for the addition assignment to sparse vectors.
4338  //**********************************************************************************************
4339 
4340  //**Subtraction assignment to dense vectors*****************************************************
4352  template< typename VT1 // Type of the target dense vector
4353  , bool TF > // Transpose flag of the target dense vector
4354  friend inline void subAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
4355  {
4357 
4358  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4359 
4360  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4361  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4362 
4363  if( right.rows() == 0UL || right.columns() == 0UL ) {
4364  return;
4365  }
4366 
4367  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
4368  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4369 
4370  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4371  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4372  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4373  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4374 
4375  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
4376  }
4377  //**********************************************************************************************
4378 
4379  //**Subtraction assignment to dense vectors (kernel selection)**********************************
4390  template< typename VT1 // Type of the left-hand side target vector
4391  , typename VT2 // Type of the left-hand side vector operand
4392  , typename MT1 // Type of the right-hand side matrix operand
4393  , typename ST2 > // Type of the scalar value
4394  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4395  {
4396  if( ( IsDiagonal<MT1>::value ) ||
4397  ( IsComputation<MT>::value && !evaluateMatrix ) ||
4398  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4399  selectSmallSubAssignKernel( y, x, A, scalar );
4400  else
4401  selectBlasSubAssignKernel( y, x, A, scalar );
4402  }
4403  //**********************************************************************************************
4404 
4405  //**Default subtraction assignment to dense vectors*********************************************
4419  template< typename VT1 // Type of the left-hand side target vector
4420  , typename VT2 // Type of the left-hand side vector operand
4421  , typename MT1 // Type of the right-hand side matrix operand
4422  , typename ST2 > // Type of the scalar value
4423  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4424  {
4425  y.subAssign( x * A * scalar );
4426  }
4427  //**********************************************************************************************
4428 
4429  //**Default subtraction assignment to dense vectors (small matrices)****************************
4443  template< typename VT1 // Type of the left-hand side target vector
4444  , typename VT2 // Type of the left-hand side vector operand
4445  , typename MT1 // Type of the right-hand side matrix operand
4446  , typename ST2 > // Type of the scalar value
4448  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4449  {
4450  selectDefaultSubAssignKernel( y, x, A, scalar );
4451  }
4452  //**********************************************************************************************
4453 
4454  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
4469  template< typename VT1 // Type of the left-hand side target vector
4470  , typename VT2 // Type of the left-hand side vector operand
4471  , typename MT1 // Type of the right-hand side matrix operand
4472  , typename ST2 > // Type of the scalar value
4474  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4475  {
4476  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4477 
4478  const size_t M( A.rows() );
4479  const size_t N( A.columns() );
4480 
4481  size_t j( 0UL );
4482 
4483  for( ; (j+8UL) <= N; j+=8UL )
4484  {
4485  const size_t ibegin( ( IsLower<MT1>::value )
4486  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4487  :( 0UL ) );
4488  const size_t iend( ( IsUpper<MT1>::value )
4489  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4490  :( M ) );
4491  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4492 
4493  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4494  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4495 
4496  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4497  size_t i( ibegin );
4498 
4499  for( ; i<ipos; i+=SIMDSIZE ) {
4500  const SIMDType x1( x.load(i) );
4501  xmm1 += x1 * A.load(i,j );
4502  xmm2 += x1 * A.load(i,j+1UL);
4503  xmm3 += x1 * A.load(i,j+2UL);
4504  xmm4 += x1 * A.load(i,j+3UL);
4505  xmm5 += x1 * A.load(i,j+4UL);
4506  xmm6 += x1 * A.load(i,j+5UL);
4507  xmm7 += x1 * A.load(i,j+6UL);
4508  xmm8 += x1 * A.load(i,j+7UL);
4509  }
4510 
4511  y[j ] -= sum( xmm1 ) * scalar;
4512  y[j+1UL] -= sum( xmm2 ) * scalar;
4513  y[j+2UL] -= sum( xmm3 ) * scalar;
4514  y[j+3UL] -= sum( xmm4 ) * scalar;
4515  y[j+4UL] -= sum( xmm5 ) * scalar;
4516  y[j+5UL] -= sum( xmm6 ) * scalar;
4517  y[j+6UL] -= sum( xmm7 ) * scalar;
4518  y[j+7UL] -= sum( xmm8 ) * scalar;
4519 
4520  for( ; remainder && i<iend; ++i ) {
4521  y[j ] -= x[i] * A(i,j ) * scalar;
4522  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4523  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4524  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4525  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4526  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4527  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4528  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4529  }
4530  }
4531 
4532  for( ; (j+4UL) <= N; j+=4UL )
4533  {
4534  const size_t ibegin( ( IsLower<MT1>::value )
4535  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4536  :( 0UL ) );
4537  const size_t iend( ( IsUpper<MT1>::value )
4538  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4539  :( M ) );
4540  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4541 
4542  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4543  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4544 
4545  SIMDType xmm1, xmm2, xmm3, xmm4;
4546  size_t i( ibegin );
4547 
4548  for( ; i<ipos; i+=SIMDSIZE ) {
4549  const SIMDType x1( x.load(i) );
4550  xmm1 += x1 * A.load(i,j );
4551  xmm2 += x1 * A.load(i,j+1UL);
4552  xmm3 += x1 * A.load(i,j+2UL);
4553  xmm4 += x1 * A.load(i,j+3UL);
4554  }
4555 
4556  y[j ] -= sum( xmm1 ) * scalar;
4557  y[j+1UL] -= sum( xmm2 ) * scalar;
4558  y[j+2UL] -= sum( xmm3 ) * scalar;
4559  y[j+3UL] -= sum( xmm4 ) * scalar;
4560 
4561  for( ; remainder && i<iend; ++i ) {
4562  y[j ] -= x[i] * A(i,j ) * scalar;
4563  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4564  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4565  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4566  }
4567  }
4568 
4569  for( ; (j+3UL) <= N; j+=3UL )
4570  {
4571  const size_t ibegin( ( IsLower<MT1>::value )
4572  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4573  :( 0UL ) );
4574  const size_t iend( ( IsUpper<MT1>::value )
4575  ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
4576  :( M ) );
4577  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4578 
4579  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4580  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4581 
4582  SIMDType xmm1, xmm2, xmm3;
4583  size_t i( ibegin );
4584 
4585  for( ; i<ipos; i+=SIMDSIZE ) {
4586  const SIMDType x1( x.load(i) );
4587  xmm1 += x1 * A.load(i,j );
4588  xmm2 += x1 * A.load(i,j+1UL);
4589  xmm3 += x1 * A.load(i,j+2UL);
4590  }
4591 
4592  y[j ] -= sum( xmm1 ) * scalar;
4593  y[j+1UL] -= sum( xmm2 ) * scalar;
4594  y[j+2UL] -= sum( xmm3 ) * scalar;
4595 
4596  for( ; remainder && i<iend; ++i ) {
4597  y[j ] -= x[i] * A(i,j ) * scalar;
4598  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4599  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4600  }
4601  }
4602 
4603  for( ; (j+2UL) <= N; j+=2UL )
4604  {
4605  const size_t ibegin( ( IsLower<MT1>::value )
4606  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4607  :( 0UL ) );
4608  const size_t iend( ( IsUpper<MT1>::value )
4609  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4610  :( M ) );
4611  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4612 
4613  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4614  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4615 
4616  SIMDType xmm1, xmm2;
4617  size_t i( ibegin );
4618 
4619  for( ; i<ipos; i+=SIMDSIZE ) {
4620  const SIMDType x1( x.load(i) );
4621  xmm1 += x1 * A.load(i,j );
4622  xmm2 += x1 * A.load(i,j+1UL);
4623  }
4624 
4625  y[j ] -= sum( xmm1 ) * scalar;
4626  y[j+1UL] -= sum( xmm2 ) * scalar;
4627 
4628  for( ; remainder && i<iend; ++i ) {
4629  y[j ] -= x[i] * A(i,j ) * scalar;
4630  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4631  }
4632  }
4633 
4634  if( j < N )
4635  {
4636  const size_t ibegin( ( IsLower<MT1>::value )
4637  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4638  :( 0UL ) );
4639  const size_t iend( ( IsUpper<MT1>::value )
4640  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4641  :( M ) );
4642  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4643 
4644  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4645  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4646 
4647  SIMDType xmm1;
4648  size_t i( ibegin );
4649 
4650  for( ; i<ipos; i+=SIMDSIZE ) {
4651  xmm1 += A.load(i,j) * x.load(i);
4652  }
4653 
4654  y[j] -= sum( xmm1 ) * scalar;
4655 
4656  for( ; remainder && i<iend; ++i ) {
4657  y[j] -= x[i] * A(i,j) * scalar;
4658  }
4659  }
4660  }
4661  //**********************************************************************************************
4662 
4663  //**Default subtraction assignment to dense vectors (large matrices)****************************
4677  template< typename VT1 // Type of the left-hand side target vector
4678  , typename VT2 // Type of the left-hand side vector operand
4679  , typename MT1 // Type of the right-hand side matrix operand
4680  , typename ST2 > // Type of the scalar value
4682  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4683  {
4684  selectDefaultSubAssignKernel( y, x, A, scalar );
4685  }
4686  //**********************************************************************************************
4687 
4688  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4703  template< typename VT1 // Type of the left-hand side target vector
4704  , typename VT2 // Type of the left-hand side vector operand
4705  , typename MT1 // Type of the right-hand side matrix operand
4706  , typename ST2 > // Type of the scalar value
4708  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4709  {
4710  constexpr bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4711 
4712  const size_t M( A.rows() );
4713  const size_t N( A.columns() );
4714 
4715  size_t j( 0UL );
4716 
4717  for( ; (j+8UL) <= N; j+=8UL )
4718  {
4719  const size_t ibegin( ( IsLower<MT1>::value )
4720  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4721  :( 0UL ) );
4722  const size_t iend( ( IsUpper<MT1>::value )
4723  ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4724  :( M ) );
4725  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4726 
4727  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4728  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4729 
4730  size_t i( ibegin );
4731 
4732  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4733  const size_t i1( i+SIMDSIZE );
4734  const size_t i2( i+SIMDSIZE*2UL );
4735  const size_t i3( i+SIMDSIZE*3UL );
4736  const SIMDType x1( x.load(i ) );
4737  const SIMDType x2( x.load(i1) );
4738  const SIMDType x3( x.load(i2) );
4739  const SIMDType x4( x.load(i3) );
4740  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4741  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4742  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4743  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4744  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4745  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4746  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4747  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4748  }
4749 
4750  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4751  const size_t i1( i+SIMDSIZE );
4752  const SIMDType x1( x.load(i ) );
4753  const SIMDType x2( x.load(i1) );
4754  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4755  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4756  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4757  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4758  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4759  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4760  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4761  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4762  }
4763 
4764  for( ; i<ipos; i+=SIMDSIZE ) {
4765  const SIMDType x1( x.load(i) );
4766  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4767  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4768  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4769  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4770  y[j+4UL] -= sum( x1 * A.load(i,j+4UL) ) * scalar;
4771  y[j+5UL] -= sum( x1 * A.load(i,j+5UL) ) * scalar;
4772  y[j+6UL] -= sum( x1 * A.load(i,j+6UL) ) * scalar;
4773  y[j+7UL] -= sum( x1 * A.load(i,j+7UL) ) * scalar;
4774  }
4775 
4776  for( ; remainder && i<iend; ++i ) {
4777  y[j ] -= x[i] * A(i,j ) * scalar;
4778  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4779  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4780  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4781  y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4782  y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4783  y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4784  y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4785  }
4786  }
4787 
4788  for( ; (j+4UL) <= N; j+=4UL )
4789  {
4790  const size_t ibegin( ( IsLower<MT1>::value )
4791  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4792  :( 0UL ) );
4793  const size_t iend( ( IsUpper<MT1>::value )
4794  ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4795  :( M ) );
4796  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4797 
4798  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4799  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4800 
4801  size_t i( ibegin );
4802 
4803  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4804  const size_t i1( i+SIMDSIZE );
4805  const size_t i2( i+SIMDSIZE*2UL );
4806  const size_t i3( i+SIMDSIZE*3UL );
4807  const SIMDType x1( x.load(i ) );
4808  const SIMDType x2( x.load(i1) );
4809  const SIMDType x3( x.load(i2) );
4810  const SIMDType x4( x.load(i3) );
4811  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4812  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4813  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4814  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4815  }
4816 
4817  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4818  const size_t i1( i+SIMDSIZE );
4819  const SIMDType x1( x.load(i ) );
4820  const SIMDType x2( x.load(i1) );
4821  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4822  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4823  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4824  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4825  }
4826 
4827  for( ; i<ipos; i+=SIMDSIZE ) {
4828  const SIMDType x1( x.load(i) );
4829  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4830  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4831  y[j+2UL] -= sum( x1 * A.load(i,j+2UL) ) * scalar;
4832  y[j+3UL] -= sum( x1 * A.load(i,j+3UL) ) * scalar;
4833  }
4834 
4835  for( ; remainder && i<iend; ++i ) {
4836  y[j ] -= x[i] * A(i,j ) * scalar;
4837  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4838  y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4839  y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4840  }
4841  }
4842 
4843  for( ; (j+2UL) <= N; j+=2UL )
4844  {
4845  const size_t ibegin( ( IsLower<MT1>::value )
4846  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4847  :( 0UL ) );
4848  const size_t iend( ( IsUpper<MT1>::value )
4849  ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4850  :( M ) );
4851  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4852 
4853  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4854  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4855 
4856  size_t i( ibegin );
4857 
4858  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4859  const size_t i1( i+SIMDSIZE );
4860  const size_t i2( i+SIMDSIZE*2UL );
4861  const size_t i3( i+SIMDSIZE*3UL );
4862  const SIMDType x1( x.load(i ) );
4863  const SIMDType x2( x.load(i1) );
4864  const SIMDType x3( x.load(i2) );
4865  const SIMDType x4( x.load(i3) );
4866  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4867  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4868  }
4869 
4870  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4871  const size_t i1( i+SIMDSIZE );
4872  const SIMDType x1( x.load(i ) );
4873  const SIMDType x2( x.load(i1) );
4874  y[j ] -= sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4875  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4876  }
4877 
4878  for( ; i<ipos; i+=SIMDSIZE ) {
4879  const SIMDType x1( x.load(i) );
4880  y[j ] -= sum( x1 * A.load(i,j ) ) * scalar;
4881  y[j+1UL] -= sum( x1 * A.load(i,j+1UL) ) * scalar;
4882  }
4883 
4884  for( ; remainder && i<iend; ++i ) {
4885  y[j ] -= x[i] * A(i,j ) * scalar;
4886  y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4887  }
4888  }
4889 
4890  if( j < N )
4891  {
4892  const size_t ibegin( ( IsLower<MT1>::value )
4893  ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) & size_t(-SIMDSIZE) )
4894  :( 0UL ) );
4895  const size_t iend( ( IsUpper<MT1>::value )
4896  ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4897  :( M ) );
4898  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4899 
4900  const size_t ipos( remainder ? ( iend & size_t(-SIMDSIZE) ) : iend );
4901  BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos, "Invalid end calculation" );
4902 
4903  size_t i( ibegin );
4904 
4905  for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4906  const size_t i1( i+SIMDSIZE );
4907  const size_t i2( i+SIMDSIZE*2UL );
4908  const size_t i3( i+SIMDSIZE*3UL );
4909  const SIMDType x1( x.load(i ) );
4910  const SIMDType x2( x.load(i1) );
4911  const SIMDType x3( x.load(i2) );
4912  const SIMDType x4( x.load(i3) );
4913  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4914  }
4915 
4916  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4917  const size_t i1( i+SIMDSIZE );
4918  const SIMDType x1( x.load(i ) );
4919  const SIMDType x2( x.load(i1) );
4920  y[j] -= sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4921  }
4922 
4923  for( ; i<ipos; i+=SIMDSIZE ) {
4924  const SIMDType x1( x.load(i) );
4925  y[j] -= sum( x1 * A.load(i,j) ) * scalar;
4926  }
4927 
4928  for( ; remainder && i<iend; ++i ) {
4929  y[j] -= x[i] * A(i,j) * scalar;
4930  }
4931  }
4932  }
4933  //**********************************************************************************************
4934 
4935  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4950  template< typename VT1 // Type of the left-hand side target vector
4951  , typename VT2 // Type of the left-hand side vector operand
4952  , typename MT1 // Type of the right-hand side matrix operand
4953  , typename ST2 > // Type of the scalar value
4955  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4956  {
4957  selectLargeSubAssignKernel( y, x, A, scalar );
4958  }
4959  //**********************************************************************************************
4960 
4961  //**BLAS-based subtraction assignment to dense vectors******************************************
4962 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4963 
4976  template< typename VT1 // Type of the left-hand side target vector
4977  , typename VT2 // Type of the left-hand side vector operand
4978  , typename MT1 // Type of the right-hand side matrix operand
4979  , typename ST2 > // Type of the scalar value
4981  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4982  {
4983  using ET = ElementType_<VT1>;
4984 
4985  if( IsTriangular<MT1>::value ) {
4986  ResultType_<VT1> tmp( serial( scalar * x ) );
4987  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4988  subAssign( y, tmp );
4989  }
4990  else {
4991  gemv( y, x, A, ET(-scalar), ET(1) );
4992  }
4993  }
4994 #endif
4995  //**********************************************************************************************
4996 
4997  //**Subtraction assignment to sparse vectors****************************************************
4998  // No special implementation for the subtraction assignment to sparse vectors.
4999  //**********************************************************************************************
5000 
5001  //**Multiplication assignment to dense vectors**************************************************
5013  template< typename VT1 // Type of the target dense vector
5014  , bool TF > // Transpose flag of the target dense vector
5015  friend inline void multAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5016  {
5018 
5022 
5023  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5024 
5025  const ResultType tmp( serial( rhs ) );
5026  multAssign( ~lhs, tmp );
5027  }
5028  //**********************************************************************************************
5029 
5030  //**Multiplication assignment to sparse vectors*************************************************
5031  // No special implementation for the multiplication assignment to sparse vectors.
5032  //**********************************************************************************************
5033 
5034  //**Division assignment to dense vectors********************************************************
5046  template< typename VT1 // Type of the target dense vector
5047  , bool TF > // Transpose flag of the target dense vector
5048  friend inline void divAssign( DenseVector<VT1,TF>& lhs, const DVecScalarMultExpr& rhs )
5049  {
5051 
5055 
5056  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5057 
5058  const ResultType tmp( serial( rhs ) );
5059  divAssign( ~lhs, tmp );
5060  }
5061  //**********************************************************************************************
5062 
5063  //**Division assignment to sparse vectors*******************************************************
5064  // No special implementation for the division assignment to sparse vectors.
5065  //**********************************************************************************************
5066 
5067  //**SMP assignment to dense vectors*************************************************************
5081  template< typename VT1 // Type of the target dense vector
5082  , bool TF > // Transpose flag of the target dense vector
5083  friend inline EnableIf_< UseSMPAssign<VT1> >
5085  {
5087 
5088  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5089 
5090  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5091  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5092 
5093  if( right.rows() == 0UL ) {
5094  reset( ~lhs );
5095  return;
5096  }
5097  else if( right.columns() == 0UL ) {
5098  return;
5099  }
5100 
5101  LT x( left ); // Evaluation of the left-hand side dense vector operand
5102  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5103 
5104  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5105  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5106  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5107  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5108 
5109  smpAssign( ~lhs, x * A * rhs.scalar_ );
5110  }
5111  //**********************************************************************************************
5112 
5113  //**SMP assignment to sparse vectors************************************************************
5127  template< typename VT1 // Type of the target sparse vector
5128  , bool TF > // Transpose flag of the target sparse vector
5129  friend inline EnableIf_< UseSMPAssign<VT1> >
5131  {
5133 
5137 
5138  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5139 
5140  const ResultType tmp( rhs );
5141  smpAssign( ~lhs, tmp );
5142  }
5143  //**********************************************************************************************
5144 
5145  //**SMP addition assignment to dense vectors****************************************************
5159  template< typename VT1 // Type of the target dense vector
5160  , bool TF > // Transpose flag of the target dense vector
5161  friend inline EnableIf_< UseSMPAssign<VT1> >
5163  {
5165 
5166  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5167 
5168  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5169  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5170 
5171  if( right.rows() == 0UL || right.columns() == 0UL ) {
5172  return;
5173  }
5174 
5175  LT x( left ); // Evaluation of the left-hand side dense vector operand
5176  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5177 
5178  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5179  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5180  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5181  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5182 
5183  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
5184  }
5185  //**********************************************************************************************
5186 
5187  //**SMP addition assignment to sparse vectors***************************************************
5188  // No special implementation for the SMP addition assignment to sparse vectors.
5189  //**********************************************************************************************
5190 
5191  //**SMP subtraction assignment to dense vectors*************************************************
5205  template< typename VT1 // Type of the target dense vector
5206  , bool TF > // Transpose flag of the target dense vector
5207  friend inline EnableIf_< UseSMPAssign<VT1> >
5209  {
5211 
5212  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5213 
5214  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
5215  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
5216 
5217  if( right.rows() == 0UL || right.columns() == 0UL ) {
5218  return;
5219  }
5220 
5221  LT x( left ); // Evaluation of the left-hand side dense vector operand
5222  RT A( right ); // Evaluation of the right-hand side dense matrix operand
5223 
5224  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
5225  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
5226  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
5227  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
5228 
5229  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
5230  }
5231  //**********************************************************************************************
5232 
5233  //**SMP subtraction assignment to sparse vectors************************************************
5234  // No special implementation for the SMP subtraction assignment to sparse vectors.
5235  //**********************************************************************************************
5236 
5237  //**SMP multiplication assignment to dense vectors**********************************************
5251  template< typename VT1 // Type of the target dense vector
5252  , bool TF > // Transpose flag of the target dense vector
5253  friend inline EnableIf_< UseSMPAssign<VT1> >
5255  {
5257 
5261 
5262  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5263 
5264  const ResultType tmp( rhs );
5265  smpMultAssign( ~lhs, tmp );
5266  }
5267  //**********************************************************************************************
5268 
5269  //**SMP multiplication assignment to sparse vectors*********************************************
5270  // No special implementation for the SMP multiplication assignment to sparse vectors.
5271  //**********************************************************************************************
5272 
5273  //**SMP division assignment to dense vectors****************************************************
5287  template< typename VT1 // Type of the target dense vector
5288  , bool TF > // Transpose flag of the target dense vector
5289  friend inline EnableIf_< UseSMPAssign<VT1> >
5291  {
5293 
5297 
5298  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
5299 
5300  const ResultType tmp( rhs );
5301  smpDivAssign( ~lhs, tmp );
5302  }
5303  //**********************************************************************************************
5304 
5305  //**SMP division assignment to sparse vectors***************************************************
5306  // No special implementation for the SMP division assignment to sparse vectors.
5307  //**********************************************************************************************
5308 
5309  //**Compile time checks*************************************************************************
5318  //**********************************************************************************************
5319 };
5321 //*************************************************************************************************
5322 
5323 
5324 
5325 
5326 //=================================================================================================
5327 //
5328 // GLOBAL BINARY ARITHMETIC OPERATORS
5329 //
5330 //=================================================================================================
5331 
5332 //*************************************************************************************************
5363 template< typename VT // Type of the left-hand side dense vector
5364  , typename MT > // Type of the right-hand side dense matrix
5365 inline decltype(auto)
5366  operator*( const DenseVector<VT,true>& vec, const DenseMatrix<MT,true>& mat )
5367 {
5369 
5371 
5372  if( (~vec).size() != (~mat).rows() ) {
5373  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
5374  }
5375 
5376  using ReturnType = const TDVecTDMatMultExpr<VT,MT>;
5377  return ReturnType( ~vec, ~mat );
5378 }
5379 //*************************************************************************************************
5380 
5381 
5382 
5383 
5384 //=================================================================================================
5385 //
5386 // SIZE SPECIALIZATIONS
5387 //
5388 //=================================================================================================
5389 
5390 //*************************************************************************************************
5392 template< typename VT, typename MT >
5393 struct Size< TDVecTDMatMultExpr<VT,MT> >
5394  : public Columns<MT>
5395 {};
5397 //*************************************************************************************************
5398 
5399 
5400 
5401 
5402 //=================================================================================================
5403 //
5404 // ISALIGNED SPECIALIZATIONS
5405 //
5406 //=================================================================================================
5407 
5408 //*************************************************************************************************
5410 template< typename VT, typename MT >
5411 struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
5412  : public BoolConstant< And< IsAligned<VT>, IsAligned<MT> >::value >
5413 {};
5415 //*************************************************************************************************
5416 
5417 } // namespace blaze
5418 
5419 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:158
Header file for auxiliary alias declarations.
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:206
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:203
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:220
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:369
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:246
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:260
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:521
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:381
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:325
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:305
Header file for the DisableIf class template.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:208
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:127
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
If_< IsExpression< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:214
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the Columns type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:349
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:126
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:88
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsTriangular type trait.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:204
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:129
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:315
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:593
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
Header file for the HasSIMDMult type trait.
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
Header file for run time assertion macros.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:292
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:382
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Header file for the TVecMatMultExpr base class.
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:217
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:324
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:109
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:106
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:74
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:205
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:130
Header file for the IsComplex type trait.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:131
Constraint on the data type.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:337
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:128
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:207
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:359
Header file for the Size type trait.
If_< IsExpression< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:211
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.