TDVecDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/Exception.h>
60 #include <blaze/math/shims/Reset.h>
62 #include <blaze/math/SIMD.h>
83 #include <blaze/math/views/Check.h>
84 #include <blaze/system/BLAS.h>
89 #include <blaze/util/Assert.h>
90 #include <blaze/util/Complex.h>
92 #include <blaze/util/DisableIf.h>
93 #include <blaze/util/EnableIf.h>
95 #include <blaze/util/mpl/And.h>
96 #include <blaze/util/mpl/If.h>
97 #include <blaze/util/Types.h>
105 
106 
107 namespace blaze {
108 
109 //=================================================================================================
110 //
111 // CLASS TDVECDMATMULTEXPR
112 //
113 //=================================================================================================
114 
115 //*************************************************************************************************
122 template< typename VT // Type of the left-hand side dense vector
123  , typename MT > // Type of the right-hand side dense matrix
124 class TDVecDMatMultExpr
125  : public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
126  , private Computation
127 {
128  private:
129  //**Type definitions****************************************************************************
136  //**********************************************************************************************
137 
138  //**********************************************************************************************
140  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
141  //**********************************************************************************************
142 
143  //**********************************************************************************************
145  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
147  //**********************************************************************************************
148 
149  //**********************************************************************************************
151 
155  template< typename T1 >
156  struct UseSMPAssign {
157  enum : bool { value = ( evaluateVector || evaluateMatrix ) };
158  };
160  //**********************************************************************************************
161 
162  //**********************************************************************************************
164 
167  template< typename T1, typename T2, typename T3 >
168  struct UseBlasKernel {
174  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
179  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
180  };
182  //**********************************************************************************************
183 
184  //**********************************************************************************************
186 
190  template< typename T1, typename T2, typename T3 >
191  struct UseVectorizedDefaultKernel {
192  enum : bool { value = useOptimizedKernels &&
194  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
197  , ElementType_<T3> >::value &&
200  };
202  //**********************************************************************************************
203 
204  public:
205  //**Type definitions****************************************************************************
211  using ReturnType = const ElementType;
212  using CompositeType = const ResultType;
213 
215  using LeftOperand = If_< IsExpression<VT>, const VT, const VT& >;
216 
218  using RightOperand = If_< IsExpression<MT>, const MT, const MT& >;
219 
222 
225  //**********************************************************************************************
226 
227  //**Compilation flags***************************************************************************
229  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
230  VT::simdEnabled && MT::simdEnabled &&
233 
235  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
236  !evaluateMatrix && MT::smpAssignable };
237  //**********************************************************************************************
238 
239  //**SIMD properties*****************************************************************************
241  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
242  //**********************************************************************************************
243 
244  //**Constructor*********************************************************************************
250  explicit inline TDVecDMatMultExpr( const VT& vec, const MT& mat ) noexcept
251  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
252  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
253  {
254  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
255  }
256  //**********************************************************************************************
257 
258  //**Subscript operator**************************************************************************
264  inline ReturnType operator[]( size_t index ) const {
265  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
266 
268  {
269  return vec_[index] * mat_(index,index);
270  }
271  else if( IsLower<MT>::value && ( index > 8UL ) )
272  {
273  const size_t begin( IsStrictlyLower<MT>::value ? index+1UL : index );
274  const size_t n ( mat_.rows() - begin );
275  return subvector( vec_, begin, n, unchecked ) *
276  subvector( column( mat_, index, unchecked ), begin, n, unchecked );
277  }
278  else if( IsUpper<MT>::value && ( index + 8UL < mat_.rows() ) )
279  {
280  const size_t n( IsStrictlyUpper<MT>::value ? index : index+1UL );
281  return subvector( vec_, 0UL, n, unchecked ) *
282  subvector( column( mat_, index, unchecked ), 0UL, n, unchecked );
283  }
284  else
285  {
286  return vec_ * column( mat_, index, unchecked );
287  }
288  }
289  //**********************************************************************************************
290 
291  //**At function*********************************************************************************
298  inline ReturnType at( size_t index ) const {
299  if( index >= mat_.columns() ) {
300  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
301  }
302  return (*this)[index];
303  }
304  //**********************************************************************************************
305 
306  //**Size function*******************************************************************************
311  inline size_t size() const noexcept {
312  return mat_.columns();
313  }
314  //**********************************************************************************************
315 
316  //**Left operand access*************************************************************************
321  inline LeftOperand leftOperand() const noexcept {
322  return vec_;
323  }
324  //**********************************************************************************************
325 
326  //**Right operand access************************************************************************
331  inline RightOperand rightOperand() const noexcept {
332  return mat_;
333  }
334  //**********************************************************************************************
335 
336  //**********************************************************************************************
342  template< typename T >
343  inline bool canAlias( const T* alias ) const noexcept {
344  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
345  }
346  //**********************************************************************************************
347 
348  //**********************************************************************************************
354  template< typename T >
355  inline bool isAliased( const T* alias ) const noexcept {
356  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
357  }
358  //**********************************************************************************************
359 
360  //**********************************************************************************************
365  inline bool isAligned() const noexcept {
366  return vec_.isAligned() && mat_.isAligned();
367  }
368  //**********************************************************************************************
369 
370  //**********************************************************************************************
375  inline bool canSMPAssign() const noexcept {
376  return ( !BLAZE_BLAS_MODE ||
379  ( IsComputation<MT>::value && !evaluateMatrix ) ||
380  ( mat_.rows() * mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
381  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
382  }
383  //**********************************************************************************************
384 
385  private:
386  //**Member variables****************************************************************************
389  //**********************************************************************************************
390 
391  //**Assignment to dense vectors*****************************************************************
404  template< typename VT1 > // Type of the target dense vector
405  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
406  {
408 
409  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
410 
411  if( rhs.mat_.rows() == 0UL ) {
412  reset( ~lhs );
413  return;
414  }
415  else if( rhs.mat_.columns() == 0UL ) {
416  return;
417  }
418 
419  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
420  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
421 
422  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
423  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
424  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
425  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
426 
427  TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
428  }
430  //**********************************************************************************************
431 
432  //**Assignment to dense vectors (kernel selection)**********************************************
443  template< typename VT1 // Type of the left-hand side target vector
444  , typename VT2 // Type of the left-hand side vector operand
445  , typename MT1 > // Type of the right-hand side matrix operand
446  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
447  {
448  if( ( IsDiagonal<MT1>::value ) ||
449  ( IsComputation<MT>::value && !evaluateMatrix ) ||
450  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
451  selectSmallAssignKernel( y, x, A );
452  else
453  selectBlasAssignKernel( y, x, A );
454  }
456  //**********************************************************************************************
457 
458  //**Default assignment to dense vectors*********************************************************
472  template< typename VT1 // Type of the left-hand side target vector
473  , typename VT2 // Type of the left-hand side vector operand
474  , typename MT1 > // Type of the right-hand side matrix operand
475  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
476  {
477  const size_t M( A.rows() );
478  const size_t N( A.columns() );
479 
481  reset( y[0] );
482  }
483 
484  if( !IsLower<MT1>::value )
485  {
486  const size_t jbegin( IsStrictlyUpper<MT1>::value ? 1UL : 0UL );
487  for( size_t j=jbegin; j<N; ++j ) {
488  y[j] = x[0UL] * A(0UL,j);
489  }
490  }
491 
492  for( size_t i=( IsLower<MT1>::value && !IsStrictlyLower<MT1>::value ? 0UL : 1UL ); i<M; ++i )
493  {
495  {
496  y[i] = x[i] * A(i,i);
497  }
498  else
499  {
500  const size_t jbegin( ( IsUpper<MT1>::value )
501  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
502  :( 0UL ) );
503  const size_t jend( ( IsLower<MT1>::value )
504  ?( IsStrictlyLower<MT1>::value ? i-1UL : i )
505  :( N ) );
506  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
507 
508  const size_t jnum( jend - jbegin );
509  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
510 
511  for( size_t j=jbegin; j<jpos; j+=2UL ) {
512  y[j ] += x[i] * A(i,j );
513  y[j+1UL] += x[i] * A(i,j+1UL);
514  }
515  if( jpos < jend ) {
516  y[jpos] += x[i] * A(i,jpos);
517  }
518  if( IsLower<MT1>::value ) {
519  y[jend] = x[i] * A(i,jend);
520  }
521  }
522  }
523 
525  reset( y[N-1UL] );
526  }
527  }
529  //**********************************************************************************************
530 
531  //**Default assignment to dense vectors (small matrices)****************************************
545  template< typename VT1 // Type of the left-hand side target vector
546  , typename VT2 // Type of the left-hand side vector operand
547  , typename MT1 > // Type of the right-hand side matrix operand
549  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
550  {
551  selectDefaultAssignKernel( y, x, A );
552  }
554  //**********************************************************************************************
555 
556  //**Vectorized default assignment to dense vectors (small matrices)*****************************
570  template< typename VT1 // Type of the left-hand side target vector
571  , typename VT2 // Type of the left-hand side vector operand
572  , typename MT1 > // Type of the right-hand side matrix operand
574  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
575  {
576  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
577 
578  const size_t M( A.rows() );
579  const size_t N( A.columns() );
580 
581  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
582  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
583 
584  size_t j( 0UL );
585 
586  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
587  {
588  const size_t ibegin( ( IsLower<MT1>::value )
589  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
590  :( 0UL ) );
591  const size_t iend( ( IsUpper<MT1>::value )
592  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
593  :( M ) );
594  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
595 
596  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
597 
598  for( size_t i=ibegin; i<iend; ++i ) {
599  const SIMDType x1( set( x[i] ) );
600  xmm1 += x1 * A.load(i,j );
601  xmm2 += x1 * A.load(i,j+SIMDSIZE );
602  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
603  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
604  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
605  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
606  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
607  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
608  }
609 
610  y.store( j , xmm1 );
611  y.store( j+SIMDSIZE , xmm2 );
612  y.store( j+SIMDSIZE*2UL, xmm3 );
613  y.store( j+SIMDSIZE*3UL, xmm4 );
614  y.store( j+SIMDSIZE*4UL, xmm5 );
615  y.store( j+SIMDSIZE*5UL, xmm6 );
616  y.store( j+SIMDSIZE*6UL, xmm7 );
617  y.store( j+SIMDSIZE*7UL, xmm8 );
618  }
619 
620  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
621  {
622  const size_t ibegin( ( IsLower<MT1>::value )
623  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
624  :( 0UL ) );
625  const size_t iend( ( IsUpper<MT1>::value )
626  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
627  :( M ) );
628  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
629 
630  SIMDType xmm1, xmm2, xmm3, xmm4;
631 
632  for( size_t i=ibegin; i<iend; ++i ) {
633  const SIMDType x1( set( x[i] ) );
634  xmm1 += x1 * A.load(i,j );
635  xmm2 += x1 * A.load(i,j+SIMDSIZE );
636  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
637  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
638  }
639 
640  y.store( j , xmm1 );
641  y.store( j+SIMDSIZE , xmm2 );
642  y.store( j+SIMDSIZE*2UL, xmm3 );
643  y.store( j+SIMDSIZE*3UL, xmm4 );
644  }
645 
646  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
647  {
648  const size_t ibegin( ( IsLower<MT1>::value )
649  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
650  :( 0UL ) );
651  const size_t iend( ( IsUpper<MT1>::value )
652  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
653  :( M ) );
654  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
655 
656  SIMDType xmm1, xmm2, xmm3;
657 
658  for( size_t i=ibegin; i<iend; ++i ) {
659  const SIMDType x1( set( x[i] ) );
660  xmm1 += x1 * A.load(i,j );
661  xmm2 += x1 * A.load(i,j+SIMDSIZE );
662  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
663  }
664 
665  y.store( j , xmm1 );
666  y.store( j+SIMDSIZE , xmm2 );
667  y.store( j+SIMDSIZE*2UL, xmm3 );
668  }
669 
670  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
671  {
672  const size_t ibegin( ( IsLower<MT1>::value )
673  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
674  :( 0UL ) );
675  const size_t iend( ( IsUpper<MT1>::value )
676  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
677  :( M ) );
678  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
679 
680  SIMDType xmm1, xmm2;
681 
682  for( size_t i=ibegin; i<iend; ++i ) {
683  const SIMDType x1( set( x[i] ) );
684  xmm1 += x1 * A.load(i,j );
685  xmm2 += x1 * A.load(i,j+SIMDSIZE);
686  }
687 
688  y.store( j , xmm1 );
689  y.store( j+SIMDSIZE, xmm2 );
690  }
691 
692  for( ; j<jpos; j+=SIMDSIZE )
693  {
694  const size_t ibegin( ( IsLower<MT1>::value )
695  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
696  :( 0UL ) );
697  const size_t iend( ( IsUpper<MT1>::value )
698  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
699  :( M ) );
700  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
701 
702  SIMDType xmm1;
703 
704  for( size_t i=ibegin; i<iend; ++i ) {
705  xmm1 += set( x[i] ) * A.load(i,j);
706  }
707 
708  y.store( j, xmm1 );
709  }
710 
711  for( ; remainder && j<N; ++j )
712  {
713  const size_t ibegin( ( IsLower<MT1>::value )
714  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
715  :( 0UL ) );
716  const size_t iend( ( IsUpper<MT1>::value )
717  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
718  :( M ) );
719  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
720 
721  ElementType value{};
722 
723  for( size_t i=ibegin; i<iend; ++i ) {
724  value += x[i] * A(i,j);
725  }
726 
727  y[j] = value;
728  }
729  }
731  //**********************************************************************************************
732 
733  //**Default assignment to dense vectors (large matrices)****************************************
747  template< typename VT1 // Type of the left-hand side target vector
748  , typename VT2 // Type of the left-hand side vector operand
749  , typename MT1 > // Type of the right-hand side matrix operand
751  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
752  {
753  selectDefaultAssignKernel( y, x, A );
754  }
756  //**********************************************************************************************
757 
758  //**Vectorized default assignment to dense vectors (large matrices)*****************************
772  template< typename VT1 // Type of the left-hand side target vector
773  , typename VT2 // Type of the left-hand side vector operand
774  , typename MT1 > // Type of the right-hand side matrix operand
776  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
777  {
778  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
779 
780  const size_t M( A.rows() );
781  const size_t N( A.columns() );
782 
783  const size_t jblock( 32768UL / sizeof( ElementType ) );
784  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
785 
786  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
787 
788  reset( y );
789 
790  for( size_t jj=0U; jj<N; jj+=jblock ) {
791  for( size_t ii=0UL; ii<M; ii+=iblock )
792  {
793  const size_t iend( min( ii+iblock, M ) );
794  const size_t jtmp( min( jj+jblock, N ) );
795  const size_t jend( ( IsLower<MT1>::value )
796  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
797  :( jtmp ) );
798 
799  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
800  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
801 
802  size_t j( ( IsUpper<MT1>::value )
803  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
804  :( jj ) );
805 
806  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
807  {
808  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
809 
810  for( size_t i=ii; i<iend; ++i ) {
811  const SIMDType x1( set( x[i] ) );
812  xmm1 += x1 * A.load(i,j );
813  xmm2 += x1 * A.load(i,j+SIMDSIZE );
814  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
815  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
816  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
817  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
818  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
819  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
820  }
821 
822  y.store( j , y.load(j ) + xmm1 );
823  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
824  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
825  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
826  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
827  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
828  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
829  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
830  }
831 
832  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
833  {
834  SIMDType xmm1, xmm2, xmm3, xmm4;
835 
836  for( size_t i=ii; i<iend; ++i ) {
837  const SIMDType x1( set( x[i] ) );
838  xmm1 += x1 * A.load(i,j );
839  xmm2 += x1 * A.load(i,j+SIMDSIZE );
840  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
841  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
842  }
843 
844  y.store( j , y.load(j ) + xmm1 );
845  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
846  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
847  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
848  }
849 
850  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
851  {
852  SIMDType xmm1, xmm2, xmm3;
853 
854  for( size_t i=ii; i<iend; ++i ) {
855  const SIMDType x1( set( x[i] ) );
856  xmm1 += x1 * A.load(i,j );
857  xmm2 += x1 * A.load(i,j+SIMDSIZE );
858  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
859  }
860 
861  y.store( j , y.load(j ) + xmm1 );
862  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
863  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
864  }
865 
866  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
867  {
868  SIMDType xmm1, xmm2;
869 
870  for( size_t i=ii; i<iend; ++i ) {
871  const SIMDType x1( set( x[i] ) );
872  xmm1 += x1 * A.load(i,j );
873  xmm2 += x1 * A.load(i,j+SIMDSIZE);
874  }
875 
876  y.store( j , y.load(j ) + xmm1 );
877  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
878  }
879 
880  for( ; j<jpos; j+=SIMDSIZE )
881  {
882  SIMDType xmm1;
883 
884  for( size_t i=ii; i<iend; ++i ) {
885  xmm1 += set( x[i] ) * A.load(i,j);
886  }
887 
888  y.store( j, y.load(j) + xmm1 );
889  }
890 
891  for( ; remainder && j<jend; ++j )
892  {
893  ElementType value{};
894 
895  for( size_t i=ii; i<iend; ++i ) {
896  value += x[i] * A(i,j);
897  }
898 
899  y[j] += value;
900  }
901  }
902  }
903  }
905  //**********************************************************************************************
906 
907  //**BLAS-based assignment to dense vectors (default)********************************************
921  template< typename VT1 // Type of the left-hand side target vector
922  , typename VT2 // Type of the left-hand side vector operand
923  , typename MT1 > // Type of the right-hand side matrix operand
925  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
926  {
927  selectLargeAssignKernel( y, x, A );
928  }
930  //**********************************************************************************************
931 
932  //**BLAS-based assignment to dense vectors******************************************************
933 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
934 
947  template< typename VT1 // Type of the left-hand side target vector
948  , typename VT2 // Type of the left-hand side vector operand
949  , typename MT1 > // Type of the right-hand side matrix operand
951  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
952  {
953  using ET = ElementType_<VT1>;
954 
956  assign( y, x );
957  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
958  }
959  else {
960  gemv( y, x, A, ET(1), ET(0) );
961  }
962  }
964 #endif
965  //**********************************************************************************************
966 
967  //**Assignment to sparse vectors****************************************************************
980  template< typename VT1 > // Type of the target sparse vector
981  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
982  {
984 
988 
989  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
990 
991  const ResultType tmp( serial( rhs ) );
992  assign( ~lhs, tmp );
993  }
995  //**********************************************************************************************
996 
997  //**Addition assignment to dense vectors********************************************************
1010  template< typename VT1 > // Type of the target dense vector
1011  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1012  {
1014 
1015  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1016 
1017  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1018  return;
1019  }
1020 
1021  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1022  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1023 
1024  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1025  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1026  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1027  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1028 
1029  TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1030  }
1032  //**********************************************************************************************
1033 
1034  //**Addition assignment to dense vectors (kernel selection)*************************************
1045  template< typename VT1 // Type of the left-hand side target vector
1046  , typename VT2 // Type of the left-hand side vector operand
1047  , typename MT1 > // Type of the right-hand side matrix operand
1048  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1049  {
1050  if( ( IsDiagonal<MT1>::value ) ||
1051  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1052  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1053  selectSmallAddAssignKernel( y, x, A );
1054  else
1055  selectBlasAddAssignKernel( y, x, A );
1056  }
1058  //**********************************************************************************************
1059 
1060  //**Default addition assignment to dense vectors************************************************
1074  template< typename VT1 // Type of the left-hand side target vector
1075  , typename VT2 // Type of the left-hand side vector operand
1076  , typename MT1 > // Type of the right-hand side matrix operand
1077  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1078  {
1079  const size_t M( A.rows() );
1080  const size_t N( A.columns() );
1081 
1082  for( size_t i=0UL; i<M; ++i )
1083  {
1085  {
1086  y[i] += x[i] * A(i,i);
1087  }
1088  else
1089  {
1090  const size_t jbegin( ( IsUpper<MT1>::value )
1091  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1092  :( 0UL ) );
1093  const size_t jend( ( IsLower<MT1>::value )
1094  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1095  :( N ) );
1096  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1097 
1098  const size_t jnum( jend - jbegin );
1099  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1100 
1101  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1102  y[j ] += x[i] * A(i,j );
1103  y[j+1UL] += x[i] * A(i,j+1UL);
1104  }
1105  if( jpos < jend ) {
1106  y[jpos] += x[i] * A(i,jpos);
1107  }
1108  }
1109  }
1110  }
1112  //**********************************************************************************************
1113 
1114  //**Default addition assignment to dense vectors (small matrices)*******************************
1128  template< typename VT1 // Type of the left-hand side target vector
1129  , typename VT2 // Type of the left-hand side vector operand
1130  , typename MT1 > // Type of the right-hand side matrix operand
1132  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1133  {
1134  selectDefaultAddAssignKernel( y, x, A );
1135  }
1137  //**********************************************************************************************
1138 
1139  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1153  template< typename VT1 // Type of the left-hand side target vector
1154  , typename VT2 // Type of the left-hand side vector operand
1155  , typename MT1 > // Type of the right-hand side matrix operand
1157  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1158  {
1159  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1160 
1161  const size_t M( A.rows() );
1162  const size_t N( A.columns() );
1163 
1164  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1165  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1166 
1167  size_t j( 0UL );
1168 
1169  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1170  {
1171  const size_t ibegin( ( IsLower<MT1>::value )
1172  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1173  :( 0UL ) );
1174  const size_t iend( ( IsUpper<MT1>::value )
1175  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1176  :( M ) );
1177  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1178 
1179  SIMDType xmm1( y.load(j ) );
1180  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1181  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1182  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1183  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1184  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1185  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1186  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1187 
1188  for( size_t i=ibegin; i<iend; ++i ) {
1189  const SIMDType x1( set( x[i] ) );
1190  xmm1 += x1 * A.load(i,j );
1191  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1192  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1193  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1194  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1195  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1196  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1197  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1198  }
1199 
1200  y.store( j , xmm1 );
1201  y.store( j+SIMDSIZE , xmm2 );
1202  y.store( j+SIMDSIZE*2UL, xmm3 );
1203  y.store( j+SIMDSIZE*3UL, xmm4 );
1204  y.store( j+SIMDSIZE*4UL, xmm5 );
1205  y.store( j+SIMDSIZE*5UL, xmm6 );
1206  y.store( j+SIMDSIZE*6UL, xmm7 );
1207  y.store( j+SIMDSIZE*7UL, xmm8 );
1208  }
1209 
1210  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1211  {
1212  const size_t ibegin( ( IsLower<MT1>::value )
1213  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1214  :( 0UL ) );
1215  const size_t iend( ( IsUpper<MT1>::value )
1216  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1217  :( M ) );
1218  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1219 
1220  SIMDType xmm1( y.load(j ) );
1221  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1222  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1223  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1224 
1225  for( size_t i=ibegin; i<iend; ++i ) {
1226  const SIMDType x1( set( x[i] ) );
1227  xmm1 += x1 * A.load(i,j );
1228  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1229  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1230  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1231  }
1232 
1233  y.store( j , xmm1 );
1234  y.store( j+SIMDSIZE , xmm2 );
1235  y.store( j+SIMDSIZE*2UL, xmm3 );
1236  y.store( j+SIMDSIZE*3UL, xmm4 );
1237  }
1238 
1239  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1240  {
1241  const size_t ibegin( ( IsLower<MT1>::value )
1242  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1243  :( 0UL ) );
1244  const size_t iend( ( IsUpper<MT1>::value )
1245  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1246  :( M ) );
1247  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1248 
1249  SIMDType xmm1( y.load(j ) );
1250  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1251  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1252 
1253  for( size_t i=ibegin; i<iend; ++i ) {
1254  const SIMDType x1( set( x[i] ) );
1255  xmm1 += x1 * A.load(i,j );
1256  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1257  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1258  }
1259 
1260  y.store( j , xmm1 );
1261  y.store( j+SIMDSIZE , xmm2 );
1262  y.store( j+SIMDSIZE*2UL, xmm3 );
1263  }
1264 
1265  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1266  {
1267  const size_t ibegin( ( IsLower<MT1>::value )
1268  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1269  :( 0UL ) );
1270  const size_t iend( ( IsUpper<MT1>::value )
1271  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1272  :( M ) );
1273  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1274 
1275  SIMDType xmm1( y.load(j ) );
1276  SIMDType xmm2( y.load(j+SIMDSIZE) );
1277 
1278  for( size_t i=ibegin; i<iend; ++i ) {
1279  const SIMDType x1( set( x[i] ) );
1280  xmm1 += x1 * A.load(i,j );
1281  xmm2 += x1 * A.load(i,j+SIMDSIZE);
1282  }
1283 
1284  y.store( j , xmm1 );
1285  y.store( j+SIMDSIZE, xmm2 );
1286  }
1287 
1288  for( ; j<jpos; j+=SIMDSIZE )
1289  {
1290  const size_t ibegin( ( IsLower<MT1>::value )
1291  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1292  :( 0UL ) );
1293  const size_t iend( ( IsUpper<MT1>::value )
1294  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1295  :( M ) );
1296  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1297 
1298  SIMDType xmm1( y.load(j) );
1299 
1300  for( size_t i=ibegin; i<iend; ++i ) {
1301  xmm1 += set( x[i] ) * A.load(i,j);
1302  }
1303 
1304  y.store( j, xmm1 );
1305  }
1306 
1307  for( ; remainder && j<N; ++j )
1308  {
1309  const size_t ibegin( ( IsLower<MT1>::value )
1310  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1311  :( 0UL ) );
1312  const size_t iend( ( IsUpper<MT1>::value )
1313  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1314  :( M ) );
1315  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1316 
1317  ElementType value{};
1318 
1319  for( size_t i=ibegin; i<iend; ++i ) {
1320  value += x[i] * A(i,j);
1321  }
1322 
1323  y[j] += value;
1324  }
1325  }
1327  //**********************************************************************************************
1328 
1329  //**Default addition assignment to dense vectors (large matrices)*******************************
1343  template< typename VT1 // Type of the left-hand side target vector
1344  , typename VT2 // Type of the left-hand side vector operand
1345  , typename MT1 > // Type of the right-hand side matrix operand
1347  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1348  {
1349  selectDefaultAddAssignKernel( y, x, A );
1350  }
1352  //**********************************************************************************************
1353 
1354  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1368  template< typename VT1 // Type of the left-hand side target vector
1369  , typename VT2 // Type of the left-hand side vector operand
1370  , typename MT1 > // Type of the right-hand side matrix operand
1372  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1373  {
1374  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1375 
1376  const size_t M( A.rows() );
1377  const size_t N( A.columns() );
1378 
1379  const size_t jblock( 32768UL / sizeof( ElementType ) );
1380  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1381 
1382  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1383 
1384  for( size_t jj=0U; jj<N; jj+=jblock ) {
1385  for( size_t ii=0UL; ii<M; ii+=iblock )
1386  {
1387  const size_t iend( min( ii+iblock, M ) );
1388  const size_t jtmp( min( jj+jblock, N ) );
1389  const size_t jend( ( IsLower<MT1>::value )
1390  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
1391  :( jtmp ) );
1392 
1393  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1394  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1395 
1396  size_t j( ( IsUpper<MT1>::value )
1397  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1398  :( jj ) );
1399 
1400  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1401  {
1402  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1403 
1404  for( size_t i=ii; i<iend; ++i ) {
1405  const SIMDType x1( set( x[i] ) );
1406  xmm1 += x1 * A.load(i,j );
1407  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1408  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1409  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1410  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1411  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1412  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1413  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1414  }
1415 
1416  y.store( j , y.load(j ) + xmm1 );
1417  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1418  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1419  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1420  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1421  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1422  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1423  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1424  }
1425 
1426  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1427  {
1428  SIMDType xmm1, xmm2, xmm3, xmm4;
1429 
1430  for( size_t i=ii; i<iend; ++i ) {
1431  const SIMDType x1( set( x[i] ) );
1432  xmm1 += x1 * A.load(i,j );
1433  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1434  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1435  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1436  }
1437 
1438  y.store( j , y.load(j ) + xmm1 );
1439  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1440  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1441  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1442  }
1443 
1444  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1445  {
1446  SIMDType xmm1, xmm2, xmm3;
1447 
1448  for( size_t i=ii; i<iend; ++i ) {
1449  const SIMDType x1( set( x[i] ) );
1450  xmm1 += x1 * A.load(i,j );
1451  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1452  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1453  }
1454 
1455  y.store( j , y.load(j ) + xmm1 );
1456  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1457  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1458  }
1459 
1460  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1461  {
1462  SIMDType xmm1, xmm2;
1463 
1464  for( size_t i=ii; i<iend; ++i ) {
1465  const SIMDType x1( set( x[i] ) );
1466  xmm1 += x1 * A.load(i,j );
1467  xmm2 += x1 * A.load(i,j+SIMDSIZE);
1468  }
1469 
1470  y.store( j , y.load(j ) + xmm1 );
1471  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1472  }
1473 
1474  for( ; j<jpos; j+=SIMDSIZE )
1475  {
1476  SIMDType xmm1;
1477 
1478  for( size_t i=ii; i<iend; ++i ) {
1479  xmm1 += set( x[i] ) * A.load(i,j);
1480  }
1481 
1482  y.store( j, y.load(j) + xmm1 );
1483  }
1484 
1485  for( ; remainder && j<jend; ++j )
1486  {
1487  ElementType value{};
1488 
1489  for( size_t i=ii; i<iend; ++i ) {
1490  value += x[i] * A(i,j);
1491  }
1492 
1493  y[j] += value;
1494  }
1495  }
1496  }
1497  }
1499  //**********************************************************************************************
1500 
1501  //**BLAS-based addition assignment to dense vectors (default)***********************************
1515  template< typename VT1 // Type of the left-hand side target vector
1516  , typename VT2 // Type of the left-hand side vector operand
1517  , typename MT1 > // Type of the right-hand side matrix operand
1519  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1520  {
1521  selectLargeAddAssignKernel( y, x, A );
1522  }
1524  //**********************************************************************************************
1525 
1526  //**BLAS-based addition assignment to dense vectors*********************************************
1527 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1528 
1541  template< typename VT1 // Type of the left-hand side target vector
1542  , typename VT2 // Type of the left-hand side vector operand
1543  , typename MT1 > // Type of the right-hand side matrix operand
1545  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1546  {
1547  using ET = ElementType_<VT1>;
1548 
1549  if( IsTriangular<MT1>::value ) {
1550  ResultType_<VT1> tmp( serial( x ) );
1551  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1552  addAssign( y, tmp );
1553  }
1554  else {
1555  gemv( y, x, A, ET(1), ET(1) );
1556  }
1557  }
1559 #endif
1560  //**********************************************************************************************
1561 
1562  //**Addition assignment to sparse vectors*******************************************************
1563  // No special implementation for the addition assignment to sparse vectors.
1564  //**********************************************************************************************
1565 
1566  //**Subtraction assignment to dense vectors*****************************************************
1579  template< typename VT1 > // Type of the target dense vector
1580  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1581  {
1583 
1584  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1585 
1586  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1587  return;
1588  }
1589 
1590  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1591  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1592 
1593  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1594  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1595  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1596  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1597 
1598  TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1599  }
1601  //**********************************************************************************************
1602 
1603  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1614  template< typename VT1 // Type of the left-hand side target vector
1615  , typename VT2 // Type of the left-hand side vector operand
1616  , typename MT1 > // Type of the right-hand side matrix operand
1617  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1618  {
1619  if( ( IsDiagonal<MT1>::value ) ||
1620  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1621  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1622  selectSmallSubAssignKernel( y, x, A );
1623  else
1624  selectBlasSubAssignKernel( y, x, A );
1625  }
1627  //**********************************************************************************************
1628 
1629  //**Default subtraction assignment to dense vectors*********************************************
1643  template< typename VT1 // Type of the left-hand side target vector
1644  , typename VT2 // Type of the left-hand side vector operand
1645  , typename MT1 > // Type of the right-hand side matrix operand
1646  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1647  {
1648  const size_t M( A.rows() );
1649  const size_t N( A.columns() );
1650 
1651  for( size_t i=0UL; i<M; ++i )
1652  {
1654  {
1655  y[i] -= x[i] * A(i,i);
1656  }
1657  else
1658  {
1659  const size_t jbegin( ( IsUpper<MT1>::value )
1660  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1661  :( 0UL ) );
1662  const size_t jend( ( IsLower<MT1>::value )
1663  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1664  :( N ) );
1665  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1666 
1667  const size_t jnum( jend - jbegin );
1668  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1669 
1670  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1671  y[j ] -= x[i] * A(i,j );
1672  y[j+1UL] -= x[i] * A(i,j+1UL);
1673  }
1674  if( jpos < jend ) {
1675  y[jpos] -= x[i] * A(i,jpos);
1676  }
1677  }
1678  }
1679  }
1681  //**********************************************************************************************
1682 
1683  //**Default subtraction assignment to dense vectors (small matrices)****************************
1697  template< typename VT1 // Type of the left-hand side target vector
1698  , typename VT2 // Type of the left-hand side vector operand
1699  , typename MT1 > // Type of the right-hand side matrix operand
1701  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1702  {
1703  selectDefaultSubAssignKernel( y, x, A );
1704  }
1706  //**********************************************************************************************
1707 
1708  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1723  template< typename VT1 // Type of the left-hand side target vector
1724  , typename VT2 // Type of the left-hand side vector operand
1725  , typename MT1 > // Type of the right-hand side matrix operand
1727  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1728  {
1729  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1730 
1731  const size_t M( A.rows() );
1732  const size_t N( A.columns() );
1733 
1734  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1735  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1736 
1737  size_t j( 0UL );
1738 
1739  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1740  {
1741  const size_t ibegin( ( IsLower<MT1>::value )
1742  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1743  :( 0UL ) );
1744  const size_t iend( ( IsUpper<MT1>::value )
1745  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1746  :( M ) );
1747  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1748 
1749  SIMDType xmm1( y.load(j ) );
1750  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1751  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1752  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1753  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1754  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1755  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1756  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1757 
1758  for( size_t i=ibegin; i<iend; ++i ) {
1759  const SIMDType x1( set( x[i] ) );
1760  xmm1 -= x1 * A.load(i,j );
1761  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1762  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1763  xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1764  xmm5 -= x1 * A.load(i,j+SIMDSIZE*4UL);
1765  xmm6 -= x1 * A.load(i,j+SIMDSIZE*5UL);
1766  xmm7 -= x1 * A.load(i,j+SIMDSIZE*6UL);
1767  xmm8 -= x1 * A.load(i,j+SIMDSIZE*7UL);
1768  }
1769 
1770  y.store( j , xmm1 );
1771  y.store( j+SIMDSIZE , xmm2 );
1772  y.store( j+SIMDSIZE*2UL, xmm3 );
1773  y.store( j+SIMDSIZE*3UL, xmm4 );
1774  y.store( j+SIMDSIZE*4UL, xmm5 );
1775  y.store( j+SIMDSIZE*5UL, xmm6 );
1776  y.store( j+SIMDSIZE*6UL, xmm7 );
1777  y.store( j+SIMDSIZE*7UL, xmm8 );
1778  }
1779 
1780  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1781  {
1782  const size_t ibegin( ( IsLower<MT1>::value )
1783  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1784  :( 0UL ) );
1785  const size_t iend( ( IsUpper<MT1>::value )
1786  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1787  :( M ) );
1788  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1789 
1790  SIMDType xmm1( y.load(j ) );
1791  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1792  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1793  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1794 
1795  for( size_t i=ibegin; i<iend; ++i ) {
1796  const SIMDType x1( set( x[i] ) );
1797  xmm1 -= x1 * A.load(i,j );
1798  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1799  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1800  xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1801  }
1802 
1803  y.store( j , xmm1 );
1804  y.store( j+SIMDSIZE , xmm2 );
1805  y.store( j+SIMDSIZE*2UL, xmm3 );
1806  y.store( j+SIMDSIZE*3UL, xmm4 );
1807  }
1808 
1809  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1810  {
1811  const size_t ibegin( ( IsLower<MT1>::value )
1812  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1813  :( 0UL ) );
1814  const size_t iend( ( IsUpper<MT1>::value )
1815  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1816  :( M ) );
1817  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1818 
1819  SIMDType xmm1( y.load(j ) );
1820  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1821  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1822 
1823  for( size_t i=ibegin; i<iend; ++i ) {
1824  const SIMDType x1( set( x[i] ) );
1825  xmm1 -= x1 * A.load(i,j );
1826  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1827  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1828  }
1829 
1830  y.store( j , xmm1 );
1831  y.store( j+SIMDSIZE , xmm2 );
1832  y.store( j+SIMDSIZE*2UL, xmm3 );
1833  }
1834 
1835  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1836  {
1837  const size_t ibegin( ( IsLower<MT1>::value )
1838  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1839  :( 0UL ) );
1840  const size_t iend( ( IsUpper<MT1>::value )
1841  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1842  :( M ) );
1843  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1844 
1845  SIMDType xmm1( y.load(j ) );
1846  SIMDType xmm2( y.load(j+SIMDSIZE) );
1847 
1848  for( size_t i=ibegin; i<iend; ++i ) {
1849  const SIMDType x1( set( x[i] ) );
1850  xmm1 -= x1 * A.load(i,j );
1851  xmm2 -= x1 * A.load(i,j+SIMDSIZE);
1852  }
1853 
1854  y.store( j , xmm1 );
1855  y.store( j+SIMDSIZE, xmm2 );
1856  }
1857 
1858  for( ; j<jpos; j+=SIMDSIZE )
1859  {
1860  const size_t ibegin( ( IsLower<MT1>::value )
1861  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1862  :( 0UL ) );
1863  const size_t iend( ( IsUpper<MT1>::value )
1864  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1865  :( M ) );
1866  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1867 
1868  SIMDType xmm1( y.load(j) );
1869 
1870  for( size_t i=ibegin; i<iend; ++i ) {
1871  xmm1 -= set( x[i] ) * A.load(i,j);
1872  }
1873 
1874  y.store( j, xmm1 );
1875  }
1876 
1877  for( ; remainder && j<N; ++j )
1878  {
1879  const size_t ibegin( ( IsLower<MT1>::value )
1880  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1881  :( 0UL ) );
1882  const size_t iend( ( IsUpper<MT1>::value )
1883  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1884  :( M ) );
1885  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1886 
1887  ElementType value{};
1888 
1889  for( size_t i=ibegin; i<iend; ++i ) {
1890  value += x[i] * A(i,j);
1891  }
1892 
1893  y[j] -= value;
1894  }
1895  }
1897  //**********************************************************************************************
1898 
1899  //**Default subtraction assignment to dense vectors (large matrices)****************************
1913  template< typename VT1 // Type of the left-hand side target vector
1914  , typename VT2 // Type of the left-hand side vector operand
1915  , typename MT1 > // Type of the right-hand side matrix operand
1917  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1918  {
1919  selectDefaultSubAssignKernel( y, x, A );
1920  }
1922  //**********************************************************************************************
1923 
1924  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
1939  template< typename VT1 // Type of the left-hand side target vector
1940  , typename VT2 // Type of the left-hand side vector operand
1941  , typename MT1 > // Type of the right-hand side matrix operand
1943  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1944  {
1945  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1946 
1947  const size_t M( A.rows() );
1948  const size_t N( A.columns() );
1949 
1950  const size_t jblock( 32768UL / sizeof( ElementType ) );
1951  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1952 
1953  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1954 
1955  for( size_t jj=0U; jj<N; jj+=jblock ) {
1956  for( size_t ii=0UL; ii<M; ii+=iblock )
1957  {
1958  const size_t iend( min( ii+iblock, M ) );
1959  const size_t jtmp( min( jj+jblock, N ) );
1960  const size_t jend( ( IsLower<MT1>::value )
1961  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
1962  :( jtmp ) );
1963 
1964  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1965  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1966 
1967  size_t j( ( IsUpper<MT1>::value )
1968  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1969  :( jj ) );
1970 
1971  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1972  {
1973  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1974 
1975  for( size_t i=ii; i<iend; ++i ) {
1976  const SIMDType x1( set( x[i] ) );
1977  xmm1 += x1 * A.load(i,j );
1978  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1979  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1980  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1981  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1982  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1983  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1984  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1985  }
1986 
1987  y.store( j , y.load(j ) - xmm1 );
1988  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
1989  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
1990  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
1991  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
1992  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
1993  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
1994  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
1995  }
1996 
1997  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1998  {
1999  SIMDType xmm1, xmm2, xmm3, xmm4;
2000 
2001  for( size_t i=ii; i<iend; ++i ) {
2002  const SIMDType x1( set( x[i] ) );
2003  xmm1 += x1 * A.load(i,j );
2004  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2005  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2006  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2007  }
2008 
2009  y.store( j , y.load(j ) - xmm1 );
2010  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2011  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2012  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2013  }
2014 
2015  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2016  {
2017  SIMDType xmm1, xmm2, xmm3;
2018 
2019  for( size_t i=ii; i<iend; ++i ) {
2020  const SIMDType x1( set( x[i] ) );
2021  xmm1 += x1 * A.load(i,j );
2022  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2023  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2024  }
2025 
2026  y.store( j , y.load(j ) - xmm1 );
2027  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2028  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2029  }
2030 
2031  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2032  {
2033  SIMDType xmm1, xmm2;
2034 
2035  for( size_t i=ii; i<iend; ++i ) {
2036  const SIMDType x1( set( x[i] ) );
2037  xmm1 += x1 * A.load(i,j );
2038  xmm2 += x1 * A.load(i,j+SIMDSIZE);
2039  }
2040 
2041  y.store( j , y.load(j ) - xmm1 );
2042  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2043  }
2044 
2045  for( ; j<jpos; j+=SIMDSIZE )
2046  {
2047  SIMDType xmm1;
2048 
2049  for( size_t i=ii; i<iend; ++i ) {
2050  xmm1 += set( x[i] ) * A.load(i,j);
2051  }
2052 
2053  y.store( j, y.load(j) - xmm1 );
2054  }
2055 
2056  for( ; remainder && j<jend; ++j )
2057  {
2058  ElementType value{};
2059 
2060  for( size_t i=ii; i<iend; ++i ) {
2061  value += x[i] * A(i,j);
2062  }
2063 
2064  y[j] -= value;
2065  }
2066  }
2067  }
2068  }
2070  //**********************************************************************************************
2071 
2072  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2086  template< typename VT1 // Type of the left-hand side target vector
2087  , typename VT2 // Type of the left-hand side vector operand
2088  , typename MT1 > // Type of the right-hand side matrix operand
2090  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2091  {
2092  selectLargeSubAssignKernel( y, x, A );
2093  }
2095  //**********************************************************************************************
2096 
2097  //**BLAS-based subtraction assignment to dense vectors******************************************
2098 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2099 
2112  template< typename VT1 // Type of the left-hand side target vector
2113  , typename VT2 // Type of the left-hand side vector operand
2114  , typename MT1 > // Type of the right-hand side matrix operand
2116  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2117  {
2118  using ET = ElementType_<VT1>;
2119 
2120  if( IsTriangular<MT1>::value ) {
2121  ResultType_<VT1> tmp( serial( x ) );
2122  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2123  subAssign( y, tmp );
2124  }
2125  else {
2126  gemv( y, x, A, ET(-1), ET(1) );
2127  }
2128  }
2130 #endif
2131  //**********************************************************************************************
2132 
2133  //**Subtraction assignment to sparse vectors****************************************************
2134  // No special implementation for the subtraction assignment to sparse vectors.
2135  //**********************************************************************************************
2136 
2137  //**Multiplication assignment to dense vectors**************************************************
2150  template< typename VT1 > // Type of the target dense vector
2151  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2152  {
2154 
2158 
2159  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2160 
2161  const ResultType tmp( serial( rhs ) );
2162  multAssign( ~lhs, tmp );
2163  }
2165  //**********************************************************************************************
2166 
2167  //**Multiplication assignment to sparse vectors*************************************************
2168  // No special implementation for the multiplication assignment to sparse vectors.
2169  //**********************************************************************************************
2170 
2171  //**Division assignment to dense vectors********************************************************
2184  template< typename VT1 > // Type of the target dense vector
2185  friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2186  {
2188 
2192 
2193  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2194 
2195  const ResultType tmp( serial( rhs ) );
2196  divAssign( ~lhs, tmp );
2197  }
2199  //**********************************************************************************************
2200 
2201  //**Division assignment to sparse vectors*******************************************************
2202  // No special implementation for the division assignment to sparse vectors.
2203  //**********************************************************************************************
2204 
2205  //**SMP assignment to dense vectors*************************************************************
2220  template< typename VT1 > // Type of the target dense vector
2221  friend inline EnableIf_< UseSMPAssign<VT1> >
2223  {
2225 
2226  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2227 
2228  if( rhs.mat_.rows() == 0UL ) {
2229  reset( ~lhs );
2230  return;
2231  }
2232  else if( rhs.mat_.columns() == 0UL ) {
2233  return;
2234  }
2235 
2236  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2237  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2238 
2239  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2240  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2241  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2242  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2243 
2244  smpAssign( ~lhs, x * A );
2245  }
2247  //**********************************************************************************************
2248 
2249  //**SMP assignment to sparse vectors************************************************************
2264  template< typename VT1 > // Type of the target sparse vector
2265  friend inline EnableIf_< UseSMPAssign<VT1> >
2267  {
2269 
2273 
2274  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2275 
2276  const ResultType tmp( rhs );
2277  smpAssign( ~lhs, tmp );
2278  }
2280  //**********************************************************************************************
2281 
2282  //**SMP addition assignment to dense vectors****************************************************
2297  template< typename VT1 > // Type of the target dense vector
2298  friend inline EnableIf_< UseSMPAssign<VT1> >
2300  {
2302 
2303  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2304 
2305  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2306  return;
2307  }
2308 
2309  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2310  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2311 
2312  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2313  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2314  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2315  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2316 
2317  smpAddAssign( ~lhs, x * A );
2318  }
2320  //**********************************************************************************************
2321 
2322  //**SMP addition assignment to sparse vectors***************************************************
2323  // No special implementation for the SMP addition assignment to sparse vectors.
2324  //**********************************************************************************************
2325 
2326  //**SMP subtraction assignment to dense vectors*************************************************
2341  template< typename VT1 > // Type of the target dense vector
2342  friend inline EnableIf_< UseSMPAssign<VT1> >
2344  {
2346 
2347  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2348 
2349  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2350  return;
2351  }
2352 
2353  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2354  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2355 
2356  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2357  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2358  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2359  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2360 
2361  smpSubAssign( ~lhs, x * A );
2362  }
2364  //**********************************************************************************************
2365 
2366  //**SMP subtraction assignment to sparse vectors************************************************
2367  // No special implementation for the SMP subtraction assignment to sparse vectors.
2368  //**********************************************************************************************
2369 
2370  //**SMP multiplication assignment to dense vectors**********************************************
2385  template< typename VT1 > // Type of the target dense vector
2386  friend inline EnableIf_< UseSMPAssign<VT1> >
2388  {
2390 
2394 
2395  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2396 
2397  const ResultType tmp( rhs );
2398  smpMultAssign( ~lhs, tmp );
2399  }
2401  //**********************************************************************************************
2402 
2403  //**SMP multiplication assignment to sparse vectors*********************************************
2404  // No special implementation for the SMP multiplication assignment to sparse vectors.
2405  //**********************************************************************************************
2406 
2407  //**SMP division assignment to dense vectors****************************************************
2422  template< typename VT1 > // Type of the target dense vector
2423  friend inline EnableIf_< UseSMPAssign<VT1> >
2425  {
2427 
2431 
2432  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2433 
2434  const ResultType tmp( rhs );
2435  smpDivAssign( ~lhs, tmp );
2436  }
2438  //**********************************************************************************************
2439 
2440  //**SMP division assignment to sparse vectors***************************************************
2441  // No special implementation for the SMP division assignment to sparse vectors.
2442  //**********************************************************************************************
2443 
2444  //**Compile time checks*************************************************************************
2452  //**********************************************************************************************
2453 };
2454 //*************************************************************************************************
2455 
2456 
2457 
2458 
2459 //=================================================================================================
2460 //
2461 // DVECSCALARMULTEXPR SPECIALIZATION
2462 //
2463 //=================================================================================================
2464 
2465 //*************************************************************************************************
2473 template< typename VT // Type of the left-hand side dense vector
2474  , typename MT // Type of the right-hand side dense matrix
2475  , typename ST > // Type of the side scalar value
2476 class DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >
2477  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2478  , private Computation
2479 {
2480  private:
2481  //**Type definitions****************************************************************************
2482  using VMM = TDVecDMatMultExpr<VT,MT>;
2483  using RES = ResultType_<VMM>;
2484  using VRT = ResultType_<VT>;
2485  using MRT = ResultType_<MT>;
2486  using VET = ElementType_<VRT>;
2487  using MET = ElementType_<MRT>;
2488  using VCT = CompositeType_<VT>;
2489  using MCT = CompositeType_<MT>;
2490  //**********************************************************************************************
2491 
2492  //**********************************************************************************************
2494  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2495  //**********************************************************************************************
2496 
2497  //**********************************************************************************************
2499  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2501  //**********************************************************************************************
2502 
2503  //**********************************************************************************************
2505 
2508  template< typename T1 >
2509  struct UseSMPAssign {
2510  enum : bool { value = ( evaluateVector || evaluateMatrix ) };
2511  };
2512  //**********************************************************************************************
2513 
2514  //**********************************************************************************************
2516 
2518  template< typename T1, typename T2, typename T3, typename T4 >
2519  struct UseBlasKernel {
2525  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2530  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2532  };
2533  //**********************************************************************************************
2534 
2535  //**********************************************************************************************
2537 
2540  template< typename T1, typename T2, typename T3, typename T4 >
2541  struct UseVectorizedDefaultKernel {
2542  enum : bool { value = useOptimizedKernels &&
2544  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2548  , T4 >::value &&
2549  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2550  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2551  };
2552  //**********************************************************************************************
2553 
2554  public:
2555  //**Type definitions****************************************************************************
2557  using ResultType = MultTrait_<RES,ST>;
2561  using ReturnType = const ElementType;
2562  using CompositeType = const ResultType;
2563 
2565  using LeftOperand = const TDVecDMatMultExpr<VT,MT>;
2566 
2568  using RightOperand = ST;
2569 
2572 
2575  //**********************************************************************************************
2576 
2577  //**Compilation flags***************************************************************************
2579  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2580  VT::simdEnabled && MT::simdEnabled &&
2584 
2586  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2587  !evaluateMatrix && MT::smpAssignable };
2588  //**********************************************************************************************
2589 
2590  //**SIMD properties*****************************************************************************
2592  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2593  //**********************************************************************************************
2594 
2595  //**Constructor*********************************************************************************
2601  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2602  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2603  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2604  {}
2605  //**********************************************************************************************
2606 
2607  //**Subscript operator**************************************************************************
2613  inline ReturnType operator[]( size_t index ) const {
2614  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2615  return vector_[index] * scalar_;
2616  }
2617  //**********************************************************************************************
2618 
2619  //**At function*********************************************************************************
2626  inline ReturnType at( size_t index ) const {
2627  if( index >= vector_.size() ) {
2628  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2629  }
2630  return (*this)[index];
2631  }
2632  //**********************************************************************************************
2633 
2634  //**Size function*******************************************************************************
2639  inline size_t size() const {
2640  return vector_.size();
2641  }
2642  //**********************************************************************************************
2643 
2644  //**Left operand access*************************************************************************
2649  inline LeftOperand leftOperand() const {
2650  return vector_;
2651  }
2652  //**********************************************************************************************
2653 
2654  //**Right operand access************************************************************************
2659  inline RightOperand rightOperand() const {
2660  return scalar_;
2661  }
2662  //**********************************************************************************************
2663 
2664  //**********************************************************************************************
2670  template< typename T >
2671  inline bool canAlias( const T* alias ) const {
2672  return vector_.canAlias( alias );
2673  }
2674  //**********************************************************************************************
2675 
2676  //**********************************************************************************************
2682  template< typename T >
2683  inline bool isAliased( const T* alias ) const {
2684  return vector_.isAliased( alias );
2685  }
2686  //**********************************************************************************************
2687 
2688  //**********************************************************************************************
2693  inline bool isAligned() const {
2694  return vector_.isAligned();
2695  }
2696  //**********************************************************************************************
2697 
2698  //**********************************************************************************************
2703  inline bool canSMPAssign() const noexcept {
2704  RightOperand_<VMM> A( vector_.rightOperand() );
2705  return ( !BLAZE_BLAS_MODE ||
2708  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2709  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2710  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
2711  }
2712  //**********************************************************************************************
2713 
2714  private:
2715  //**Member variables****************************************************************************
2716  LeftOperand vector_;
2717  RightOperand scalar_;
2718  //**********************************************************************************************
2719 
2720  //**Assignment to dense vectors*****************************************************************
2732  template< typename VT1 > // Type of the target dense vector
2733  friend inline void assign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
2734  {
2736 
2737  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2738 
2739  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
2740  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
2741 
2742  if( right.rows() == 0UL ) {
2743  reset( ~lhs );
2744  return;
2745  }
2746  else if( right.columns() == 0UL ) {
2747  return;
2748  }
2749 
2750  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
2751  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
2752 
2753  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
2754  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
2755  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
2756  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2757 
2758  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2759  }
2760  //**********************************************************************************************
2761 
2762  //**Assignment to dense vectors (kernel selection)**********************************************
2773  template< typename VT1 // Type of the left-hand side target vector
2774  , typename VT2 // Type of the left-hand side vector operand
2775  , typename MT1 // Type of the right-hand side matrix operand
2776  , typename ST2 > // Type of the scalar value
2777  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2778  {
2779  if( ( IsDiagonal<MT1>::value ) ||
2780  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2781  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2782  selectSmallAssignKernel( y, x, A, scalar );
2783  else
2784  selectBlasAssignKernel( y, x, A, scalar );
2785  }
2786  //**********************************************************************************************
2787 
2788  //**Default assignment to dense vectors*********************************************************
2802  template< typename VT1 // Type of the left-hand side target vector
2803  , typename VT2 // Type of the left-hand side vector operand
2804  , typename MT1 // Type of the right-hand side matrix operand
2805  , typename ST2 > // Type of the scalar value
2806  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2807  {
2808  const size_t M( A.rows() );
2809  const size_t N( A.columns() );
2810 
2812  reset( y[0] );
2813  }
2814 
2815  if( !IsLower<MT1>::value )
2816  {
2817  for( size_t j=( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ); j<N; ++j ) {
2818  y[j] = x[0UL] * A(0UL,j);
2819  }
2820  }
2821 
2822  for( size_t i=( IsLower<MT1>::value && !IsStrictlyLower<MT1>::value ? 0UL : 1UL ); i<M; ++i )
2823  {
2825  {
2826  y[i] = x[i] * A(i,i) * scalar;
2827  }
2828  else
2829  {
2830  const size_t jbegin( ( IsUpper<MT1>::value )
2831  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2832  :( 0UL ) );
2833  const size_t jend( ( IsLower<MT1>::value )
2834  ?( IsStrictlyLower<MT1>::value ? i-1UL : i )
2835  :( N ) );
2836  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2837 
2838  const size_t jnum( jend - jbegin );
2839  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2840 
2841  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2842  y[j ] += x[i] * A(i,j );
2843  y[j+1UL] += x[i] * A(i,j+1UL);
2844  }
2845  if( jpos < jend ) {
2846  y[jpos] += x[i] * A(i,jpos);
2847  }
2848  if( IsLower<MT1>::value ) {
2849  y[jend] = x[i] * A(i,jend);
2850  }
2851  }
2852  }
2853 
2855  reset( y[N-1UL] );
2856  }
2857 
2858  if( !IsDiagonal<MT1>::value )
2859  {
2860  const size_t iend( IsStrictlyLower<MT1>::value ? N-1UL : N );
2861  for( size_t j=( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ); j<iend; ++j ) {
2862  y[j] *= scalar;
2863  }
2864  }
2865  }
2866  //**********************************************************************************************
2867 
2868  //**Default assignment to dense vectors (small matrices)****************************************
2882  template< typename VT1 // Type of the left-hand side target vector
2883  , typename VT2 // Type of the left-hand side vector operand
2884  , typename MT1 // Type of the right-hand side matrix operand
2885  , typename ST2 > // Type of the scalar value
2887  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2888  {
2889  selectDefaultAssignKernel( y, x, A, scalar );
2890  }
2891  //**********************************************************************************************
2892 
2893  //**Default assignment to dense vectors (small matrices)****************************************
2907  template< typename VT1 // Type of the left-hand side target vector
2908  , typename VT2 // Type of the left-hand side vector operand
2909  , typename MT1 // Type of the right-hand side matrix operand
2910  , typename ST2 > // Type of the scalar value
2912  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2913  {
2914  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
2915 
2916  const size_t M( A.rows() );
2917  const size_t N( A.columns() );
2918 
2919  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2920  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2921 
2922  const SIMDType factor( set( scalar ) );
2923 
2924  size_t j( 0UL );
2925 
2926  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2927  {
2928  const size_t ibegin( ( IsLower<MT1>::value )
2929  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2930  :( 0UL ) );
2931  const size_t iend( ( IsUpper<MT1>::value )
2932  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2933  :( M ) );
2934  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2935 
2936  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2937 
2938  for( size_t i=ibegin; i<iend; ++i ) {
2939  const SIMDType x1( set( x[i] ) );
2940  xmm1 += x1 * A.load(i,j );
2941  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2942  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2943  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2944  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
2945  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
2946  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
2947  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
2948  }
2949 
2950  y.store( j , xmm1*factor );
2951  y.store( j+SIMDSIZE , xmm2*factor );
2952  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2953  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2954  y.store( j+SIMDSIZE*4UL, xmm5*factor );
2955  y.store( j+SIMDSIZE*5UL, xmm6*factor );
2956  y.store( j+SIMDSIZE*6UL, xmm7*factor );
2957  y.store( j+SIMDSIZE*7UL, xmm8*factor );
2958  }
2959 
2960  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2961  {
2962  const size_t ibegin( ( IsLower<MT1>::value )
2963  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2964  :( 0UL ) );
2965  const size_t iend( ( IsUpper<MT1>::value )
2966  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2967  :( M ) );
2968  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2969 
2970  SIMDType xmm1, xmm2, xmm3, xmm4;
2971 
2972  for( size_t i=ibegin; i<iend; ++i ) {
2973  const SIMDType x1( set( x[i] ) );
2974  xmm1 += x1 * A.load(i,j );
2975  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2976  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2977  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2978  }
2979 
2980  y.store( j , xmm1*factor );
2981  y.store( j+SIMDSIZE , xmm2*factor );
2982  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2983  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2984  }
2985 
2986  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2987  {
2988  const size_t ibegin( ( IsLower<MT1>::value )
2989  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2990  :( 0UL ) );
2991  const size_t iend( ( IsUpper<MT1>::value )
2992  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2993  :( M ) );
2994  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2995 
2996  SIMDType xmm1, xmm2, xmm3;
2997 
2998  for( size_t i=ibegin; i<iend; ++i ) {
2999  const SIMDType x1( set( x[i] ) );
3000  xmm1 += x1 * A.load(i,j );
3001  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3002  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3003  }
3004 
3005  y.store( j , xmm1*factor );
3006  y.store( j+SIMDSIZE , xmm2*factor );
3007  y.store( j+SIMDSIZE*2UL, xmm3*factor );
3008  }
3009 
3010  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3011  {
3012  const size_t ibegin( ( IsLower<MT1>::value )
3013  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3014  :( 0UL ) );
3015  const size_t iend( ( IsUpper<MT1>::value )
3016  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3017  :( M ) );
3018  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3019 
3020  SIMDType xmm1, xmm2;
3021 
3022  for( size_t i=ibegin; i<iend; ++i ) {
3023  const SIMDType x1( set( x[i] ) );
3024  xmm1 += x1 * A.load(i,j );
3025  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3026  }
3027 
3028  y.store( j , xmm1*factor );
3029  y.store( j+SIMDSIZE, xmm2*factor );
3030  }
3031 
3032  for( ; j<jpos; j+=SIMDSIZE )
3033  {
3034  const size_t ibegin( ( IsLower<MT1>::value )
3035  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3036  :( 0UL ) );
3037  const size_t iend( ( IsUpper<MT1>::value )
3038  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3039  :( M ) );
3040  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3041 
3042  SIMDType xmm1;
3043 
3044  for( size_t i=ibegin; i<iend; ++i ) {
3045  xmm1 += set( x[i] ) * A.load(i,j);
3046  }
3047 
3048  y.store( j, xmm1*factor );
3049  }
3050 
3051  for( ; remainder && j<N; ++j )
3052  {
3053  const size_t ibegin( ( IsLower<MT1>::value )
3054  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3055  :( 0UL ) );
3056  const size_t iend( ( IsUpper<MT1>::value )
3057  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3058  :( M ) );
3059  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3060 
3061  ElementType value{};
3062 
3063  for( size_t i=ibegin; i<iend; ++i ) {
3064  value += x[i] * A(i,j);
3065  }
3066 
3067  y[j] = value * scalar;
3068  }
3069  }
3070  //**********************************************************************************************
3071 
3072  //**Default assignment to dense vectors (large matrices)****************************************
3086  template< typename VT1 // Type of the left-hand side target vector
3087  , typename VT2 // Type of the left-hand side vector operand
3088  , typename MT1 // Type of the right-hand side matrix operand
3089  , typename ST2 > // Type of the scalar value
3091  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3092  {
3093  selectDefaultAssignKernel( y, x, A, scalar );
3094  }
3095  //**********************************************************************************************
3096 
3097  //**Default assignment to dense vectors (large matrices)****************************************
3111  template< typename VT1 // Type of the left-hand side target vector
3112  , typename VT2 // Type of the left-hand side vector operand
3113  , typename MT1 // Type of the right-hand side matrix operand
3114  , typename ST2 > // Type of the scalar value
3116  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3117  {
3118  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3119 
3120  const size_t M( A.rows() );
3121  const size_t N( A.columns() );
3122 
3123  const size_t jblock( 32768UL / sizeof( ElementType ) );
3124  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3125 
3126  const SIMDType factor( set( scalar ) );
3127 
3128  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3129 
3130  reset( y );
3131 
3132  for( size_t jj=0U; jj<N; jj+=jblock ) {
3133  for( size_t ii=0UL; ii<M; ii+=iblock )
3134  {
3135  const size_t iend( min( ii+iblock, M ) );
3136  const size_t jtmp( min( jj+jblock, N ) );
3137  const size_t jend( ( IsLower<MT1>::value )
3138  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
3139  :( jtmp ) );
3140 
3141  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3142  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3143 
3144  size_t j( ( IsUpper<MT1>::value )
3145  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3146  :( jj ) );
3147 
3148  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3149  {
3150  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3151 
3152  for( size_t i=ii; i<iend; ++i ) {
3153  const SIMDType x1( set( x[i] ) );
3154  xmm1 += x1 * A.load(i,j );
3155  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3156  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3157  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3158  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3159  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3160  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3161  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3162  }
3163 
3164  y.store( j , y.load(j ) + xmm1*factor );
3165  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3166  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3167  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3168  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3169  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3170  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3171  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3172  }
3173 
3174  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3175  {
3176  SIMDType xmm1, xmm2, xmm3, xmm4;
3177 
3178  for( size_t i=ii; i<iend; ++i ) {
3179  const SIMDType x1( set( x[i] ) );
3180  xmm1 += x1 * A.load(i,j );
3181  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3182  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3183  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3184  }
3185 
3186  y.store( j , y.load(j ) + xmm1*factor );
3187  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3188  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3189  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3190  }
3191 
3192  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3193  {
3194  SIMDType xmm1, xmm2, xmm3;
3195 
3196  for( size_t i=ii; i<iend; ++i ) {
3197  const SIMDType x1( set( x[i] ) );
3198  xmm1 += x1 * A.load(i,j );
3199  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3200  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3201  }
3202 
3203  y.store( j , y.load(j ) + xmm1*factor );
3204  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3205  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3206  }
3207 
3208  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3209  {
3210  SIMDType xmm1, xmm2;
3211 
3212  for( size_t i=ii; i<iend; ++i ) {
3213  const SIMDType x1( set( x[i] ) );
3214  xmm1 += x1 * A.load(i,j );
3215  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3216  }
3217 
3218  y.store( j , y.load(j ) + xmm1*factor );
3219  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3220  }
3221 
3222  for( ; j<jpos; j+=SIMDSIZE )
3223  {
3224  SIMDType xmm1;
3225 
3226  for( size_t i=ii; i<iend; ++i ) {
3227  xmm1 += set( x[i] ) * A.load(i,j);
3228  }
3229 
3230  y.store( j, y.load(j) + xmm1*factor );
3231  }
3232 
3233  for( ; remainder && j<jend; ++j )
3234  {
3235  ElementType value{};
3236 
3237  for( size_t i=ii; i<iend; ++i ) {
3238  value += x[i] * A(i,j);
3239  }
3240 
3241  y[j] += value * scalar;
3242  }
3243  }
3244  }
3245  }
3246  //**********************************************************************************************
3247 
3248  //**BLAS-based assignment to dense vectors (default)********************************************
3261  template< typename VT1 // Type of the left-hand side target vector
3262  , typename VT2 // Type of the left-hand side vector operand
3263  , typename MT1 // Type of the right-hand side matrix operand
3264  , typename ST2 > // Type of the scalar value
3266  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3267  {
3268  selectLargeAssignKernel( y, x, A, scalar );
3269  }
3270  //**********************************************************************************************
3271 
3272  //**BLAS-based assignment to dense vectors******************************************************
3273 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3274 
3287  template< typename VT1 // Type of the left-hand side target vector
3288  , typename VT2 // Type of the left-hand side vector operand
3289  , typename MT1 // Type of the right-hand side matrix operand
3290  , typename ST2 > // Type of the scalar value
3292  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3293  {
3294  using ET = ElementType_<VT1>;
3295 
3296  if( IsTriangular<MT1>::value ) {
3297  assign( y, scalar * x );
3298  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3299  }
3300  else {
3301  gemv( y, x, A, ET(scalar), ET(0) );
3302  }
3303  }
3304 #endif
3305  //**********************************************************************************************
3306 
3307  //**Assignment to sparse vectors****************************************************************
3319  template< typename VT1 > // Type of the target sparse vector
3320  friend inline void assign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3321  {
3323 
3327 
3328  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3329 
3330  const ResultType tmp( serial( rhs ) );
3331  assign( ~lhs, tmp );
3332  }
3333  //**********************************************************************************************
3334 
3335  //**Addition assignment to dense vectors********************************************************
3347  template< typename VT1 > // Type of the target dense vector
3348  friend inline void addAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3349  {
3351 
3352  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3353 
3354  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3355  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3356 
3357  if( right.rows() == 0UL || right.columns() == 0UL ) {
3358  return;
3359  }
3360 
3361  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3362  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3363 
3364  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3365  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3366  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3367  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3368 
3369  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3370  }
3371  //**********************************************************************************************
3372 
3373  //**Addition assignment to dense vectors (kernel selection)*************************************
3384  template< typename VT1 // Type of the left-hand side target vector
3385  , typename VT2 // Type of the left-hand side vector operand
3386  , typename MT1 // Type of the right-hand side matrix operand
3387  , typename ST2 > // Type of the scalar value
3388  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3389  {
3390  if( ( IsDiagonal<MT1>::value ) ||
3391  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3392  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3393  selectSmallAddAssignKernel( y, x, A, scalar );
3394  else
3395  selectBlasAddAssignKernel( y, x, A, scalar );
3396  }
3397  //**********************************************************************************************
3398 
3399  //**Default addition assignment to dense vectors************************************************
3413  template< typename VT1 // Type of the left-hand side target vector
3414  , typename VT2 // Type of the left-hand side vector operand
3415  , typename MT1 // Type of the right-hand side matrix operand
3416  , typename ST2 > // Type of the scalar value
3417  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3418  {
3419  y.addAssign( x * A * scalar );
3420  }
3421  //**********************************************************************************************
3422 
3423  //**Default addition assignment to dense vectors (small matrices)*******************************
3437  template< typename VT1 // Type of the left-hand side target vector
3438  , typename VT2 // Type of the left-hand side vector operand
3439  , typename MT1 // Type of the right-hand side matrix operand
3440  , typename ST2 > // Type of the scalar value
3442  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3443  {
3444  selectDefaultAddAssignKernel( y, x, A, scalar );
3445  }
3446  //**********************************************************************************************
3447 
3448  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3463  template< typename VT1 // Type of the left-hand side target vector
3464  , typename VT2 // Type of the left-hand side vector operand
3465  , typename MT1 // Type of the right-hand side matrix operand
3466  , typename ST2 > // Type of the scalar value
3468  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3469  {
3470  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3471 
3472  const size_t M( A.rows() );
3473  const size_t N( A.columns() );
3474 
3475  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3476  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3477 
3478  const SIMDType factor( set( scalar ) );
3479 
3480  size_t j( 0UL );
3481 
3482  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3483  {
3484  const size_t ibegin( ( IsLower<MT1>::value )
3485  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3486  :( 0UL ) );
3487  const size_t iend( ( IsUpper<MT1>::value )
3488  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3489  :( M ) );
3490  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3491 
3492  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3493 
3494  for( size_t i=ibegin; i<iend; ++i ) {
3495  const SIMDType x1( set( x[i] ) );
3496  xmm1 += x1 * A.load(i,j );
3497  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3498  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3499  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3500  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3501  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3502  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3503  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3504  }
3505 
3506  y.store( j , y.load(j ) + xmm1*factor );
3507  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3508  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3509  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3510  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3511  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3512  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3513  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3514  }
3515 
3516  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3517  {
3518  const size_t ibegin( ( IsLower<MT1>::value )
3519  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3520  :( 0UL ) );
3521  const size_t iend( ( IsUpper<MT1>::value )
3522  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3523  :( M ) );
3524  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3525 
3526  SIMDType xmm1, xmm2, xmm3, xmm4;
3527 
3528  for( size_t i=ibegin; i<iend; ++i ) {
3529  const SIMDType x1( set( x[i] ) );
3530  xmm1 += x1 * A.load(i,j );
3531  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3532  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3533  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3534  }
3535 
3536  y.store( j , y.load(j ) + xmm1*factor );
3537  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3538  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3539  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3540  }
3541 
3542  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3543  {
3544  const size_t ibegin( ( IsLower<MT1>::value )
3545  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3546  :( 0UL ) );
3547  const size_t iend( ( IsUpper<MT1>::value )
3548  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3549  :( M ) );
3550  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3551 
3552  SIMDType xmm1, xmm2, xmm3;
3553 
3554  for( size_t i=ibegin; i<iend; ++i ) {
3555  const SIMDType x1( set( x[i] ) );
3556  xmm1 += x1 * A.load(i,j );
3557  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3558  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3559  }
3560 
3561  y.store( j , y.load(j ) + xmm1*factor );
3562  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3563  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3564  }
3565 
3566  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3567  {
3568  const size_t ibegin( ( IsLower<MT1>::value )
3569  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3570  :( 0UL ) );
3571  const size_t iend( ( IsUpper<MT1>::value )
3572  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3573  :( M ) );
3574  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3575 
3576  SIMDType xmm1, xmm2;
3577 
3578  for( size_t i=ibegin; i<iend; ++i ) {
3579  const SIMDType x1( set( x[i] ) );
3580  xmm1 += x1 * A.load(i,j );
3581  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3582  }
3583 
3584  y.store( j , y.load(j ) + xmm1*factor );
3585  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3586  }
3587 
3588  for( ; j<jpos; j+=SIMDSIZE )
3589  {
3590  const size_t ibegin( ( IsLower<MT1>::value )
3591  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3592  :( 0UL ) );
3593  const size_t iend( ( IsUpper<MT1>::value )
3594  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3595  :( M ) );
3596  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3597 
3598  SIMDType xmm1;
3599 
3600  for( size_t i=ibegin; i<iend; ++i ) {
3601  xmm1 += set( x[i] ) * A.load(i,j);
3602  }
3603 
3604  y.store( j, y.load(j) + xmm1*factor );
3605  }
3606 
3607  for( ; remainder && j<N; ++j )
3608  {
3609  const size_t ibegin( ( IsLower<MT1>::value )
3610  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3611  :( 0UL ) );
3612  const size_t iend( ( IsUpper<MT1>::value )
3613  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3614  :( M ) );
3615  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3616 
3617  ElementType value{};
3618 
3619  for( size_t i=ibegin; i<iend; ++i ) {
3620  value += x[i] * A(i,j);
3621  }
3622 
3623  y[j] += value * scalar;
3624  }
3625  }
3626  //**********************************************************************************************
3627 
3628  //**Default addition assignment to dense vectors (large matrices)*******************************
3642  template< typename VT1 // Type of the left-hand side target vector
3643  , typename VT2 // Type of the left-hand side vector operand
3644  , typename MT1 // Type of the right-hand side matrix operand
3645  , typename ST2 > // Type of the scalar value
3647  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3648  {
3649  selectDefaultAddAssignKernel( y, x, A, scalar );
3650  }
3651  //**********************************************************************************************
3652 
3653  //**Vectorized default addition assignment to dense vectors (large matrices)********************
3668  template< typename VT1 // Type of the left-hand side target vector
3669  , typename VT2 // Type of the left-hand side vector operand
3670  , typename MT1 // Type of the right-hand side matrix operand
3671  , typename ST2 > // Type of the scalar value
3673  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3674  {
3675  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3676 
3677  const size_t M( A.rows() );
3678  const size_t N( A.columns() );
3679 
3680  const size_t jblock( 32768UL / sizeof( ElementType ) );
3681  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3682 
3683  const SIMDType factor( set( scalar ) );
3684 
3685  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3686 
3687  for( size_t jj=0U; jj<N; jj+=jblock ) {
3688  for( size_t ii=0UL; ii<M; ii+=iblock )
3689  {
3690  const size_t iend( min( ii+iblock, M ) );
3691  const size_t jtmp( min( jj+jblock, N ) );
3692  const size_t jend( ( IsLower<MT1>::value )
3693  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
3694  :( jtmp ) );
3695 
3696  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3697  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3698 
3699  size_t j( ( IsUpper<MT1>::value )
3700  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3701  :( jj ) );
3702 
3703  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3704  {
3705  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3706 
3707  for( size_t i=ii; i<iend; ++i ) {
3708  const SIMDType x1( set( x[i] ) );
3709  xmm1 += x1 * A.load(i,j );
3710  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3711  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3712  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3713  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3714  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3715  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3716  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3717  }
3718 
3719  y.store( j , y.load(j ) + xmm1*factor );
3720  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3721  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3722  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3723  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3724  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3725  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3726  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3727  }
3728 
3729  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3730  {
3731  SIMDType xmm1, xmm2, xmm3, xmm4;
3732 
3733  for( size_t i=ii; i<iend; ++i ) {
3734  const SIMDType x1( set( x[i] ) );
3735  xmm1 += x1 * A.load(i,j );
3736  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3737  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3738  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3739  }
3740 
3741  y.store( j , y.load(j ) + xmm1*factor );
3742  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3743  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3744  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3745  }
3746 
3747  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3748  {
3749  SIMDType xmm1, xmm2, xmm3;
3750 
3751  for( size_t i=ii; i<iend; ++i ) {
3752  const SIMDType x1( set( x[i] ) );
3753  xmm1 += x1 * A.load(i,j );
3754  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3755  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3756  }
3757 
3758  y.store( j , y.load(j ) + xmm1*factor );
3759  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3760  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3761  }
3762 
3763  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3764  {
3765  SIMDType xmm1, xmm2;
3766 
3767  for( size_t i=ii; i<iend; ++i ) {
3768  const SIMDType x1( set( x[i] ) );
3769  xmm1 += x1 * A.load(i,j );
3770  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3771  }
3772 
3773  y.store( j , y.load(j ) + xmm1*factor );
3774  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3775  }
3776 
3777  for( ; j<jpos; j+=SIMDSIZE )
3778  {
3779  SIMDType xmm1;
3780 
3781  for( size_t i=ii; i<iend; ++i ) {
3782  xmm1 += set( x[i] ) * A.load(i,j);
3783  }
3784 
3785  y.store( j, y.load(j) + xmm1*factor );
3786  }
3787 
3788  for( ; remainder && j<jend; ++j )
3789  {
3790  ElementType value{};
3791 
3792  for( size_t i=ii; i<iend; ++i ) {
3793  value += x[i] * A(i,j);
3794  }
3795 
3796  y[j] += value * scalar;
3797  }
3798  }
3799  }
3800  }
3801  //**********************************************************************************************
3802 
3803  //**BLAS-based addition assignment to dense vectors (default)***********************************
3817  template< typename VT1 // Type of the left-hand side target vector
3818  , typename VT2 // Type of the left-hand side vector operand
3819  , typename MT1 // Type of the right-hand side matrix operand
3820  , typename ST2 > // Type of the scalar value
3822  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3823  {
3824  selectLargeAddAssignKernel( y, x, A, scalar );
3825  }
3826  //**********************************************************************************************
3827 
3828  //**BLAS-based addition assignment to dense vectors*********************************************
3829 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3830 
3843  template< typename VT1 // Type of the left-hand side target vector
3844  , typename VT2 // Type of the left-hand side vector operand
3845  , typename MT1 // Type of the right-hand side matrix operand
3846  , typename ST2 > // Type of the scalar value
3848  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3849  {
3850  using ET = ElementType_<VT1>;
3851 
3852  if( IsTriangular<MT1>::value ) {
3853  ResultType_<VT1> tmp( serial( scalar * x ) );
3854  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3855  addAssign( y, tmp );
3856  }
3857  else {
3858  gemv( y, x, A, ET(scalar), ET(1) );
3859  }
3860  }
3861 #endif
3862  //**********************************************************************************************
3863 
3864  //**Addition assignment to sparse vectors*******************************************************
3865  // No special implementation for the addition assignment to sparse vectors.
3866  //**********************************************************************************************
3867 
3868  //**Subtraction assignment to dense vectors*****************************************************
3880  template< typename VT1 > // Type of the target dense vector
3881  friend inline void subAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3882  {
3884 
3885  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3886 
3887  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3888  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3889 
3890  if( right.rows() == 0UL || right.columns() == 0UL ) {
3891  return;
3892  }
3893 
3894  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3895  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3896 
3897  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3898  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3899  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3900  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3901 
3902  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
3903  }
3904  //**********************************************************************************************
3905 
3906  //**Subtraction assignment to dense vectors (kernel selection)**********************************
3917  template< typename VT1 // Type of the left-hand side target vector
3918  , typename VT2 // Type of the left-hand side vector operand
3919  , typename MT1 // Type of the right-hand side matrix operand
3920  , typename ST2 > // Type of the scalar value
3921  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3922  {
3923  if( ( IsDiagonal<MT1>::value ) ||
3924  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3925  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3926  selectSmallSubAssignKernel( y, x, A, scalar );
3927  else
3928  selectBlasSubAssignKernel( y, x, A, scalar );
3929  }
3930  //**********************************************************************************************
3931 
3932  //**Default subtraction assignment to dense vectors*********************************************
3946  template< typename VT1 // Type of the left-hand side target vector
3947  , typename VT2 // Type of the left-hand side vector operand
3948  , typename MT1 // Type of the right-hand side matrix operand
3949  , typename ST2 > // Type of the scalar value
3950  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3951  {
3952  y.subAssign( x * A * scalar );
3953  }
3954  //**********************************************************************************************
3955 
3956  //**Default subtraction assignment to dense vectors (small matrices)****************************
3970  template< typename VT1 // Type of the left-hand side target vector
3971  , typename VT2 // Type of the left-hand side vector operand
3972  , typename MT1 // Type of the right-hand side matrix operand
3973  , typename ST2 > // Type of the scalar value
3975  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3976  {
3977  selectDefaultSubAssignKernel( y, x, A, scalar );
3978  }
3979  //**********************************************************************************************
3980 
3981  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
3996  template< typename VT1 // Type of the left-hand side target vector
3997  , typename VT2 // Type of the left-hand side vector operand
3998  , typename MT1 // Type of the right-hand side matrix operand
3999  , typename ST2 > // Type of the scalar value
4001  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4002  {
4003  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
4004 
4005  const size_t M( A.rows() );
4006  const size_t N( A.columns() );
4007 
4008  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
4009  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4010 
4011  const SIMDType factor( set( scalar ) );
4012 
4013  size_t j( 0UL );
4014 
4015  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4016  {
4017  const size_t ibegin( ( IsLower<MT1>::value )
4018  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4019  :( 0UL ) );
4020  const size_t iend( ( IsUpper<MT1>::value )
4021  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4022  :( M ) );
4023  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4024 
4025  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4026 
4027  for( size_t i=ibegin; i<iend; ++i ) {
4028  const SIMDType x1( set( x[i] ) );
4029  xmm1 += x1 * A.load(i,j );
4030  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4031  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4032  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4033  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4034  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4035  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4036  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4037  }
4038 
4039  y.store( j , y.load(j ) - xmm1*factor );
4040  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4041  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4042  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4043  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4044  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4045  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4046  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4047  }
4048 
4049  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4050  {
4051  const size_t ibegin( ( IsLower<MT1>::value )
4052  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4053  :( 0UL ) );
4054  const size_t iend( ( IsUpper<MT1>::value )
4055  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4056  :( M ) );
4057  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4058 
4059  SIMDType xmm1, xmm2, xmm3, xmm4;
4060 
4061  for( size_t i=ibegin; i<iend; ++i ) {
4062  const SIMDType x1( set( x[i] ) );
4063  xmm1 += x1 * A.load(i,j );
4064  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4065  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4066  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4067  }
4068 
4069  y.store( j , y.load(j ) - xmm1*factor );
4070  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4071  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4072  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4073  }
4074 
4075  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4076  {
4077  const size_t ibegin( ( IsLower<MT1>::value )
4078  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4079  :( 0UL ) );
4080  const size_t iend( ( IsUpper<MT1>::value )
4081  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4082  :( M ) );
4083  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4084 
4085  SIMDType xmm1, xmm2, xmm3;
4086 
4087  for( size_t i=ibegin; i<iend; ++i ) {
4088  const SIMDType x1( set( x[i] ) );
4089  xmm1 += x1 * A.load(i,j );
4090  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4091  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4092  }
4093 
4094  y.store( j , y.load(j ) - xmm1*factor );
4095  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4096  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4097  }
4098 
4099  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4100  {
4101  const size_t ibegin( ( IsLower<MT1>::value )
4102  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4103  :( 0UL ) );
4104  const size_t iend( ( IsUpper<MT1>::value )
4105  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4106  :( M ) );
4107  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4108 
4109  SIMDType xmm1, xmm2;
4110 
4111  for( size_t i=ibegin; i<iend; ++i ) {
4112  const SIMDType x1( set( x[i] ) );
4113  xmm1 += x1 * A.load(i,j );
4114  xmm2 += x1 * A.load(i,j+SIMDSIZE);
4115  }
4116 
4117  y.store( j , y.load(j ) - xmm1*factor );
4118  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4119  }
4120 
4121  for( ; j<jpos; j+=SIMDSIZE )
4122  {
4123  const size_t ibegin( ( IsLower<MT1>::value )
4124  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4125  :( 0UL ) );
4126  const size_t iend( ( IsUpper<MT1>::value )
4127  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4128  :( M ) );
4129  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4130 
4131  SIMDType xmm1;
4132 
4133  for( size_t i=ibegin; i<iend; ++i ) {
4134  xmm1 += set( x[i] ) * A.load(i,j);
4135  }
4136 
4137  y.store( j, y.load(j) - xmm1*factor );
4138  }
4139 
4140  for( ; remainder && j<N; ++j )
4141  {
4142  const size_t ibegin( ( IsLower<MT1>::value )
4143  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4144  :( 0UL ) );
4145  const size_t iend( ( IsUpper<MT1>::value )
4146  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4147  :( M ) );
4148  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4149 
4150  ElementType value{};
4151 
4152  for( size_t i=ibegin; i<iend; ++i ) {
4153  value += x[i] * A(i,j);
4154  }
4155 
4156  y[j] -= value * scalar;
4157  }
4158  }
4159  //**********************************************************************************************
4160 
4161  //**Default subtraction assignment to dense vectors (large matrices)****************************
4175  template< typename VT1 // Type of the left-hand side target vector
4176  , typename VT2 // Type of the left-hand side vector operand
4177  , typename MT1 // Type of the right-hand side matrix operand
4178  , typename ST2 > // Type of the scalar value
4180  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4181  {
4182  selectDefaultSubAssignKernel( y, x, A, scalar );
4183  }
4184  //**********************************************************************************************
4185 
4186  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4201  template< typename VT1 // Type of the left-hand side target vector
4202  , typename VT2 // Type of the left-hand side vector operand
4203  , typename MT1 // Type of the right-hand side matrix operand
4204  , typename ST2 > // Type of the scalar value
4206  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4207  {
4208  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
4209 
4210  const size_t M( A.rows() );
4211  const size_t N( A.columns() );
4212 
4213  const size_t jblock( 32768UL / sizeof( ElementType ) );
4214  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4215 
4216  const SIMDType factor( set( scalar ) );
4217 
4218  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
4219 
4220  for( size_t jj=0U; jj<N; jj+=jblock ) {
4221  for( size_t ii=0UL; ii<M; ii+=iblock )
4222  {
4223  const size_t iend( min( ii+iblock, M ) );
4224  const size_t jtmp( min( jj+jblock, N ) );
4225  const size_t jend( ( IsLower<MT1>::value )
4226  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
4227  :( jtmp ) );
4228 
4229  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4230  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4231 
4232  size_t j( ( IsUpper<MT1>::value )
4233  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
4234  :( jj ) );
4235 
4236  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4237  {
4238  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4239 
4240  for( size_t i=ii; i<iend; ++i ) {
4241  const SIMDType x1( set( x[i] ) );
4242  xmm1 += x1 * A.load(i,j );
4243  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4244  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4245  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4246  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4247  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4248  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4249  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4250  }
4251 
4252  y.store( j , y.load(j ) - xmm1*factor );
4253  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4254  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4255  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4256  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4257  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4258  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4259  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4260  }
4261 
4262  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4263  {
4264  SIMDType xmm1, xmm2, xmm3, xmm4;
4265 
4266  for( size_t i=ii; i<iend; ++i ) {
4267  const SIMDType x1( set( x[i] ) );
4268  xmm1 += x1 * A.load(i,j );
4269  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4270  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4271  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4272  }
4273 
4274  y.store( j , y.load(j ) - xmm1*factor );
4275  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4276  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4277  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4278  }
4279 
4280  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4281  {
4282  SIMDType xmm1, xmm2, xmm3;
4283 
4284  for( size_t i=ii; i<iend; ++i ) {
4285  const SIMDType x1( set( x[i] ) );
4286  xmm1 += x1 * A.load(i,j );
4287  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4288  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4289  }
4290 
4291  y.store( j , y.load(j ) - xmm1*factor );
4292  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4293  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4294  }
4295 
4296  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4297  {
4298  SIMDType xmm1, xmm2;
4299 
4300  for( size_t i=ii; i<iend; ++i ) {
4301  const SIMDType x1( set( x[i] ) );
4302  xmm1 += x1 * A.load(i,j );
4303  xmm2 += x1 * A.load(i,j+SIMDSIZE);
4304  }
4305 
4306  y.store( j , y.load(j ) - xmm1*factor );
4307  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4308  }
4309 
4310  for( ; j<jpos; j+=SIMDSIZE )
4311  {
4312  SIMDType xmm1;
4313 
4314  for( size_t i=ii; i<iend; ++i ) {
4315  xmm1 += set( x[i] ) * A.load(i,j);
4316  }
4317 
4318  y.store( j, y.load(j) - xmm1*factor );
4319  }
4320 
4321  for( ; remainder && j<jend; ++j )
4322  {
4323  ElementType value{};
4324 
4325  for( size_t i=ii; i<iend; ++i ) {
4326  value += x[i] * A(i,j);
4327  }
4328 
4329  y[j] -= value * scalar;
4330  }
4331  }
4332  }
4333  }
4334  //**********************************************************************************************
4335 
4336  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4350  template< typename VT1 // Type of the left-hand side target vector
4351  , typename VT2 // Type of the left-hand side vector operand
4352  , typename MT1 // Type of the right-hand side matrix operand
4353  , typename ST2 > // Type of the scalar value
4355  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4356  {
4357  selectLargeSubAssignKernel( y, x, A, scalar );
4358  }
4359  //**********************************************************************************************
4360 
4361  //**BLAS-based subtraction assignment to dense vectors******************************************
4362 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4363 
4376  template< typename VT1 // Type of the left-hand side target vector
4377  , typename VT2 // Type of the left-hand side vector operand
4378  , typename MT1 // Type of the right-hand side matrix operand
4379  , typename ST2 > // Type of the scalar value
4381  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4382  {
4383  using ET = ElementType_<VT1>;
4384 
4385  if( IsTriangular<MT1>::value ) {
4386  ResultType_<VT1> tmp( serial( scalar * x ) );
4387  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4388  subAssign( y, tmp );
4389  }
4390  else {
4391  gemv( y, x, A, ET(-scalar), ET(1) );
4392  }
4393  }
4394 #endif
4395  //**********************************************************************************************
4396 
4397  //**Subtraction assignment to sparse vectors****************************************************
4398  // No special implementation for the subtraction assignment to sparse vectors.
4399  //**********************************************************************************************
4400 
4401  //**Multiplication assignment to dense vectors**************************************************
4413  template< typename VT1 > // Type of the target dense vector
4414  friend inline void multAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4415  {
4417 
4421 
4422  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4423 
4424  const ResultType tmp( serial( rhs ) );
4425  multAssign( ~lhs, tmp );
4426  }
4427  //**********************************************************************************************
4428 
4429  //**Multiplication assignment to sparse vectors*************************************************
4430  // No special implementation for the multiplication assignment to sparse vectors.
4431  //**********************************************************************************************
4432 
4433  //**Division assignment to dense vectors********************************************************
4445  template< typename VT1 > // Type of the target dense vector
4446  friend inline void divAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4447  {
4449 
4453 
4454  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4455 
4456  const ResultType tmp( serial( rhs ) );
4457  divAssign( ~lhs, tmp );
4458  }
4459  //**********************************************************************************************
4460 
4461  //**Division assignment to sparse vectors*******************************************************
4462  // No special implementation for the division assignment to sparse vectors.
4463  //**********************************************************************************************
4464 
4465  //**SMP assignment to dense vectors*************************************************************
4479  template< typename VT1 > // Type of the target dense vector
4480  friend inline EnableIf_< UseSMPAssign<VT1> >
4482  {
4484 
4485  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4486 
4487  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4488  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4489 
4490  if( right.rows() == 0UL ) {
4491  reset( ~lhs );
4492  return;
4493  }
4494  else if( right.columns() == 0UL ) {
4495  return;
4496  }
4497 
4498  LT x( left ); // Evaluation of the left-hand side dense vector operand
4499  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4500 
4501  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4502  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4503  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4504  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4505 
4506  smpAssign( ~lhs, x * A * rhs.scalar_ );
4507  }
4508  //**********************************************************************************************
4509 
4510  //**SMP assignment to sparse vectors************************************************************
4524  template< typename VT1 > // Type of the target sparse vector
4525  friend inline EnableIf_< UseSMPAssign<VT1> >
4527  {
4529 
4533 
4534  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4535 
4536  const ResultType tmp( rhs );
4537  smpAssign( ~lhs, tmp );
4538  }
4539  //**********************************************************************************************
4540 
4541  //**SMP addition assignment to dense vectors****************************************************
4555  template< typename VT1 > // Type of the target dense vector
4556  friend inline EnableIf_< UseSMPAssign<VT1> >
4558  {
4560 
4561  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4562 
4563  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4564  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4565 
4566  if( right.rows() == 0UL || right.columns() == 0UL ) {
4567  return;
4568  }
4569 
4570  LT x( left ); // Evaluation of the left-hand side dense vector operand
4571  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4572 
4573  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4574  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4575  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4576  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4577 
4578  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
4579  }
4580  //**********************************************************************************************
4581 
4582  //**SMP addition assignment to sparse vectors***************************************************
4583  // No special implementation for the SMP addition assignment to sparse vectors.
4584  //**********************************************************************************************
4585 
4586  //**SMP subtraction assignment to dense vectors*************************************************
4600  template< typename VT1 > // Type of the target dense vector
4601  friend inline EnableIf_< UseSMPAssign<VT1> >
4603  {
4605 
4606  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4607 
4608  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4609  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4610 
4611  if( right.rows() == 0UL || right.columns() == 0UL ) {
4612  return;
4613  }
4614 
4615  LT x( left ); // Evaluation of the left-hand side dense vector operand
4616  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4617 
4618  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4619  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4620  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4621  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4622 
4623  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
4624  }
4625  //**********************************************************************************************
4626 
4627  //**SMP subtraction assignment to sparse vectors************************************************
4628  // No special implementation for the SMP subtraction assignment to sparse vectors.
4629  //**********************************************************************************************
4630 
4631  //**SMP multiplication assignment to dense vectors**********************************************
4646  template< typename VT1 > // Type of the target dense vector
4647  friend inline EnableIf_< UseSMPAssign<VT1> >
4649  {
4651 
4655 
4656  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4657 
4658  const ResultType tmp( rhs );
4659  smpMultAssign( ~lhs, tmp );
4660  }
4661  //**********************************************************************************************
4662 
4663  //**SMP multiplication assignment to sparse vectors*********************************************
4664  // No special implementation for the SMP multiplication assignment to sparse vectors.
4665  //**********************************************************************************************
4666 
4667  //**SMP division assignment to dense vectors****************************************************
4681  template< typename VT1 > // Type of the target dense vector
4682  friend inline EnableIf_< UseSMPAssign<VT1> >
4684  {
4686 
4690 
4691  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4692 
4693  const ResultType tmp( rhs );
4694  smpDivAssign( ~lhs, tmp );
4695  }
4696  //**********************************************************************************************
4697 
4698  //**SMP division assignment to sparse vectors***************************************************
4699  // No special implementation for the SMP division assignment to sparse vectors.
4700  //**********************************************************************************************
4701 
4702  //**Compile time checks*************************************************************************
4711  //**********************************************************************************************
4712 };
4714 //*************************************************************************************************
4715 
4716 
4717 
4718 
4719 //=================================================================================================
4720 //
4721 // GLOBAL BINARY ARITHMETIC OPERATORS
4722 //
4723 //=================================================================================================
4724 
4725 //*************************************************************************************************
4756 template< typename VT // Type of the left-hand side dense vector
4757  , typename MT > // Type of the right-hand side dense matrix
4758 inline decltype(auto)
4759  operator*( const DenseVector<VT,true>& vec, const DenseMatrix<MT,false>& mat )
4760 {
4762 
4764 
4765  if( (~vec).size() != (~mat).rows() ) {
4766  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
4767  }
4768 
4769  using ReturnType = const TDVecDMatMultExpr<VT,MT>;
4770  return ReturnType( ~vec, ~mat );
4771 }
4772 //*************************************************************************************************
4773 
4774 
4775 
4776 
4777 //=================================================================================================
4778 //
4779 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
4780 //
4781 //=================================================================================================
4782 
4783 //*************************************************************************************************
4797 template< typename VT // Type of the left-hand side dense vector
4798  , typename MT > // Matrix base type of the right-hand side expression
4799 inline decltype(auto)
4800  operator*( const DenseVector<VT,true>& vec, const MatMatMultExpr<MT>& mat )
4801 {
4803 
4804  return ( vec * (~mat).leftOperand() ) * (~mat).rightOperand();
4805 }
4807 //*************************************************************************************************
4808 
4809 
4810 
4811 
4812 //=================================================================================================
4813 //
4814 // SIZE SPECIALIZATIONS
4815 //
4816 //=================================================================================================
4817 
4818 //*************************************************************************************************
4820 template< typename VT, typename MT >
4821 struct Size< TDVecDMatMultExpr<VT,MT>, 0UL >
4822  : public Size<MT,1UL>
4823 {};
4825 //*************************************************************************************************
4826 
4827 
4828 
4829 
4830 //=================================================================================================
4831 //
4832 // ISALIGNED SPECIALIZATIONS
4833 //
4834 //=================================================================================================
4835 
4836 //*************************************************************************************************
4838 template< typename VT, typename MT >
4839 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4840  : public And< IsAligned<VT>, IsAligned<MT> >
4841 {};
4843 //*************************************************************************************************
4844 
4845 } // namespace blaze
4846 
4847 #endif
If_< IsExpression< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:215
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:131
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:208
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:131
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:298
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:130
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:519
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1903
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:365
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:387
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Constraints on the storage order of matrix types.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:331
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1950
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:355
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:133
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:134
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:210
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:264
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:209
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:388
decltype(auto) operator*(const DenseMatrix< MT1, false > &lhs, const DenseMatrix< MT2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8893
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:590
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:135
Header file for the IsPadded type trait.
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:250
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:224
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
If_< IsExpression< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:218
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:375
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:211
Header file for the IsContiguous type trait.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:221
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:132
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Header file for the TVecMatMultExpr base class.
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:156
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:311
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:343
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:490
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:108
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:212
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:104
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:321
Header file for the IsComplexFloat type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Compile time logical &#39;and&#39; evaluation.The And alias declaration performs at compile time a logical &#39;a...
Definition: And.h:76
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
Constraint on the data type.
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:207
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.