TDVecDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemv.h>
44 #include <blaze/math/blas/trmv.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/Exception.h>
59 #include <blaze/math/shims/Reset.h>
61 #include <blaze/math/SIMD.h>
81 #include <blaze/system/BLAS.h>
86 #include <blaze/util/Assert.h>
87 #include <blaze/util/Complex.h>
89 #include <blaze/util/DisableIf.h>
90 #include <blaze/util/EnableIf.h>
93 #include <blaze/util/mpl/And.h>
94 #include <blaze/util/mpl/If.h>
95 #include <blaze/util/Types.h>
103 
104 
105 namespace blaze {
106 
107 //=================================================================================================
108 //
109 // CLASS TDVECDMATMULTEXPR
110 //
111 //=================================================================================================
112 
113 //*************************************************************************************************
120 template< typename VT // Type of the left-hand side dense vector
121  , typename MT > // Type of the right-hand side dense matrix
122 class TDVecDMatMultExpr
123  : public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
124  , private Computation
125 {
126  private:
127  //**Type definitions****************************************************************************
134  //**********************************************************************************************
135 
136  //**********************************************************************************************
138  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
139  //**********************************************************************************************
140 
141  //**********************************************************************************************
143  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
145  //**********************************************************************************************
146 
147  //**********************************************************************************************
149 
153  template< typename T1 >
154  struct UseSMPAssign {
155  enum : bool { value = ( evaluateVector || evaluateMatrix ) };
156  };
158  //**********************************************************************************************
159 
160  //**********************************************************************************************
162 
165  template< typename T1, typename T2, typename T3 >
166  struct UseBlasKernel {
172  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
177  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
178  };
180  //**********************************************************************************************
181 
182  //**********************************************************************************************
184 
188  template< typename T1, typename T2, typename T3 >
189  struct UseVectorizedDefaultKernel {
190  enum : bool { value = useOptimizedKernels &&
192  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
195  , ElementType_<T3> >::value &&
198  };
200  //**********************************************************************************************
201 
202  public:
203  //**Type definitions****************************************************************************
209  using ReturnType = const ElementType;
210  using CompositeType = const ResultType;
211 
213  using LeftOperand = If_< IsExpression<VT>, const VT, const VT& >;
214 
216  using RightOperand = If_< IsExpression<MT>, const MT, const MT& >;
217 
220 
223  //**********************************************************************************************
224 
225  //**Compilation flags***************************************************************************
227  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
228  VT::simdEnabled && MT::simdEnabled &&
231 
233  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
234  !evaluateMatrix && MT::smpAssignable };
235  //**********************************************************************************************
236 
237  //**SIMD properties*****************************************************************************
239  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
240  //**********************************************************************************************
241 
242  //**Constructor*********************************************************************************
248  explicit inline TDVecDMatMultExpr( const VT& vec, const MT& mat ) noexcept
249  : vec_( vec ) // Left-hand side dense vector of the multiplication expression
250  , mat_( mat ) // Right-hand side dense matrix of the multiplication expression
251  {
252  BLAZE_INTERNAL_ASSERT( vec_.size() == mat_.rows(), "Invalid vector and matrix sizes" );
253  }
254  //**********************************************************************************************
255 
256  //**Subscript operator**************************************************************************
262  inline ReturnType operator[]( size_t index ) const {
263  BLAZE_INTERNAL_ASSERT( index < mat_.columns(), "Invalid vector access index" );
264 
266  {
267  return vec_[index] * mat_(index,index);
268  }
269  else if( IsLower<MT>::value && ( index > 8UL ) )
270  {
271  const size_t begin( IsStrictlyLower<MT>::value ? index+1UL : index );
272  const size_t n ( mat_.rows() - begin );
273  return subvector( vec_, begin, n ) * subvector( column( mat_, index ), begin, n );
274  }
275  else if( IsUpper<MT>::value && ( index + 8UL < mat_.rows() ) )
276  {
277  const size_t n( IsStrictlyUpper<MT>::value ? index : index+1UL );
278  return subvector( vec_, 0UL, n ) * subvector( column( mat_, index ), 0UL, n );
279  }
280  else
281  {
282  return vec_ * column( mat_, index );
283  }
284  }
285  //**********************************************************************************************
286 
287  //**At function*********************************************************************************
294  inline ReturnType at( size_t index ) const {
295  if( index >= mat_.columns() ) {
296  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
297  }
298  return (*this)[index];
299  }
300  //**********************************************************************************************
301 
302  //**Size function*******************************************************************************
307  inline size_t size() const noexcept {
308  return mat_.columns();
309  }
310  //**********************************************************************************************
311 
312  //**Left operand access*************************************************************************
317  inline LeftOperand leftOperand() const noexcept {
318  return vec_;
319  }
320  //**********************************************************************************************
321 
322  //**Right operand access************************************************************************
327  inline RightOperand rightOperand() const noexcept {
328  return mat_;
329  }
330  //**********************************************************************************************
331 
332  //**********************************************************************************************
338  template< typename T >
339  inline bool canAlias( const T* alias ) const noexcept {
340  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
341  }
342  //**********************************************************************************************
343 
344  //**********************************************************************************************
350  template< typename T >
351  inline bool isAliased( const T* alias ) const noexcept {
352  return ( vec_.isAliased( alias ) || mat_.isAliased( alias ) );
353  }
354  //**********************************************************************************************
355 
356  //**********************************************************************************************
361  inline bool isAligned() const noexcept {
362  return vec_.isAligned() && mat_.isAligned();
363  }
364  //**********************************************************************************************
365 
366  //**********************************************************************************************
371  inline bool canSMPAssign() const noexcept {
372  return ( !BLAZE_BLAS_MODE ||
375  ( IsComputation<MT>::value && !evaluateMatrix ) ||
376  ( mat_.rows() * mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
377  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
378  }
379  //**********************************************************************************************
380 
381  private:
382  //**Member variables****************************************************************************
385  //**********************************************************************************************
386 
387  //**Assignment to dense vectors*****************************************************************
400  template< typename VT1 > // Type of the target dense vector
401  friend inline void assign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
402  {
404 
405  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
406 
407  if( rhs.mat_.rows() == 0UL ) {
408  reset( ~lhs );
409  return;
410  }
411  else if( rhs.mat_.columns() == 0UL ) {
412  return;
413  }
414 
415  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
416  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
417 
418  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
419  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
420  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
421  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
422 
423  TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
424  }
426  //**********************************************************************************************
427 
428  //**Assignment to dense vectors (kernel selection)**********************************************
439  template< typename VT1 // Type of the left-hand side target vector
440  , typename VT2 // Type of the left-hand side vector operand
441  , typename MT1 > // Type of the right-hand side matrix operand
442  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A )
443  {
444  if( ( IsDiagonal<MT1>::value ) ||
445  ( IsComputation<MT>::value && !evaluateMatrix ) ||
446  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
447  selectSmallAssignKernel( y, x, A );
448  else
449  selectBlasAssignKernel( y, x, A );
450  }
452  //**********************************************************************************************
453 
454  //**Default assignment to dense vectors*********************************************************
468  template< typename VT1 // Type of the left-hand side target vector
469  , typename VT2 // Type of the left-hand side vector operand
470  , typename MT1 > // Type of the right-hand side matrix operand
471  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A )
472  {
473  const size_t M( A.rows() );
474  const size_t N( A.columns() );
475 
477  reset( y[0] );
478  }
479 
480  if( !IsLower<MT1>::value )
481  {
482  const size_t jbegin( IsStrictlyUpper<MT1>::value ? 1UL : 0UL );
483  for( size_t j=jbegin; j<N; ++j ) {
484  y[j] = x[0UL] * A(0UL,j);
485  }
486  }
487 
488  for( size_t i=( IsLower<MT1>::value && !IsStrictlyLower<MT1>::value ? 0UL : 1UL ); i<M; ++i )
489  {
491  {
492  y[i] = x[i] * A(i,i);
493  }
494  else
495  {
496  const size_t jbegin( ( IsUpper<MT1>::value )
497  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
498  :( 0UL ) );
499  const size_t jend( ( IsLower<MT1>::value )
500  ?( IsStrictlyLower<MT1>::value ? i-1UL : i )
501  :( N ) );
502  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
503 
504  const size_t jnum( jend - jbegin );
505  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
506 
507  for( size_t j=jbegin; j<jpos; j+=2UL ) {
508  y[j ] += x[i] * A(i,j );
509  y[j+1UL] += x[i] * A(i,j+1UL);
510  }
511  if( jpos < jend ) {
512  y[jpos] += x[i] * A(i,jpos);
513  }
514  if( IsLower<MT1>::value ) {
515  y[jend] = x[i] * A(i,jend);
516  }
517  }
518  }
519 
521  reset( y[N-1UL] );
522  }
523  }
525  //**********************************************************************************************
526 
527  //**Default assignment to dense vectors (small matrices)****************************************
541  template< typename VT1 // Type of the left-hand side target vector
542  , typename VT2 // Type of the left-hand side vector operand
543  , typename MT1 > // Type of the right-hand side matrix operand
545  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
546  {
547  selectDefaultAssignKernel( y, x, A );
548  }
550  //**********************************************************************************************
551 
552  //**Vectorized default assignment to dense vectors (small matrices)*****************************
566  template< typename VT1 // Type of the left-hand side target vector
567  , typename VT2 // Type of the left-hand side vector operand
568  , typename MT1 > // Type of the right-hand side matrix operand
570  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A )
571  {
572  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
573 
574  const size_t M( A.rows() );
575  const size_t N( A.columns() );
576 
577  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
578  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
579 
580  size_t j( 0UL );
581 
582  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
583  {
584  const size_t ibegin( ( IsLower<MT1>::value )
585  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
586  :( 0UL ) );
587  const size_t iend( ( IsUpper<MT1>::value )
588  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
589  :( M ) );
590  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
591 
592  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
593 
594  for( size_t i=ibegin; i<iend; ++i ) {
595  const SIMDType x1( set( x[i] ) );
596  xmm1 += x1 * A.load(i,j );
597  xmm2 += x1 * A.load(i,j+SIMDSIZE );
598  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
599  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
600  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
601  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
602  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
603  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
604  }
605 
606  y.store( j , xmm1 );
607  y.store( j+SIMDSIZE , xmm2 );
608  y.store( j+SIMDSIZE*2UL, xmm3 );
609  y.store( j+SIMDSIZE*3UL, xmm4 );
610  y.store( j+SIMDSIZE*4UL, xmm5 );
611  y.store( j+SIMDSIZE*5UL, xmm6 );
612  y.store( j+SIMDSIZE*6UL, xmm7 );
613  y.store( j+SIMDSIZE*7UL, xmm8 );
614  }
615 
616  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
617  {
618  const size_t ibegin( ( IsLower<MT1>::value )
619  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
620  :( 0UL ) );
621  const size_t iend( ( IsUpper<MT1>::value )
622  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
623  :( M ) );
624  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
625 
626  SIMDType xmm1, xmm2, xmm3, xmm4;
627 
628  for( size_t i=ibegin; i<iend; ++i ) {
629  const SIMDType x1( set( x[i] ) );
630  xmm1 += x1 * A.load(i,j );
631  xmm2 += x1 * A.load(i,j+SIMDSIZE );
632  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
633  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
634  }
635 
636  y.store( j , xmm1 );
637  y.store( j+SIMDSIZE , xmm2 );
638  y.store( j+SIMDSIZE*2UL, xmm3 );
639  y.store( j+SIMDSIZE*3UL, xmm4 );
640  }
641 
642  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
643  {
644  const size_t ibegin( ( IsLower<MT1>::value )
645  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
646  :( 0UL ) );
647  const size_t iend( ( IsUpper<MT1>::value )
648  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
649  :( M ) );
650  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
651 
652  SIMDType xmm1, xmm2, xmm3;
653 
654  for( size_t i=ibegin; i<iend; ++i ) {
655  const SIMDType x1( set( x[i] ) );
656  xmm1 += x1 * A.load(i,j );
657  xmm2 += x1 * A.load(i,j+SIMDSIZE );
658  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
659  }
660 
661  y.store( j , xmm1 );
662  y.store( j+SIMDSIZE , xmm2 );
663  y.store( j+SIMDSIZE*2UL, xmm3 );
664  }
665 
666  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
667  {
668  const size_t ibegin( ( IsLower<MT1>::value )
669  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
670  :( 0UL ) );
671  const size_t iend( ( IsUpper<MT1>::value )
672  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
673  :( M ) );
674  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
675 
676  SIMDType xmm1, xmm2;
677 
678  for( size_t i=ibegin; i<iend; ++i ) {
679  const SIMDType x1( set( x[i] ) );
680  xmm1 += x1 * A.load(i,j );
681  xmm2 += x1 * A.load(i,j+SIMDSIZE);
682  }
683 
684  y.store( j , xmm1 );
685  y.store( j+SIMDSIZE, xmm2 );
686  }
687 
688  for( ; j<jpos; j+=SIMDSIZE )
689  {
690  const size_t ibegin( ( IsLower<MT1>::value )
691  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
692  :( 0UL ) );
693  const size_t iend( ( IsUpper<MT1>::value )
694  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
695  :( M ) );
696  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
697 
698  SIMDType xmm1;
699 
700  for( size_t i=ibegin; i<iend; ++i ) {
701  xmm1 += set( x[i] ) * A.load(i,j);
702  }
703 
704  y.store( j, xmm1 );
705  }
706 
707  for( ; remainder && j<N; ++j )
708  {
709  const size_t ibegin( ( IsLower<MT1>::value )
710  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
711  :( 0UL ) );
712  const size_t iend( ( IsUpper<MT1>::value )
713  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
714  :( M ) );
715  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
716 
717  ElementType value = ElementType();
718 
719  for( size_t i=ibegin; i<iend; ++i ) {
720  value += x[i] * A(i,j);
721  }
722 
723  y[j] = value;
724  }
725  }
727  //**********************************************************************************************
728 
729  //**Default assignment to dense vectors (large matrices)****************************************
743  template< typename VT1 // Type of the left-hand side target vector
744  , typename VT2 // Type of the left-hand side vector operand
745  , typename MT1 > // Type of the right-hand side matrix operand
747  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
748  {
749  selectDefaultAssignKernel( y, x, A );
750  }
752  //**********************************************************************************************
753 
754  //**Vectorized default assignment to dense vectors (large matrices)*****************************
768  template< typename VT1 // Type of the left-hand side target vector
769  , typename VT2 // Type of the left-hand side vector operand
770  , typename MT1 > // Type of the right-hand side matrix operand
772  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A )
773  {
774  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
775 
776  const size_t M( A.rows() );
777  const size_t N( A.columns() );
778 
779  const size_t jblock( 32768UL / sizeof( ElementType ) );
780  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
781 
782  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
783 
784  reset( y );
785 
786  for( size_t jj=0U; jj<N; jj+=jblock ) {
787  for( size_t ii=0UL; ii<M; ii+=iblock )
788  {
789  const size_t iend( min( ii+iblock, M ) );
790  const size_t jtmp( min( jj+jblock, N ) );
791  const size_t jend( ( IsLower<MT1>::value )
792  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
793  :( jtmp ) );
794 
795  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
796  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
797 
798  size_t j( ( IsUpper<MT1>::value )
799  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
800  :( jj ) );
801 
802  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
803  {
804  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
805 
806  for( size_t i=ii; i<iend; ++i ) {
807  const SIMDType x1( set( x[i] ) );
808  xmm1 += x1 * A.load(i,j );
809  xmm2 += x1 * A.load(i,j+SIMDSIZE );
810  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
811  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
812  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
813  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
814  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
815  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
816  }
817 
818  y.store( j , y.load(j ) + xmm1 );
819  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
820  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
821  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
822  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
823  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
824  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
825  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
826  }
827 
828  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
829  {
830  SIMDType xmm1, xmm2, xmm3, xmm4;
831 
832  for( size_t i=ii; i<iend; ++i ) {
833  const SIMDType x1( set( x[i] ) );
834  xmm1 += x1 * A.load(i,j );
835  xmm2 += x1 * A.load(i,j+SIMDSIZE );
836  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
837  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
838  }
839 
840  y.store( j , y.load(j ) + xmm1 );
841  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
842  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
843  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
844  }
845 
846  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
847  {
848  SIMDType xmm1, xmm2, xmm3;
849 
850  for( size_t i=ii; i<iend; ++i ) {
851  const SIMDType x1( set( x[i] ) );
852  xmm1 += x1 * A.load(i,j );
853  xmm2 += x1 * A.load(i,j+SIMDSIZE );
854  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
855  }
856 
857  y.store( j , y.load(j ) + xmm1 );
858  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
859  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
860  }
861 
862  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
863  {
864  SIMDType xmm1, xmm2;
865 
866  for( size_t i=ii; i<iend; ++i ) {
867  const SIMDType x1( set( x[i] ) );
868  xmm1 += x1 * A.load(i,j );
869  xmm2 += x1 * A.load(i,j+SIMDSIZE);
870  }
871 
872  y.store( j , y.load(j ) + xmm1 );
873  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
874  }
875 
876  for( ; j<jpos; j+=SIMDSIZE )
877  {
878  SIMDType xmm1;
879 
880  for( size_t i=ii; i<iend; ++i ) {
881  xmm1 += set( x[i] ) * A.load(i,j);
882  }
883 
884  y.store( j, y.load(j) + xmm1 );
885  }
886 
887  for( ; remainder && j<jend; ++j )
888  {
889  ElementType value = ElementType();
890 
891  for( size_t i=ii; i<iend; ++i ) {
892  value += x[i] * A(i,j);
893  }
894 
895  y[j] += value;
896  }
897  }
898  }
899  }
901  //**********************************************************************************************
902 
903  //**BLAS-based assignment to dense vectors (default)********************************************
917  template< typename VT1 // Type of the left-hand side target vector
918  , typename VT2 // Type of the left-hand side vector operand
919  , typename MT1 > // Type of the right-hand side matrix operand
921  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
922  {
923  selectLargeAssignKernel( y, x, A );
924  }
926  //**********************************************************************************************
927 
928  //**BLAS-based assignment to dense vectors******************************************************
929 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
930 
943  template< typename VT1 // Type of the left-hand side target vector
944  , typename VT2 // Type of the left-hand side vector operand
945  , typename MT1 > // Type of the right-hand side matrix operand
947  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A )
948  {
949  using ET = ElementType_<VT1>;
950 
952  assign( y, x );
953  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
954  }
955  else {
956  gemv( y, x, A, ET(1), ET(0) );
957  }
958  }
960 #endif
961  //**********************************************************************************************
962 
963  //**Assignment to sparse vectors****************************************************************
976  template< typename VT1 > // Type of the target sparse vector
977  friend inline void assign( SparseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
978  {
980 
984 
985  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
986 
987  const ResultType tmp( serial( rhs ) );
988  assign( ~lhs, tmp );
989  }
991  //**********************************************************************************************
992 
993  //**Addition assignment to dense vectors********************************************************
1006  template< typename VT1 > // Type of the target dense vector
1007  friend inline void addAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1008  {
1010 
1011  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1012 
1013  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1014  return;
1015  }
1016 
1017  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1018  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1019 
1020  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1021  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1022  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1023  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1024 
1025  TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1026  }
1028  //**********************************************************************************************
1029 
1030  //**Addition assignment to dense vectors (kernel selection)*************************************
1041  template< typename VT1 // Type of the left-hand side target vector
1042  , typename VT2 // Type of the left-hand side vector operand
1043  , typename MT1 > // Type of the right-hand side matrix operand
1044  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1045  {
1046  if( ( IsDiagonal<MT1>::value ) ||
1047  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1048  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1049  selectSmallAddAssignKernel( y, x, A );
1050  else
1051  selectBlasAddAssignKernel( y, x, A );
1052  }
1054  //**********************************************************************************************
1055 
1056  //**Default addition assignment to dense vectors************************************************
1070  template< typename VT1 // Type of the left-hand side target vector
1071  , typename VT2 // Type of the left-hand side vector operand
1072  , typename MT1 > // Type of the right-hand side matrix operand
1073  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1074  {
1075  const size_t M( A.rows() );
1076  const size_t N( A.columns() );
1077 
1078  for( size_t i=0UL; i<M; ++i )
1079  {
1081  {
1082  y[i] += x[i] * A(i,i);
1083  }
1084  else
1085  {
1086  const size_t jbegin( ( IsUpper<MT1>::value )
1087  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1088  :( 0UL ) );
1089  const size_t jend( ( IsLower<MT1>::value )
1090  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1091  :( N ) );
1092  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1093 
1094  const size_t jnum( jend - jbegin );
1095  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1096 
1097  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1098  y[j ] += x[i] * A(i,j );
1099  y[j+1UL] += x[i] * A(i,j+1UL);
1100  }
1101  if( jpos < jend ) {
1102  y[jpos] += x[i] * A(i,jpos);
1103  }
1104  }
1105  }
1106  }
1108  //**********************************************************************************************
1109 
1110  //**Default addition assignment to dense vectors (small matrices)*******************************
1124  template< typename VT1 // Type of the left-hand side target vector
1125  , typename VT2 // Type of the left-hand side vector operand
1126  , typename MT1 > // Type of the right-hand side matrix operand
1128  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1129  {
1130  selectDefaultAddAssignKernel( y, x, A );
1131  }
1133  //**********************************************************************************************
1134 
1135  //**Vectorized default addition assignment to dense vectors (small matrices)********************
1149  template< typename VT1 // Type of the left-hand side target vector
1150  , typename VT2 // Type of the left-hand side vector operand
1151  , typename MT1 > // Type of the right-hand side matrix operand
1153  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1154  {
1155  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1156 
1157  const size_t M( A.rows() );
1158  const size_t N( A.columns() );
1159 
1160  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1161  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1162 
1163  size_t j( 0UL );
1164 
1165  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1166  {
1167  const size_t ibegin( ( IsLower<MT1>::value )
1168  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1169  :( 0UL ) );
1170  const size_t iend( ( IsUpper<MT1>::value )
1171  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1172  :( M ) );
1173  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1174 
1175  SIMDType xmm1( y.load(j ) );
1176  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1177  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1178  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1179  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1180  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1181  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1182  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1183 
1184  for( size_t i=ibegin; i<iend; ++i ) {
1185  const SIMDType x1( set( x[i] ) );
1186  xmm1 += x1 * A.load(i,j );
1187  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1188  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1189  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1190  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1191  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1192  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1193  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1194  }
1195 
1196  y.store( j , xmm1 );
1197  y.store( j+SIMDSIZE , xmm2 );
1198  y.store( j+SIMDSIZE*2UL, xmm3 );
1199  y.store( j+SIMDSIZE*3UL, xmm4 );
1200  y.store( j+SIMDSIZE*4UL, xmm5 );
1201  y.store( j+SIMDSIZE*5UL, xmm6 );
1202  y.store( j+SIMDSIZE*6UL, xmm7 );
1203  y.store( j+SIMDSIZE*7UL, xmm8 );
1204  }
1205 
1206  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1207  {
1208  const size_t ibegin( ( IsLower<MT1>::value )
1209  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1210  :( 0UL ) );
1211  const size_t iend( ( IsUpper<MT1>::value )
1212  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1213  :( M ) );
1214  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1215 
1216  SIMDType xmm1( y.load(j ) );
1217  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1218  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1219  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1220 
1221  for( size_t i=ibegin; i<iend; ++i ) {
1222  const SIMDType x1( set( x[i] ) );
1223  xmm1 += x1 * A.load(i,j );
1224  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1225  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1226  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1227  }
1228 
1229  y.store( j , xmm1 );
1230  y.store( j+SIMDSIZE , xmm2 );
1231  y.store( j+SIMDSIZE*2UL, xmm3 );
1232  y.store( j+SIMDSIZE*3UL, xmm4 );
1233  }
1234 
1235  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1236  {
1237  const size_t ibegin( ( IsLower<MT1>::value )
1238  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1239  :( 0UL ) );
1240  const size_t iend( ( IsUpper<MT1>::value )
1241  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1242  :( M ) );
1243  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1244 
1245  SIMDType xmm1( y.load(j ) );
1246  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1247  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1248 
1249  for( size_t i=ibegin; i<iend; ++i ) {
1250  const SIMDType x1( set( x[i] ) );
1251  xmm1 += x1 * A.load(i,j );
1252  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1253  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1254  }
1255 
1256  y.store( j , xmm1 );
1257  y.store( j+SIMDSIZE , xmm2 );
1258  y.store( j+SIMDSIZE*2UL, xmm3 );
1259  }
1260 
1261  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1262  {
1263  const size_t ibegin( ( IsLower<MT1>::value )
1264  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1265  :( 0UL ) );
1266  const size_t iend( ( IsUpper<MT1>::value )
1267  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1268  :( M ) );
1269  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1270 
1271  SIMDType xmm1( y.load(j ) );
1272  SIMDType xmm2( y.load(j+SIMDSIZE) );
1273 
1274  for( size_t i=ibegin; i<iend; ++i ) {
1275  const SIMDType x1( set( x[i] ) );
1276  xmm1 += x1 * A.load(i,j );
1277  xmm2 += x1 * A.load(i,j+SIMDSIZE);
1278  }
1279 
1280  y.store( j , xmm1 );
1281  y.store( j+SIMDSIZE, xmm2 );
1282  }
1283 
1284  for( ; j<jpos; j+=SIMDSIZE )
1285  {
1286  const size_t ibegin( ( IsLower<MT1>::value )
1287  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1288  :( 0UL ) );
1289  const size_t iend( ( IsUpper<MT1>::value )
1290  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1291  :( M ) );
1292  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1293 
1294  SIMDType xmm1( y.load(j) );
1295 
1296  for( size_t i=ibegin; i<iend; ++i ) {
1297  xmm1 += set( x[i] ) * A.load(i,j);
1298  }
1299 
1300  y.store( j, xmm1 );
1301  }
1302 
1303  for( ; remainder && j<N; ++j )
1304  {
1305  const size_t ibegin( ( IsLower<MT1>::value )
1306  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1307  :( 0UL ) );
1308  const size_t iend( ( IsUpper<MT1>::value )
1309  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1310  :( M ) );
1311  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1312 
1313  ElementType value = ElementType();
1314 
1315  for( size_t i=ibegin; i<iend; ++i ) {
1316  value += x[i] * A(i,j);
1317  }
1318 
1319  y[j] += value;
1320  }
1321  }
1323  //**********************************************************************************************
1324 
1325  //**Default addition assignment to dense vectors (large matrices)*******************************
1339  template< typename VT1 // Type of the left-hand side target vector
1340  , typename VT2 // Type of the left-hand side vector operand
1341  , typename MT1 > // Type of the right-hand side matrix operand
1343  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1344  {
1345  selectDefaultAddAssignKernel( y, x, A );
1346  }
1348  //**********************************************************************************************
1349 
1350  //**Vectorized default addition assignment to dense vectors (large matrices)********************
1364  template< typename VT1 // Type of the left-hand side target vector
1365  , typename VT2 // Type of the left-hand side vector operand
1366  , typename MT1 > // Type of the right-hand side matrix operand
1368  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1369  {
1370  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1371 
1372  const size_t M( A.rows() );
1373  const size_t N( A.columns() );
1374 
1375  const size_t jblock( 32768UL / sizeof( ElementType ) );
1376  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1377 
1378  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1379 
1380  for( size_t jj=0U; jj<N; jj+=jblock ) {
1381  for( size_t ii=0UL; ii<M; ii+=iblock )
1382  {
1383  const size_t iend( min( ii+iblock, M ) );
1384  const size_t jtmp( min( jj+jblock, N ) );
1385  const size_t jend( ( IsLower<MT1>::value )
1386  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
1387  :( jtmp ) );
1388 
1389  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1390  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1391 
1392  size_t j( ( IsUpper<MT1>::value )
1393  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1394  :( jj ) );
1395 
1396  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1397  {
1398  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1399 
1400  for( size_t i=ii; i<iend; ++i ) {
1401  const SIMDType x1( set( x[i] ) );
1402  xmm1 += x1 * A.load(i,j );
1403  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1404  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1405  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1406  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1407  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1408  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1409  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1410  }
1411 
1412  y.store( j , y.load(j ) + xmm1 );
1413  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1414  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1415  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1416  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1417  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1418  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1419  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1420  }
1421 
1422  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1423  {
1424  SIMDType xmm1, xmm2, xmm3, xmm4;
1425 
1426  for( size_t i=ii; i<iend; ++i ) {
1427  const SIMDType x1( set( x[i] ) );
1428  xmm1 += x1 * A.load(i,j );
1429  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1430  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1431  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1432  }
1433 
1434  y.store( j , y.load(j ) + xmm1 );
1435  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1436  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1437  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1438  }
1439 
1440  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1441  {
1442  SIMDType xmm1, xmm2, xmm3;
1443 
1444  for( size_t i=ii; i<iend; ++i ) {
1445  const SIMDType x1( set( x[i] ) );
1446  xmm1 += x1 * A.load(i,j );
1447  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1448  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1449  }
1450 
1451  y.store( j , y.load(j ) + xmm1 );
1452  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1453  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1454  }
1455 
1456  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1457  {
1458  SIMDType xmm1, xmm2;
1459 
1460  for( size_t i=ii; i<iend; ++i ) {
1461  const SIMDType x1( set( x[i] ) );
1462  xmm1 += x1 * A.load(i,j );
1463  xmm2 += x1 * A.load(i,j+SIMDSIZE);
1464  }
1465 
1466  y.store( j , y.load(j ) + xmm1 );
1467  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1468  }
1469 
1470  for( ; j<jpos; j+=SIMDSIZE )
1471  {
1472  SIMDType xmm1;
1473 
1474  for( size_t i=ii; i<iend; ++i ) {
1475  xmm1 += set( x[i] ) * A.load(i,j);
1476  }
1477 
1478  y.store( j, y.load(j) + xmm1 );
1479  }
1480 
1481  for( ; remainder && j<jend; ++j )
1482  {
1483  ElementType value = ElementType();
1484 
1485  for( size_t i=ii; i<iend; ++i ) {
1486  value += x[i] * A(i,j);
1487  }
1488 
1489  y[j] += value;
1490  }
1491  }
1492  }
1493  }
1495  //**********************************************************************************************
1496 
1497  //**BLAS-based addition assignment to dense vectors (default)***********************************
1511  template< typename VT1 // Type of the left-hand side target vector
1512  , typename VT2 // Type of the left-hand side vector operand
1513  , typename MT1 > // Type of the right-hand side matrix operand
1515  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1516  {
1517  selectLargeAddAssignKernel( y, x, A );
1518  }
1520  //**********************************************************************************************
1521 
1522  //**BLAS-based addition assignment to dense vectors*********************************************
1523 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1524 
1537  template< typename VT1 // Type of the left-hand side target vector
1538  , typename VT2 // Type of the left-hand side vector operand
1539  , typename MT1 > // Type of the right-hand side matrix operand
1541  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A )
1542  {
1543  using ET = ElementType_<VT1>;
1544 
1545  if( IsTriangular<MT1>::value ) {
1546  ResultType_<VT1> tmp( serial( x ) );
1547  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1548  addAssign( y, tmp );
1549  }
1550  else {
1551  gemv( y, x, A, ET(1), ET(1) );
1552  }
1553  }
1555 #endif
1556  //**********************************************************************************************
1557 
1558  //**Addition assignment to sparse vectors*******************************************************
1559  // No special implementation for the addition assignment to sparse vectors.
1560  //**********************************************************************************************
1561 
1562  //**Subtraction assignment to dense vectors*****************************************************
1575  template< typename VT1 > // Type of the target dense vector
1576  friend inline void subAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
1577  {
1579 
1580  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
1581 
1582  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1583  return;
1584  }
1585 
1586  LT x( serial( rhs.vec_ ) ); // Evaluation of the left-hand side dense vector operand
1587  RT A( serial( rhs.mat_ ) ); // Evaluation of the right-hand side dense matrix operand
1588 
1589  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
1590  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
1591  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
1592  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
1593 
1594  TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1595  }
1597  //**********************************************************************************************
1598 
1599  //**Subtraction assignment to dense vectors (kernel selection)**********************************
1610  template< typename VT1 // Type of the left-hand side target vector
1611  , typename VT2 // Type of the left-hand side vector operand
1612  , typename MT1 > // Type of the right-hand side matrix operand
1613  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1614  {
1615  if( ( IsDiagonal<MT1>::value ) ||
1616  ( IsComputation<MT>::value && !evaluateMatrix ) ||
1617  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1618  selectSmallSubAssignKernel( y, x, A );
1619  else
1620  selectBlasSubAssignKernel( y, x, A );
1621  }
1623  //**********************************************************************************************
1624 
1625  //**Default subtraction assignment to dense vectors*********************************************
1639  template< typename VT1 // Type of the left-hand side target vector
1640  , typename VT2 // Type of the left-hand side vector operand
1641  , typename MT1 > // Type of the right-hand side matrix operand
1642  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1643  {
1644  const size_t M( A.rows() );
1645  const size_t N( A.columns() );
1646 
1647  for( size_t i=0UL; i<M; ++i )
1648  {
1650  {
1651  y[i] -= x[i] * A(i,i);
1652  }
1653  else
1654  {
1655  const size_t jbegin( ( IsUpper<MT1>::value )
1656  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1657  :( 0UL ) );
1658  const size_t jend( ( IsLower<MT1>::value )
1659  ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1660  :( N ) );
1661  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1662 
1663  const size_t jnum( jend - jbegin );
1664  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1665 
1666  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1667  y[j ] -= x[i] * A(i,j );
1668  y[j+1UL] -= x[i] * A(i,j+1UL);
1669  }
1670  if( jpos < jend ) {
1671  y[jpos] -= x[i] * A(i,jpos);
1672  }
1673  }
1674  }
1675  }
1677  //**********************************************************************************************
1678 
1679  //**Default subtraction assignment to dense vectors (small matrices)****************************
1693  template< typename VT1 // Type of the left-hand side target vector
1694  , typename VT2 // Type of the left-hand side vector operand
1695  , typename MT1 > // Type of the right-hand side matrix operand
1697  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1698  {
1699  selectDefaultSubAssignKernel( y, x, A );
1700  }
1702  //**********************************************************************************************
1703 
1704  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
1719  template< typename VT1 // Type of the left-hand side target vector
1720  , typename VT2 // Type of the left-hand side vector operand
1721  , typename MT1 > // Type of the right-hand side matrix operand
1723  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1724  {
1725  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1726 
1727  const size_t M( A.rows() );
1728  const size_t N( A.columns() );
1729 
1730  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1731  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1732 
1733  size_t j( 0UL );
1734 
1735  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1736  {
1737  const size_t ibegin( ( IsLower<MT1>::value )
1738  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1739  :( 0UL ) );
1740  const size_t iend( ( IsUpper<MT1>::value )
1741  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1742  :( M ) );
1743  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1744 
1745  SIMDType xmm1( y.load(j ) );
1746  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1747  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1748  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1749  SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1750  SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1751  SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1752  SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1753 
1754  for( size_t i=ibegin; i<iend; ++i ) {
1755  const SIMDType x1( set( x[i] ) );
1756  xmm1 -= x1 * A.load(i,j );
1757  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1758  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1759  xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1760  xmm5 -= x1 * A.load(i,j+SIMDSIZE*4UL);
1761  xmm6 -= x1 * A.load(i,j+SIMDSIZE*5UL);
1762  xmm7 -= x1 * A.load(i,j+SIMDSIZE*6UL);
1763  xmm8 -= x1 * A.load(i,j+SIMDSIZE*7UL);
1764  }
1765 
1766  y.store( j , xmm1 );
1767  y.store( j+SIMDSIZE , xmm2 );
1768  y.store( j+SIMDSIZE*2UL, xmm3 );
1769  y.store( j+SIMDSIZE*3UL, xmm4 );
1770  y.store( j+SIMDSIZE*4UL, xmm5 );
1771  y.store( j+SIMDSIZE*5UL, xmm6 );
1772  y.store( j+SIMDSIZE*6UL, xmm7 );
1773  y.store( j+SIMDSIZE*7UL, xmm8 );
1774  }
1775 
1776  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1777  {
1778  const size_t ibegin( ( IsLower<MT1>::value )
1779  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1780  :( 0UL ) );
1781  const size_t iend( ( IsUpper<MT1>::value )
1782  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1783  :( M ) );
1784  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1785 
1786  SIMDType xmm1( y.load(j ) );
1787  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1788  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1789  SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1790 
1791  for( size_t i=ibegin; i<iend; ++i ) {
1792  const SIMDType x1( set( x[i] ) );
1793  xmm1 -= x1 * A.load(i,j );
1794  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1795  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1796  xmm4 -= x1 * A.load(i,j+SIMDSIZE*3UL);
1797  }
1798 
1799  y.store( j , xmm1 );
1800  y.store( j+SIMDSIZE , xmm2 );
1801  y.store( j+SIMDSIZE*2UL, xmm3 );
1802  y.store( j+SIMDSIZE*3UL, xmm4 );
1803  }
1804 
1805  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1806  {
1807  const size_t ibegin( ( IsLower<MT1>::value )
1808  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1809  :( 0UL ) );
1810  const size_t iend( ( IsUpper<MT1>::value )
1811  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1812  :( M ) );
1813  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1814 
1815  SIMDType xmm1( y.load(j ) );
1816  SIMDType xmm2( y.load(j+SIMDSIZE ) );
1817  SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1818 
1819  for( size_t i=ibegin; i<iend; ++i ) {
1820  const SIMDType x1( set( x[i] ) );
1821  xmm1 -= x1 * A.load(i,j );
1822  xmm2 -= x1 * A.load(i,j+SIMDSIZE );
1823  xmm3 -= x1 * A.load(i,j+SIMDSIZE*2UL);
1824  }
1825 
1826  y.store( j , xmm1 );
1827  y.store( j+SIMDSIZE , xmm2 );
1828  y.store( j+SIMDSIZE*2UL, xmm3 );
1829  }
1830 
1831  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1832  {
1833  const size_t ibegin( ( IsLower<MT1>::value )
1834  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1835  :( 0UL ) );
1836  const size_t iend( ( IsUpper<MT1>::value )
1837  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1838  :( M ) );
1839  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1840 
1841  SIMDType xmm1( y.load(j ) );
1842  SIMDType xmm2( y.load(j+SIMDSIZE) );
1843 
1844  for( size_t i=ibegin; i<iend; ++i ) {
1845  const SIMDType x1( set( x[i] ) );
1846  xmm1 -= x1 * A.load(i,j );
1847  xmm2 -= x1 * A.load(i,j+SIMDSIZE);
1848  }
1849 
1850  y.store( j , xmm1 );
1851  y.store( j+SIMDSIZE, xmm2 );
1852  }
1853 
1854  for( ; j<jpos; j+=SIMDSIZE )
1855  {
1856  const size_t ibegin( ( IsLower<MT1>::value )
1857  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1858  :( 0UL ) );
1859  const size_t iend( ( IsUpper<MT1>::value )
1860  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1861  :( M ) );
1862  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1863 
1864  SIMDType xmm1( y.load(j) );
1865 
1866  for( size_t i=ibegin; i<iend; ++i ) {
1867  xmm1 -= set( x[i] ) * A.load(i,j);
1868  }
1869 
1870  y.store( j, xmm1 );
1871  }
1872 
1873  for( ; remainder && j<N; ++j )
1874  {
1875  const size_t ibegin( ( IsLower<MT1>::value )
1876  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1877  :( 0UL ) );
1878  const size_t iend( ( IsUpper<MT1>::value )
1879  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1880  :( M ) );
1881  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1882 
1883  ElementType value = ElementType();
1884 
1885  for( size_t i=ibegin; i<iend; ++i ) {
1886  value += x[i] * A(i,j);
1887  }
1888 
1889  y[j] -= value;
1890  }
1891  }
1893  //**********************************************************************************************
1894 
1895  //**Default subtraction assignment to dense vectors (large matrices)****************************
1909  template< typename VT1 // Type of the left-hand side target vector
1910  , typename VT2 // Type of the left-hand side vector operand
1911  , typename MT1 > // Type of the right-hand side matrix operand
1913  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1914  {
1915  selectDefaultSubAssignKernel( y, x, A );
1916  }
1918  //**********************************************************************************************
1919 
1920  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
1935  template< typename VT1 // Type of the left-hand side target vector
1936  , typename VT2 // Type of the left-hand side vector operand
1937  , typename MT1 > // Type of the right-hand side matrix operand
1939  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
1940  {
1941  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1942 
1943  const size_t M( A.rows() );
1944  const size_t N( A.columns() );
1945 
1946  const size_t jblock( 32768UL / sizeof( ElementType ) );
1947  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1948 
1949  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
1950 
1951  for( size_t jj=0U; jj<N; jj+=jblock ) {
1952  for( size_t ii=0UL; ii<M; ii+=iblock )
1953  {
1954  const size_t iend( min( ii+iblock, M ) );
1955  const size_t jtmp( min( jj+jblock, N ) );
1956  const size_t jend( ( IsLower<MT1>::value )
1957  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
1958  :( jtmp ) );
1959 
1960  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
1961  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1962 
1963  size_t j( ( IsUpper<MT1>::value )
1964  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
1965  :( jj ) );
1966 
1967  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1968  {
1969  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1970 
1971  for( size_t i=ii; i<iend; ++i ) {
1972  const SIMDType x1( set( x[i] ) );
1973  xmm1 += x1 * A.load(i,j );
1974  xmm2 += x1 * A.load(i,j+SIMDSIZE );
1975  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
1976  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
1977  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
1978  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
1979  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
1980  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
1981  }
1982 
1983  y.store( j , y.load(j ) - xmm1 );
1984  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
1985  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
1986  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
1987  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
1988  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
1989  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
1990  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
1991  }
1992 
1993  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1994  {
1995  SIMDType xmm1, xmm2, xmm3, xmm4;
1996 
1997  for( size_t i=ii; i<iend; ++i ) {
1998  const SIMDType x1( set( x[i] ) );
1999  xmm1 += x1 * A.load(i,j );
2000  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2001  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2002  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2003  }
2004 
2005  y.store( j , y.load(j ) - xmm1 );
2006  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2007  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2008  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2009  }
2010 
2011  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2012  {
2013  SIMDType xmm1, xmm2, xmm3;
2014 
2015  for( size_t i=ii; i<iend; ++i ) {
2016  const SIMDType x1( set( x[i] ) );
2017  xmm1 += x1 * A.load(i,j );
2018  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2019  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2020  }
2021 
2022  y.store( j , y.load(j ) - xmm1 );
2023  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2024  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2025  }
2026 
2027  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2028  {
2029  SIMDType xmm1, xmm2;
2030 
2031  for( size_t i=ii; i<iend; ++i ) {
2032  const SIMDType x1( set( x[i] ) );
2033  xmm1 += x1 * A.load(i,j );
2034  xmm2 += x1 * A.load(i,j+SIMDSIZE);
2035  }
2036 
2037  y.store( j , y.load(j ) - xmm1 );
2038  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2039  }
2040 
2041  for( ; j<jpos; j+=SIMDSIZE )
2042  {
2043  SIMDType xmm1;
2044 
2045  for( size_t i=ii; i<iend; ++i ) {
2046  xmm1 += set( x[i] ) * A.load(i,j);
2047  }
2048 
2049  y.store( j, y.load(j) - xmm1 );
2050  }
2051 
2052  for( ; remainder && j<jend; ++j )
2053  {
2054  ElementType value = ElementType();
2055 
2056  for( size_t i=ii; i<iend; ++i ) {
2057  value += x[i] * A(i,j);
2058  }
2059 
2060  y[j] -= value;
2061  }
2062  }
2063  }
2064  }
2066  //**********************************************************************************************
2067 
2068  //**BLAS-based subtraction assignment to dense vectors (default)********************************
2082  template< typename VT1 // Type of the left-hand side target vector
2083  , typename VT2 // Type of the left-hand side vector operand
2084  , typename MT1 > // Type of the right-hand side matrix operand
2086  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2087  {
2088  selectLargeSubAssignKernel( y, x, A );
2089  }
2091  //**********************************************************************************************
2092 
2093  //**BLAS-based subtraction assignment to dense vectors******************************************
2094 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2095 
2108  template< typename VT1 // Type of the left-hand side target vector
2109  , typename VT2 // Type of the left-hand side vector operand
2110  , typename MT1 > // Type of the right-hand side matrix operand
2112  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A )
2113  {
2114  using ET = ElementType_<VT1>;
2115 
2116  if( IsTriangular<MT1>::value ) {
2117  ResultType_<VT1> tmp( serial( x ) );
2118  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2119  subAssign( y, tmp );
2120  }
2121  else {
2122  gemv( y, x, A, ET(-1), ET(1) );
2123  }
2124  }
2126 #endif
2127  //**********************************************************************************************
2128 
2129  //**Subtraction assignment to sparse vectors****************************************************
2130  // No special implementation for the subtraction assignment to sparse vectors.
2131  //**********************************************************************************************
2132 
2133  //**Multiplication assignment to dense vectors**************************************************
2146  template< typename VT1 > // Type of the target dense vector
2147  friend inline void multAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2148  {
2150 
2154 
2155  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2156 
2157  const ResultType tmp( serial( rhs ) );
2158  multAssign( ~lhs, tmp );
2159  }
2161  //**********************************************************************************************
2162 
2163  //**Multiplication assignment to sparse vectors*************************************************
2164  // No special implementation for the multiplication assignment to sparse vectors.
2165  //**********************************************************************************************
2166 
2167  //**Division assignment to dense vectors********************************************************
2180  template< typename VT1 > // Type of the target dense vector
2181  friend inline void divAssign( DenseVector<VT1,true>& lhs, const TDVecDMatMultExpr& rhs )
2182  {
2184 
2188 
2189  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2190 
2191  const ResultType tmp( serial( rhs ) );
2192  divAssign( ~lhs, tmp );
2193  }
2195  //**********************************************************************************************
2196 
2197  //**Division assignment to sparse vectors*******************************************************
2198  // No special implementation for the division assignment to sparse vectors.
2199  //**********************************************************************************************
2200 
2201  //**SMP assignment to dense vectors*************************************************************
2216  template< typename VT1 > // Type of the target dense vector
2217  friend inline EnableIf_< UseSMPAssign<VT1> >
2219  {
2221 
2222  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2223 
2224  if( rhs.mat_.rows() == 0UL ) {
2225  reset( ~lhs );
2226  return;
2227  }
2228  else if( rhs.mat_.columns() == 0UL ) {
2229  return;
2230  }
2231 
2232  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2233  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2234 
2235  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2236  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2237  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2238  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2239 
2240  smpAssign( ~lhs, x * A );
2241  }
2243  //**********************************************************************************************
2244 
2245  //**SMP assignment to sparse vectors************************************************************
2260  template< typename VT1 > // Type of the target sparse vector
2261  friend inline EnableIf_< UseSMPAssign<VT1> >
2263  {
2265 
2269 
2270  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2271 
2272  const ResultType tmp( rhs );
2273  smpAssign( ~lhs, tmp );
2274  }
2276  //**********************************************************************************************
2277 
2278  //**SMP addition assignment to dense vectors****************************************************
2293  template< typename VT1 > // Type of the target dense vector
2294  friend inline EnableIf_< UseSMPAssign<VT1> >
2296  {
2298 
2299  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2300 
2301  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2302  return;
2303  }
2304 
2305  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2306  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2307 
2308  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2309  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2310  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2311  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2312 
2313  smpAddAssign( ~lhs, x * A );
2314  }
2316  //**********************************************************************************************
2317 
2318  //**SMP addition assignment to sparse vectors***************************************************
2319  // No special implementation for the SMP addition assignment to sparse vectors.
2320  //**********************************************************************************************
2321 
2322  //**SMP subtraction assignment to dense vectors*************************************************
2337  template< typename VT1 > // Type of the target dense vector
2338  friend inline EnableIf_< UseSMPAssign<VT1> >
2340  {
2342 
2343  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2344 
2345  if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2346  return;
2347  }
2348 
2349  LT x( rhs.vec_ ); // Evaluation of the left-hand side dense vector operand
2350  RT A( rhs.mat_ ); // Evaluation of the right-hand side dense matrix operand
2351 
2352  BLAZE_INTERNAL_ASSERT( x.size() == rhs.vec_.size() , "Invalid vector size" );
2353  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.mat_.rows() , "Invalid number of rows" );
2354  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.mat_.columns(), "Invalid number of columns" );
2355  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2356 
2357  smpSubAssign( ~lhs, x * A );
2358  }
2360  //**********************************************************************************************
2361 
2362  //**SMP subtraction assignment to sparse vectors************************************************
2363  // No special implementation for the SMP subtraction assignment to sparse vectors.
2364  //**********************************************************************************************
2365 
2366  //**SMP multiplication assignment to dense vectors**********************************************
2381  template< typename VT1 > // Type of the target dense vector
2382  friend inline EnableIf_< UseSMPAssign<VT1> >
2384  {
2386 
2390 
2391  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2392 
2393  const ResultType tmp( rhs );
2394  smpMultAssign( ~lhs, tmp );
2395  }
2397  //**********************************************************************************************
2398 
2399  //**SMP multiplication assignment to sparse vectors*********************************************
2400  // No special implementation for the SMP multiplication assignment to sparse vectors.
2401  //**********************************************************************************************
2402 
2403  //**SMP division assignment to dense vectors****************************************************
2418  template< typename VT1 > // Type of the target dense vector
2419  friend inline EnableIf_< UseSMPAssign<VT1> >
2421  {
2423 
2427 
2428  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2429 
2430  const ResultType tmp( rhs );
2431  smpDivAssign( ~lhs, tmp );
2432  }
2434  //**********************************************************************************************
2435 
2436  //**SMP division assignment to sparse vectors***************************************************
2437  // No special implementation for the SMP division assignment to sparse vectors.
2438  //**********************************************************************************************
2439 
2440  //**Compile time checks*************************************************************************
2448  //**********************************************************************************************
2449 };
2450 //*************************************************************************************************
2451 
2452 
2453 
2454 
2455 //=================================================================================================
2456 //
2457 // DVECSCALARMULTEXPR SPECIALIZATION
2458 //
2459 //=================================================================================================
2460 
2461 //*************************************************************************************************
2469 template< typename VT // Type of the left-hand side dense vector
2470  , typename MT // Type of the right-hand side dense matrix
2471  , typename ST > // Type of the side scalar value
2472 class DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >
2473  : public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2474  , private Computation
2475 {
2476  private:
2477  //**Type definitions****************************************************************************
2478  using VMM = TDVecDMatMultExpr<VT,MT>;
2479  using RES = ResultType_<VMM>;
2480  using VRT = ResultType_<VT>;
2481  using MRT = ResultType_<MT>;
2482  using VET = ElementType_<VRT>;
2483  using MET = ElementType_<MRT>;
2484  using VCT = CompositeType_<VT>;
2485  using MCT = CompositeType_<MT>;
2486  //**********************************************************************************************
2487 
2488  //**********************************************************************************************
2490  enum : bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2491  //**********************************************************************************************
2492 
2493  //**********************************************************************************************
2495  enum : bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2497  //**********************************************************************************************
2498 
2499  //**********************************************************************************************
2501 
2504  template< typename T1 >
2505  struct UseSMPAssign {
2506  enum : bool { value = ( evaluateVector || evaluateMatrix ) };
2507  };
2508  //**********************************************************************************************
2509 
2510  //**********************************************************************************************
2512 
2514  template< typename T1, typename T2, typename T3, typename T4 >
2515  struct UseBlasKernel {
2521  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2526  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2528  };
2529  //**********************************************************************************************
2530 
2531  //**********************************************************************************************
2533 
2536  template< typename T1, typename T2, typename T3, typename T4 >
2537  struct UseVectorizedDefaultKernel {
2538  enum : bool { value = useOptimizedKernels &&
2540  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2544  , T4 >::value &&
2545  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2546  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2547  };
2548  //**********************************************************************************************
2549 
2550  public:
2551  //**Type definitions****************************************************************************
2553  using ResultType = MultTrait_<RES,ST>;
2557  using ReturnType = const ElementType;
2558  using CompositeType = const ResultType;
2559 
2561  using LeftOperand = const TDVecDMatMultExpr<VT,MT>;
2562 
2564  using RightOperand = ST;
2565 
2568 
2571  //**********************************************************************************************
2572 
2573  //**Compilation flags***************************************************************************
2575  enum : bool { simdEnabled = !IsDiagonal<MT>::value &&
2576  VT::simdEnabled && MT::simdEnabled &&
2580 
2582  enum : bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2583  !evaluateMatrix && MT::smpAssignable };
2584  //**********************************************************************************************
2585 
2586  //**SIMD properties*****************************************************************************
2588  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
2589  //**********************************************************************************************
2590 
2591  //**Constructor*********************************************************************************
2597  explicit inline DVecScalarMultExpr( const VMM& vector, ST scalar )
2598  : vector_( vector ) // Left-hand side dense vector of the multiplication expression
2599  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
2600  {}
2601  //**********************************************************************************************
2602 
2603  //**Subscript operator**************************************************************************
2609  inline ReturnType operator[]( size_t index ) const {
2610  BLAZE_INTERNAL_ASSERT( index < vector_.size(), "Invalid vector access index" );
2611  return vector_[index] * scalar_;
2612  }
2613  //**********************************************************************************************
2614 
2615  //**At function*********************************************************************************
2622  inline ReturnType at( size_t index ) const {
2623  if( index >= vector_.size() ) {
2624  BLAZE_THROW_OUT_OF_RANGE( "Invalid vector access index" );
2625  }
2626  return (*this)[index];
2627  }
2628  //**********************************************************************************************
2629 
2630  //**Size function*******************************************************************************
2635  inline size_t size() const {
2636  return vector_.size();
2637  }
2638  //**********************************************************************************************
2639 
2640  //**Left operand access*************************************************************************
2645  inline LeftOperand leftOperand() const {
2646  return vector_;
2647  }
2648  //**********************************************************************************************
2649 
2650  //**Right operand access************************************************************************
2655  inline RightOperand rightOperand() const {
2656  return scalar_;
2657  }
2658  //**********************************************************************************************
2659 
2660  //**********************************************************************************************
2666  template< typename T >
2667  inline bool canAlias( const T* alias ) const {
2668  return vector_.canAlias( alias );
2669  }
2670  //**********************************************************************************************
2671 
2672  //**********************************************************************************************
2678  template< typename T >
2679  inline bool isAliased( const T* alias ) const {
2680  return vector_.isAliased( alias );
2681  }
2682  //**********************************************************************************************
2683 
2684  //**********************************************************************************************
2689  inline bool isAligned() const {
2690  return vector_.isAligned();
2691  }
2692  //**********************************************************************************************
2693 
2694  //**********************************************************************************************
2699  inline bool canSMPAssign() const noexcept {
2700  RightOperand_<VMM> A( vector_.rightOperand() );
2701  return ( !BLAZE_BLAS_MODE ||
2704  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2705  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2706  ( size() > SMP_TDVECDMATMULT_THRESHOLD );
2707  }
2708  //**********************************************************************************************
2709 
2710  private:
2711  //**Member variables****************************************************************************
2712  LeftOperand vector_;
2713  RightOperand scalar_;
2714  //**********************************************************************************************
2715 
2716  //**Assignment to dense vectors*****************************************************************
2728  template< typename VT1 > // Type of the target dense vector
2729  friend inline void assign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
2730  {
2732 
2733  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
2734 
2735  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
2736  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
2737 
2738  if( right.rows() == 0UL ) {
2739  reset( ~lhs );
2740  return;
2741  }
2742  else if( right.columns() == 0UL ) {
2743  return;
2744  }
2745 
2746  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
2747  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
2748 
2749  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
2750  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
2751  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
2752  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
2753 
2754  DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2755  }
2756  //**********************************************************************************************
2757 
2758  //**Assignment to dense vectors (kernel selection)**********************************************
2769  template< typename VT1 // Type of the left-hand side target vector
2770  , typename VT2 // Type of the left-hand side vector operand
2771  , typename MT1 // Type of the right-hand side matrix operand
2772  , typename ST2 > // Type of the scalar value
2773  static inline void selectAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2774  {
2775  if( ( IsDiagonal<MT1>::value ) ||
2776  ( IsComputation<MT>::value && !evaluateMatrix ) ||
2777  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2778  selectSmallAssignKernel( y, x, A, scalar );
2779  else
2780  selectBlasAssignKernel( y, x, A, scalar );
2781  }
2782  //**********************************************************************************************
2783 
2784  //**Default assignment to dense vectors*********************************************************
2798  template< typename VT1 // Type of the left-hand side target vector
2799  , typename VT2 // Type of the left-hand side vector operand
2800  , typename MT1 // Type of the right-hand side matrix operand
2801  , typename ST2 > // Type of the scalar value
2802  static inline void selectDefaultAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2803  {
2804  const size_t M( A.rows() );
2805  const size_t N( A.columns() );
2806 
2808  reset( y[0] );
2809  }
2810 
2811  if( !IsLower<MT1>::value )
2812  {
2813  for( size_t j=( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ); j<N; ++j ) {
2814  y[j] = x[0UL] * A(0UL,j);
2815  }
2816  }
2817 
2818  for( size_t i=( IsLower<MT1>::value && !IsStrictlyLower<MT1>::value ? 0UL : 1UL ); i<M; ++i )
2819  {
2821  {
2822  y[i] = x[i] * A(i,i) * scalar;
2823  }
2824  else
2825  {
2826  const size_t jbegin( ( IsUpper<MT1>::value )
2827  ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2828  :( 0UL ) );
2829  const size_t jend( ( IsLower<MT1>::value )
2830  ?( IsStrictlyLower<MT1>::value ? i-1UL : i )
2831  :( N ) );
2832  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2833 
2834  const size_t jnum( jend - jbegin );
2835  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2836 
2837  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2838  y[j ] += x[i] * A(i,j );
2839  y[j+1UL] += x[i] * A(i,j+1UL);
2840  }
2841  if( jpos < jend ) {
2842  y[jpos] += x[i] * A(i,jpos);
2843  }
2844  if( IsLower<MT1>::value ) {
2845  y[jend] = x[i] * A(i,jend);
2846  }
2847  }
2848  }
2849 
2851  reset( y[N-1UL] );
2852  }
2853 
2854  if( !IsDiagonal<MT1>::value )
2855  {
2856  const size_t iend( IsStrictlyLower<MT1>::value ? N-1UL : N );
2857  for( size_t j=( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ); j<iend; ++j ) {
2858  y[j] *= scalar;
2859  }
2860  }
2861  }
2862  //**********************************************************************************************
2863 
2864  //**Default assignment to dense vectors (small matrices)****************************************
2878  template< typename VT1 // Type of the left-hand side target vector
2879  , typename VT2 // Type of the left-hand side vector operand
2880  , typename MT1 // Type of the right-hand side matrix operand
2881  , typename ST2 > // Type of the scalar value
2883  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2884  {
2885  selectDefaultAssignKernel( y, x, A, scalar );
2886  }
2887  //**********************************************************************************************
2888 
2889  //**Default assignment to dense vectors (small matrices)****************************************
2903  template< typename VT1 // Type of the left-hand side target vector
2904  , typename VT2 // Type of the left-hand side vector operand
2905  , typename MT1 // Type of the right-hand side matrix operand
2906  , typename ST2 > // Type of the scalar value
2908  selectSmallAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
2909  {
2910  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
2911 
2912  const size_t M( A.rows() );
2913  const size_t N( A.columns() );
2914 
2915  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2916  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2917 
2918  const SIMDType factor( set( scalar ) );
2919 
2920  size_t j( 0UL );
2921 
2922  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2923  {
2924  const size_t ibegin( ( IsLower<MT1>::value )
2925  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2926  :( 0UL ) );
2927  const size_t iend( ( IsUpper<MT1>::value )
2928  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2929  :( M ) );
2930  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2931 
2932  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2933 
2934  for( size_t i=ibegin; i<iend; ++i ) {
2935  const SIMDType x1( set( x[i] ) );
2936  xmm1 += x1 * A.load(i,j );
2937  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2938  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2939  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2940  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
2941  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
2942  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
2943  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
2944  }
2945 
2946  y.store( j , xmm1*factor );
2947  y.store( j+SIMDSIZE , xmm2*factor );
2948  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2949  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2950  y.store( j+SIMDSIZE*4UL, xmm5*factor );
2951  y.store( j+SIMDSIZE*5UL, xmm6*factor );
2952  y.store( j+SIMDSIZE*6UL, xmm7*factor );
2953  y.store( j+SIMDSIZE*7UL, xmm8*factor );
2954  }
2955 
2956  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2957  {
2958  const size_t ibegin( ( IsLower<MT1>::value )
2959  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2960  :( 0UL ) );
2961  const size_t iend( ( IsUpper<MT1>::value )
2962  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2963  :( M ) );
2964  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2965 
2966  SIMDType xmm1, xmm2, xmm3, xmm4;
2967 
2968  for( size_t i=ibegin; i<iend; ++i ) {
2969  const SIMDType x1( set( x[i] ) );
2970  xmm1 += x1 * A.load(i,j );
2971  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2972  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2973  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
2974  }
2975 
2976  y.store( j , xmm1*factor );
2977  y.store( j+SIMDSIZE , xmm2*factor );
2978  y.store( j+SIMDSIZE*2UL, xmm3*factor );
2979  y.store( j+SIMDSIZE*3UL, xmm4*factor );
2980  }
2981 
2982  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2983  {
2984  const size_t ibegin( ( IsLower<MT1>::value )
2985  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2986  :( 0UL ) );
2987  const size_t iend( ( IsUpper<MT1>::value )
2988  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2989  :( M ) );
2990  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2991 
2992  SIMDType xmm1, xmm2, xmm3;
2993 
2994  for( size_t i=ibegin; i<iend; ++i ) {
2995  const SIMDType x1( set( x[i] ) );
2996  xmm1 += x1 * A.load(i,j );
2997  xmm2 += x1 * A.load(i,j+SIMDSIZE );
2998  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
2999  }
3000 
3001  y.store( j , xmm1*factor );
3002  y.store( j+SIMDSIZE , xmm2*factor );
3003  y.store( j+SIMDSIZE*2UL, xmm3*factor );
3004  }
3005 
3006  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3007  {
3008  const size_t ibegin( ( IsLower<MT1>::value )
3009  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3010  :( 0UL ) );
3011  const size_t iend( ( IsUpper<MT1>::value )
3012  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3013  :( M ) );
3014  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3015 
3016  SIMDType xmm1, xmm2;
3017 
3018  for( size_t i=ibegin; i<iend; ++i ) {
3019  const SIMDType x1( set( x[i] ) );
3020  xmm1 += x1 * A.load(i,j );
3021  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3022  }
3023 
3024  y.store( j , xmm1*factor );
3025  y.store( j+SIMDSIZE, xmm2*factor );
3026  }
3027 
3028  for( ; j<jpos; j+=SIMDSIZE )
3029  {
3030  const size_t ibegin( ( IsLower<MT1>::value )
3031  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3032  :( 0UL ) );
3033  const size_t iend( ( IsUpper<MT1>::value )
3034  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3035  :( M ) );
3036  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3037 
3038  SIMDType xmm1;
3039 
3040  for( size_t i=ibegin; i<iend; ++i ) {
3041  xmm1 += set( x[i] ) * A.load(i,j);
3042  }
3043 
3044  y.store( j, xmm1*factor );
3045  }
3046 
3047  for( ; remainder && j<N; ++j )
3048  {
3049  const size_t ibegin( ( IsLower<MT1>::value )
3050  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3051  :( 0UL ) );
3052  const size_t iend( ( IsUpper<MT1>::value )
3053  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3054  :( M ) );
3055  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3056 
3057  ElementType value = ElementType();
3058 
3059  for( size_t i=ibegin; i<iend; ++i ) {
3060  value += x[i] * A(i,j);
3061  }
3062 
3063  y[j] = value * scalar;
3064  }
3065  }
3066  //**********************************************************************************************
3067 
3068  //**Default assignment to dense vectors (large matrices)****************************************
3082  template< typename VT1 // Type of the left-hand side target vector
3083  , typename VT2 // Type of the left-hand side vector operand
3084  , typename MT1 // Type of the right-hand side matrix operand
3085  , typename ST2 > // Type of the scalar value
3087  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3088  {
3089  selectDefaultAssignKernel( y, x, A, scalar );
3090  }
3091  //**********************************************************************************************
3092 
3093  //**Default assignment to dense vectors (large matrices)****************************************
3107  template< typename VT1 // Type of the left-hand side target vector
3108  , typename VT2 // Type of the left-hand side vector operand
3109  , typename MT1 // Type of the right-hand side matrix operand
3110  , typename ST2 > // Type of the scalar value
3112  selectLargeAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3113  {
3114  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3115 
3116  const size_t M( A.rows() );
3117  const size_t N( A.columns() );
3118 
3119  const size_t jblock( 32768UL / sizeof( ElementType ) );
3120  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3121 
3122  const SIMDType factor( set( scalar ) );
3123 
3124  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3125 
3126  reset( y );
3127 
3128  for( size_t jj=0U; jj<N; jj+=jblock ) {
3129  for( size_t ii=0UL; ii<M; ii+=iblock )
3130  {
3131  const size_t iend( min( ii+iblock, M ) );
3132  const size_t jtmp( min( jj+jblock, N ) );
3133  const size_t jend( ( IsLower<MT1>::value )
3134  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
3135  :( jtmp ) );
3136 
3137  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3138  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3139 
3140  size_t j( ( IsUpper<MT1>::value )
3141  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3142  :( jj ) );
3143 
3144  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3145  {
3146  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3147 
3148  for( size_t i=ii; i<iend; ++i ) {
3149  const SIMDType x1( set( x[i] ) );
3150  xmm1 += x1 * A.load(i,j );
3151  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3152  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3153  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3154  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3155  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3156  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3157  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3158  }
3159 
3160  y.store( j , y.load(j ) + xmm1*factor );
3161  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3162  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3163  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3164  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3165  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3166  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3167  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3168  }
3169 
3170  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3171  {
3172  SIMDType xmm1, xmm2, xmm3, xmm4;
3173 
3174  for( size_t i=ii; i<iend; ++i ) {
3175  const SIMDType x1( set( x[i] ) );
3176  xmm1 += x1 * A.load(i,j );
3177  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3178  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3179  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3180  }
3181 
3182  y.store( j , y.load(j ) + xmm1*factor );
3183  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3184  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3185  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3186  }
3187 
3188  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3189  {
3190  SIMDType xmm1, xmm2, xmm3;
3191 
3192  for( size_t i=ii; i<iend; ++i ) {
3193  const SIMDType x1( set( x[i] ) );
3194  xmm1 += x1 * A.load(i,j );
3195  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3196  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3197  }
3198 
3199  y.store( j , y.load(j ) + xmm1*factor );
3200  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3201  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3202  }
3203 
3204  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3205  {
3206  SIMDType xmm1, xmm2;
3207 
3208  for( size_t i=ii; i<iend; ++i ) {
3209  const SIMDType x1( set( x[i] ) );
3210  xmm1 += x1 * A.load(i,j );
3211  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3212  }
3213 
3214  y.store( j , y.load(j ) + xmm1*factor );
3215  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3216  }
3217 
3218  for( ; j<jpos; j+=SIMDSIZE )
3219  {
3220  SIMDType xmm1;
3221 
3222  for( size_t i=ii; i<iend; ++i ) {
3223  xmm1 += set( x[i] ) * A.load(i,j);
3224  }
3225 
3226  y.store( j, y.load(j) + xmm1*factor );
3227  }
3228 
3229  for( ; remainder && j<jend; ++j )
3230  {
3231  ElementType value = ElementType();
3232 
3233  for( size_t i=ii; i<iend; ++i ) {
3234  value += x[i] * A(i,j);
3235  }
3236 
3237  y[j] += value * scalar;
3238  }
3239  }
3240  }
3241  }
3242  //**********************************************************************************************
3243 
3244  //**BLAS-based assignment to dense vectors (default)********************************************
3257  template< typename VT1 // Type of the left-hand side target vector
3258  , typename VT2 // Type of the left-hand side vector operand
3259  , typename MT1 // Type of the right-hand side matrix operand
3260  , typename ST2 > // Type of the scalar value
3262  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3263  {
3264  selectLargeAssignKernel( y, x, A, scalar );
3265  }
3266  //**********************************************************************************************
3267 
3268  //**BLAS-based assignment to dense vectors******************************************************
3269 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3270 
3283  template< typename VT1 // Type of the left-hand side target vector
3284  , typename VT2 // Type of the left-hand side vector operand
3285  , typename MT1 // Type of the right-hand side matrix operand
3286  , typename ST2 > // Type of the scalar value
3288  selectBlasAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3289  {
3290  using ET = ElementType_<VT1>;
3291 
3292  if( IsTriangular<MT1>::value ) {
3293  assign( y, scalar * x );
3294  trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3295  }
3296  else {
3297  gemv( y, x, A, ET(scalar), ET(0) );
3298  }
3299  }
3300 #endif
3301  //**********************************************************************************************
3302 
3303  //**Assignment to sparse vectors****************************************************************
3315  template< typename VT1 > // Type of the target sparse vector
3316  friend inline void assign( SparseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3317  {
3319 
3323 
3324  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3325 
3326  const ResultType tmp( serial( rhs ) );
3327  assign( ~lhs, tmp );
3328  }
3329  //**********************************************************************************************
3330 
3331  //**Addition assignment to dense vectors********************************************************
3343  template< typename VT1 > // Type of the target dense vector
3344  friend inline void addAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3345  {
3347 
3348  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3349 
3350  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3351  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3352 
3353  if( right.rows() == 0UL || right.columns() == 0UL ) {
3354  return;
3355  }
3356 
3357  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3358  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3359 
3360  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3361  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3362  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3363  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3364 
3365  DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3366  }
3367  //**********************************************************************************************
3368 
3369  //**Addition assignment to dense vectors (kernel selection)*************************************
3380  template< typename VT1 // Type of the left-hand side target vector
3381  , typename VT2 // Type of the left-hand side vector operand
3382  , typename MT1 // Type of the right-hand side matrix operand
3383  , typename ST2 > // Type of the scalar value
3384  static inline void selectAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3385  {
3386  if( ( IsDiagonal<MT1>::value ) ||
3387  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3388  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3389  selectSmallAddAssignKernel( y, x, A, scalar );
3390  else
3391  selectBlasAddAssignKernel( y, x, A, scalar );
3392  }
3393  //**********************************************************************************************
3394 
3395  //**Default addition assignment to dense vectors************************************************
3409  template< typename VT1 // Type of the left-hand side target vector
3410  , typename VT2 // Type of the left-hand side vector operand
3411  , typename MT1 // Type of the right-hand side matrix operand
3412  , typename ST2 > // Type of the scalar value
3413  static inline void selectDefaultAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3414  {
3415  y.addAssign( x * A * scalar );
3416  }
3417  //**********************************************************************************************
3418 
3419  //**Default addition assignment to dense vectors (small matrices)*******************************
3433  template< typename VT1 // Type of the left-hand side target vector
3434  , typename VT2 // Type of the left-hand side vector operand
3435  , typename MT1 // Type of the right-hand side matrix operand
3436  , typename ST2 > // Type of the scalar value
3438  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3439  {
3440  selectDefaultAddAssignKernel( y, x, A, scalar );
3441  }
3442  //**********************************************************************************************
3443 
3444  //**Vectorized default addition assignment to dense vectors (small matrices)********************
3459  template< typename VT1 // Type of the left-hand side target vector
3460  , typename VT2 // Type of the left-hand side vector operand
3461  , typename MT1 // Type of the right-hand side matrix operand
3462  , typename ST2 > // Type of the scalar value
3464  selectSmallAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3465  {
3466  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3467 
3468  const size_t M( A.rows() );
3469  const size_t N( A.columns() );
3470 
3471  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3472  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3473 
3474  const SIMDType factor( set( scalar ) );
3475 
3476  size_t j( 0UL );
3477 
3478  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3479  {
3480  const size_t ibegin( ( IsLower<MT1>::value )
3481  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3482  :( 0UL ) );
3483  const size_t iend( ( IsUpper<MT1>::value )
3484  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3485  :( M ) );
3486  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3487 
3488  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3489 
3490  for( size_t i=ibegin; i<iend; ++i ) {
3491  const SIMDType x1( set( x[i] ) );
3492  xmm1 += x1 * A.load(i,j );
3493  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3494  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3495  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3496  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3497  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3498  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3499  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3500  }
3501 
3502  y.store( j , y.load(j ) + xmm1*factor );
3503  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3504  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3505  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3506  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3507  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3508  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3509  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3510  }
3511 
3512  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3513  {
3514  const size_t ibegin( ( IsLower<MT1>::value )
3515  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3516  :( 0UL ) );
3517  const size_t iend( ( IsUpper<MT1>::value )
3518  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3519  :( M ) );
3520  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3521 
3522  SIMDType xmm1, xmm2, xmm3, xmm4;
3523 
3524  for( size_t i=ibegin; i<iend; ++i ) {
3525  const SIMDType x1( set( x[i] ) );
3526  xmm1 += x1 * A.load(i,j );
3527  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3528  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3529  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3530  }
3531 
3532  y.store( j , y.load(j ) + xmm1*factor );
3533  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3534  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3535  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3536  }
3537 
3538  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3539  {
3540  const size_t ibegin( ( IsLower<MT1>::value )
3541  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3542  :( 0UL ) );
3543  const size_t iend( ( IsUpper<MT1>::value )
3544  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3545  :( M ) );
3546  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3547 
3548  SIMDType xmm1, xmm2, xmm3;
3549 
3550  for( size_t i=ibegin; i<iend; ++i ) {
3551  const SIMDType x1( set( x[i] ) );
3552  xmm1 += x1 * A.load(i,j );
3553  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3554  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3555  }
3556 
3557  y.store( j , y.load(j ) + xmm1*factor );
3558  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3559  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3560  }
3561 
3562  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3563  {
3564  const size_t ibegin( ( IsLower<MT1>::value )
3565  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3566  :( 0UL ) );
3567  const size_t iend( ( IsUpper<MT1>::value )
3568  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3569  :( M ) );
3570  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3571 
3572  SIMDType xmm1, xmm2;
3573 
3574  for( size_t i=ibegin; i<iend; ++i ) {
3575  const SIMDType x1( set( x[i] ) );
3576  xmm1 += x1 * A.load(i,j );
3577  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3578  }
3579 
3580  y.store( j , y.load(j ) + xmm1*factor );
3581  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3582  }
3583 
3584  for( ; j<jpos; j+=SIMDSIZE )
3585  {
3586  const size_t ibegin( ( IsLower<MT1>::value )
3587  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3588  :( 0UL ) );
3589  const size_t iend( ( IsUpper<MT1>::value )
3590  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3591  :( M ) );
3592  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3593 
3594  SIMDType xmm1;
3595 
3596  for( size_t i=ibegin; i<iend; ++i ) {
3597  xmm1 += set( x[i] ) * A.load(i,j);
3598  }
3599 
3600  y.store( j, y.load(j) + xmm1*factor );
3601  }
3602 
3603  for( ; remainder && j<N; ++j )
3604  {
3605  const size_t ibegin( ( IsLower<MT1>::value )
3606  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3607  :( 0UL ) );
3608  const size_t iend( ( IsUpper<MT1>::value )
3609  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3610  :( M ) );
3611  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3612 
3613  ElementType value = ElementType();
3614 
3615  for( size_t i=ibegin; i<iend; ++i ) {
3616  value += x[i] * A(i,j);
3617  }
3618 
3619  y[j] += value * scalar;
3620  }
3621  }
3622  //**********************************************************************************************
3623 
3624  //**Default addition assignment to dense vectors (large matrices)*******************************
3638  template< typename VT1 // Type of the left-hand side target vector
3639  , typename VT2 // Type of the left-hand side vector operand
3640  , typename MT1 // Type of the right-hand side matrix operand
3641  , typename ST2 > // Type of the scalar value
3643  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3644  {
3645  selectDefaultAddAssignKernel( y, x, A, scalar );
3646  }
3647  //**********************************************************************************************
3648 
3649  //**Vectorized default addition assignment to dense vectors (large matrices)********************
3664  template< typename VT1 // Type of the left-hand side target vector
3665  , typename VT2 // Type of the left-hand side vector operand
3666  , typename MT1 // Type of the right-hand side matrix operand
3667  , typename ST2 > // Type of the scalar value
3669  selectLargeAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3670  {
3671  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3672 
3673  const size_t M( A.rows() );
3674  const size_t N( A.columns() );
3675 
3676  const size_t jblock( 32768UL / sizeof( ElementType ) );
3677  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3678 
3679  const SIMDType factor( set( scalar ) );
3680 
3681  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
3682 
3683  for( size_t jj=0U; jj<N; jj+=jblock ) {
3684  for( size_t ii=0UL; ii<M; ii+=iblock )
3685  {
3686  const size_t iend( min( ii+iblock, M ) );
3687  const size_t jtmp( min( jj+jblock, N ) );
3688  const size_t jend( ( IsLower<MT1>::value )
3689  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
3690  :( jtmp ) );
3691 
3692  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
3693  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3694 
3695  size_t j( ( IsUpper<MT1>::value )
3696  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
3697  :( jj ) );
3698 
3699  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3700  {
3701  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3702 
3703  for( size_t i=ii; i<iend; ++i ) {
3704  const SIMDType x1( set( x[i] ) );
3705  xmm1 += x1 * A.load(i,j );
3706  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3707  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3708  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3709  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
3710  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
3711  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
3712  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
3713  }
3714 
3715  y.store( j , y.load(j ) + xmm1*factor );
3716  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3717  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3718  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3719  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3720  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3721  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3722  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3723  }
3724 
3725  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3726  {
3727  SIMDType xmm1, xmm2, xmm3, xmm4;
3728 
3729  for( size_t i=ii; i<iend; ++i ) {
3730  const SIMDType x1( set( x[i] ) );
3731  xmm1 += x1 * A.load(i,j );
3732  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3733  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3734  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
3735  }
3736 
3737  y.store( j , y.load(j ) + xmm1*factor );
3738  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3739  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3740  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3741  }
3742 
3743  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3744  {
3745  SIMDType xmm1, xmm2, xmm3;
3746 
3747  for( size_t i=ii; i<iend; ++i ) {
3748  const SIMDType x1( set( x[i] ) );
3749  xmm1 += x1 * A.load(i,j );
3750  xmm2 += x1 * A.load(i,j+SIMDSIZE );
3751  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
3752  }
3753 
3754  y.store( j , y.load(j ) + xmm1*factor );
3755  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3756  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3757  }
3758 
3759  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3760  {
3761  SIMDType xmm1, xmm2;
3762 
3763  for( size_t i=ii; i<iend; ++i ) {
3764  const SIMDType x1( set( x[i] ) );
3765  xmm1 += x1 * A.load(i,j );
3766  xmm2 += x1 * A.load(i,j+SIMDSIZE);
3767  }
3768 
3769  y.store( j , y.load(j ) + xmm1*factor );
3770  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3771  }
3772 
3773  for( ; j<jpos; j+=SIMDSIZE )
3774  {
3775  SIMDType xmm1;
3776 
3777  for( size_t i=ii; i<iend; ++i ) {
3778  xmm1 += set( x[i] ) * A.load(i,j);
3779  }
3780 
3781  y.store( j, y.load(j) + xmm1*factor );
3782  }
3783 
3784  for( ; remainder && j<jend; ++j )
3785  {
3786  ElementType value = ElementType();
3787 
3788  for( size_t i=ii; i<iend; ++i ) {
3789  value += x[i] * A(i,j);
3790  }
3791 
3792  y[j] += value * scalar;
3793  }
3794  }
3795  }
3796  }
3797  //**********************************************************************************************
3798 
3799  //**BLAS-based addition assignment to dense vectors (default)***********************************
3813  template< typename VT1 // Type of the left-hand side target vector
3814  , typename VT2 // Type of the left-hand side vector operand
3815  , typename MT1 // Type of the right-hand side matrix operand
3816  , typename ST2 > // Type of the scalar value
3818  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3819  {
3820  selectLargeAddAssignKernel( y, x, A, scalar );
3821  }
3822  //**********************************************************************************************
3823 
3824  //**BLAS-based addition assignment to dense vectors*********************************************
3825 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3826 
3839  template< typename VT1 // Type of the left-hand side target vector
3840  , typename VT2 // Type of the left-hand side vector operand
3841  , typename MT1 // Type of the right-hand side matrix operand
3842  , typename ST2 > // Type of the scalar value
3844  selectBlasAddAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3845  {
3846  using ET = ElementType_<VT1>;
3847 
3848  if( IsTriangular<MT1>::value ) {
3849  ResultType_<VT1> tmp( serial( scalar * x ) );
3850  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3851  addAssign( y, tmp );
3852  }
3853  else {
3854  gemv( y, x, A, ET(scalar), ET(1) );
3855  }
3856  }
3857 #endif
3858  //**********************************************************************************************
3859 
3860  //**Addition assignment to sparse vectors*******************************************************
3861  // No special implementation for the addition assignment to sparse vectors.
3862  //**********************************************************************************************
3863 
3864  //**Subtraction assignment to dense vectors*****************************************************
3876  template< typename VT1 > // Type of the target dense vector
3877  friend inline void subAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
3878  {
3880 
3881  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
3882 
3883  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3884  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3885 
3886  if( right.rows() == 0UL || right.columns() == 0UL ) {
3887  return;
3888  }
3889 
3890  LT x( serial( left ) ); // Evaluation of the left-hand side dense vector operand
3891  RT A( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
3892 
3893  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
3894  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
3895  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
3896  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
3897 
3898  DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
3899  }
3900  //**********************************************************************************************
3901 
3902  //**Subtraction assignment to dense vectors (kernel selection)**********************************
3913  template< typename VT1 // Type of the left-hand side target vector
3914  , typename VT2 // Type of the left-hand side vector operand
3915  , typename MT1 // Type of the right-hand side matrix operand
3916  , typename ST2 > // Type of the scalar value
3917  static inline void selectSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3918  {
3919  if( ( IsDiagonal<MT1>::value ) ||
3920  ( IsComputation<MT>::value && !evaluateMatrix ) ||
3921  ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3922  selectSmallSubAssignKernel( y, x, A, scalar );
3923  else
3924  selectBlasSubAssignKernel( y, x, A, scalar );
3925  }
3926  //**********************************************************************************************
3927 
3928  //**Default subtraction assignment to dense vectors*********************************************
3942  template< typename VT1 // Type of the left-hand side target vector
3943  , typename VT2 // Type of the left-hand side vector operand
3944  , typename MT1 // Type of the right-hand side matrix operand
3945  , typename ST2 > // Type of the scalar value
3946  static inline void selectDefaultSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3947  {
3948  y.subAssign( x * A * scalar );
3949  }
3950  //**********************************************************************************************
3951 
3952  //**Default subtraction assignment to dense vectors (small matrices)****************************
3966  template< typename VT1 // Type of the left-hand side target vector
3967  , typename VT2 // Type of the left-hand side vector operand
3968  , typename MT1 // Type of the right-hand side matrix operand
3969  , typename ST2 > // Type of the scalar value
3971  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3972  {
3973  selectDefaultSubAssignKernel( y, x, A, scalar );
3974  }
3975  //**********************************************************************************************
3976 
3977  //**Vectorized default subtraction assignment to dense vectors (small matrices)*****************
3992  template< typename VT1 // Type of the left-hand side target vector
3993  , typename VT2 // Type of the left-hand side vector operand
3994  , typename MT1 // Type of the right-hand side matrix operand
3995  , typename ST2 > // Type of the scalar value
3997  selectSmallSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
3998  {
3999  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
4000 
4001  const size_t M( A.rows() );
4002  const size_t N( A.columns() );
4003 
4004  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
4005  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4006 
4007  const SIMDType factor( set( scalar ) );
4008 
4009  size_t j( 0UL );
4010 
4011  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4012  {
4013  const size_t ibegin( ( IsLower<MT1>::value )
4014  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4015  :( 0UL ) );
4016  const size_t iend( ( IsUpper<MT1>::value )
4017  ?( min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4018  :( M ) );
4019  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4020 
4021  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4022 
4023  for( size_t i=ibegin; i<iend; ++i ) {
4024  const SIMDType x1( set( x[i] ) );
4025  xmm1 += x1 * A.load(i,j );
4026  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4027  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4028  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4029  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4030  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4031  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4032  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4033  }
4034 
4035  y.store( j , y.load(j ) - xmm1*factor );
4036  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4037  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4038  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4039  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4040  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4041  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4042  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4043  }
4044 
4045  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4046  {
4047  const size_t ibegin( ( IsLower<MT1>::value )
4048  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4049  :( 0UL ) );
4050  const size_t iend( ( IsUpper<MT1>::value )
4051  ?( min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4052  :( M ) );
4053  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4054 
4055  SIMDType xmm1, xmm2, xmm3, xmm4;
4056 
4057  for( size_t i=ibegin; i<iend; ++i ) {
4058  const SIMDType x1( set( x[i] ) );
4059  xmm1 += x1 * A.load(i,j );
4060  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4061  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4062  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4063  }
4064 
4065  y.store( j , y.load(j ) - xmm1*factor );
4066  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4067  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4068  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4069  }
4070 
4071  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4072  {
4073  const size_t ibegin( ( IsLower<MT1>::value )
4074  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4075  :( 0UL ) );
4076  const size_t iend( ( IsUpper<MT1>::value )
4077  ?( min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4078  :( M ) );
4079  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4080 
4081  SIMDType xmm1, xmm2, xmm3;
4082 
4083  for( size_t i=ibegin; i<iend; ++i ) {
4084  const SIMDType x1( set( x[i] ) );
4085  xmm1 += x1 * A.load(i,j );
4086  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4087  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4088  }
4089 
4090  y.store( j , y.load(j ) - xmm1*factor );
4091  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4092  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4093  }
4094 
4095  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4096  {
4097  const size_t ibegin( ( IsLower<MT1>::value )
4098  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4099  :( 0UL ) );
4100  const size_t iend( ( IsUpper<MT1>::value )
4101  ?( min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4102  :( M ) );
4103  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4104 
4105  SIMDType xmm1, xmm2;
4106 
4107  for( size_t i=ibegin; i<iend; ++i ) {
4108  const SIMDType x1( set( x[i] ) );
4109  xmm1 += x1 * A.load(i,j );
4110  xmm2 += x1 * A.load(i,j+SIMDSIZE);
4111  }
4112 
4113  y.store( j , y.load(j ) - xmm1*factor );
4114  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4115  }
4116 
4117  for( ; j<jpos; j+=SIMDSIZE )
4118  {
4119  const size_t ibegin( ( IsLower<MT1>::value )
4120  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4121  :( 0UL ) );
4122  const size_t iend( ( IsUpper<MT1>::value )
4123  ?( min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4124  :( M ) );
4125  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4126 
4127  SIMDType xmm1;
4128 
4129  for( size_t i=ibegin; i<iend; ++i ) {
4130  xmm1 += set( x[i] ) * A.load(i,j);
4131  }
4132 
4133  y.store( j, y.load(j) - xmm1*factor );
4134  }
4135 
4136  for( ; remainder && j<N; ++j )
4137  {
4138  const size_t ibegin( ( IsLower<MT1>::value )
4139  ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4140  :( 0UL ) );
4141  const size_t iend( ( IsUpper<MT1>::value )
4142  ?( min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4143  :( M ) );
4144  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4145 
4146  ElementType value = ElementType();
4147 
4148  for( size_t i=ibegin; i<iend; ++i ) {
4149  value += x[i] * A(i,j);
4150  }
4151 
4152  y[j] -= value * scalar;
4153  }
4154  }
4155  //**********************************************************************************************
4156 
4157  //**Default subtraction assignment to dense vectors (large matrices)****************************
4171  template< typename VT1 // Type of the left-hand side target vector
4172  , typename VT2 // Type of the left-hand side vector operand
4173  , typename MT1 // Type of the right-hand side matrix operand
4174  , typename ST2 > // Type of the scalar value
4176  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4177  {
4178  selectDefaultSubAssignKernel( y, x, A, scalar );
4179  }
4180  //**********************************************************************************************
4181 
4182  //**Vectorized default subtraction assignment to dense vectors (large matrices)*****************
4197  template< typename VT1 // Type of the left-hand side target vector
4198  , typename VT2 // Type of the left-hand side vector operand
4199  , typename MT1 // Type of the right-hand side matrix operand
4200  , typename ST2 > // Type of the scalar value
4202  selectLargeSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4203  {
4204  constexpr bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
4205 
4206  const size_t M( A.rows() );
4207  const size_t N( A.columns() );
4208 
4209  const size_t jblock( 32768UL / sizeof( ElementType ) );
4210  const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4211 
4212  const SIMDType factor( set( scalar ) );
4213 
4214  BLAZE_INTERNAL_ASSERT( ( jblock % SIMDSIZE ) == 0UL, "Invalid block size detected" );
4215 
4216  for( size_t jj=0U; jj<N; jj+=jblock ) {
4217  for( size_t ii=0UL; ii<M; ii+=iblock )
4218  {
4219  const size_t iend( min( ii+iblock, M ) );
4220  const size_t jtmp( min( jj+jblock, N ) );
4221  const size_t jend( ( IsLower<MT1>::value )
4222  ?( min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
4223  :( jtmp ) );
4224 
4225  const size_t jpos( remainder ? ( jend & size_t(-SIMDSIZE) ) : jend );
4226  BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4227 
4228  size_t j( ( IsUpper<MT1>::value )
4229  ?( max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) & size_t(-SIMDSIZE) ) )
4230  :( jj ) );
4231 
4232  for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4233  {
4234  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4235 
4236  for( size_t i=ii; i<iend; ++i ) {
4237  const SIMDType x1( set( x[i] ) );
4238  xmm1 += x1 * A.load(i,j );
4239  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4240  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4241  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4242  xmm5 += x1 * A.load(i,j+SIMDSIZE*4UL);
4243  xmm6 += x1 * A.load(i,j+SIMDSIZE*5UL);
4244  xmm7 += x1 * A.load(i,j+SIMDSIZE*6UL);
4245  xmm8 += x1 * A.load(i,j+SIMDSIZE*7UL);
4246  }
4247 
4248  y.store( j , y.load(j ) - xmm1*factor );
4249  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4250  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4251  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4252  y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4253  y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4254  y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4255  y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4256  }
4257 
4258  for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4259  {
4260  SIMDType xmm1, xmm2, xmm3, xmm4;
4261 
4262  for( size_t i=ii; i<iend; ++i ) {
4263  const SIMDType x1( set( x[i] ) );
4264  xmm1 += x1 * A.load(i,j );
4265  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4266  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4267  xmm4 += x1 * A.load(i,j+SIMDSIZE*3UL);
4268  }
4269 
4270  y.store( j , y.load(j ) - xmm1*factor );
4271  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4272  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4273  y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4274  }
4275 
4276  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4277  {
4278  SIMDType xmm1, xmm2, xmm3;
4279 
4280  for( size_t i=ii; i<iend; ++i ) {
4281  const SIMDType x1( set( x[i] ) );
4282  xmm1 += x1 * A.load(i,j );
4283  xmm2 += x1 * A.load(i,j+SIMDSIZE );
4284  xmm3 += x1 * A.load(i,j+SIMDSIZE*2UL);
4285  }
4286 
4287  y.store( j , y.load(j ) - xmm1*factor );
4288  y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4289  y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4290  }
4291 
4292  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4293  {
4294  SIMDType xmm1, xmm2;
4295 
4296  for( size_t i=ii; i<iend; ++i ) {
4297  const SIMDType x1( set( x[i] ) );
4298  xmm1 += x1 * A.load(i,j );
4299  xmm2 += x1 * A.load(i,j+SIMDSIZE);
4300  }
4301 
4302  y.store( j , y.load(j ) - xmm1*factor );
4303  y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4304  }
4305 
4306  for( ; j<jpos; j+=SIMDSIZE )
4307  {
4308  SIMDType xmm1;
4309 
4310  for( size_t i=ii; i<iend; ++i ) {
4311  xmm1 += set( x[i] ) * A.load(i,j);
4312  }
4313 
4314  y.store( j, y.load(j) - xmm1*factor );
4315  }
4316 
4317  for( ; remainder && j<jend; ++j )
4318  {
4319  ElementType value = ElementType();
4320 
4321  for( size_t i=ii; i<iend; ++i ) {
4322  value += x[i] * A(i,j);
4323  }
4324 
4325  y[j] -= value * scalar;
4326  }
4327  }
4328  }
4329  }
4330  //**********************************************************************************************
4331 
4332  //**BLAS-based subtraction assignment to dense vectors (default)********************************
4346  template< typename VT1 // Type of the left-hand side target vector
4347  , typename VT2 // Type of the left-hand side vector operand
4348  , typename MT1 // Type of the right-hand side matrix operand
4349  , typename ST2 > // Type of the scalar value
4351  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4352  {
4353  selectLargeSubAssignKernel( y, x, A, scalar );
4354  }
4355  //**********************************************************************************************
4356 
4357  //**BLAS-based subtraction assignment to dense vectors******************************************
4358 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4359 
4372  template< typename VT1 // Type of the left-hand side target vector
4373  , typename VT2 // Type of the left-hand side vector operand
4374  , typename MT1 // Type of the right-hand side matrix operand
4375  , typename ST2 > // Type of the scalar value
4377  selectBlasSubAssignKernel( VT1& y, const VT2& x, const MT1& A, ST2 scalar )
4378  {
4379  using ET = ElementType_<VT1>;
4380 
4381  if( IsTriangular<MT1>::value ) {
4382  ResultType_<VT1> tmp( serial( scalar * x ) );
4383  trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4384  subAssign( y, tmp );
4385  }
4386  else {
4387  gemv( y, x, A, ET(-scalar), ET(1) );
4388  }
4389  }
4390 #endif
4391  //**********************************************************************************************
4392 
4393  //**Subtraction assignment to sparse vectors****************************************************
4394  // No special implementation for the subtraction assignment to sparse vectors.
4395  //**********************************************************************************************
4396 
4397  //**Multiplication assignment to dense vectors**************************************************
4409  template< typename VT1 > // Type of the target dense vector
4410  friend inline void multAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4411  {
4413 
4417 
4418  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4419 
4420  const ResultType tmp( serial( rhs ) );
4421  multAssign( ~lhs, tmp );
4422  }
4423  //**********************************************************************************************
4424 
4425  //**Multiplication assignment to sparse vectors*************************************************
4426  // No special implementation for the multiplication assignment to sparse vectors.
4427  //**********************************************************************************************
4428 
4429  //**Division assignment to dense vectors********************************************************
4441  template< typename VT1 > // Type of the target dense vector
4442  friend inline void divAssign( DenseVector<VT1,true>& lhs, const DVecScalarMultExpr& rhs )
4443  {
4445 
4449 
4450  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4451 
4452  const ResultType tmp( serial( rhs ) );
4453  divAssign( ~lhs, tmp );
4454  }
4455  //**********************************************************************************************
4456 
4457  //**Division assignment to sparse vectors*******************************************************
4458  // No special implementation for the division assignment to sparse vectors.
4459  //**********************************************************************************************
4460 
4461  //**SMP assignment to dense vectors*************************************************************
4475  template< typename VT1 > // Type of the target dense vector
4476  friend inline EnableIf_< UseSMPAssign<VT1> >
4478  {
4480 
4481  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4482 
4483  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4484  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4485 
4486  if( right.rows() == 0UL ) {
4487  reset( ~lhs );
4488  return;
4489  }
4490  else if( right.columns() == 0UL ) {
4491  return;
4492  }
4493 
4494  LT x( left ); // Evaluation of the left-hand side dense vector operand
4495  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4496 
4497  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4498  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4499  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4500  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4501 
4502  smpAssign( ~lhs, x * A * rhs.scalar_ );
4503  }
4504  //**********************************************************************************************
4505 
4506  //**SMP assignment to sparse vectors************************************************************
4520  template< typename VT1 > // Type of the target sparse vector
4521  friend inline EnableIf_< UseSMPAssign<VT1> >
4523  {
4525 
4529 
4530  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4531 
4532  const ResultType tmp( rhs );
4533  smpAssign( ~lhs, tmp );
4534  }
4535  //**********************************************************************************************
4536 
4537  //**SMP addition assignment to dense vectors****************************************************
4551  template< typename VT1 > // Type of the target dense vector
4552  friend inline EnableIf_< UseSMPAssign<VT1> >
4554  {
4556 
4557  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4558 
4559  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4560  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4561 
4562  if( right.rows() == 0UL || right.columns() == 0UL ) {
4563  return;
4564  }
4565 
4566  LT x( left ); // Evaluation of the left-hand side dense vector operand
4567  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4568 
4569  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4570  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4571  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4572  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4573 
4574  smpAddAssign( ~lhs, x * A * rhs.scalar_ );
4575  }
4576  //**********************************************************************************************
4577 
4578  //**SMP addition assignment to sparse vectors***************************************************
4579  // No special implementation for the SMP addition assignment to sparse vectors.
4580  //**********************************************************************************************
4581 
4582  //**SMP subtraction assignment to dense vectors*************************************************
4596  template< typename VT1 > // Type of the target dense vector
4597  friend inline EnableIf_< UseSMPAssign<VT1> >
4599  {
4601 
4602  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4603 
4604  LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4605  RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4606 
4607  if( right.rows() == 0UL || right.columns() == 0UL ) {
4608  return;
4609  }
4610 
4611  LT x( left ); // Evaluation of the left-hand side dense vector operand
4612  RT A( right ); // Evaluation of the right-hand side dense matrix operand
4613 
4614  BLAZE_INTERNAL_ASSERT( x.size() == left.size() , "Invalid vector size" );
4615  BLAZE_INTERNAL_ASSERT( A.rows() == right.rows() , "Invalid number of rows" );
4616  BLAZE_INTERNAL_ASSERT( A.columns() == right.columns(), "Invalid number of columns" );
4617  BLAZE_INTERNAL_ASSERT( A.columns() == (~lhs).size() , "Invalid vector size" );
4618 
4619  smpSubAssign( ~lhs, x * A * rhs.scalar_ );
4620  }
4621  //**********************************************************************************************
4622 
4623  //**SMP subtraction assignment to sparse vectors************************************************
4624  // No special implementation for the SMP subtraction assignment to sparse vectors.
4625  //**********************************************************************************************
4626 
4627  //**SMP multiplication assignment to dense vectors**********************************************
4642  template< typename VT1 > // Type of the target dense vector
4643  friend inline EnableIf_< UseSMPAssign<VT1> >
4645  {
4647 
4651 
4652  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4653 
4654  const ResultType tmp( rhs );
4655  smpMultAssign( ~lhs, tmp );
4656  }
4657  //**********************************************************************************************
4658 
4659  //**SMP multiplication assignment to sparse vectors*********************************************
4660  // No special implementation for the SMP multiplication assignment to sparse vectors.
4661  //**********************************************************************************************
4662 
4663  //**SMP division assignment to dense vectors****************************************************
4677  template< typename VT1 > // Type of the target dense vector
4678  friend inline EnableIf_< UseSMPAssign<VT1> >
4680  {
4682 
4686 
4687  BLAZE_INTERNAL_ASSERT( (~lhs).size() == rhs.size(), "Invalid vector sizes" );
4688 
4689  const ResultType tmp( rhs );
4690  smpDivAssign( ~lhs, tmp );
4691  }
4692  //**********************************************************************************************
4693 
4694  //**SMP division assignment to sparse vectors***************************************************
4695  // No special implementation for the SMP division assignment to sparse vectors.
4696  //**********************************************************************************************
4697 
4698  //**Compile time checks*************************************************************************
4707  //**********************************************************************************************
4708 };
4710 //*************************************************************************************************
4711 
4712 
4713 
4714 
4715 //=================================================================================================
4716 //
4717 // GLOBAL BINARY ARITHMETIC OPERATORS
4718 //
4719 //=================================================================================================
4720 
4721 //*************************************************************************************************
4752 template< typename VT // Type of the left-hand side dense vector
4753  , typename MT > // Type of the right-hand side dense matrix
4754 inline decltype(auto)
4755  operator*( const DenseVector<VT,true>& vec, const DenseMatrix<MT,false>& mat )
4756 {
4758 
4760 
4761  if( (~vec).size() != (~mat).rows() ) {
4762  BLAZE_THROW_INVALID_ARGUMENT( "Vector and matrix sizes do not match" );
4763  }
4764 
4765  using ReturnType = const TDVecDMatMultExpr<VT,MT>;
4766  return ReturnType( ~vec, ~mat );
4767 }
4768 //*************************************************************************************************
4769 
4770 
4771 
4772 
4773 //=================================================================================================
4774 //
4775 // GLOBAL RESTRUCTURING BINARY ARITHMETIC OPERATORS
4776 //
4777 //=================================================================================================
4778 
4779 //*************************************************************************************************
4793 template< typename VT // Type of the left-hand side dense vector
4794  , typename MT > // Matrix base type of the right-hand side expression
4795 inline decltype(auto)
4796  operator*( const DenseVector<VT,true>& vec, const MatMatMultExpr<MT>& mat )
4797 {
4799 
4800  return ( vec * (~mat).leftOperand() ) * (~mat).rightOperand();
4801 }
4803 //*************************************************************************************************
4804 
4805 
4806 
4807 
4808 //=================================================================================================
4809 //
4810 // SIZE SPECIALIZATIONS
4811 //
4812 //=================================================================================================
4813 
4814 //*************************************************************************************************
4816 template< typename VT, typename MT >
4817 struct Size< TDVecDMatMultExpr<VT,MT> >
4818  : public Columns<MT>
4819 {};
4821 //*************************************************************************************************
4822 
4823 
4824 
4825 
4826 //=================================================================================================
4827 //
4828 // ISALIGNED SPECIALIZATIONS
4829 //
4830 //=================================================================================================
4831 
4832 //*************************************************************************************************
4834 template< typename VT, typename MT >
4835 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4836  : public BoolConstant< And< IsAligned<VT>, IsAligned<MT> >::value >
4837 {};
4839 //*************************************************************************************************
4840 
4841 } // namespace blaze
4842 
4843 #endif
If_< IsExpression< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:213
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Data type constraint.
Headerfile for the generic min algorithm.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:206
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:129
Header file for the IsDiagonal type trait.
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:294
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:128
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:521
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:361
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:383
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:327
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1809
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:351
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:131
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:132
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:208
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:262
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the Columns type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:207
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:88
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:384
decltype(auto) operator*(const DenseMatrix< MT1, false > &lhs, const DenseMatrix< MT2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8893
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:133
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:248
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:222
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:593
If_< IsExpression< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:216
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:371
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:209
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:219
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:130
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Header file for the TVecMatMultExpr base class.
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:156
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:307
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
Header file for BLAS general matrix/vector multiplication functions (gemv)
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:339
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:324
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:109
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:210
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:106
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:74
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:317
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:205
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.