Blaze  3.6
DMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/dense/MMM.h>
53 #include <blaze/math/Exception.h>
66 #include <blaze/math/shims/Reset.h>
68 #include <blaze/math/SIMD.h>
96 #include <blaze/math/views/Check.h>
97 #include <blaze/system/BLAS.h>
98 #include <blaze/system/Blocking.h>
100 #include <blaze/system/Thresholds.h>
103 #include <blaze/util/Assert.h>
104 #include <blaze/util/Complex.h>
107 #include <blaze/util/DisableIf.h>
108 #include <blaze/util/EnableIf.h>
111 #include <blaze/util/mpl/If.h>
112 #include <blaze/util/Types.h>
120 
121 
122 namespace blaze {
123 
124 //=================================================================================================
125 //
126 // CLASS DMATTDMATMULTEXPR
127 //
128 //=================================================================================================
129 
130 //*************************************************************************************************
137 template< typename MT1 // Type of the left-hand side dense matrix
138  , typename MT2 // Type of the right-hand side dense matrix
139  , bool SF // Symmetry flag
140  , bool HF // Hermitian flag
141  , bool LF // Lower flag
142  , bool UF > // Upper flag
144  : public MatMatMultExpr< DenseMatrix< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
145  , private Computation
146 {
147  private:
148  //**Type definitions****************************************************************************
155  //**********************************************************************************************
156 
157  //**********************************************************************************************
159  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
160  //**********************************************************************************************
161 
162  //**********************************************************************************************
164  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
165  //**********************************************************************************************
166 
167  //**********************************************************************************************
168  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
169  static constexpr bool HERM = ( HF && !( LF || UF ) );
170  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
171  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
172  //**********************************************************************************************
173 
174  //**********************************************************************************************
176 
180  template< typename T1, typename T2, typename T3 >
181  static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
183  //**********************************************************************************************
184 
185  //**********************************************************************************************
187 
190  template< typename T1, typename T2, typename T3 >
191  static constexpr bool UseBlasKernel_v =
192  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
193  !SYM && !HERM && !LOW && !UPP &&
194  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
195  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
196  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
197  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
198  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
199  IsBLASCompatible_v< ElementType_t<T1> > &&
200  IsBLASCompatible_v< ElementType_t<T2> > &&
201  IsBLASCompatible_v< ElementType_t<T3> > &&
202  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
203  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
205  //**********************************************************************************************
206 
207  //**********************************************************************************************
209 
212  template< typename T1, typename T2, typename T3 >
213  static constexpr bool UseVectorizedDefaultKernel_v =
214  ( useOptimizedKernels &&
215  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
216  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
217  IsSIMDCombinable_v< ElementType_t<T1>
219  , ElementType_t<T3> > &&
220  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
221  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
223  //**********************************************************************************************
224 
225  //**********************************************************************************************
227 
230  using ForwardFunctor = If_t< HERM
231  , DeclHerm
232  , If_t< SYM
233  , DeclSym
234  , If_t< LOW
235  , If_t< UPP
236  , DeclDiag
237  , DeclLow >
238  , If_t< UPP
239  , DeclUpp
240  , Noop > > > >;
242  //**********************************************************************************************
243 
244  public:
245  //**Type definitions****************************************************************************
248 
251 
253  using ResultType = typename If_t< HERM
255  , If_t< SYM
257  , If_t< LOW
258  , If_t< UPP
261  , If_t< UPP
263  , MultTrait<RT1,RT2> > > > >::Type;
264 
269  using ReturnType = const ElementType;
270  using CompositeType = const ResultType;
271 
273  using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
274 
276  using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
277 
280 
283  //**********************************************************************************************
284 
285  //**Compilation flags***************************************************************************
287  static constexpr bool simdEnabled =
288  ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
289  MT1::simdEnabled && MT2::simdEnabled &&
290  HasSIMDAdd_v<ET1,ET2> &&
291  HasSIMDMult_v<ET1,ET2> );
292 
294  static constexpr bool smpAssignable =
295  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
296  //**********************************************************************************************
297 
298  //**SIMD properties*****************************************************************************
300  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
301  //**********************************************************************************************
302 
303  //**Constructor*********************************************************************************
309  explicit inline DMatTDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
310  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
311  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
312  {
313  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
314  }
315  //**********************************************************************************************
316 
317  //**Access operator*****************************************************************************
324  inline ReturnType operator()( size_t i, size_t j ) const {
325  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
326  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
327 
328  if( IsDiagonal_v<MT1> ) {
329  return lhs_(i,i) * rhs_(i,j);
330  }
331  else if( IsDiagonal_v<MT2> ) {
332  return lhs_(i,j) * rhs_(j,j);
333  }
334  else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
335  const size_t begin( ( IsUpper_v<MT1> )
336  ?( ( IsLower_v<MT2> )
337  ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
338  , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
339  :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
340  :( ( IsLower_v<MT2> )
341  ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
342  :( 0UL ) ) );
343  const size_t end( ( IsLower_v<MT1> )
344  ?( ( IsUpper_v<MT2> )
345  ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
346  , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
347  :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
348  :( ( IsUpper_v<MT2> )
349  ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
350  :( lhs_.columns() ) ) );
351 
352  if( begin >= end ) return ElementType();
353 
354  const size_t n( end - begin );
355 
356  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
357  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
358  }
359  else {
360  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
361  }
362  }
363  //**********************************************************************************************
364 
365  //**At function*********************************************************************************
373  inline ReturnType at( size_t i, size_t j ) const {
374  if( i >= lhs_.rows() ) {
375  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
376  }
377  if( j >= rhs_.columns() ) {
378  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
379  }
380  return (*this)(i,j);
381  }
382  //**********************************************************************************************
383 
384  //**Rows function*******************************************************************************
389  inline size_t rows() const noexcept {
390  return lhs_.rows();
391  }
392  //**********************************************************************************************
393 
394  //**Columns function****************************************************************************
399  inline size_t columns() const noexcept {
400  return rhs_.columns();
401  }
402  //**********************************************************************************************
403 
404  //**Left operand access*************************************************************************
409  inline LeftOperand leftOperand() const noexcept {
410  return lhs_;
411  }
412  //**********************************************************************************************
413 
414  //**Right operand access************************************************************************
419  inline RightOperand rightOperand() const noexcept {
420  return rhs_;
421  }
422  //**********************************************************************************************
423 
424  //**********************************************************************************************
430  template< typename T >
431  inline bool canAlias( const T* alias ) const noexcept {
432  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
433  }
434  //**********************************************************************************************
435 
436  //**********************************************************************************************
442  template< typename T >
443  inline bool isAliased( const T* alias ) const noexcept {
444  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
445  }
446  //**********************************************************************************************
447 
448  //**********************************************************************************************
453  inline bool isAligned() const noexcept {
454  return lhs_.isAligned() && rhs_.isAligned();
455  }
456  //**********************************************************************************************
457 
458  //**********************************************************************************************
463  inline bool canSMPAssign() const noexcept {
464  return ( !BLAZE_BLAS_MODE ||
465  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
467  ( rows() * columns() < DMATTDMATMULT_THRESHOLD ) ) &&
468  ( rows() * columns() >= SMP_DMATTDMATMULT_THRESHOLD ) &&
469  !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
470  }
471  //**********************************************************************************************
472 
473  private:
474  //**Member variables****************************************************************************
477  //**********************************************************************************************
478 
479  //**Assignment to dense matrices****************************************************************
492  template< typename MT // Type of the target dense matrix
493  , bool SO > // Storage order of the target dense matrix
494  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
495  {
497 
498  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
499  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
500 
501  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
502  return;
503  }
504  else if( rhs.lhs_.columns() == 0UL ) {
505  reset( ~lhs );
506  return;
507  }
508 
509  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
510  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
511 
512  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
513  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
514  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
515  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
516  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
517  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
518 
519  DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
520  }
522  //**********************************************************************************************
523 
524  //**Assignment to dense matrices (kernel selection)*********************************************
535  template< typename MT3 // Type of the left-hand side target matrix
536  , typename MT4 // Type of the left-hand side matrix operand
537  , typename MT5 > // Type of the right-hand side matrix operand
538  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
539  {
540  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
541  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
542  selectSmallAssignKernel( C, A, B );
543  else
544  selectBlasAssignKernel( C, A, B );
545  }
547  //**********************************************************************************************
548 
549  //**Default assignment to row-major dense matrices (general/general)****************************
563  template< typename MT3 // Type of the left-hand side target matrix
564  , typename MT4 // Type of the left-hand side matrix operand
565  , typename MT5 > // Type of the right-hand side matrix operand
566  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
567  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
568  {
569  const size_t M( A.rows() );
570  const size_t N( B.columns() );
571  const size_t K( A.columns() );
572 
573  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
574 
575  const size_t ibegin( ( IsStrictlyLower_v<MT4> )
576  ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
577  :( 0UL ) );
578  const size_t iend( ( IsStrictlyUpper_v<MT4> )
579  ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
580  :( M ) );
581  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
582 
583  for( size_t i=0UL; i<ibegin; ++i ) {
584  for( size_t j=0UL; j<N; ++j ) {
585  reset( C(i,j) );
586  }
587  }
588  for( size_t i=ibegin; i<iend; ++i )
589  {
590  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
591  ?( ( IsStrictlyUpper_v<MT4> )
592  ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
593  :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
594  :( ( IsStrictlyUpper_v<MT5> )
595  ?( SYM || HERM || UPP ? max( i, 1UL ) : 1UL )
596  :( SYM || HERM || UPP ? i : 0UL ) ) );
597  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
598  ?( ( IsStrictlyLower_v<MT4> )
599  ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
600  :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
601  :( ( IsStrictlyLower_v<MT5> )
602  ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
603  :( LOW ? i+1UL : N ) ) );
604 
605  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
606  for( size_t j=0UL; j<N; ++j ) {
607  reset( C(i,j) );
608  }
609  continue;
610  }
611 
612  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
613 
614  for( size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
615  reset( C(i,j) );
616  }
617  for( size_t j=jbegin; j<jend; ++j )
618  {
619  const size_t kbegin( ( IsUpper_v<MT4> )
620  ?( ( IsLower_v<MT5> )
621  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
622  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
623  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
624  :( ( IsLower_v<MT5> )
625  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
626  :( 0UL ) ) );
627  const size_t kend( ( IsLower_v<MT4> )
628  ?( ( IsUpper_v<MT5> )
629  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
630  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
631  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
632  :( ( IsUpper_v<MT5> )
633  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
634  :( K ) ) );
635  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
636 
637  C(i,j) = A(i,kbegin) * B(kbegin,j);
638  for( size_t k=kbegin+1UL; k<kend; ++k ) {
639  C(i,j) += A(i,k) * B(k,j);
640  }
641  }
642  for( size_t j=jend; j<N; ++j ) {
643  reset( C(i,j) );
644  }
645  }
646  for( size_t i=iend; i<M; ++i ) {
647  for( size_t j=0UL; j<N; ++j ) {
648  reset( C(i,j) );
649  }
650  }
651 
652  if( SYM || HERM ) {
653  for( size_t i=1UL; i<M; ++i ) {
654  for( size_t j=0UL; j<i; ++j ) {
655  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
656  }
657  }
658  }
659  }
661  //**********************************************************************************************
662 
663  //**Default assignment to column-major dense matrices (general/general)*************************
677  template< typename MT3 // Type of the left-hand side target matrix
678  , typename MT4 // Type of the left-hand side matrix operand
679  , typename MT5 > // Type of the right-hand side matrix operand
680  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
681  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
682  {
683  const size_t M( A.rows() );
684  const size_t N( B.columns() );
685  const size_t K( A.columns() );
686 
687  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
688 
689  const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
690  ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
691  :( 0UL ) );
692  const size_t jend( ( IsStrictlyLower_v<MT5> )
693  ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
694  :( N ) );
695  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
696 
697  for( size_t j=0UL; j<jbegin; ++j ) {
698  for( size_t i=0UL; i<M; ++i ) {
699  reset( C(i,j) );
700  }
701  }
702  for( size_t j=jbegin; j<jend; ++j )
703  {
704  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
705  ?( ( IsStrictlyLower_v<MT4> )
706  ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
707  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
708  :( ( IsStrictlyLower_v<MT4> )
709  ?( SYM || HERM || LOW ? max( j, 1UL ) : 1UL )
710  :( SYM || HERM || LOW ? j : 0UL ) ) );
711  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
712  ?( ( IsStrictlyUpper_v<MT4> )
713  ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
714  :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
715  :( ( IsStrictlyUpper_v<MT4> )
716  ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
717  :( UPP ? j+1UL : M ) ) );
718 
719  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
720  for( size_t i=0UL; i<M; ++i ) {
721  reset( C(i,j) );
722  }
723  continue;
724  }
725 
726  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
727 
728  for( size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
729  reset( C(i,j) );
730  }
731  for( size_t i=ibegin; i<iend; ++i )
732  {
733  const size_t kbegin( ( IsUpper_v<MT4> )
734  ?( ( IsLower_v<MT5> )
735  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
736  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
737  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
738  :( ( IsLower_v<MT5> )
739  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
740  :( 0UL ) ) );
741  const size_t kend( ( IsLower_v<MT4> )
742  ?( ( IsUpper_v<MT5> )
743  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
744  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
745  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
746  :( ( IsUpper_v<MT5> )
747  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
748  :( K ) ) );
749  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
750 
751  C(i,j) = A(i,kbegin) * B(kbegin,j);
752  for( size_t k=kbegin+1UL; k<kend; ++k ) {
753  C(i,j) += A(i,k) * B(k,j);
754  }
755  }
756  for( size_t i=iend; i<M; ++i ) {
757  reset( C(i,j) );
758  }
759  }
760  for( size_t j=jend; j<N; ++j ) {
761  for( size_t i=0UL; i<M; ++i ) {
762  reset( C(i,j) );
763  }
764  }
765 
766  if( SYM || HERM ) {
767  for( size_t j=1UL; j<N; ++j ) {
768  for( size_t i=0UL; i<j; ++i ) {
769  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
770  }
771  }
772  }
773  }
775  //**********************************************************************************************
776 
777  //**Default assignment to row-major dense matrices (general/diagonal)***************************
791  template< typename MT3 // Type of the left-hand side target matrix
792  , typename MT4 // Type of the left-hand side matrix operand
793  , typename MT5 > // Type of the right-hand side matrix operand
794  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
795  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
796  {
797  const size_t M( A.rows() );
798  const size_t N( B.columns() );
799 
800  for( size_t i=0UL; i<M; ++i )
801  {
802  const size_t jbegin( ( IsUpper_v<MT4> )
803  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
804  :( 0UL ) );
805  const size_t jend( ( IsLower_v<MT4> )
806  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
807  :( N ) );
808  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
809 
810  if( IsUpper_v<MT4> ) {
811  for( size_t j=0UL; j<jbegin; ++j ) {
812  reset( C(i,j) );
813  }
814  }
815  for( size_t j=jbegin; j<jend; ++j ) {
816  C(i,j) = A(i,j) * B(j,j);
817  }
818  if( IsLower_v<MT4> ) {
819  for( size_t j=jend; j<N; ++j ) {
820  reset( C(i,j) );
821  }
822  }
823  }
824  }
826  //**********************************************************************************************
827 
828  //**Default assignment to column-major dense matrices (general/diagonal)************************
842  template< typename MT3 // Type of the left-hand side target matrix
843  , typename MT4 // Type of the left-hand side matrix operand
844  , typename MT5 > // Type of the right-hand side matrix operand
845  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
846  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
847  {
848  constexpr size_t block( BLOCK_SIZE );
849 
850  const size_t M( A.rows() );
851  const size_t N( B.columns() );
852 
853  for( size_t jj=0UL; jj<N; jj+=block ) {
854  const size_t jend( min( N, jj+block ) );
855  for( size_t ii=0UL; ii<M; ii+=block ) {
856  const size_t iend( min( M, ii+block ) );
857  for( size_t j=jj; j<jend; ++j )
858  {
859  const size_t ibegin( ( IsLower_v<MT4> )
860  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
861  :( ii ) );
862  const size_t ipos( ( IsUpper_v<MT4> )
863  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
864  :( iend ) );
865 
866  if( IsLower_v<MT4> ) {
867  for( size_t i=ii; i<ibegin; ++i ) {
868  reset( C(i,j) );
869  }
870  }
871  for( size_t i=ibegin; i<ipos; ++i ) {
872  C(i,j) = A(i,j) * B(j,j);
873  }
874  if( IsUpper_v<MT4> ) {
875  for( size_t i=ipos; i<iend; ++i ) {
876  reset( C(i,j) );
877  }
878  }
879  }
880  }
881  }
882  }
884  //**********************************************************************************************
885 
886  //**Default assignment to row-major dense matrices (diagonal/general)***************************
900  template< typename MT3 // Type of the left-hand side target matrix
901  , typename MT4 // Type of the left-hand side matrix operand
902  , typename MT5 > // Type of the right-hand side matrix operand
903  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
904  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
905  {
906  constexpr size_t block( BLOCK_SIZE );
907 
908  const size_t M( A.rows() );
909  const size_t N( B.columns() );
910 
911  for( size_t ii=0UL; ii<M; ii+=block ) {
912  const size_t iend( min( M, ii+block ) );
913  for( size_t jj=0UL; jj<N; jj+=block ) {
914  const size_t jend( min( N, jj+block ) );
915  for( size_t i=ii; i<iend; ++i )
916  {
917  const size_t jbegin( ( IsUpper_v<MT5> )
918  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
919  :( jj ) );
920  const size_t jpos( ( IsLower_v<MT5> )
921  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
922  :( jend ) );
923 
924  if( IsUpper_v<MT5> ) {
925  for( size_t j=jj; j<jbegin; ++j ) {
926  reset( C(i,j) );
927  }
928  }
929  for( size_t j=jbegin; j<jpos; ++j ) {
930  C(i,j) = A(i,i) * B(i,j);
931  }
932  if( IsLower_v<MT5> ) {
933  for( size_t j=jpos; j<jend; ++j ) {
934  reset( C(i,j) );
935  }
936  }
937  }
938  }
939  }
940  }
942  //**********************************************************************************************
943 
944  //**Default assignment to column-major dense matrices (diagonal/general)************************
958  template< typename MT3 // Type of the left-hand side target matrix
959  , typename MT4 // Type of the left-hand side matrix operand
960  , typename MT5 > // Type of the right-hand side matrix operand
961  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
962  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
963  {
964  const size_t M( A.rows() );
965  const size_t N( B.columns() );
966 
967  for( size_t j=0UL; j<N; ++j )
968  {
969  const size_t ibegin( ( IsLower_v<MT5> )
970  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
971  :( 0UL ) );
972  const size_t iend( ( IsUpper_v<MT5> )
973  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
974  :( M ) );
975  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
976 
977  if( IsLower_v<MT5> ) {
978  for( size_t i=0UL; i<ibegin; ++i ) {
979  reset( C(i,j) );
980  }
981  }
982  for( size_t i=ibegin; i<iend; ++i ) {
983  C(i,j) = A(i,i) * B(i,j);
984  }
985  if( IsUpper_v<MT5> ) {
986  for( size_t i=iend; i<M; ++i ) {
987  reset( C(i,j) );
988  }
989  }
990  }
991  }
993  //**********************************************************************************************
994 
995  //**Default assignment to dense matrices (diagonal/diagonal)************************************
1009  template< typename MT3 // Type of the left-hand side target matrix
1010  , typename MT4 // Type of the left-hand side matrix operand
1011  , typename MT5 > // Type of the right-hand side matrix operand
1012  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
1013  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1014  {
1015  reset( C );
1016 
1017  for( size_t i=0UL; i<A.rows(); ++i ) {
1018  C(i,i) = A(i,i) * B(i,i);
1019  }
1020  }
1022  //**********************************************************************************************
1023 
1024  //**Default assignment to dense matrices (small matrices)***************************************
1038  template< typename MT3 // Type of the left-hand side target matrix
1039  , typename MT4 // Type of the left-hand side matrix operand
1040  , typename MT5 > // Type of the right-hand side matrix operand
1041  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1042  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1043  {
1044  selectDefaultAssignKernel( C, A, B );
1045  }
1047  //**********************************************************************************************
1048 
1049  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1064  template< typename MT3 // Type of the left-hand side target matrix
1065  , typename MT4 // Type of the left-hand side matrix operand
1066  , typename MT5 > // Type of the right-hand side matrix operand
1067  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1068  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1069  {
1070  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1071 
1072  const size_t M( A.rows() );
1073  const size_t N( B.columns() );
1074  const size_t K( A.columns() );
1075 
1076  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1077 
1078  size_t i( 0UL );
1079 
1080  for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
1081  {
1082  const size_t jend( LOW ? i+2UL : N );
1083  size_t j( 0UL );
1084 
1085  if( SYM || HERM ) {
1086  for( ; j<i; ++j ) {
1087  C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
1088  C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
1089  }
1090  }
1091  else if( UPP ) {
1092  for( ; j<i; ++j ) {
1093  reset( C(i ,j) );
1094  reset( C(i+1UL,j) );
1095  }
1096  }
1097 
1098  for( ; (j+4UL) <= jend; j+=4UL )
1099  {
1100  const size_t kbegin( ( IsUpper_v<MT4> )
1101  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1102  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1103  const size_t kend( ( IsLower_v<MT4> )
1104  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
1105  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
1106 
1107  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1108  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1109 
1110  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1111  size_t k( kbegin );
1112 
1113  for( ; k<kpos; k+=SIMDSIZE ) {
1114  const SIMDType a1( A.load(i ,k) );
1115  const SIMDType a2( A.load(i+1UL,k) );
1116  const SIMDType b1( B.load(k,j ) );
1117  const SIMDType b2( B.load(k,j+1UL) );
1118  const SIMDType b3( B.load(k,j+2UL) );
1119  const SIMDType b4( B.load(k,j+3UL) );
1120  xmm1 += a1 * b1;
1121  xmm2 += a1 * b2;
1122  xmm3 += a1 * b3;
1123  xmm4 += a1 * b4;
1124  xmm5 += a2 * b1;
1125  xmm6 += a2 * b2;
1126  xmm7 += a2 * b3;
1127  xmm8 += a2 * b4;
1128  }
1129 
1130  C(i ,j ) = sum( xmm1 );
1131  C(i ,j+1UL) = sum( xmm2 );
1132  C(i ,j+2UL) = sum( xmm3 );
1133  C(i ,j+3UL) = sum( xmm4 );
1134  C(i+1UL,j ) = sum( xmm5 );
1135  C(i+1UL,j+1UL) = sum( xmm6 );
1136  C(i+1UL,j+2UL) = sum( xmm7 );
1137  C(i+1UL,j+3UL) = sum( xmm8 );
1138 
1139  for( ; remainder && k<kend; ++k ) {
1140  C(i ,j ) += A(i ,k) * B(k,j );
1141  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1142  C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1143  C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1144  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1145  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1146  C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1147  C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1148  }
1149  }
1150 
1151  for( ; (j+2UL) <= jend; j+=2UL )
1152  {
1153  const size_t kbegin( ( IsUpper_v<MT4> )
1154  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1155  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1156  const size_t kend( ( IsLower_v<MT4> )
1157  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
1158  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1159 
1160  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1161  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1162 
1163  SIMDType xmm1, xmm2, xmm3, xmm4;
1164  size_t k( kbegin );
1165 
1166  for( ; k<kpos; k+=SIMDSIZE ) {
1167  const SIMDType a1( A.load(i ,k) );
1168  const SIMDType a2( A.load(i+1UL,k) );
1169  const SIMDType b1( B.load(k,j ) );
1170  const SIMDType b2( B.load(k,j+1UL) );
1171  xmm1 += a1 * b1;
1172  xmm2 += a1 * b2;
1173  xmm3 += a2 * b1;
1174  xmm4 += a2 * b2;
1175  }
1176 
1177  C(i ,j ) = sum( xmm1 );
1178  C(i ,j+1UL) = sum( xmm2 );
1179  C(i+1UL,j ) = sum( xmm3 );
1180  C(i+1UL,j+1UL) = sum( xmm4 );
1181 
1182  for( ; remainder && k<kend; ++k ) {
1183  C(i ,j ) += A(i ,k) * B(k,j );
1184  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1185  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1186  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1187  }
1188  }
1189 
1190  if( j < jend )
1191  {
1192  const size_t kbegin( ( IsUpper_v<MT4> )
1193  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1194  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1195  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1196 
1197  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1198  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1199 
1200  SIMDType xmm1, xmm2;
1201  size_t k( kbegin );
1202 
1203  for( ; k<kpos; k+=SIMDSIZE ) {
1204  const SIMDType b1( B.load(k,j) );
1205  xmm1 += A.load(i ,k) * b1;
1206  xmm2 += A.load(i+1UL,k) * b1;
1207  }
1208 
1209  C(i ,j) = sum( xmm1 );
1210  C(i+1UL,j) = sum( xmm2 );
1211 
1212  for( ; remainder && k<kend; ++k ) {
1213  C(i ,j) += A(i ,k) * B(k,j);
1214  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1215  }
1216 
1217  if( LOW ) ++j;
1218  }
1219 
1220  if( LOW ) {
1221  for( ; j<N; ++j ) {
1222  reset( C(i ,j) );
1223  reset( C(i+1UL,j) );
1224  }
1225  }
1226  }
1227 
1228  for( ; i<M; ++i )
1229  {
1230  const size_t jend( LOW ? i+1UL : N );
1231  size_t j( 0UL );
1232 
1233  if( SYM || HERM ) {
1234  for( ; j<i; ++j ) {
1235  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1236  }
1237  }
1238  else if( UPP ) {
1239  for( ; j<i; ++j ) {
1240  reset( C(i,j) );
1241  }
1242  }
1243 
1244  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
1245  {
1246  const size_t kbegin( ( IsUpper_v<MT4> )
1247  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1248  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1249  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
1250 
1251  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1252  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1253 
1254  SIMDType xmm1, xmm2, xmm3, xmm4;
1255  size_t k( kbegin );
1256 
1257  for( ; k<kpos; k+=SIMDSIZE ) {
1258  const SIMDType a1( A.load(i,k) );
1259  xmm1 += a1 * B.load(k,j );
1260  xmm2 += a1 * B.load(k,j+1UL);
1261  xmm3 += a1 * B.load(k,j+2UL);
1262  xmm4 += a1 * B.load(k,j+3UL);
1263  }
1264 
1265  C(i,j ) = sum( xmm1 );
1266  C(i,j+1UL) = sum( xmm2 );
1267  C(i,j+2UL) = sum( xmm3 );
1268  C(i,j+3UL) = sum( xmm4 );
1269 
1270  for( ; remainder && k<kend; ++k ) {
1271  C(i,j ) += A(i,k) * B(k,j );
1272  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1273  C(i,j+2UL) += A(i,k) * B(k,j+2UL);
1274  C(i,j+3UL) += A(i,k) * B(k,j+3UL);
1275  }
1276  }
1277 
1278  for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
1279  {
1280  const size_t kbegin( ( IsUpper_v<MT4> )
1281  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1282  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1283  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1284 
1285  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1286  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1287 
1288  SIMDType xmm1, xmm2;
1289  size_t k( kbegin );
1290 
1291  for( ; k<kpos; k+=SIMDSIZE ) {
1292  const SIMDType a1( A.load(i,k) );
1293  xmm1 += a1 * B.load(k,j );
1294  xmm2 += a1 * B.load(k,j+1UL);
1295  }
1296 
1297  C(i,j ) = sum( xmm1 );
1298  C(i,j+1UL) = sum( xmm2 );
1299 
1300  for( ; remainder && k<kend; ++k ) {
1301  C(i,j ) += A(i,k) * B(k,j );
1302  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1303  }
1304  }
1305 
1306  if( j < jend )
1307  {
1308  const size_t kbegin( ( IsUpper_v<MT4> )
1309  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1310  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1311 
1312  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
1313  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1314 
1315  SIMDType xmm1;
1316  size_t k( kbegin );
1317 
1318  for( ; k<kpos; k+=SIMDSIZE ) {
1319  xmm1 += A.load(i,k) * B.load(k,j);
1320  }
1321 
1322  C(i,j) = sum( xmm1 );
1323 
1324  for( ; remainder && k<K; ++k ) {
1325  C(i,j) += A(i,k) * B(k,j);
1326  }
1327 
1328  if( LOW ) ++j;
1329  }
1330 
1331  if( LOW ) {
1332  for( ; j<N; ++j ) {
1333  reset( C(i,j) );
1334  }
1335  }
1336  }
1337  }
1339  //**********************************************************************************************
1340 
1341  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1356  template< typename MT3 // Type of the left-hand side target matrix
1357  , typename MT4 // Type of the left-hand side matrix operand
1358  , typename MT5 > // Type of the right-hand side matrix operand
1359  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1360  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1361  {
1362  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1363 
1364  const size_t M( A.rows() );
1365  const size_t N( B.columns() );
1366  const size_t K( A.columns() );
1367 
1368  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1369 
1370  size_t i( 0UL );
1371 
1372  for( ; !( LOW && UPP ) && (i+4UL) <= M; i+=4UL )
1373  {
1374  const size_t jend( LOW ? i+4UL : N );
1375  size_t j( 0UL );
1376 
1377  if( SYM || HERM ) {
1378  for( ; j<i; ++j ) {
1379  C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
1380  C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
1381  C(i+2UL,j) = HERM ? conj( C(j,i+2UL) ) : C(j,i+2UL);
1382  C(i+3UL,j) = HERM ? conj( C(j,i+3UL) ) : C(j,i+3UL);
1383  }
1384  }
1385  else if( UPP ) {
1386  for( ; j<i; ++j ) {
1387  reset( C(i ,j) );
1388  reset( C(i+1UL,j) );
1389  reset( C(i+2UL,j) );
1390  reset( C(i+3UL,j) );
1391  }
1392  }
1393 
1394  for( ; (j+2UL) <= jend; j+=2UL )
1395  {
1396  const size_t kbegin( ( IsUpper_v<MT4> )
1397  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1398  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1399  const size_t kend( ( IsLower_v<MT4> )
1400  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
1401  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1402 
1403  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1404  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1405 
1406  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1407  size_t k( kbegin );
1408 
1409  for( ; k<kpos; k+=SIMDSIZE ) {
1410  const SIMDType a1( A.load(i ,k) );
1411  const SIMDType a2( A.load(i+1UL,k) );
1412  const SIMDType a3( A.load(i+2UL,k) );
1413  const SIMDType a4( A.load(i+3UL,k) );
1414  const SIMDType b1( B.load(k,j ) );
1415  const SIMDType b2( B.load(k,j+1UL) );
1416  xmm1 += a1 * b1;
1417  xmm2 += a1 * b2;
1418  xmm3 += a2 * b1;
1419  xmm4 += a2 * b2;
1420  xmm5 += a3 * b1;
1421  xmm6 += a3 * b2;
1422  xmm7 += a4 * b1;
1423  xmm8 += a4 * b2;
1424  }
1425 
1426  C(i ,j ) = sum( xmm1 );
1427  C(i ,j+1UL) = sum( xmm2 );
1428  C(i+1UL,j ) = sum( xmm3 );
1429  C(i+1UL,j+1UL) = sum( xmm4 );
1430  C(i+2UL,j ) = sum( xmm5 );
1431  C(i+2UL,j+1UL) = sum( xmm6 );
1432  C(i+3UL,j ) = sum( xmm7 );
1433  C(i+3UL,j+1UL) = sum( xmm8 );
1434 
1435  for( ; remainder && k<kend; ++k ) {
1436  C(i ,j ) += A(i ,k) * B(k,j );
1437  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1438  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1439  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1440  C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1441  C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1442  C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1443  C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1444  }
1445  }
1446 
1447  if( j < jend )
1448  {
1449  const size_t kbegin( ( IsUpper_v<MT4> )
1450  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1451  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1452  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
1453 
1454  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1455  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1456 
1457  SIMDType xmm1, xmm2, xmm3, xmm4;
1458  size_t k( kbegin );
1459 
1460  for( ; k<kpos; k+=SIMDSIZE ) {
1461  const SIMDType b1( B.load(k,j) );
1462  xmm1 += A.load(i ,k) * b1;
1463  xmm2 += A.load(i+1UL,k) * b1;
1464  xmm3 += A.load(i+2UL,k) * b1;
1465  xmm4 += A.load(i+3UL,k) * b1;
1466  }
1467 
1468  C(i ,j) = sum( xmm1 );
1469  C(i+1UL,j) = sum( xmm2 );
1470  C(i+2UL,j) = sum( xmm3 );
1471  C(i+3UL,j) = sum( xmm4 );
1472 
1473  for( ; remainder && k<kend; ++k ) {
1474  C(i ,j) += A(i ,k) * B(k,j);
1475  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1476  C(i+2UL,j) += A(i+2UL,k) * B(k,j);
1477  C(i+3UL,j) += A(i+3UL,k) * B(k,j);
1478  }
1479 
1480  if( LOW ) ++j;
1481  }
1482 
1483  if( LOW ) {
1484  for( ; j<N; ++j ) {
1485  reset( C(i ,j) );
1486  reset( C(i+1UL,j) );
1487  reset( C(i+2UL,j) );
1488  reset( C(i+3UL,j) );
1489  }
1490  }
1491  }
1492 
1493  for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
1494  {
1495  const size_t jend( LOW ? i+2UL : N );
1496  size_t j( 0UL );
1497 
1498  if( SYM || HERM ) {
1499  for( ; j<i; ++j ) {
1500  C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
1501  C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
1502  }
1503  }
1504  else if( UPP ) {
1505  for( ; j<i; ++j ) {
1506  reset( C(i ,j) );
1507  reset( C(i+1UL,j) );
1508  }
1509  }
1510 
1511  for( ; (j+2UL) <= jend; j+=2UL )
1512  {
1513  const size_t kbegin( ( IsUpper_v<MT4> )
1514  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1515  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1516  const size_t kend( ( IsLower_v<MT4> )
1517  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
1518  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1519 
1520  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1521  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1522 
1523  SIMDType xmm1, xmm2, xmm3, xmm4;
1524  size_t k( kbegin );
1525 
1526  for( ; k<kpos; k+=SIMDSIZE ) {
1527  const SIMDType a1( A.load(i ,k) );
1528  const SIMDType a2( A.load(i+1UL,k) );
1529  const SIMDType b1( B.load(k,j ) );
1530  const SIMDType b2( B.load(k,j+1UL) );
1531  xmm1 += a1 * b1;
1532  xmm2 += a1 * b2;
1533  xmm3 += a2 * b1;
1534  xmm4 += a2 * b2;
1535  }
1536 
1537  C(i ,j ) = sum( xmm1 );
1538  C(i ,j+1UL) = sum( xmm2 );
1539  C(i+1UL,j ) = sum( xmm3 );
1540  C(i+1UL,j+1UL) = sum( xmm4 );
1541 
1542  for( ; remainder && k<kend; ++k ) {
1543  C(i ,j ) += A(i ,k) * B(k,j );
1544  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1545  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1546  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1547  }
1548  }
1549 
1550  if( j < jend )
1551  {
1552  const size_t kbegin( ( IsUpper_v<MT4> )
1553  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1554  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1555  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1556 
1557  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1558  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1559 
1560  SIMDType xmm1, xmm2;
1561  size_t k( kbegin );
1562 
1563  for( ; k<kpos; k+=SIMDSIZE ) {
1564  const SIMDType b1( B.load(k,j) );
1565  xmm1 += A.load(i ,k) * b1;
1566  xmm2 += A.load(i+1UL,k) * b1;
1567  }
1568 
1569  C(i ,j) = sum( xmm1 );
1570  C(i+1UL,j) = sum( xmm2 );
1571 
1572  for( ; remainder && k<kend; ++k ) {
1573  C(i ,j) += A(i ,k) * B(k,j);
1574  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1575  }
1576 
1577  if( LOW ) ++j;
1578  }
1579 
1580  if( LOW ) {
1581  for( ; j<N; ++j ) {
1582  reset( C(i ,j) );
1583  reset( C(i+1UL,j) );
1584  }
1585  }
1586  }
1587 
1588  for( ; i<M; ++i )
1589  {
1590  const size_t jend( LOW ? i+1UL : N );
1591  size_t j( 0UL );
1592 
1593  if( SYM || HERM ) {
1594  for( ; j<i; ++j ) {
1595  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1596  }
1597  }
1598  else if( UPP ) {
1599  for( ; j<i; ++j ) {
1600  reset( C(i,j) );
1601  }
1602  }
1603 
1604  for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
1605  {
1606  const size_t kbegin( ( IsUpper_v<MT4> )
1607  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1608  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1609  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1610 
1611  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1612  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1613 
1614  SIMDType xmm1, xmm2;
1615  size_t k( kbegin );
1616 
1617  for( ; k<kpos; k+=SIMDSIZE ) {
1618  const SIMDType a1( A.load(i,k) );
1619  xmm1 += a1 * B.load(k,j );
1620  xmm2 += a1 * B.load(k,j+1UL);
1621  }
1622 
1623  C(i,j ) = sum( xmm1 );
1624  C(i,j+1UL) = sum( xmm2 );
1625 
1626  for( ; remainder && k<kend; ++k ) {
1627  C(i,j ) += A(i,k) * B(k,j );
1628  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1629  }
1630  }
1631 
1632  if( j < jend )
1633  {
1634  const size_t kbegin( ( IsUpper_v<MT4> )
1635  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1636  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1637 
1638  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
1639  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1640 
1641  SIMDType xmm1;
1642  size_t k( kbegin );
1643 
1644  for( ; k<kpos; k+=SIMDSIZE ) {
1645  xmm1 += A.load(i,k) * B.load(k,j);
1646  }
1647 
1648  C(i,j) = sum( xmm1 );
1649 
1650  for( ; remainder && k<K; ++k ) {
1651  C(i,j) += A(i,k) * B(k,j);
1652  }
1653 
1654  if( LOW ) ++j;
1655  }
1656 
1657  if( LOW ) {
1658  for( ; j<N; ++j ) {
1659  reset( C(i,j) );
1660  }
1661  }
1662  }
1663  }
1665  //**********************************************************************************************
1666 
1667  //**Default assignment to dense matrices (large matrices)***************************************
1681  template< typename MT3 // Type of the left-hand side target matrix
1682  , typename MT4 // Type of the left-hand side matrix operand
1683  , typename MT5 > // Type of the right-hand side matrix operand
1684  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1685  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1686  {
1687  selectDefaultAssignKernel( C, A, B );
1688  }
1690  //**********************************************************************************************
1691 
1692  //**Vectorized default assignment to dense matrices (large matrices)****************************
1707  template< typename MT3 // Type of the left-hand side target matrix
1708  , typename MT4 // Type of the left-hand side matrix operand
1709  , typename MT5 > // Type of the right-hand side matrix operand
1710  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1711  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1712  {
1713  if( SYM )
1714  smmm( C, A, B, ElementType(1) );
1715  else if( HERM )
1716  hmmm( C, A, B, ElementType(1) );
1717  else if( LOW )
1718  lmmm( C, A, B, ElementType(1), ElementType(0) );
1719  else if( UPP )
1720  ummm( C, A, B, ElementType(1), ElementType(0) );
1721  else
1722  mmm( C, A, B, ElementType(1), ElementType(0) );
1723  }
1725  //**********************************************************************************************
1726 
1727  //**BLAS-based assignment to dense matrices (default)*******************************************
1741  template< typename MT3 // Type of the left-hand side target matrix
1742  , typename MT4 // Type of the left-hand side matrix operand
1743  , typename MT5 > // Type of the right-hand side matrix operand
1744  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1745  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1746  {
1747  selectLargeAssignKernel( C, A, B );
1748  }
1750  //**********************************************************************************************
1751 
1752  //**BLAS-based assignment to dense matrices*****************************************************
1753 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1754 
1767  template< typename MT3 // Type of the left-hand side target matrix
1768  , typename MT4 // Type of the left-hand side matrix operand
1769  , typename MT5 > // Type of the right-hand side matrix operand
1770  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1771  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1772  {
1773  using ET = ElementType_t<MT3>;
1774 
1775  if( IsTriangular_v<MT4> ) {
1776  assign( C, B );
1777  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1778  }
1779  else if( IsTriangular_v<MT5> ) {
1780  assign( C, A );
1781  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1782  }
1783  else {
1784  gemm( C, A, B, ET(1), ET(0) );
1785  }
1786  }
1788 #endif
1789  //**********************************************************************************************
1790 
1791  //**Assignment to sparse matrices***************************************************************
1804  template< typename MT // Type of the target sparse matrix
1805  , bool SO > // Storage order of the target sparse matrix
1806  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
1807  {
1809 
1810  using TmpType = If_t< SO, OppositeType, ResultType >;
1811 
1818 
1819  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1820  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1821 
1822  const ForwardFunctor fwd;
1823 
1824  const TmpType tmp( serial( rhs ) );
1825  assign( ~lhs, fwd( tmp ) );
1826  }
1828  //**********************************************************************************************
1829 
1830  //**Addition assignment to dense matrices*******************************************************
1843  template< typename MT // Type of the target dense matrix
1844  , bool SO > // Storage order of the target dense matrix
1845  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
1846  {
1848 
1849  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1850  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1851 
1852  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1853  return;
1854  }
1855 
1856  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1857  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1858 
1859  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1860  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1861  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1862  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1863  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1864  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1865 
1866  DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1867  }
1869  //**********************************************************************************************
1870 
1871  //**Addition assignment to dense matrices (kernel selection)************************************
1882  template< typename MT3 // Type of the left-hand side target matrix
1883  , typename MT4 // Type of the left-hand side matrix operand
1884  , typename MT5 > // Type of the right-hand side matrix operand
1885  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1886  {
1887  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
1888  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
1889  selectSmallAddAssignKernel( C, A, B );
1890  else
1891  selectBlasAddAssignKernel( C, A, B );
1892  }
1894  //**********************************************************************************************
1895 
1896  //**Default addition assignment to row-major dense matrices (general/general)*******************
1910  template< typename MT3 // Type of the left-hand side target matrix
1911  , typename MT4 // Type of the left-hand side matrix operand
1912  , typename MT5 > // Type of the right-hand side matrix operand
1913  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1914  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1915  {
1916  const size_t M( A.rows() );
1917  const size_t N( B.columns() );
1918  const size_t K( A.columns() );
1919 
1920  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1921 
1922  const size_t ibegin( ( IsStrictlyLower_v<MT4> )
1923  ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
1924  :( 0UL ) );
1925  const size_t iend( ( IsStrictlyUpper_v<MT4> )
1926  ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
1927  :( M ) );
1928  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1929 
1930  for( size_t i=ibegin; i<iend; ++i )
1931  {
1932  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
1933  ?( ( IsStrictlyUpper_v<MT4> )
1934  ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
1935  :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
1936  :( ( IsStrictlyUpper_v<MT5> )
1937  ?( UPP ? max( i, 1UL ) : 1UL )
1938  :( UPP ? i : 0UL ) ) );
1939  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
1940  ?( ( IsStrictlyLower_v<MT4> )
1941  ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
1942  :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
1943  :( ( IsStrictlyLower_v<MT5> )
1944  ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
1945  :( LOW ? i+1UL : N ) ) );
1946 
1947  if( ( LOW || UPP ) && ( jbegin > jend ) ) continue;
1948  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1949 
1950  for( size_t j=jbegin; j<jend; ++j )
1951  {
1952  const size_t kbegin( ( IsUpper_v<MT4> )
1953  ?( ( IsLower_v<MT5> )
1954  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1955  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1956  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1957  :( ( IsLower_v<MT5> )
1958  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1959  :( 0UL ) ) );
1960  const size_t kend( ( IsLower_v<MT4> )
1961  ?( ( IsUpper_v<MT5> )
1962  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
1963  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
1964  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1965  :( ( IsUpper_v<MT5> )
1966  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1967  :( K ) ) );
1968  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
1969 
1970  const size_t knum( kend - kbegin );
1971  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
1972 
1973  for( size_t k=kbegin; k<kpos; k+=2UL ) {
1974  C(i,j) += A(i,k ) * B(k ,j);
1975  C(i,j) += A(i,k+1UL) * B(k+1UL,j);
1976  }
1977  if( kpos < kend ) {
1978  C(i,j) += A(i,kpos) * B(kpos,j);
1979  }
1980  }
1981  }
1982  }
1984  //**********************************************************************************************
1985 
1986  //**Default addition assignment to column-major dense matrices (general/general)****************
2000  template< typename MT3 // Type of the left-hand side target matrix
2001  , typename MT4 // Type of the left-hand side matrix operand
2002  , typename MT5 > // Type of the right-hand side matrix operand
2003  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2004  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2005  {
2006  const size_t M( A.rows() );
2007  const size_t N( B.columns() );
2008  const size_t K( A.columns() );
2009 
2010  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2011 
2012  const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
2013  ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
2014  :( 0UL ) );
2015  const size_t jend( ( IsStrictlyLower_v<MT5> )
2016  ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
2017  :( N ) );
2018  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2019 
2020  for( size_t j=jbegin; j<jend; ++j )
2021  {
2022  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
2023  ?( ( IsStrictlyLower_v<MT4> )
2024  ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
2025  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2026  :( ( IsStrictlyLower_v<MT4> )
2027  ?( LOW ? max( j, 1UL ) : 1UL )
2028  :( LOW ? j : 0UL ) ) );
2029  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
2030  ?( ( IsStrictlyUpper_v<MT4> )
2031  ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
2032  :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
2033  :( ( IsStrictlyUpper_v<MT4> )
2034  ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
2035  :( UPP ? j+1UL : M ) ) );
2036 
2037  if( ( LOW || UPP ) && ( ibegin > iend ) ) continue;
2038  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2039 
2040  for( size_t i=ibegin; i<iend; ++i )
2041  {
2042  const size_t kbegin( ( IsUpper_v<MT4> )
2043  ?( ( IsLower_v<MT5> )
2044  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2045  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2046  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2047  :( ( IsLower_v<MT5> )
2048  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2049  :( 0UL ) ) );
2050  const size_t kend( ( IsLower_v<MT4> )
2051  ?( ( IsUpper_v<MT5> )
2052  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
2053  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2054  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2055  :( ( IsUpper_v<MT5> )
2056  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2057  :( K ) ) );
2058  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
2059 
2060  const size_t knum( kend - kbegin );
2061  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
2062 
2063  for( size_t k=kbegin; k<kpos; k+=2UL ) {
2064  C(i,j) += A(i,k ) * B(k ,j);
2065  C(i,j) += A(i,k+1UL) * B(k+1UL,j);
2066  }
2067  if( kpos < kend ) {
2068  C(i,j) += A(i,kpos) * B(kpos,j);
2069  }
2070  }
2071  }
2072  }
2074  //**********************************************************************************************
2075 
2076  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2090  template< typename MT3 // Type of the left-hand side target matrix
2091  , typename MT4 // Type of the left-hand side matrix operand
2092  , typename MT5 > // Type of the right-hand side matrix operand
2093  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2094  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2095  {
2096  const size_t M( A.rows() );
2097  const size_t N( B.columns() );
2098 
2099  for( size_t i=0UL; i<M; ++i )
2100  {
2101  const size_t jbegin( ( IsUpper_v<MT4> )
2102  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2103  :( 0UL ) );
2104  const size_t jend( ( IsLower_v<MT4> )
2105  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2106  :( N ) );
2107  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2108 
2109  const size_t jnum( jend - jbegin );
2110  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2111 
2112  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2113  C(i,j ) += A(i,j ) * B(j ,j );
2114  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2115  }
2116  if( jpos < jend ) {
2117  C(i,jpos) += A(i,jpos) * B(jpos,jpos);
2118  }
2119  }
2120  }
2122  //**********************************************************************************************
2123 
2124  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
2138  template< typename MT3 // Type of the left-hand side target matrix
2139  , typename MT4 // Type of the left-hand side matrix operand
2140  , typename MT5 > // Type of the right-hand side matrix operand
2141  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2142  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2143  {
2144  constexpr size_t block( BLOCK_SIZE );
2145 
2146  const size_t M( A.rows() );
2147  const size_t N( B.columns() );
2148 
2149  for( size_t jj=0UL; jj<N; jj+=block ) {
2150  const size_t jend( min( N, jj+block ) );
2151  for( size_t ii=0UL; ii<M; ii+=block ) {
2152  const size_t iend( min( M, ii+block ) );
2153  for( size_t j=jj; j<jend; ++j )
2154  {
2155  const size_t ibegin( ( IsLower_v<MT4> )
2156  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
2157  :( ii ) );
2158  const size_t ipos( ( IsUpper_v<MT4> )
2159  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
2160  :( iend ) );
2161 
2162  for( size_t i=ibegin; i<ipos; ++i ) {
2163  C(i,j) += A(i,j) * B(j,j);
2164  }
2165  }
2166  }
2167  }
2168  }
2170  //**********************************************************************************************
2171 
2172  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
2186  template< typename MT3 // Type of the left-hand side target matrix
2187  , typename MT4 // Type of the left-hand side matrix operand
2188  , typename MT5 > // Type of the right-hand side matrix operand
2189  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2190  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2191  {
2192  constexpr size_t block( BLOCK_SIZE );
2193 
2194  const size_t M( A.rows() );
2195  const size_t N( B.columns() );
2196 
2197  for( size_t ii=0UL; ii<M; ii+=block ) {
2198  const size_t iend( min( M, ii+block ) );
2199  for( size_t jj=0UL; jj<N; jj+=block ) {
2200  const size_t jend( min( N, jj+block ) );
2201  for( size_t i=ii; i<iend; ++i )
2202  {
2203  const size_t jbegin( ( IsUpper_v<MT5> )
2204  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
2205  :( jj ) );
2206  const size_t jpos( ( IsLower_v<MT5> )
2207  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
2208  :( jend ) );
2209 
2210  for( size_t j=jbegin; j<jpos; ++j ) {
2211  C(i,j) += A(i,i) * B(i,j);
2212  }
2213  }
2214  }
2215  }
2216  }
2218  //**********************************************************************************************
2219 
2220  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2234  template< typename MT3 // Type of the left-hand side target matrix
2235  , typename MT4 // Type of the left-hand side matrix operand
2236  , typename MT5 > // Type of the right-hand side matrix operand
2237  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2238  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2239  {
2240  const size_t M( A.rows() );
2241  const size_t N( B.columns() );
2242 
2243  for( size_t j=0UL; j<N; ++j )
2244  {
2245  const size_t ibegin( ( IsLower_v<MT5> )
2246  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2247  :( 0UL ) );
2248  const size_t iend( ( IsUpper_v<MT5> )
2249  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2250  :( M ) );
2251  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2252 
2253  const size_t inum( iend - ibegin );
2254  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2255 
2256  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2257  C(i ,j) += A(i ,i ) * B(i ,j);
2258  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2259  }
2260  if( ipos < iend ) {
2261  C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2262  }
2263  }
2264  }
2266  //**********************************************************************************************
2267 
2268  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2282  template< typename MT3 // Type of the left-hand side target matrix
2283  , typename MT4 // Type of the left-hand side matrix operand
2284  , typename MT5 > // Type of the right-hand side matrix operand
2285  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2286  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2287  {
2288  for( size_t i=0UL; i<A.rows(); ++i ) {
2289  C(i,i) += A(i,i) * B(i,i);
2290  }
2291  }
2293  //**********************************************************************************************
2294 
2295  //**Default addition assignment to dense matrices (small matrices)******************************
2309  template< typename MT3 // Type of the left-hand side target matrix
2310  , typename MT4 // Type of the left-hand side matrix operand
2311  , typename MT5 > // Type of the right-hand side matrix operand
2312  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2313  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2314  {
2315  selectDefaultAddAssignKernel( C, A, B );
2316  }
2318  //**********************************************************************************************
2319 
2320  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2335  template< typename MT3 // Type of the left-hand side target matrix
2336  , typename MT4 // Type of the left-hand side matrix operand
2337  , typename MT5 > // Type of the right-hand side matrix operand
2338  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2339  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2340  {
2341  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
2342 
2343  const size_t M( A.rows() );
2344  const size_t N( B.columns() );
2345  const size_t K( A.columns() );
2346 
2347  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2348 
2349  size_t i( 0UL );
2350 
2351  for( ; (i+2UL) <= M; i+=2UL )
2352  {
2353  const size_t jend( LOW ? i+2UL : N );
2354  size_t j( UPP ? i : 0UL );
2355 
2356  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
2357  {
2358  const size_t kbegin( ( IsUpper_v<MT4> )
2359  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2360  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2361  const size_t kend( ( IsLower_v<MT4> )
2362  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
2363  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
2364 
2365  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2366  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2367 
2368  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2369  size_t k( kbegin );
2370 
2371  for( ; k<kpos; k+=SIMDSIZE ) {
2372  const SIMDType a1( A.load(i ,k) );
2373  const SIMDType a2( A.load(i+1UL,k) );
2374  const SIMDType b1( B.load(k,j ) );
2375  const SIMDType b2( B.load(k,j+1UL) );
2376  const SIMDType b3( B.load(k,j+2UL) );
2377  const SIMDType b4( B.load(k,j+3UL) );
2378  xmm1 += a1 * b1;
2379  xmm2 += a1 * b2;
2380  xmm3 += a1 * b3;
2381  xmm4 += a1 * b4;
2382  xmm5 += a2 * b1;
2383  xmm6 += a2 * b2;
2384  xmm7 += a2 * b3;
2385  xmm8 += a2 * b4;
2386  }
2387 
2388  C(i ,j ) += sum( xmm1 );
2389  C(i ,j+1UL) += sum( xmm2 );
2390  C(i ,j+2UL) += sum( xmm3 );
2391  C(i ,j+3UL) += sum( xmm4 );
2392  C(i+1UL,j ) += sum( xmm5 );
2393  C(i+1UL,j+1UL) += sum( xmm6 );
2394  C(i+1UL,j+2UL) += sum( xmm7 );
2395  C(i+1UL,j+3UL) += sum( xmm8 );
2396 
2397  for( ; remainder && k<kend; ++k ) {
2398  C(i ,j ) += A(i ,k) * B(k,j );
2399  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2400  C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2401  C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
2402  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2403  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2404  C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2405  C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
2406  }
2407  }
2408 
2409  for( ; (j+2UL) <= jend; j+=2UL )
2410  {
2411  const size_t kbegin( ( IsUpper_v<MT4> )
2412  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2413  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2414  const size_t kend( ( IsLower_v<MT4> )
2415  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
2416  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2417 
2418  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2419  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2420 
2421  SIMDType xmm1, xmm2, xmm3, xmm4;
2422  size_t k( kbegin );
2423 
2424  for( ; k<kpos; k+=SIMDSIZE ) {
2425  const SIMDType a1( A.load(i ,k) );
2426  const SIMDType a2( A.load(i+1UL,k) );
2427  const SIMDType b1( B.load(k,j ) );
2428  const SIMDType b2( B.load(k,j+1UL) );
2429  xmm1 += a1 * b1;
2430  xmm2 += a1 * b2;
2431  xmm3 += a2 * b1;
2432  xmm4 += a2 * b2;
2433  }
2434 
2435  C(i ,j ) += sum( xmm1 );
2436  C(i ,j+1UL) += sum( xmm2 );
2437  C(i+1UL,j ) += sum( xmm3 );
2438  C(i+1UL,j+1UL) += sum( xmm4 );
2439 
2440  for( ; remainder && k<kend; ++k ) {
2441  C(i ,j ) += A(i ,k) * B(k,j );
2442  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2443  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2444  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2445  }
2446  }
2447 
2448  if( j < jend )
2449  {
2450  const size_t kbegin( ( IsUpper_v<MT4> )
2451  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2452  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2453  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2454 
2455  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2456  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2457 
2458  SIMDType xmm1, xmm2;
2459  size_t k( kbegin );
2460 
2461  for( ; k<kpos; k+=SIMDSIZE ) {
2462  const SIMDType b1( B.load(k,j) );
2463  xmm1 += A.load(i ,k) * b1;
2464  xmm2 += A.load(i+1UL,k) * b1;
2465  }
2466 
2467  C(i ,j) += sum( xmm1 );
2468  C(i+1UL,j) += sum( xmm2 );
2469 
2470  for( ; remainder && k<kend; ++k ) {
2471  C(i ,j) += A(i ,k) * B(k,j);
2472  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2473  }
2474  }
2475  }
2476 
2477  if( i < M )
2478  {
2479  const size_t jend( LOW ? i+1UL : N );
2480  size_t j( UPP ? i : 0UL );
2481 
2482  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
2483  {
2484  const size_t kbegin( ( IsUpper_v<MT4> )
2485  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2486  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2487  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
2488 
2489  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2490  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2491 
2492  SIMDType xmm1, xmm2, xmm3, xmm4;
2493  size_t k( kbegin );
2494 
2495  for( ; k<kpos; k+=SIMDSIZE ) {
2496  const SIMDType a1( A.load(i,k) );
2497  xmm1 += a1 * B.load(k,j );
2498  xmm2 += a1 * B.load(k,j+1UL);
2499  xmm3 += a1 * B.load(k,j+2UL);
2500  xmm4 += a1 * B.load(k,j+3UL);
2501  }
2502 
2503  C(i,j ) += sum( xmm1 );
2504  C(i,j+1UL) += sum( xmm2 );
2505  C(i,j+2UL) += sum( xmm3 );
2506  C(i,j+3UL) += sum( xmm4 );
2507 
2508  for( ; remainder && k<kend; ++k ) {
2509  C(i,j ) += A(i,k) * B(k,j );
2510  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2511  C(i,j+2UL) += A(i,k) * B(k,j+2UL);
2512  C(i,j+3UL) += A(i,k) * B(k,j+3UL);
2513  }
2514  }
2515 
2516  for( ; (j+2UL) <= jend; j+=2UL )
2517  {
2518  const size_t kbegin( ( IsUpper_v<MT4> )
2519  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2520  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2521  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2522 
2523  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2524  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2525 
2526  SIMDType xmm1, xmm2;
2527  size_t k( kbegin );
2528 
2529  for( ; k<kpos; k+=SIMDSIZE ) {
2530  const SIMDType a1( A.load(i,k) );
2531  xmm1 += a1 * B.load(k,j );
2532  xmm2 += a1 * B.load(k,j+1UL);
2533  }
2534 
2535  C(i,j ) += sum( xmm1 );
2536  C(i,j+1UL) += sum( xmm2 );
2537 
2538  for( ; remainder && k<kend; ++k ) {
2539  C(i,j ) += A(i,k) * B(k,j );
2540  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2541  }
2542  }
2543 
2544  if( j < jend )
2545  {
2546  const size_t kbegin( ( IsUpper_v<MT4> )
2547  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2548  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2549 
2550  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
2551  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2552 
2553  SIMDType xmm1;
2554  size_t k( kbegin );
2555 
2556  for( ; k<kpos; k+=SIMDSIZE ) {
2557  xmm1 += A.load(i,k) * B.load(k,j);
2558  }
2559 
2560  C(i,j) += sum( xmm1 );
2561 
2562  for( ; remainder && k<K; ++k ) {
2563  C(i,j) += A(i,k) * B(k,j);
2564  }
2565  }
2566  }
2567  }
2569  //**********************************************************************************************
2570 
2571  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2586  template< typename MT3 // Type of the left-hand side target matrix
2587  , typename MT4 // Type of the left-hand side matrix operand
2588  , typename MT5 > // Type of the right-hand side matrix operand
2589  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2590  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2591  {
2592  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
2593 
2594  const size_t M( A.rows() );
2595  const size_t N( B.columns() );
2596  const size_t K( A.columns() );
2597 
2598  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2599 
2600  size_t i( 0UL );
2601 
2602  for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
2603  {
2604  size_t j( 0UL );
2605 
2606  for( ; (j+2UL) <= N; j+=2UL )
2607  {
2608  const size_t kbegin( ( IsUpper_v<MT4> )
2609  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2610  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2611  const size_t kend( ( IsLower_v<MT4> )
2612  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
2613  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2614 
2615  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2616  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2617 
2618  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2619  size_t k( kbegin );
2620 
2621  for( ; k<kpos; k+=SIMDSIZE ) {
2622  const SIMDType a1( A.load(i ,k) );
2623  const SIMDType a2( A.load(i+1UL,k) );
2624  const SIMDType a3( A.load(i+2UL,k) );
2625  const SIMDType a4( A.load(i+3UL,k) );
2626  const SIMDType b1( B.load(k,j ) );
2627  const SIMDType b2( B.load(k,j+1UL) );
2628  xmm1 += a1 * b1;
2629  xmm2 += a1 * b2;
2630  xmm3 += a2 * b1;
2631  xmm4 += a2 * b2;
2632  xmm5 += a3 * b1;
2633  xmm6 += a3 * b2;
2634  xmm7 += a4 * b1;
2635  xmm8 += a4 * b2;
2636  }
2637 
2638  C(i ,j ) += sum( xmm1 );
2639  C(i ,j+1UL) += sum( xmm2 );
2640  C(i+1UL,j ) += sum( xmm3 );
2641  C(i+1UL,j+1UL) += sum( xmm4 );
2642  C(i+2UL,j ) += sum( xmm5 );
2643  C(i+2UL,j+1UL) += sum( xmm6 );
2644  C(i+3UL,j ) += sum( xmm7 );
2645  C(i+3UL,j+1UL) += sum( xmm8 );
2646 
2647  for( ; remainder && k<kend; ++k ) {
2648  C(i ,j ) += A(i ,k) * B(k,j );
2649  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2650  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2651  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2652  C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2653  C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2654  C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
2655  C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
2656  }
2657  }
2658 
2659  if( j < N )
2660  {
2661  const size_t kbegin( ( IsUpper_v<MT4> )
2662  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2663  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2664  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
2665 
2666  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2667  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2668 
2669  SIMDType xmm1, xmm2, xmm3, xmm4;
2670  size_t k( kbegin );
2671 
2672  for( ; k<kpos; k+=SIMDSIZE ) {
2673  const SIMDType b1( B.load(k,j) );
2674  xmm1 += A.load(i ,k) * b1;
2675  xmm2 += A.load(i+1UL,k) * b1;
2676  xmm3 += A.load(i+2UL,k) * b1;
2677  xmm4 += A.load(i+3UL,k) * b1;
2678  }
2679 
2680  C(i ,j) += sum( xmm1 );
2681  C(i+1UL,j) += sum( xmm2 );
2682  C(i+2UL,j) += sum( xmm3 );
2683  C(i+3UL,j) += sum( xmm4 );
2684 
2685  for( ; remainder && k<kend; ++k ) {
2686  C(i ,j) += A(i ,k) * B(k,j);
2687  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2688  C(i+2UL,j) += A(i+2UL,k) * B(k,j);
2689  C(i+3UL,j) += A(i+3UL,k) * B(k,j);
2690  }
2691  }
2692  }
2693 
2694  for( ; (i+2UL) <= M; i+=2UL )
2695  {
2696  const size_t jend( LOW ? i+2UL : N );
2697  size_t j( UPP ? i : 0UL );
2698 
2699  for( ; (j+2UL) <= jend; j+=2UL )
2700  {
2701  const size_t kbegin( ( IsUpper_v<MT4> )
2702  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2703  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2704  const size_t kend( ( IsLower_v<MT4> )
2705  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
2706  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2707 
2708  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2709  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2710 
2711  SIMDType xmm1, xmm2, xmm3, xmm4;
2712  size_t k( kbegin );
2713 
2714  for( ; k<kpos; k+=SIMDSIZE ) {
2715  const SIMDType a1( A.load(i ,k) );
2716  const SIMDType a2( A.load(i+1UL,k) );
2717  const SIMDType b1( B.load(k,j ) );
2718  const SIMDType b2( B.load(k,j+1UL) );
2719  xmm1 += a1 * b1;
2720  xmm2 += a1 * b2;
2721  xmm3 += a2 * b1;
2722  xmm4 += a2 * b2;
2723  }
2724 
2725  C(i ,j ) += sum( xmm1 );
2726  C(i ,j+1UL) += sum( xmm2 );
2727  C(i+1UL,j ) += sum( xmm3 );
2728  C(i+1UL,j+1UL) += sum( xmm4 );
2729 
2730  for( ; remainder && k<kend; ++k ) {
2731  C(i ,j ) += A(i ,k) * B(k,j );
2732  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2733  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2734  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2735  }
2736  }
2737 
2738  if( j < jend )
2739  {
2740  const size_t kbegin( ( IsUpper_v<MT4> )
2741  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2742  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2743  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2744 
2745  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2746  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2747 
2748  SIMDType xmm1, xmm2;
2749  size_t k( kbegin );
2750 
2751  for( ; k<kpos; k+=SIMDSIZE ) {
2752  const SIMDType b1( B.load(k,j) );
2753  xmm1 += A.load(i ,k) * b1;
2754  xmm2 += A.load(i+1UL,k) * b1;
2755  }
2756 
2757  C(i ,j) += sum( xmm1 );
2758  C(i+1UL,j) += sum( xmm2 );
2759 
2760  for( ; remainder && k<kend; ++k ) {
2761  C(i ,j) += A(i ,k) * B(k,j);
2762  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2763  }
2764  }
2765  }
2766 
2767  if( i < M )
2768  {
2769  const size_t jend( LOW ? i+1UL : N );
2770  size_t j( UPP ? i : 0UL );
2771 
2772  for( ; (j+2UL) <= jend; j+=2UL )
2773  {
2774  const size_t kbegin( ( IsUpper_v<MT4> )
2775  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2776  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2777  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2778 
2779  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2780  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2781 
2782  SIMDType xmm1, xmm2;
2783  size_t k( kbegin );
2784 
2785  for( ; k<kpos; k+=SIMDSIZE ) {
2786  const SIMDType a1( A.load(i,k) );
2787  xmm1 += a1 * B.load(k,j );
2788  xmm2 += a1 * B.load(k,j+1UL);
2789  }
2790 
2791  C(i,j ) += sum( xmm1 );
2792  C(i,j+1UL) += sum( xmm2 );
2793 
2794  for( ; remainder && k<kend; ++k ) {
2795  C(i,j ) += A(i,k) * B(k,j );
2796  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2797  }
2798  }
2799 
2800  if( j < jend )
2801  {
2802  const size_t kbegin( ( IsUpper_v<MT4> )
2803  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2804  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2805 
2806  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
2807  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2808 
2809  SIMDType xmm1;
2810  size_t k( kbegin );
2811 
2812  for( ; k<kpos; k+=SIMDSIZE ) {
2813  xmm1 += A.load(i,k) * B.load(k,j);
2814  }
2815 
2816  C(i,j) += sum( xmm1 );
2817 
2818  for( ; remainder && k<K; ++k ) {
2819  C(i,j) += A(i,k) * B(k,j);
2820  }
2821  }
2822  }
2823  }
2825  //**********************************************************************************************
2826 
2827  //**Default addition assignment to dense matrices (large matrices)******************************
2841  template< typename MT3 // Type of the left-hand side target matrix
2842  , typename MT4 // Type of the left-hand side matrix operand
2843  , typename MT5 > // Type of the right-hand side matrix operand
2844  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2845  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2846  {
2847  selectDefaultAddAssignKernel( C, A, B );
2848  }
2850  //**********************************************************************************************
2851 
2852  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
2867  template< typename MT3 // Type of the left-hand side target matrix
2868  , typename MT4 // Type of the left-hand side matrix operand
2869  , typename MT5 > // Type of the right-hand side matrix operand
2870  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2871  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2872  {
2873  if( LOW )
2874  lmmm( C, A, B, ElementType(1), ElementType(1) );
2875  else if( UPP )
2876  ummm( C, A, B, ElementType(1), ElementType(1) );
2877  else
2878  mmm( C, A, B, ElementType(1), ElementType(1) );
2879  }
2881  //**********************************************************************************************
2882 
2883  //**BLAS-based addition assignment to dense matrices (default)**********************************
2897  template< typename MT3 // Type of the left-hand side target matrix
2898  , typename MT4 // Type of the left-hand side matrix operand
2899  , typename MT5 > // Type of the right-hand side matrix operand
2900  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2901  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2902  {
2903  selectLargeAddAssignKernel( C, A, B );
2904  }
2906  //**********************************************************************************************
2907 
2908  //**BLAS-based addition assignment to dense matrices********************************************
2909 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2910 
2923  template< typename MT3 // Type of the left-hand side target matrix
2924  , typename MT4 // Type of the left-hand side matrix operand
2925  , typename MT5 > // Type of the right-hand side matrix operand
2926  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2927  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2928  {
2929  using ET = ElementType_t<MT3>;
2930 
2931  if( IsTriangular_v<MT4> ) {
2932  ResultType_t<MT3> tmp( serial( B ) );
2933  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2934  addAssign( C, tmp );
2935  }
2936  else if( IsTriangular_v<MT5> ) {
2937  ResultType_t<MT3> tmp( serial( A ) );
2938  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2939  addAssign( C, tmp );
2940  }
2941  else {
2942  gemm( C, A, B, ET(1), ET(1) );
2943  }
2944  }
2946 #endif
2947  //**********************************************************************************************
2948 
2949  //**Addition assignment to sparse matrices******************************************************
2950  // No special implementation for the addition assignment to sparse matrices.
2951  //**********************************************************************************************
2952 
2953  //**Subtraction assignment to dense matrices****************************************************
2966  template< typename MT // Type of the target dense matrix
2967  , bool SO > // Storage order of the target dense matrix
2968  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
2969  {
2971 
2972  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2973  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2974 
2975  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2976  return;
2977  }
2978 
2979  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2980  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2981 
2982  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2983  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2984  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2985  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2986  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2987  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2988 
2989  DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2990  }
2992  //**********************************************************************************************
2993 
2994  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3005  template< typename MT3 // Type of the left-hand side target matrix
3006  , typename MT4 // Type of the left-hand side matrix operand
3007  , typename MT5 > // Type of the right-hand side matrix operand
3008  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3009  {
3010  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
3011  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
3012  selectSmallSubAssignKernel( C, A, B );
3013  else
3014  selectBlasSubAssignKernel( C, A, B );
3015  }
3017  //**********************************************************************************************
3018 
3019  //**Default subtraction assignment to row-major dense matrices (general/general)****************
3033  template< typename MT3 // Type of the left-hand side target matrix
3034  , typename MT4 // Type of the left-hand side matrix operand
3035  , typename MT5 > // Type of the right-hand side matrix operand
3036  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3037  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3038  {
3039  const size_t M( A.rows() );
3040  const size_t N( B.columns() );
3041  const size_t K( A.columns() );
3042 
3043  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3044 
3045  const size_t ibegin( ( IsStrictlyLower_v<MT4> )
3046  ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
3047  :( 0UL ) );
3048  const size_t iend( ( IsStrictlyUpper_v<MT4> )
3049  ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
3050  :( M ) );
3051  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3052 
3053  for( size_t i=ibegin; i<iend; ++i )
3054  {
3055  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3056  ?( ( IsStrictlyUpper_v<MT4> )
3057  ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
3058  :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
3059  :( ( IsStrictlyUpper_v<MT5> )
3060  ?( UPP ? max( i, 1UL ) : 1UL )
3061  :( UPP ? i : 0UL ) ) );
3062  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
3063  ?( ( IsStrictlyLower_v<MT4> )
3064  ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
3065  :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
3066  :( ( IsStrictlyLower_v<MT5> )
3067  ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
3068  :( LOW ? i+1UL : N ) ) );
3069 
3070  if( ( LOW || UPP ) && ( jbegin > jend ) ) continue;
3071  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3072 
3073  for( size_t j=jbegin; j<jend; ++j )
3074  {
3075  const size_t kbegin( ( IsUpper_v<MT4> )
3076  ?( ( IsLower_v<MT5> )
3077  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3078  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3079  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3080  :( ( IsLower_v<MT5> )
3081  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3082  :( 0UL ) ) );
3083  const size_t kend( ( IsLower_v<MT4> )
3084  ?( ( IsUpper_v<MT5> )
3085  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3086  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3087  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3088  :( ( IsUpper_v<MT5> )
3089  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3090  :( K ) ) );
3091  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
3092 
3093  const size_t knum( kend - kbegin );
3094  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
3095 
3096  for( size_t k=kbegin; k<kpos; k+=2UL ) {
3097  C(i,j) -= A(i,k ) * B(k ,j);
3098  C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3099  }
3100  if( kpos < kend ) {
3101  C(i,j) -= A(i,kpos) * B(kpos,j);
3102  }
3103  }
3104  }
3105  }
3107  //**********************************************************************************************
3108 
3109  //**Default subtraction assignment to column-major dense matrices (general/general)*************
3123  template< typename MT3 // Type of the left-hand side target matrix
3124  , typename MT4 // Type of the left-hand side matrix operand
3125  , typename MT5 > // Type of the right-hand side matrix operand
3126  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3127  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3128  {
3129  const size_t M( A.rows() );
3130  const size_t N( B.columns() );
3131  const size_t K( A.columns() );
3132 
3133  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3134 
3135  const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
3136  ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
3137  :( 0UL ) );
3138  const size_t jend( ( IsStrictlyLower_v<MT5> )
3139  ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
3140  :( N ) );
3141  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3142 
3143  for( size_t j=jbegin; j<jend; ++j )
3144  {
3145  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
3146  ?( ( IsStrictlyLower_v<MT4> )
3147  ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
3148  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3149  :( ( IsStrictlyLower_v<MT4> )
3150  ?( LOW ? max( j, 1UL ) : 1UL )
3151  :( LOW ? j : 0UL ) ) );
3152  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3153  ?( ( IsStrictlyUpper_v<MT4> )
3154  ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
3155  :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
3156  :( ( IsStrictlyUpper_v<MT4> )
3157  ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
3158  :( UPP ? j+1UL : M ) ) );
3159 
3160  if( ( LOW || UPP ) && ( ibegin > iend ) ) continue;
3161  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3162 
3163  for( size_t i=ibegin; i<iend; ++i )
3164  {
3165  const size_t kbegin( ( IsUpper_v<MT4> )
3166  ?( ( IsLower_v<MT5> )
3167  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3168  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3169  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3170  :( ( IsLower_v<MT5> )
3171  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3172  :( 0UL ) ) );
3173  const size_t kend( ( IsLower_v<MT4> )
3174  ?( ( IsUpper_v<MT5> )
3175  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3176  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3177  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3178  :( ( IsUpper_v<MT5> )
3179  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3180  :( K ) ) );
3181  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
3182 
3183  const size_t knum( kend - kbegin );
3184  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
3185 
3186  for( size_t k=kbegin; k<kpos; k+=2UL ) {
3187  C(i,j) -= A(i,k ) * B(k ,j);
3188  C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3189  }
3190  if( kpos < kend ) {
3191  C(i,j) -= A(i,kpos) * B(kpos,j);
3192  }
3193  }
3194  }
3195  }
3197  //**********************************************************************************************
3198 
3199  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
3213  template< typename MT3 // Type of the left-hand side target matrix
3214  , typename MT4 // Type of the left-hand side matrix operand
3215  , typename MT5 > // Type of the right-hand side matrix operand
3216  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3217  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3218  {
3219  const size_t M( A.rows() );
3220  const size_t N( B.columns() );
3221 
3222  for( size_t i=0UL; i<M; ++i )
3223  {
3224  const size_t jbegin( ( IsUpper_v<MT4> )
3225  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3226  :( 0UL ) );
3227  const size_t jend( ( IsLower_v<MT4> )
3228  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3229  :( N ) );
3230  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3231 
3232  const size_t jnum( jend - jbegin );
3233  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3234 
3235  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3236  C(i,j ) -= A(i,j ) * B(j ,j );
3237  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3238  }
3239  if( jpos < jend ) {
3240  C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3241  }
3242  }
3243  }
3245  //**********************************************************************************************
3246 
3247  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
3261  template< typename MT3 // Type of the left-hand side target matrix
3262  , typename MT4 // Type of the left-hand side matrix operand
3263  , typename MT5 > // Type of the right-hand side matrix operand
3264  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3265  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3266  {
3267  constexpr size_t block( BLOCK_SIZE );
3268 
3269  const size_t M( A.rows() );
3270  const size_t N( B.columns() );
3271 
3272  for( size_t jj=0UL; jj<N; jj+=block ) {
3273  const size_t jend( min( N, jj+block ) );
3274  for( size_t ii=0UL; ii<M; ii+=block ) {
3275  const size_t iend( min( M, ii+block ) );
3276  for( size_t j=jj; j<jend; ++j )
3277  {
3278  const size_t ibegin( ( IsLower_v<MT4> )
3279  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
3280  :( ii ) );
3281  const size_t ipos( ( IsUpper_v<MT4> )
3282  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
3283  :( iend ) );
3284 
3285  for( size_t i=ibegin; i<ipos; ++i ) {
3286  C(i,j) -= A(i,j) * B(j,j);
3287  }
3288  }
3289  }
3290  }
3291  }
3293  //**********************************************************************************************
3294 
3295  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
3309  template< typename MT3 // Type of the left-hand side target matrix
3310  , typename MT4 // Type of the left-hand side matrix operand
3311  , typename MT5 > // Type of the right-hand side matrix operand
3312  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3313  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3314  {
3315  constexpr size_t block( BLOCK_SIZE );
3316 
3317  const size_t M( A.rows() );
3318  const size_t N( B.columns() );
3319 
3320  for( size_t ii=0UL; ii<M; ii+=block ) {
3321  const size_t iend( min( M, ii+block ) );
3322  for( size_t jj=0UL; jj<N; jj+=block ) {
3323  const size_t jend( min( N, jj+block ) );
3324  for( size_t i=ii; i<iend; ++i )
3325  {
3326  const size_t jbegin( ( IsUpper_v<MT5> )
3327  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
3328  :( jj ) );
3329  const size_t jpos( ( IsLower_v<MT5> )
3330  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
3331  :( jend ) );
3332 
3333  for( size_t j=jbegin; j<jpos; ++j ) {
3334  C(i,j) -= A(i,i) * B(i,j);
3335  }
3336  }
3337  }
3338  }
3339  }
3341  //**********************************************************************************************
3342 
3343  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
3357  template< typename MT3 // Type of the left-hand side target matrix
3358  , typename MT4 // Type of the left-hand side matrix operand
3359  , typename MT5 > // Type of the right-hand side matrix operand
3360  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3361  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3362  {
3363  const size_t M( A.rows() );
3364  const size_t N( B.columns() );
3365 
3366  for( size_t j=0UL; j<N; ++j )
3367  {
3368  const size_t ibegin( ( IsLower_v<MT5> )
3369  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3370  :( 0UL ) );
3371  const size_t iend( ( IsUpper_v<MT5> )
3372  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3373  :( M ) );
3374  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3375 
3376  const size_t inum( iend - ibegin );
3377  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3378 
3379  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3380  C(i ,j) -= A(i ,i ) * B(i ,j);
3381  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3382  }
3383  if( ipos < iend ) {
3384  C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3385  }
3386  }
3387  }
3389  //**********************************************************************************************
3390 
3391  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3405  template< typename MT3 // Type of the left-hand side target matrix
3406  , typename MT4 // Type of the left-hand side matrix operand
3407  , typename MT5 > // Type of the right-hand side matrix operand
3408  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3409  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3410  {
3411  for( size_t i=0UL; i<A.rows(); ++i ) {
3412  C(i,i) -= A(i,i) * B(i,i);
3413  }
3414  }
3416  //**********************************************************************************************
3417 
3418  //**Default subtraction assignment to dense matrices (small matrices)***************************
3432  template< typename MT3 // Type of the left-hand side target matrix
3433  , typename MT4 // Type of the left-hand side matrix operand
3434  , typename MT5 > // Type of the right-hand side matrix operand
3435  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3436  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3437  {
3438  selectDefaultSubAssignKernel( C, A, B );
3439  }
3441  //**********************************************************************************************
3442 
3443  //**Default subtraction assignment to row-major dense matrices (small matrices)*****************
3458  template< typename MT3 // Type of the left-hand side target matrix
3459  , typename MT4 // Type of the left-hand side matrix operand
3460  , typename MT5 > // Type of the right-hand side matrix operand
3461  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3462  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3463  {
3464  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3465 
3466  const size_t M( A.rows() );
3467  const size_t N( B.columns() );
3468  const size_t K( A.columns() );
3469 
3470  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3471 
3472  size_t i( 0UL );
3473 
3474  for( ; (i+2UL) <= M; i+=2UL )
3475  {
3476  const size_t jend( LOW ? i+2UL : N );
3477  size_t j( UPP ? i : 0UL );
3478 
3479  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
3480  {
3481  const size_t kbegin( ( IsUpper_v<MT4> )
3482  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3483  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3484  const size_t kend( ( IsLower_v<MT4> )
3485  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
3486  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
3487 
3488  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3489  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3490 
3491  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3492  size_t k( kbegin );
3493 
3494  for( ; k<kpos; k+=SIMDSIZE ) {
3495  const SIMDType a1( A.load(i ,k) );
3496  const SIMDType a2( A.load(i+1UL,k) );
3497  const SIMDType b1( B.load(k,j ) );
3498  const SIMDType b2( B.load(k,j+1UL) );
3499  const SIMDType b3( B.load(k,j+2UL) );
3500  const SIMDType b4( B.load(k,j+3UL) );
3501  xmm1 += a1 * b1;
3502  xmm2 += a1 * b2;
3503  xmm3 += a1 * b3;
3504  xmm4 += a1 * b4;
3505  xmm5 += a2 * b1;
3506  xmm6 += a2 * b2;
3507  xmm7 += a2 * b3;
3508  xmm8 += a2 * b4;
3509  }
3510 
3511  C(i ,j ) -= sum( xmm1 );
3512  C(i ,j+1UL) -= sum( xmm2 );
3513  C(i ,j+2UL) -= sum( xmm3 );
3514  C(i ,j+3UL) -= sum( xmm4 );
3515  C(i+1UL,j ) -= sum( xmm5 );
3516  C(i+1UL,j+1UL) -= sum( xmm6 );
3517  C(i+1UL,j+2UL) -= sum( xmm7 );
3518  C(i+1UL,j+3UL) -= sum( xmm8 );
3519 
3520  for( ; remainder && k<kend; ++k ) {
3521  C(i ,j ) -= A(i ,k) * B(k,j );
3522  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3523  C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
3524  C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
3525  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3526  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3527  C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
3528  C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
3529  }
3530  }
3531 
3532  for( ; (j+2UL) <= jend; j+=2UL )
3533  {
3534  const size_t kbegin( ( IsUpper_v<MT4> )
3535  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3536  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3537  const size_t kend( ( IsLower_v<MT4> )
3538  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
3539  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3540 
3541  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3542  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3543 
3544  SIMDType xmm1, xmm2, xmm3, xmm4;
3545  size_t k( kbegin );
3546 
3547  for( ; k<kpos; k+=SIMDSIZE ) {
3548  const SIMDType a1( A.load(i ,k) );
3549  const SIMDType a2( A.load(i+1UL,k) );
3550  const SIMDType b1( B.load(k,j ) );
3551  const SIMDType b2( B.load(k,j+1UL) );
3552  xmm1 += a1 * b1;
3553  xmm2 += a1 * b2;
3554  xmm3 += a2 * b1;
3555  xmm4 += a2 * b2;
3556  }
3557 
3558  C(i ,j ) -= sum( xmm1 );
3559  C(i ,j+1UL) -= sum( xmm2 );
3560  C(i+1UL,j ) -= sum( xmm3 );
3561  C(i+1UL,j+1UL) -= sum( xmm4 );
3562 
3563  for( ; remainder && k<kend; ++k ) {
3564  C(i ,j ) -= A(i ,k) * B(k,j );
3565  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3566  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3567  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3568  }
3569  }
3570 
3571  if( j < jend )
3572  {
3573  const size_t kbegin( ( IsUpper_v<MT4> )
3574  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3575  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3576  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3577 
3578  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3579  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3580 
3581  SIMDType xmm1, xmm2;
3582  size_t k( kbegin );
3583 
3584  for( ; k<kpos; k+=SIMDSIZE ) {
3585  const SIMDType b1( B.load(k,j) );
3586  xmm1 += A.load(i ,k) * b1;
3587  xmm2 += A.load(i+1UL,k) * b1;
3588  }
3589 
3590  C(i ,j) -= sum( xmm1 );
3591  C(i+1UL,j) -= sum( xmm2 );
3592 
3593  for( ; remainder && k<kend; ++k ) {
3594  C(i ,j) -= A(i ,k) * B(k,j);
3595  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3596  }
3597  }
3598  }
3599 
3600  if( i < M )
3601  {
3602  const size_t jend( LOW ? i+1UL : N );
3603  size_t j( UPP ? i : 0UL );
3604 
3605  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
3606  {
3607  const size_t kbegin( ( IsUpper_v<MT4> )
3608  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3609  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3610  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
3611 
3612  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3613  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3614 
3615  SIMDType xmm1, xmm2, xmm3, xmm4;
3616  size_t k( kbegin );
3617 
3618  for( ; k<kpos; k+=SIMDSIZE ) {
3619  const SIMDType a1( A.load(i,k) );
3620  xmm1 += a1 * B.load(k,j );
3621  xmm2 += a1 * B.load(k,j+1UL);
3622  xmm3 += a1 * B.load(k,j+2UL);
3623  xmm4 += a1 * B.load(k,j+3UL);
3624  }
3625 
3626  C(i,j ) -= sum( xmm1 );
3627  C(i,j+1UL) -= sum( xmm2 );
3628  C(i,j+2UL) -= sum( xmm3 );
3629  C(i,j+3UL) -= sum( xmm4 );
3630 
3631  for( ; remainder && k<kend; ++k ) {
3632  C(i,j ) -= A(i,k) * B(k,j );
3633  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3634  C(i,j+2UL) -= A(i,k) * B(k,j+2UL);
3635  C(i,j+3UL) -= A(i,k) * B(k,j+3UL);
3636  }
3637  }
3638 
3639  for( ; (j+2UL) <= jend; j+=2UL )
3640  {
3641  const size_t kbegin( ( IsUpper_v<MT4> )
3642  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3643  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3644  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3645 
3646  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3647  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3648 
3649  SIMDType xmm1, xmm2;
3650  size_t k( kbegin );
3651 
3652  for( ; k<kpos; k+=SIMDSIZE ) {
3653  const SIMDType a1( A.load(i,k) );
3654  xmm1 += a1 * B.load(k,j );
3655  xmm2 += a1 * B.load(k,j+1UL);
3656  }
3657 
3658  C(i,j ) -= sum( xmm1 );
3659  C(i,j+1UL) -= sum( xmm2 );
3660 
3661  for( ; remainder && k<kend; ++k ) {
3662  C(i,j ) -= A(i,k) * B(k,j );
3663  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3664  }
3665  }
3666 
3667  if( j < jend )
3668  {
3669  const size_t kbegin( ( IsUpper_v<MT4> )
3670  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3671  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3672 
3673  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
3674  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3675 
3676  SIMDType xmm1;
3677  size_t k( kbegin );
3678 
3679  for( ; k<kpos; k+=SIMDSIZE ) {
3680  xmm1 += A.load(i,k) * B.load(k,j);
3681  }
3682 
3683  C(i,j) -= sum( xmm1 );
3684 
3685  for( ; remainder && k<K; ++k ) {
3686  C(i,j) -= A(i,k) * B(k,j);
3687  }
3688  }
3689  }
3690  }
3692  //**********************************************************************************************
3693 
3694  //**Default subtraction assignment to column-major dense matrices (small matrices)**************
3709  template< typename MT3 // Type of the left-hand side target matrix
3710  , typename MT4 // Type of the left-hand side matrix operand
3711  , typename MT5 > // Type of the right-hand side matrix operand
3712  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3713  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3714  {
3715  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3716 
3717  const size_t M( A.rows() );
3718  const size_t N( B.columns() );
3719  const size_t K( A.columns() );
3720 
3721  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3722 
3723  size_t i( 0UL );
3724 
3725  for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
3726  {
3727  size_t j( 0UL );
3728 
3729  for( ; (j+2UL) <= N; j+=2UL )
3730  {
3731  const size_t kbegin( ( IsUpper_v<MT4> )
3732  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3733  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3734  const size_t kend( ( IsLower_v<MT4> )
3735  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
3736  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3737 
3738  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3739  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3740 
3741  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3742  size_t k( kbegin );
3743 
3744  for( ; k<kpos; k+=SIMDSIZE ) {
3745  const SIMDType a1( A.load(i ,k) );
3746  const SIMDType a2( A.load(i+1UL,k) );
3747  const SIMDType a3( A.load(i+2UL,k) );
3748  const SIMDType a4( A.load(i+3UL,k) );
3749  const SIMDType b1( B.load(k,j ) );
3750  const SIMDType b2( B.load(k,j+1UL) );
3751  xmm1 += a1 * b1;
3752  xmm2 += a1 * b2;
3753  xmm3 += a2 * b1;
3754  xmm4 += a2 * b2;
3755  xmm5 += a3 * b1;
3756  xmm6 += a3 * b2;
3757  xmm7 += a4 * b1;
3758  xmm8 += a4 * b2;
3759  }
3760 
3761  C(i ,j ) -= sum( xmm1 );
3762  C(i ,j+1UL) -= sum( xmm2 );
3763  C(i+1UL,j ) -= sum( xmm3 );
3764  C(i+1UL,j+1UL) -= sum( xmm4 );
3765  C(i+2UL,j ) -= sum( xmm5 );
3766  C(i+2UL,j+1UL) -= sum( xmm6 );
3767  C(i+3UL,j ) -= sum( xmm7 );
3768  C(i+3UL,j+1UL) -= sum( xmm8 );
3769 
3770  for( ; remainder && k<kend; ++k ) {
3771  C(i ,j ) -= A(i ,k) * B(k,j );
3772  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3773  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3774  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3775  C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3776  C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
3777  C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3778  C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
3779  }
3780  }
3781 
3782  if( j < N )
3783  {
3784  const size_t kbegin( ( IsUpper_v<MT4> )
3785  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3786  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3787  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
3788 
3789  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3790  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3791 
3792  SIMDType xmm1, xmm2, xmm3, xmm4;
3793  size_t k( kbegin );
3794 
3795  for( ; k<kpos; k+=SIMDSIZE ) {
3796  const SIMDType b1( B.load(k,j) );
3797  xmm1 += A.load(i ,k) * b1;
3798  xmm2 += A.load(i+1UL,k) * b1;
3799  xmm3 += A.load(i+2UL,k) * b1;
3800  xmm4 += A.load(i+3UL,k) * b1;
3801  }
3802 
3803  C(i ,j) -= sum( xmm1 );
3804  C(i+1UL,j) -= sum( xmm2 );
3805  C(i+2UL,j) -= sum( xmm3 );
3806  C(i+3UL,j) -= sum( xmm4 );
3807 
3808  for( ; remainder && k<kend; ++k ) {
3809  C(i ,j ) -= A(i ,k) * B(k,j );
3810  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3811  C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3812  C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3813  }
3814  }
3815  }
3816 
3817  for( ; (i+2UL) <= M; i+=2UL )
3818  {
3819  const size_t jend( LOW ? i+2UL : N );
3820  size_t j( UPP ? i : 0UL );
3821 
3822  for( ; (j+2UL) <= jend; j+=2UL )
3823  {
3824  const size_t kbegin( ( IsUpper_v<MT4> )
3825  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3826  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3827  const size_t kend( ( IsLower_v<MT4> )
3828  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
3829  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3830 
3831  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3832  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3833 
3834  SIMDType xmm1, xmm2, xmm3, xmm4;
3835  size_t k( kbegin );
3836 
3837  for( ; k<kpos; k+=SIMDSIZE ) {
3838  const SIMDType a1( A.load(i ,k) );
3839  const SIMDType a2( A.load(i+1UL,k) );
3840  const SIMDType b1( B.load(k,j ) );
3841  const SIMDType b2( B.load(k,j+1UL) );
3842  xmm1 += a1 * b1;
3843  xmm2 += a1 * b2;
3844  xmm3 += a2 * b1;
3845  xmm4 += a2 * b2;
3846  }
3847 
3848  C(i ,j ) -= sum( xmm1 );
3849  C(i ,j+1UL) -= sum( xmm2 );
3850  C(i+1UL,j ) -= sum( xmm3 );
3851  C(i+1UL,j+1UL) -= sum( xmm4 );
3852 
3853  for( ; remainder && k<kend; ++k ) {
3854  C(i ,j ) -= A(i ,k) * B(k,j );
3855  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3856  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3857  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3858  }
3859  }
3860 
3861  if( j < jend )
3862  {
3863  const size_t kbegin( ( IsUpper_v<MT4> )
3864  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3865  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3866  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3867 
3868  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3869  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3870 
3871  SIMDType xmm1, xmm2;
3872  size_t k( kbegin );
3873 
3874  for( ; k<kpos; k+=SIMDSIZE ) {
3875  const SIMDType b1( B.load(k,j) );
3876  xmm1 += A.load(i ,k) * b1;
3877  xmm2 += A.load(i+1UL,k) * b1;
3878  }
3879 
3880  C(i ,j) -= sum( xmm1 );
3881  C(i+1UL,j) -= sum( xmm2 );
3882 
3883  for( ; remainder && k<kend; ++k ) {
3884  C(i ,j) -= A(i ,k) * B(k,j);
3885  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3886  }
3887  }
3888  }
3889 
3890  if( i < M )
3891  {
3892  const size_t jend( LOW ? i+1UL : N );
3893  size_t j( UPP ? i : 0UL );
3894 
3895  for( ; (j+2UL) <= jend; j+=2UL )
3896  {
3897  const size_t kbegin( ( IsUpper_v<MT4> )
3898  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3899  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3900  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3901 
3902  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3903  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3904 
3905  SIMDType xmm1, xmm2;
3906  size_t k( kbegin );
3907 
3908  for( ; k<kpos; k+=SIMDSIZE ) {
3909  const SIMDType a1( A.load(i,k) );
3910  xmm1 += a1 * B.load(k,j );
3911  xmm2 += a1 * B.load(k,j+1UL);
3912  }
3913 
3914  C(i,j ) -= sum( xmm1 );
3915  C(i,j+1UL) -= sum( xmm2 );
3916 
3917  for( ; remainder && k<kend; ++k ) {
3918  C(i,j ) -= A(i,k) * B(k,j );
3919  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3920  }
3921  }
3922 
3923  if( j < jend )
3924  {
3925  const size_t kbegin( ( IsUpper_v<MT4> )
3926  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3927  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3928 
3929  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
3930  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3931 
3932  SIMDType xmm1;
3933  size_t k( kbegin );
3934 
3935  for( ; k<kpos; k+=SIMDSIZE ) {
3936  xmm1 += A.load(i,k) * B.load(k,j);
3937  }
3938 
3939  C(i,j) -= sum( xmm1 );
3940 
3941  for( ; remainder && k<K; ++k ) {
3942  C(i,j) -= A(i,k) * B(k,j);
3943  }
3944  }
3945  }
3946  }
3948  //**********************************************************************************************
3949 
3950  //**Default subtraction assignment to dense matrices (large matrices)***************************
3964  template< typename MT3 // Type of the left-hand side target matrix
3965  , typename MT4 // Type of the left-hand side matrix operand
3966  , typename MT5 > // Type of the right-hand side matrix operand
3967  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3968  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3969  {
3970  selectDefaultSubAssignKernel( C, A, B );
3971  }
3973  //**********************************************************************************************
3974 
3975  //**Default subtraction assignment to dense matrices (large matrices)***************************
3990  template< typename MT3 // Type of the left-hand side target matrix
3991  , typename MT4 // Type of the left-hand side matrix operand
3992  , typename MT5 > // Type of the right-hand side matrix operand
3993  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3994  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3995  {
3996  if( LOW )
3997  lmmm( C, A, B, ElementType(-1), ElementType(1) );
3998  else if( UPP )
3999  ummm( C, A, B, ElementType(-1), ElementType(1) );
4000  else
4001  mmm( C, A, B, ElementType(-1), ElementType(1) );
4002  }
4004  //**********************************************************************************************
4005 
4006  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4020  template< typename MT3 // Type of the left-hand side target matrix
4021  , typename MT4 // Type of the left-hand side matrix operand
4022  , typename MT5 > // Type of the right-hand side matrix operand
4023  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4024  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4025  {
4026  selectLargeSubAssignKernel( C, A, B );
4027  }
4029  //**********************************************************************************************
4030 
4031  //**BLAS-based subraction assignment to dense matrices******************************************
4032 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4033 
4046  template< typename MT3 // Type of the left-hand side target matrix
4047  , typename MT4 // Type of the left-hand side matrix operand
4048  , typename MT5 > // Type of the right-hand side matrix operand
4049  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4050  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4051  {
4052  using ET = ElementType_t<MT3>;
4053 
4054  if( IsTriangular_v<MT4> ) {
4055  ResultType_t<MT3> tmp( serial( B ) );
4056  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4057  subAssign( C, tmp );
4058  }
4059  else if( IsTriangular_v<MT5> ) {
4060  ResultType_t<MT3> tmp( serial( A ) );
4061  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4062  subAssign( C, tmp );
4063  }
4064  else {
4065  gemm( C, A, B, ET(-1), ET(1) );
4066  }
4067  }
4069 #endif
4070  //**********************************************************************************************
4071 
4072  //**Subtraction assignment to sparse matrices***************************************************
4073  // No special implementation for the subtraction assignment to sparse matrices.
4074  //**********************************************************************************************
4075 
4076  //**Schur product assignment to dense matrices**************************************************
4089  template< typename MT // Type of the target dense matrix
4090  , bool SO > // Storage order of the target dense matrix
4091  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4092  {
4094 
4098 
4099  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4100  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4101 
4102  const ResultType tmp( serial( rhs ) );
4103  schurAssign( ~lhs, tmp );
4104  }
4106  //**********************************************************************************************
4107 
4108  //**Schur product assignment to sparse matrices*************************************************
4109  // No special implementation for the Schur product assignment to sparse matrices.
4110  //**********************************************************************************************
4111 
4112  //**Multiplication assignment to dense matrices*************************************************
4113  // No special implementation for the multiplication assignment to dense matrices.
4114  //**********************************************************************************************
4115 
4116  //**Multiplication assignment to sparse matrices************************************************
4117  // No special implementation for the multiplication assignment to sparse matrices.
4118  //**********************************************************************************************
4119 
4120  //**SMP assignment to dense matrices************************************************************
4135  template< typename MT // Type of the target dense matrix
4136  , bool SO > // Storage order of the target dense matrix
4137  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4138  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4139  {
4141 
4142  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4143  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4144 
4145  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4146  return;
4147  }
4148  else if( rhs.lhs_.columns() == 0UL ) {
4149  reset( ~lhs );
4150  return;
4151  }
4152 
4153  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4154  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4155 
4156  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4157  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4158  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4159  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4160  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4161  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4162 
4163  smpAssign( ~lhs, A * B );
4164  }
4166  //**********************************************************************************************
4167 
4168  //**SMP assignment to sparse matrices***********************************************************
4183  template< typename MT // Type of the target sparse matrix
4184  , bool SO > // Storage order of the target sparse matrix
4185  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4186  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4187  {
4189 
4190  using TmpType = If_t< SO, OppositeType, ResultType >;
4191 
4198 
4199  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4200  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4201 
4202  const ForwardFunctor fwd;
4203 
4204  const TmpType tmp( rhs );
4205  smpAssign( ~lhs, fwd( tmp ) );
4206  }
4208  //**********************************************************************************************
4209 
4210  //**SMP addition assignment to dense matrices***************************************************
4226  template< typename MT // Type of the target dense matrix
4227  , bool SO > // Storage order of the target dense matrix
4228  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4229  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4230  {
4232 
4233  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4234  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4235 
4236  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4237  return;
4238  }
4239 
4240  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4241  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4242 
4243  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4244  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4245  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4246  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4247  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4248  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4249 
4250  smpAddAssign( ~lhs, A * B );
4251  }
4253  //**********************************************************************************************
4254 
4255  //**SMP addition assignment to sparse matrices**************************************************
4256  // No special implementation for the SMP addition assignment to sparse matrices.
4257  //**********************************************************************************************
4258 
4259  //**SMP subtraction assignment to dense matrices************************************************
4275  template< typename MT // Type of the target dense matrix
4276  , bool SO > // Storage order of the target dense matrix
4277  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4278  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4279  {
4281 
4282  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4283  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4284 
4285  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4286  return;
4287  }
4288 
4289  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4290  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4291 
4292  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4293  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4294  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4295  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4296  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4297  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4298 
4299  smpSubAssign( ~lhs, A * B );
4300  }
4302  //**********************************************************************************************
4303 
4304  //**SMP subtraction assignment to sparse matrices***********************************************
4305  // No special implementation for the SMP subtraction assignment to sparse matrices.
4306  //**********************************************************************************************
4307 
4308  //**SMP Schur product assignment to dense matrices**********************************************
4321  template< typename MT // Type of the target dense matrix
4322  , bool SO > // Storage order of the target dense matrix
4323  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4324  {
4326 
4330 
4331  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4332  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4333 
4334  const ResultType tmp( rhs );
4335  smpSchurAssign( ~lhs, tmp );
4336  }
4338  //**********************************************************************************************
4339 
4340  //**SMP Schur product assignment to sparse matrices*********************************************
4341  // No special implementation for the SMP Schur product assignment to sparse matrices.
4342  //**********************************************************************************************
4343 
4344  //**SMP multiplication assignment to dense matrices*********************************************
4345  // No special implementation for the SMP multiplication assignment to dense matrices.
4346  //**********************************************************************************************
4347 
4348  //**SMP multiplication assignment to sparse matrices********************************************
4349  // No special implementation for the SMP multiplication assignment to sparse matrices.
4350  //**********************************************************************************************
4351 
4352  //**Compile time checks*************************************************************************
4360  //**********************************************************************************************
4361 };
4362 //*************************************************************************************************
4363 
4364 
4365 
4366 
4367 //=================================================================================================
4368 //
4369 // DMATSCALARMULTEXPR SPECIALIZATION
4370 //
4371 //=================================================================================================
4372 
4373 //*************************************************************************************************
4381 template< typename MT1 // Type of the left-hand side dense matrix
4382  , typename MT2 // Type of the right-hand side dense matrix
4383  , bool SF // Symmetry flag
4384  , bool HF // Hermitian flag
4385  , bool LF // Lower flag
4386  , bool UF // Upper flag
4387  , typename ST > // Type of the right-hand side scalar value
4388 class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4389  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4390  , private Computation
4391 {
4392  private:
4393  //**Type definitions****************************************************************************
4395  using MMM = DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4396 
4397  using RES = ResultType_t<MMM>;
4398  using RT1 = ResultType_t<MT1>;
4399  using RT2 = ResultType_t<MT2>;
4400  using ET1 = ElementType_t<RT1>;
4401  using ET2 = ElementType_t<RT2>;
4402  using CT1 = CompositeType_t<MT1>;
4403  using CT2 = CompositeType_t<MT2>;
4404  //**********************************************************************************************
4405 
4406  //**********************************************************************************************
4408  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4409  //**********************************************************************************************
4410 
4411  //**********************************************************************************************
4413  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4414  //**********************************************************************************************
4415 
4416  //**********************************************************************************************
4417  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4418  static constexpr bool HERM = ( HF && !( LF || UF ) );
4419  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4420  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4421  //**********************************************************************************************
4422 
4423  //**********************************************************************************************
4425 
4428  template< typename T1, typename T2, typename T3 >
4429  static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
4430  //**********************************************************************************************
4431 
4432  //**********************************************************************************************
4434 
4436  template< typename T1, typename T2, typename T3, typename T4 >
4437  static constexpr bool UseBlasKernel_v =
4438  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4439  !SYM && !HERM && !LOW && !UPP &&
4440  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4441  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4442  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4443  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4444  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4445  IsBLASCompatible_v< ElementType_t<T1> > &&
4446  IsBLASCompatible_v< ElementType_t<T2> > &&
4447  IsBLASCompatible_v< ElementType_t<T3> > &&
4448  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4449  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4450  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4451  //**********************************************************************************************
4452 
4453  //**********************************************************************************************
4455 
4457  template< typename T1, typename T2, typename T3, typename T4 >
4458  static constexpr bool UseVectorizedDefaultKernel_v =
4459  ( useOptimizedKernels &&
4460  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4461  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4462  IsSIMDCombinable_v< ElementType_t<T1>
4463  , ElementType_t<T2>
4464  , ElementType_t<T3>
4465  , T4 > &&
4466  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4467  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4468  //**********************************************************************************************
4469 
4470  //**********************************************************************************************
4472 
4474  using ForwardFunctor = If_t< HERM
4475  , DeclHerm
4476  , If_t< SYM
4477  , DeclSym
4478  , If_t< LOW
4479  , If_t< UPP
4480  , DeclDiag
4481  , DeclLow >
4482  , If_t< UPP
4483  , DeclUpp
4484  , Noop > > > >;
4485  //**********************************************************************************************
4486 
4487  public:
4488  //**Type definitions****************************************************************************
4490  using This = DMatScalarMultExpr<MMM,ST,false>;
4491 
4493  using BaseType = DenseMatrix<This,false>;
4494 
4496  using ResultType = typename If_t< HERM
4497  , DeclHermTrait< MultTrait_t<RES,ST> >
4498  , If_t< SYM
4499  , DeclSymTrait< MultTrait_t<RES,ST> >
4500  , If_t< LOW
4501  , If_t< UPP
4502  , DeclDiagTrait< MultTrait_t<RES,ST> >
4503  , DeclLowTrait< MultTrait_t<RES,ST> > >
4504  , If_t< UPP
4505  , DeclUppTrait< MultTrait_t<RES,ST> >
4506  , MultTrait<RES,ST> > > > >::Type;
4507 
4508  using OppositeType = OppositeType_t<ResultType>;
4509  using TransposeType = TransposeType_t<ResultType>;
4510  using ElementType = ElementType_t<ResultType>;
4511  using SIMDType = SIMDTrait_t<ElementType>;
4512  using ReturnType = const ElementType;
4513  using CompositeType = const ResultType;
4514 
4516  using LeftOperand = const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4517 
4519  using RightOperand = ST;
4520 
4522  using LT = If_t< evaluateLeft, const RT1, CT1 >;
4523 
4525  using RT = If_t< evaluateRight, const RT2, CT2 >;
4526  //**********************************************************************************************
4527 
4528  //**Compilation flags***************************************************************************
4530  static constexpr bool simdEnabled =
4531  ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
4532  MT1::simdEnabled && MT2::simdEnabled &&
4533  IsSIMDCombinable_v<ET1,ET2,ST> &&
4534  HasSIMDAdd_v<ET1,ET2> &&
4535  HasSIMDMult_v<ET1,ET2> );
4536 
4538  static constexpr bool smpAssignable =
4539  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4540  //**********************************************************************************************
4541 
4542  //**SIMD properties*****************************************************************************
4544  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
4545  //**********************************************************************************************
4546 
4547  //**Constructor*********************************************************************************
4553  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4554  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4555  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4556  {}
4557  //**********************************************************************************************
4558 
4559  //**Access operator*****************************************************************************
4566  inline ReturnType operator()( size_t i, size_t j ) const {
4567  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4568  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4569  return matrix_(i,j) * scalar_;
4570  }
4571  //**********************************************************************************************
4572 
4573  //**At function*********************************************************************************
4581  inline ReturnType at( size_t i, size_t j ) const {
4582  if( i >= matrix_.rows() ) {
4583  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4584  }
4585  if( j >= matrix_.columns() ) {
4586  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4587  }
4588  return (*this)(i,j);
4589  }
4590  //**********************************************************************************************
4591 
4592  //**Rows function*******************************************************************************
4597  inline size_t rows() const {
4598  return matrix_.rows();
4599  }
4600  //**********************************************************************************************
4601 
4602  //**Columns function****************************************************************************
4607  inline size_t columns() const {
4608  return matrix_.columns();
4609  }
4610  //**********************************************************************************************
4611 
4612  //**Left operand access*************************************************************************
4617  inline LeftOperand leftOperand() const {
4618  return matrix_;
4619  }
4620  //**********************************************************************************************
4621 
4622  //**Right operand access************************************************************************
4627  inline RightOperand rightOperand() const {
4628  return scalar_;
4629  }
4630  //**********************************************************************************************
4631 
4632  //**********************************************************************************************
4638  template< typename T >
4639  inline bool canAlias( const T* alias ) const {
4640  return matrix_.canAlias( alias );
4641  }
4642  //**********************************************************************************************
4643 
4644  //**********************************************************************************************
4650  template< typename T >
4651  inline bool isAliased( const T* alias ) const {
4652  return matrix_.isAliased( alias );
4653  }
4654  //**********************************************************************************************
4655 
4656  //**********************************************************************************************
4661  inline bool isAligned() const {
4662  return matrix_.isAligned();
4663  }
4664  //**********************************************************************************************
4665 
4666  //**********************************************************************************************
4671  inline bool canSMPAssign() const noexcept {
4672  return ( !BLAZE_BLAS_MODE ||
4673  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4675  ( rows() * columns() < DMATTDMATMULT_THRESHOLD ) ) &&
4676  ( rows() * columns() >= SMP_DMATTDMATMULT_THRESHOLD );
4677  }
4678  //**********************************************************************************************
4679 
4680  private:
4681  //**Member variables****************************************************************************
4684  //**********************************************************************************************
4685 
4686  //**Assignment to dense matrices****************************************************************
4698  template< typename MT // Type of the target dense matrix
4699  , bool SO > // Storage order of the target dense matrix
4700  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4701  {
4703 
4704  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4705  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4706 
4707  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
4708  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
4709 
4710  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4711  return;
4712  }
4713  else if( left.columns() == 0UL ) {
4714  reset( ~lhs );
4715  return;
4716  }
4717 
4718  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4719  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4720 
4721  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4722  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4723  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4724  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4725  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4726  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4727 
4728  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4729  }
4730  //**********************************************************************************************
4731 
4732  //**Assignment to dense matrices (kernel selection)*********************************************
4743  template< typename MT3 // Type of the left-hand side target matrix
4744  , typename MT4 // Type of the left-hand side matrix operand
4745  , typename MT5 // Type of the right-hand side matrix operand
4746  , typename ST2 > // Type of the scalar value
4747  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4748  {
4749  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
4750  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4751  selectSmallAssignKernel( C, A, B, scalar );
4752  else
4753  selectBlasAssignKernel( C, A, B, scalar );
4754  }
4755  //**********************************************************************************************
4756 
4757  //**Default assignment to row-major dense matrices (general/general)****************************
4771  template< typename MT3 // Type of the left-hand side target matrix
4772  , typename MT4 // Type of the left-hand side matrix operand
4773  , typename MT5 // Type of the right-hand side matrix operand
4774  , typename ST2 > // Type of the scalar value
4775  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4776  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4777  {
4778  const size_t M( A.rows() );
4779  const size_t N( B.columns() );
4780  const size_t K( A.columns() );
4781 
4782  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4783 
4784  const size_t ibegin( ( IsStrictlyLower_v<MT4> )
4785  ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
4786  :( 0UL ) );
4787  const size_t iend( ( IsStrictlyUpper_v<MT4> )
4788  ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
4789  :( M ) );
4790  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4791 
4792  for( size_t i=0UL; i<ibegin; ++i ) {
4793  for( size_t j=0UL; j<N; ++j ) {
4794  reset( C(i,j) );
4795  }
4796  }
4797  for( size_t i=ibegin; i<iend; ++i )
4798  {
4799  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4800  ?( ( IsStrictlyUpper_v<MT4> )
4801  ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
4802  :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
4803  :( ( IsStrictlyUpper_v<MT5> )
4804  ?( SYM || HERM || UPP ? max( i, 1UL ) : 1UL )
4805  :( SYM || HERM || UPP ? i : 0UL ) ) );
4806  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
4807  ?( ( IsStrictlyLower_v<MT4> )
4808  ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
4809  :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
4810  :( ( IsStrictlyLower_v<MT5> )
4811  ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
4812  :( LOW ? i+1UL : N ) ) );
4813 
4814  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
4815  for( size_t j=0UL; j<N; ++j ) {
4816  reset( C(i,j) );
4817  }
4818  continue;
4819  }
4820 
4821  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4822 
4823  for( size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
4824  reset( C(i,j) );
4825  }
4826  for( size_t j=jbegin; j<jend; ++j )
4827  {
4828  const size_t kbegin( ( IsUpper_v<MT4> )
4829  ?( ( IsLower_v<MT5> )
4830  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4831  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4832  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4833  :( ( IsLower_v<MT5> )
4834  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4835  :( 0UL ) ) );
4836  const size_t kend( ( IsLower_v<MT4> )
4837  ?( ( IsUpper_v<MT5> )
4838  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4839  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4840  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4841  :( ( IsUpper_v<MT5> )
4842  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4843  :( K ) ) );
4844  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
4845 
4846  C(i,j) = A(i,kbegin) * B(kbegin,j);
4847  for( size_t k=kbegin+1UL; k<kend; ++k ) {
4848  C(i,j) += A(i,k) * B(k,j);
4849  }
4850  C(i,j) *= scalar;
4851  }
4852  for( size_t j=jend; j<N; ++j ) {
4853  reset( C(i,j) );
4854  }
4855  }
4856  for( size_t i=iend; i<M; ++i ) {
4857  for( size_t j=0UL; j<N; ++j ) {
4858  reset( C(i,j) );
4859  }
4860  }
4861 
4862  if( SYM || HERM ) {
4863  for( size_t i=1UL; i<M; ++i ) {
4864  for( size_t j=0UL; j<i; ++j ) {
4865  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
4866  }
4867  }
4868  }
4869  }
4870  //**********************************************************************************************
4871 
4872  //**Default assignment to column-major dense matrices (general/general)*************************
4886  template< typename MT3 // Type of the left-hand side target matrix
4887  , typename MT4 // Type of the left-hand side matrix operand
4888  , typename MT5 // Type of the right-hand side matrix operand
4889  , typename ST2 > // Type of the scalar value
4890  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4891  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4892  {
4893  const size_t M( A.rows() );
4894  const size_t N( B.columns() );
4895  const size_t K( A.columns() );
4896 
4897  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4898 
4899  const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
4900  ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
4901  :( 0UL ) );
4902  const size_t jend( ( IsStrictlyLower_v<MT5> )
4903  ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
4904  :( N ) );
4905  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4906 
4907  for( size_t j=0UL; j<jbegin; ++j ) {
4908  for( size_t i=0UL; i<M; ++i ) {
4909  reset( C(i,j) );
4910  }
4911  }
4912  for( size_t j=jbegin; j<jend; ++j )
4913  {
4914  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
4915  ?( ( IsStrictlyLower_v<MT4> )
4916  ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
4917  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4918  :( ( IsStrictlyLower_v<MT4> )
4919  ?( SYM || HERM || LOW ? max( j, 1UL ) : 1UL )
4920  :( SYM || HERM || LOW ? j : 0UL ) ) );
4921  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4922  ?( ( IsStrictlyUpper_v<MT4> )
4923  ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
4924  :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
4925  :( ( IsStrictlyUpper_v<MT4> )
4926  ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
4927  :( UPP ? j+1UL : M ) ) );
4928 
4929  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
4930  for( size_t i=0UL; i<M; ++i ) {
4931  reset( C(i,j) );
4932  }
4933  continue;
4934  }
4935 
4936  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4937 
4938  for( size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
4939  reset( C(i,j) );
4940  }
4941  for( size_t i=ibegin; i<iend; ++i )
4942  {
4943  const size_t kbegin( ( IsUpper_v<MT4> )
4944  ?( ( IsLower_v<MT5> )
4945  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4946  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4947  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4948  :( ( IsLower_v<MT5> )
4949  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4950  :( 0UL ) ) );
4951  const size_t kend( ( IsLower_v<MT4> )
4952  ?( ( IsUpper_v<MT5> )
4953  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4954  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4955  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4956  :( ( IsUpper_v<MT5> )
4957  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4958  :( K ) ) );
4959  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
4960 
4961  C(i,j) = A(i,kbegin) * B(kbegin,j);
4962  for( size_t k=kbegin+1UL; k<kend; ++k ) {
4963  C(i,j) += A(i,k) * B(k,j);
4964  }
4965  C(i,j) *= scalar;
4966  }
4967  for( size_t i=iend; i<M; ++i ) {
4968  reset( C(i,j) );
4969  }
4970  }
4971  for( size_t j=jend; j<N; ++j ) {
4972  for( size_t i=0UL; i<M; ++i ) {
4973  reset( C(i,j) );
4974  }
4975  }
4976 
4977  if( SYM || HERM ) {
4978  for( size_t j=1UL; j<N; ++j ) {
4979  for( size_t i=0UL; i<j; ++i ) {
4980  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
4981  }
4982  }
4983  }
4984  }
4985  //**********************************************************************************************
4986 
4987  //**Default assignment to row-major dense matrices (general/diagonal)***************************
5001  template< typename MT3 // Type of the left-hand side target matrix
5002  , typename MT4 // Type of the left-hand side matrix operand
5003  , typename MT5 // Type of the right-hand side matrix operand
5004  , typename ST2 > // Type of the scalar value
5005  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5006  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5007  {
5008  const size_t M( A.rows() );
5009  const size_t N( B.columns() );
5010 
5011  for( size_t i=0UL; i<M; ++i )
5012  {
5013  const size_t jbegin( ( IsUpper_v<MT4> )
5014  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5015  :( 0UL ) );
5016  const size_t jend( ( IsLower_v<MT4> )
5017  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5018  :( N ) );
5019  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5020 
5021  if( IsUpper_v<MT4> ) {
5022  for( size_t j=0UL; j<jbegin; ++j ) {
5023  reset( C(i,j) );
5024  }
5025  }
5026  for( size_t j=jbegin; j<jend; ++j ) {
5027  C(i,j) = A(i,j) * B(j,j) * scalar;
5028  }
5029  if( IsLower_v<MT4> ) {
5030  for( size_t j=jend; j<N; ++j ) {
5031  reset( C(i,j) );
5032  }
5033  }
5034  }
5035  }
5036  //**********************************************************************************************
5037 
5038  //**Default assignment to column-major dense matrices (general/diagonal)************************
5052  template< typename MT3 // Type of the left-hand side target matrix
5053  , typename MT4 // Type of the left-hand side matrix operand
5054  , typename MT5 // Type of the right-hand side matrix operand
5055  , typename ST2 > // Type of the scalar value
5056  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5057  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5058  {
5059  constexpr size_t block( BLOCK_SIZE );
5060 
5061  const size_t M( A.rows() );
5062  const size_t N( B.columns() );
5063 
5064  for( size_t jj=0UL; jj<N; jj+=block ) {
5065  const size_t jend( min( N, jj+block ) );
5066  for( size_t ii=0UL; ii<M; ii+=block ) {
5067  const size_t iend( min( M, ii+block ) );
5068  for( size_t j=jj; j<jend; ++j )
5069  {
5070  const size_t ibegin( ( IsLower_v<MT4> )
5071  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
5072  :( ii ) );
5073  const size_t ipos( ( IsUpper_v<MT4> )
5074  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
5075  :( iend ) );
5076 
5077  if( IsLower_v<MT4> ) {
5078  for( size_t i=ii; i<ibegin; ++i ) {
5079  reset( C(i,j) );
5080  }
5081  }
5082  for( size_t i=ibegin; i<ipos; ++i ) {
5083  C(i,j) = A(i,j) * B(j,j) * scalar;
5084  }
5085  if( IsUpper_v<MT4> ) {
5086  for( size_t i=ipos; i<iend; ++i ) {
5087  reset( C(i,j) );
5088  }
5089  }
5090  }
5091  }
5092  }
5093  }
5094  //**********************************************************************************************
5095 
5096  //**Default assignment to row-major dense matrices (diagonal/general)***************************
5110  template< typename MT3 // Type of the left-hand side target matrix
5111  , typename MT4 // Type of the left-hand side matrix operand
5112  , typename MT5 // Type of the right-hand side matrix operand
5113  , typename ST2 > // Type of the scalar value
5114  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5115  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5116  {
5117  constexpr size_t block( BLOCK_SIZE );
5118 
5119  const size_t M( A.rows() );
5120  const size_t N( B.columns() );
5121 
5122  for( size_t ii=0UL; ii<M; ii+=block ) {
5123  const size_t iend( min( M, ii+block ) );
5124  for( size_t jj=0UL; jj<N; jj+=block ) {
5125  const size_t jend( min( N, jj+block ) );
5126  for( size_t i=ii; i<iend; ++i )
5127  {
5128  const size_t jbegin( ( IsUpper_v<MT5> )
5129  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
5130  :( jj ) );
5131  const size_t jpos( ( IsLower_v<MT5> )
5132  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
5133  :( jend ) );
5134 
5135  if( IsUpper_v<MT5> ) {
5136  for( size_t j=jj; j<jbegin; ++j ) {
5137  reset( C(i,j) );
5138  }
5139  }
5140  for( size_t j=jbegin; j<jpos; ++j ) {
5141  C(i,j) = A(i,i) * B(i,j) * scalar;
5142  }
5143  if( IsLower_v<MT5> ) {
5144  for( size_t j=jpos; j<jend; ++j ) {
5145  reset( C(i,j) );
5146  }
5147  }
5148  }
5149  }
5150  }
5151  }
5152  //**********************************************************************************************
5153 
5154  //**Default assignment to column-major dense matrices (diagonal/general)************************
5168  template< typename MT3 // Type of the left-hand side target matrix
5169  , typename MT4 // Type of the left-hand side matrix operand
5170  , typename MT5 // Type of the right-hand side matrix operand
5171  , typename ST2 > // Type of the scalar value
5172  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5173  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5174  {
5175  const size_t M( A.rows() );
5176  const size_t N( B.columns() );
5177 
5178  for( size_t j=0UL; j<N; ++j )
5179  {
5180  const size_t ibegin( ( IsLower_v<MT5> )
5181  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5182  :( 0UL ) );
5183  const size_t iend( ( IsUpper_v<MT5> )
5184  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5185  :( M ) );
5186  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5187 
5188  if( IsLower_v<MT5> ) {
5189  for( size_t i=0UL; i<ibegin; ++i ) {
5190  reset( C(i,j) );
5191  }
5192  }
5193  for( size_t i=ibegin; i<iend; ++i ) {
5194  C(i,j) = A(i,i) * B(i,j) * scalar;
5195  }
5196  if( IsUpper_v<MT5> ) {
5197  for( size_t i=iend; i<M; ++i ) {
5198  reset( C(i,j) );
5199  }
5200  }
5201  }
5202  }
5203  //**********************************************************************************************
5204 
5205  //**Default assignment to dense matrices (diagonal/diagonal)************************************
5219  template< typename MT3 // Type of the left-hand side target matrix
5220  , typename MT4 // Type of the left-hand side matrix operand
5221  , typename MT5 // Type of the right-hand side matrix operand
5222  , typename ST2 > // Type of the scalar value
5223  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5224  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5225  {
5226  reset( C );
5227 
5228  for( size_t i=0UL; i<A.rows(); ++i ) {
5229  C(i,i) = A(i,i) * B(i,i) * scalar;
5230  }
5231  }
5232  //**********************************************************************************************
5233 
5234  //**Default assignment to dense matrices (small matrices)***************************************
5248  template< typename MT3 // Type of the left-hand side target matrix
5249  , typename MT4 // Type of the left-hand side matrix operand
5250  , typename MT5 // Type of the right-hand side matrix operand
5251  , typename ST2 > // Type of the scalar value
5252  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5253  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5254  {
5255  selectDefaultAssignKernel( C, A, B, scalar );
5256  }
5257  //**********************************************************************************************
5258 
5259  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5274  template< typename MT3 // Type of the left-hand side target matrix
5275  , typename MT4 // Type of the left-hand side matrix operand
5276  , typename MT5 // Type of the right-hand side matrix operand
5277  , typename ST2 > // Type of the scalar value
5278  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5279  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5280  {
5281  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5282 
5283  const size_t M( A.rows() );
5284  const size_t N( B.columns() );
5285  const size_t K( A.columns() );
5286 
5287  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5288 
5289  size_t i( 0UL );
5290 
5291  for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
5292  {
5293  const size_t jend( LOW ? i+2UL : N );
5294  size_t j( 0UL );
5295 
5296  if( SYM || HERM ) {
5297  for( ; j<i; ++j ) {
5298  C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
5299  C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
5300  }
5301  }
5302  else if( UPP ) {
5303  for( ; j<i; ++j ) {
5304  reset( C(i ,j) );
5305  reset( C(i+1UL,j) );
5306  }
5307  }
5308 
5309  for( ; (j+4UL) <= jend; j+=4UL )
5310  {
5311  const size_t kbegin( ( IsUpper_v<MT4> )
5312  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5313  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5314  const size_t kend( ( IsLower_v<MT4> )
5315  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
5316  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
5317 
5318  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5319  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5320 
5321  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5322  size_t k( kbegin );
5323 
5324  for( ; k<kpos; k+=SIMDSIZE ) {
5325  const SIMDType a1( A.load(i ,k) );
5326  const SIMDType a2( A.load(i+1UL,k) );
5327  const SIMDType b1( B.load(k,j ) );
5328  const SIMDType b2( B.load(k,j+1UL) );
5329  const SIMDType b3( B.load(k,j+2UL) );
5330  const SIMDType b4( B.load(k,j+3UL) );
5331  xmm1 += a1 * b1;
5332  xmm2 += a1 * b2;
5333  xmm3 += a1 * b3;
5334  xmm4 += a1 * b4;
5335  xmm5 += a2 * b1;
5336  xmm6 += a2 * b2;
5337  xmm7 += a2 * b3;
5338  xmm8 += a2 * b4;
5339  }
5340 
5341  C(i ,j ) = sum( xmm1 ) * scalar;
5342  C(i ,j+1UL) = sum( xmm2 ) * scalar;
5343  C(i ,j+2UL) = sum( xmm3 ) * scalar;
5344  C(i ,j+3UL) = sum( xmm4 ) * scalar;
5345  C(i+1UL,j ) = sum( xmm5 ) * scalar;
5346  C(i+1UL,j+1UL) = sum( xmm6 ) * scalar;
5347  C(i+1UL,j+2UL) = sum( xmm7 ) * scalar;
5348  C(i+1UL,j+3UL) = sum( xmm8 ) * scalar;
5349 
5350  for( ; remainder && k<kend; ++k ) {
5351  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5352  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5353  C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
5354  C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
5355  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5356  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5357  C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
5358  C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
5359  }
5360  }
5361 
5362  for( ; (j+2UL) <= jend; j+=2UL )
5363  {
5364  const size_t kbegin( ( IsUpper_v<MT4> )
5365  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5366  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5367  const size_t kend( ( IsLower_v<MT4> )
5368  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
5369  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5370 
5371  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5372  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5373 
5374  SIMDType xmm1, xmm2, xmm3, xmm4;
5375  size_t k( kbegin );
5376 
5377  for( ; k<kpos; k+=SIMDSIZE ) {
5378  const SIMDType a1( A.load(i ,k) );
5379  const SIMDType a2( A.load(i+1UL,k) );
5380  const SIMDType b1( B.load(k,j ) );
5381  const SIMDType b2( B.load(k,j+1UL) );
5382  xmm1 += a1 * b1;
5383  xmm2 += a1 * b2;
5384  xmm3 += a2 * b1;
5385  xmm4 += a2 * b2;
5386  }
5387 
5388  C(i ,j ) = sum( xmm1 ) * scalar;
5389  C(i ,j+1UL) = sum( xmm2 ) * scalar;
5390  C(i+1UL,j ) = sum( xmm3 ) * scalar;
5391  C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
5392 
5393  for( ; remainder && k<kend; ++k ) {
5394  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5395  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5396  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5397  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5398  }
5399  }
5400 
5401  if( j < jend )
5402  {
5403  const size_t kbegin( ( IsUpper_v<MT4> )
5404  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5405  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5406  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5407 
5408  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5409  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5410 
5411  SIMDType xmm1, xmm2;
5412  size_t k( kbegin );
5413 
5414  for( ; k<kpos; k+=SIMDSIZE ) {
5415  const SIMDType b1( B.load(k,j) );
5416  xmm1 += A.load(i ,k) * b1;
5417  xmm2 += A.load(i+1UL,k) * b1;
5418  }
5419 
5420  C(i ,j) = sum( xmm1 ) * scalar;
5421  C(i+1UL,j) = sum( xmm2 ) * scalar;
5422 
5423  for( ; remainder && k<kend; ++k ) {
5424  C(i ,j) += A(i ,k) * B(k,j) * scalar;
5425  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5426  }
5427 
5428  if( LOW ) ++j;
5429  }
5430 
5431  if( LOW ) {
5432  for( ; j<N; ++j ) {
5433  reset( C(i ,j) );
5434  reset( C(i+1UL,j) );
5435  }
5436  }
5437  }
5438 
5439  for( ; i<M; ++i )
5440  {
5441  const size_t jend( LOW ? i+1UL : N );
5442  size_t j( 0UL );
5443 
5444  if( SYM || HERM ) {
5445  for( ; j<i; ++j ) {
5446  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5447  }
5448  }
5449  else if( UPP ) {
5450  for( ; j<i; ++j ) {
5451  reset( C(i,j) );
5452  }
5453  }
5454 
5455  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
5456  {
5457  const size_t kbegin( ( IsUpper_v<MT4> )
5458  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5459  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5460  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
5461 
5462  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5463  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5464 
5465  SIMDType xmm1, xmm2, xmm3, xmm4;
5466  size_t k( kbegin );
5467 
5468  for( ; k<kpos; k+=SIMDSIZE ) {
5469  const SIMDType a1( A.load(i,k) );
5470  xmm1 += a1 * B.load(k,j );
5471  xmm2 += a1 * B.load(k,j+1UL);
5472  xmm3 += a1 * B.load(k,j+2UL);
5473  xmm4 += a1 * B.load(k,j+3UL);
5474  }
5475 
5476  C(i,j ) = sum( xmm1 ) * scalar;
5477  C(i,j+1UL) = sum( xmm2 ) * scalar;
5478  C(i,j+2UL) = sum( xmm3 ) * scalar;
5479  C(i,j+3UL) = sum( xmm4 ) * scalar;
5480 
5481  for( ; remainder && k<kend; ++k ) {
5482  C(i,j ) += A(i,k) * B(k,j ) * scalar;
5483  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5484  C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
5485  C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
5486  }
5487  }
5488 
5489  for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
5490  {
5491  const size_t kbegin( ( IsUpper_v<MT4> )
5492  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5493  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5494  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5495 
5496  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5497  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5498 
5499  SIMDType xmm1, xmm2;
5500  size_t k( kbegin );
5501 
5502  for( ; k<kpos; k+=SIMDSIZE ) {
5503  const SIMDType a1( A.load(i,k) );
5504  xmm1 += a1 * B.load(k,j );
5505  xmm2 += a1 * B.load(k,j+1UL);
5506  }
5507 
5508  C(i,j ) = sum( xmm1 ) * scalar;
5509  C(i,j+1UL) = sum( xmm2 ) * scalar;
5510 
5511  for( ; remainder && k<kend; ++k ) {
5512  C(i,j ) += A(i,k) * B(k,j ) * scalar;
5513  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5514  }
5515  }
5516 
5517  if( j < jend )
5518  {
5519  const size_t kbegin( ( IsUpper_v<MT4> )
5520  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5521  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5522 
5523  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
5524  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5525 
5526  SIMDType xmm1;
5527  size_t k( kbegin );
5528 
5529  for( ; k<kpos; k+=SIMDSIZE ) {
5530  xmm1 += A.load(i,k) * B.load(k,j);
5531  }
5532 
5533  C(i,j) = sum( xmm1 ) * scalar;
5534 
5535  for( ; remainder && k<K; ++k ) {
5536  C(i,j) += A(i,k) * B(k,j) * scalar;
5537  }
5538 
5539  if( LOW ) ++j;
5540  }
5541 
5542  if( LOW ) {
5543  for( ; j<N; ++j ) {
5544  reset( C(i,j) );
5545  }
5546  }
5547  }
5548  }
5549  //**********************************************************************************************
5550 
5551  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
5566  template< typename MT3 // Type of the left-hand side target matrix
5567  , typename MT4 // Type of the left-hand side matrix operand
5568  , typename MT5 // Type of the right-hand side matrix operand
5569  , typename ST2 > // Type of the scalar value
5570  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5571  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5572  {
5573  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5574 
5575  const size_t M( A.rows() );
5576  const size_t N( B.columns() );
5577  const size_t K( A.columns() );
5578 
5579  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5580 
5581  size_t i( 0UL );
5582 
5583  for( ; !( LOW && UPP ) && (i+4UL) <= M; i+=4UL )
5584  {
5585  const size_t jend( LOW ? i+4UL : N );
5586  size_t j( 0UL );
5587 
5588  if( SYM || HERM ) {
5589  for( ; j<i; ++j ) {
5590  C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
5591  C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
5592  C(i+2UL,j) = HERM ? conj( C(j,i+2UL) ) : C(j,i+2UL);
5593  C(i+3UL,j) = HERM ? conj( C(j,i+3UL) ) : C(j,i+3UL);
5594  }
5595  }
5596  else if( UPP ) {
5597  for( ; j<i; ++j ) {
5598  reset( C(i ,j) );
5599  reset( C(i+1UL,j) );
5600  reset( C(i+2UL,j) );
5601  reset( C(i+3UL,j) );
5602  }
5603  }
5604 
5605  for( ; (j+2UL) <= jend; j+=2UL )
5606  {
5607  const size_t kbegin( ( IsUpper_v<MT4> )
5608  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5609  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5610  const size_t kend( ( IsLower_v<MT4> )
5611  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
5612  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5613 
5614  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5615  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5616 
5617  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5618  size_t k( kbegin );
5619 
5620  for( ; k<kpos; k+=SIMDSIZE ) {
5621  const SIMDType a1( A.load(i ,k) );
5622  const SIMDType a2( A.load(i+1UL,k) );
5623  const SIMDType a3( A.load(i+2UL,k) );
5624  const SIMDType a4( A.load(i+3UL,k) );
5625  const SIMDType b1( B.load(k,j ) );
5626  const SIMDType b2( B.load(k,j+1UL) );
5627  xmm1 += a1 * b1;
5628  xmm2 += a1 * b2;
5629  xmm3 += a2 * b1;
5630  xmm4 += a2 * b2;
5631  xmm5 += a3 * b1;
5632  xmm6 += a3 * b2;
5633  xmm7 += a4 * b1;
5634  xmm8 += a4 * b2;
5635  }
5636 
5637  C(i ,j ) = sum( xmm1 ) * scalar;
5638  C(i ,j+1UL) = sum( xmm2 ) * scalar;
5639  C(i+1UL,j ) = sum( xmm3 ) * scalar;
5640  C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
5641  C(i+2UL,j ) = sum( xmm5 ) * scalar;
5642  C(i+2UL,j+1UL) = sum( xmm6 ) * scalar;
5643  C(i+3UL,j ) = sum( xmm7 ) * scalar;
5644  C(i+3UL,j+1UL) = sum( xmm8 ) * scalar;
5645 
5646  for( ; remainder && k<kend; ++k ) {
5647  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5648  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5649  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5650  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5651  C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
5652  C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
5653  C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
5654  C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
5655  }
5656  }
5657 
5658  if( j < jend )
5659  {
5660  const size_t kbegin( ( IsUpper_v<MT4> )
5661  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5662  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5663  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
5664 
5665  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5666  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5667 
5668  SIMDType xmm1, xmm2, xmm3, xmm4;
5669  size_t k( kbegin );
5670 
5671  for( ; k<kpos; k+=SIMDSIZE ) {
5672  const SIMDType b1( B.load(k,j) );
5673  xmm1 += A.load(i ,k) * b1;
5674  xmm2 += A.load(i+1UL,k) * b1;
5675  xmm3 += A.load(i+2UL,k) * b1;
5676  xmm4 += A.load(i+3UL,k) * b1;
5677  }
5678 
5679  C(i ,j) = sum( xmm1 ) * scalar;
5680  C(i+1UL,j) = sum( xmm2 ) * scalar;
5681  C(i+2UL,j) = sum( xmm3 ) * scalar;
5682  C(i+3UL,j) = sum( xmm4 ) * scalar;
5683 
5684  for( ; remainder && k<kend; ++k ) {
5685  C(i ,j) += A(i ,k) * B(k,j) * scalar;
5686  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5687  C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
5688  C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
5689  }
5690 
5691  if( LOW ) ++j;
5692  }
5693 
5694  if( LOW ) {
5695  for( ; j<N; ++j ) {
5696  reset( C(i ,j) );
5697  reset( C(i+1UL,j) );
5698  reset( C(i+2UL,j) );
5699  reset( C(i+3UL,j) );
5700  }
5701  }
5702  }
5703 
5704  for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
5705  {
5706  const size_t jend( LOW ? i+2UL : N );
5707  size_t j( 0UL );
5708 
5709  if( SYM || HERM ) {
5710  for( ; j<i; ++j ) {
5711  C(i ,j) = HERM ? conj( C(j,i ) ) : C(j,i );
5712  C(i+1UL,j) = HERM ? conj( C(j,i+1UL) ) : C(j,i+1UL);
5713  }
5714  }
5715  else if( UPP ) {
5716  for( ; j<i; ++j ) {
5717  reset( C(i ,j) );
5718  reset( C(i+1UL,j) );
5719  }
5720  }
5721 
5722  for( ; (j+2UL) <= jend; j+=2UL )
5723  {
5724  const size_t kbegin( ( IsUpper_v<MT4> )
5725  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5726  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5727  const size_t kend( ( IsLower_v<MT4> )
5728  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
5729  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5730 
5731  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5732  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5733 
5734  SIMDType xmm1, xmm2, xmm3, xmm4;
5735  size_t k( kbegin );
5736 
5737  for( ; k<kpos; k+=SIMDSIZE ) {
5738  const SIMDType a1( A.load(i ,k) );
5739  const SIMDType a2( A.load(i+1UL,k) );
5740  const SIMDType b1( B.load(k,j ) );
5741  const SIMDType b2( B.load(k,j+1UL) );
5742  xmm1 += a1 * b1;
5743  xmm2 += a1 * b2;
5744  xmm3 += a2 * b1;
5745  xmm4 += a2 * b2;
5746  }
5747 
5748  C(i ,j ) = sum( xmm1 ) * scalar;
5749  C(i ,j+1UL) = sum( xmm2 ) * scalar;
5750  C(i+1UL,j ) = sum( xmm3 ) * scalar;
5751  C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
5752 
5753  for( ; remainder && k<kend; ++k ) {
5754  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5755  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5756  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5757  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5758  }
5759  }
5760 
5761  if( j < jend )
5762  {
5763  const size_t kbegin( ( IsUpper_v<MT4> )
5764  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5765  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5766  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5767 
5768  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5769  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5770 
5771  SIMDType xmm1, xmm2;
5772  size_t k( kbegin );
5773 
5774  for( ; k<kpos; k+=SIMDSIZE ) {
5775  const SIMDType b1( B.load(k,j) );
5776  xmm1 += A.load(i ,k) * b1;
5777  xmm2 += A.load(i+1UL,k) * b1;
5778  }
5779 
5780  C(i ,j) = sum( xmm1 ) * scalar;
5781  C(i+1UL,j) = sum( xmm2 ) * scalar;
5782 
5783  for( ; remainder && k<kend; ++k ) {
5784  C(i ,j) += A(i ,k) * B(k,j) * scalar;
5785  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5786  }
5787 
5788  if( LOW ) ++j;
5789  }
5790 
5791  if( LOW ) {
5792  for( ; j<N; ++j ) {
5793  reset( C(i ,j) );
5794  reset( C(i+1UL,j) );
5795  }
5796  }
5797  }
5798 
5799  for( ; i<M; ++i )
5800  {
5801  const size_t jend( LOW ? i+1UL : N );
5802  size_t j( 0UL );
5803 
5804  if( SYM || HERM ) {
5805  for( ; j<i; ++j ) {
5806  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5807  }
5808  }
5809  else if( UPP ) {
5810  for( ; j<i; ++j ) {
5811  reset( C(i,j) );
5812  }
5813  }
5814 
5815  for( ; (j+2UL) <= jend; j+=2UL )
5816  {
5817  const size_t kbegin( ( IsUpper_v<MT4> )
5818  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5819  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5820  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5821 
5822  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5823  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5824 
5825  SIMDType xmm1, xmm2;
5826  size_t k( kbegin );
5827 
5828  for( ; k<kpos; k+=SIMDSIZE ) {
5829  const SIMDType a1( A.load(i,k) );
5830  xmm1 += a1 * B.load(k,j );
5831  xmm2 += a1 * B.load(k,j+1UL);
5832  }
5833 
5834  C(i,j ) = sum( xmm1 ) * scalar;
5835  C(i,j+1UL) = sum( xmm2 ) * scalar;
5836 
5837  for( ; remainder && k<kend; ++k ) {
5838  C(i,j ) += A(i,k) * B(k,j ) * scalar;
5839  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5840  }
5841  }
5842 
5843  if( j < jend )
5844  {
5845  const size_t kbegin( ( IsUpper_v<MT4> )
5846  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5847  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5848 
5849  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
5850  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5851 
5852  SIMDType xmm1;
5853  size_t k( kbegin );
5854 
5855  for( ; k<kpos; k+=SIMDSIZE ) {
5856  xmm1 += A.load(i,k) * B.load(k,j);
5857  }
5858 
5859  C(i,j) = sum( xmm1 ) * scalar;
5860 
5861  for( ; remainder && k<K; ++k ) {
5862  C(i,j) += A(i,k) * B(k,j) * scalar;
5863  }
5864 
5865  if( LOW ) ++j;
5866  }
5867 
5868  if( LOW ) {
5869  for( ; j<N; ++j ) {
5870  reset( C(i,j) );
5871  }
5872  }
5873  }
5874  }
5875  //**********************************************************************************************
5876 
5877  //**Default assignment to dense matrices (large matrices)***************************************
5891  template< typename MT3 // Type of the left-hand side target matrix
5892  , typename MT4 // Type of the left-hand side matrix operand
5893  , typename MT5 // Type of the right-hand side matrix operand
5894  , typename ST2 > // Type of the scalar value
5895  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5896  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5897  {
5898  selectDefaultAssignKernel( C, A, B, scalar );
5899  }
5900  //**********************************************************************************************
5901 
5902  //**Vectorized default assignment to dense matrices (large matrices)****************************
5917  template< typename MT3 // Type of the left-hand side target matrix
5918  , typename MT4 // Type of the left-hand side matrix operand
5919  , typename MT5 // Type of the right-hand side matrix operand
5920  , typename ST2 > // Type of the scalar value
5921  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5922  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5923  {
5924  if( SYM )
5925  smmm( C, A, B, scalar );
5926  else if( HERM )
5927  hmmm( C, A, B, scalar );
5928  else if( LOW )
5929  lmmm( C, A, B, scalar, ST2(0) );
5930  else if( UPP )
5931  ummm( C, A, B, scalar, ST2(0) );
5932  else
5933  mmm( C, A, B, scalar, ST2(0) );
5934  }
5935  //**********************************************************************************************
5936 
5937  //**BLAS-based assignment to dense matrices (default)*******************************************
5951  template< typename MT3 // Type of the left-hand side target matrix
5952  , typename MT4 // Type of the left-hand side matrix operand
5953  , typename MT5 // Type of the right-hand side matrix operand
5954  , typename ST2 > // Type of the scalar value
5955  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5956  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
5957  {
5958  selectLargeAssignKernel( C, A, B, scalar );
5959  }
5960  //**********************************************************************************************
5961 
5962  //**BLAS-based assignment to dense matrices*****************************************************
5963 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5964 
5977  template< typename MT3 // Type of the left-hand side target matrix
5978  , typename MT4 // Type of the left-hand side matrix operand
5979  , typename MT5 // Type of the right-hand side matrix operand
5980  , typename ST2 > // Type of the scalar value
5981  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5982  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
5983  {
5984  using ET = ElementType_t<MT3>;
5985 
5986  if( IsTriangular_v<MT4> ) {
5987  assign( C, B );
5988  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
5989  }
5990  else if( IsTriangular_v<MT5> ) {
5991  assign( C, A );
5992  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
5993  }
5994  else {
5995  gemm( C, A, B, ET(scalar), ET(0) );
5996  }
5997  }
5998 #endif
5999  //**********************************************************************************************
6000 
6001  //**Assignment to sparse matrices***************************************************************
6013  template< typename MT // Type of the target sparse matrix
6014  , bool SO > // Storage order of the target sparse matrix
6015  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6016  {
6018 
6019  using TmpType = If_t< SO, OppositeType, ResultType >;
6020 
6027 
6028  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6029  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6030 
6031  const ForwardFunctor fwd;
6032 
6033  const TmpType tmp( serial( rhs ) );
6034  assign( ~lhs, fwd( tmp ) );
6035  }
6036  //**********************************************************************************************
6037 
6038  //**Addition assignment to dense matrices*******************************************************
6050  template< typename MT // Type of the target dense matrix
6051  , bool SO > // Storage order of the target dense matrix
6052  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6053  {
6055 
6056  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6057  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6058 
6059  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6060  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6061 
6062  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6063  return;
6064  }
6065 
6066  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6067  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6068 
6069  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6070  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6071  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6072  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6073  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6074  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6075 
6076  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6077  }
6078  //**********************************************************************************************
6079 
6080  //**Addition assignment to dense matrices (kernel selection)************************************
6091  template< typename MT3 // Type of the left-hand side target matrix
6092  , typename MT4 // Type of the left-hand side matrix operand
6093  , typename MT5 // Type of the right-hand side matrix operand
6094  , typename ST2 > // Type of the scalar value
6095  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6096  {
6097  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
6098  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
6099  selectSmallAddAssignKernel( C, A, B, scalar );
6100  else
6101  selectBlasAddAssignKernel( C, A, B, scalar );
6102  }
6103  //**********************************************************************************************
6104 
6105  //**Default addition assignment to dense matrices (general/general)*****************************
6119  template< typename MT3 // Type of the left-hand side target matrix
6120  , typename MT4 // Type of the left-hand side matrix operand
6121  , typename MT5 // Type of the right-hand side matrix operand
6122  , typename ST2 > // Type of the scalar value
6123  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6124  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6125  {
6126  const ResultType tmp( serial( A * B * scalar ) );
6127  addAssign( C, tmp );
6128  }
6129  //**********************************************************************************************
6130 
6131  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
6145  template< typename MT3 // Type of the left-hand side target matrix
6146  , typename MT4 // Type of the left-hand side matrix operand
6147  , typename MT5 // Type of the right-hand side matrix operand
6148  , typename ST2 > // Type of the scalar value
6149  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6150  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6151  {
6152  const size_t M( A.rows() );
6153  const size_t N( B.columns() );
6154 
6155  for( size_t i=0UL; i<M; ++i )
6156  {
6157  const size_t jbegin( ( IsUpper_v<MT4> )
6158  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6159  :( 0UL ) );
6160  const size_t jend( ( IsLower_v<MT4> )
6161  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6162  :( N ) );
6163  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6164 
6165  const size_t jnum( jend - jbegin );
6166  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6167 
6168  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6169  C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6170  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6171  }
6172  if( jpos < jend ) {
6173  C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6174  }
6175  }
6176  }
6177  //**********************************************************************************************
6178 
6179  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
6193  template< typename MT3 // Type of the left-hand side target matrix
6194  , typename MT4 // Type of the left-hand side matrix operand
6195  , typename MT5 // Type of the right-hand side matrix operand
6196  , typename ST2 > // Type of the scalar value
6197  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6198  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6199  {
6200  constexpr size_t block( BLOCK_SIZE );
6201 
6202  const size_t M( A.rows() );
6203  const size_t N( B.columns() );
6204 
6205  for( size_t jj=0UL; jj<N; jj+=block ) {
6206  const size_t jend( min( N, jj+block ) );
6207  for( size_t ii=0UL; ii<M; ii+=block ) {
6208  const size_t iend( min( M, ii+block ) );
6209  for( size_t j=jj; j<jend; ++j )
6210  {
6211  const size_t ibegin( ( IsLower_v<MT4> )
6212  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
6213  :( ii ) );
6214  const size_t ipos( ( IsUpper_v<MT4> )
6215  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
6216  :( iend ) );
6217 
6218  for( size_t i=ibegin; i<ipos; ++i ) {
6219  C(i,j) += A(i,j) * B(j,j) * scalar;
6220  }
6221  }
6222  }
6223  }
6224  }
6225  //**********************************************************************************************
6226 
6227  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
6241  template< typename MT3 // Type of the left-hand side target matrix
6242  , typename MT4 // Type of the left-hand side matrix operand
6243  , typename MT5 // Type of the right-hand side matrix operand
6244  , typename ST2 > // Type of the scalar value
6245  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6246  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6247  {
6248  constexpr size_t block( BLOCK_SIZE );
6249 
6250  const size_t M( A.rows() );
6251  const size_t N( B.columns() );
6252 
6253  for( size_t ii=0UL; ii<M; ii+=block ) {
6254  const size_t iend( min( M, ii+block ) );
6255  for( size_t jj=0UL; jj<N; jj+=block ) {
6256  const size_t jend( min( N, jj+block ) );
6257  for( size_t i=ii; i<iend; ++i )
6258  {
6259  const size_t jbegin( ( IsUpper_v<MT5> )
6260  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
6261  :( jj ) );
6262  const size_t jpos( ( IsLower_v<MT5> )
6263  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
6264  :( jend ) );
6265 
6266  for( size_t j=jbegin; j<jpos; ++j ) {
6267  C(i,j) += A(i,i) * B(i,j) * scalar;
6268  }
6269  }
6270  }
6271  }
6272  }
6273  //**********************************************************************************************
6274 
6275  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
6289  template< typename MT3 // Type of the left-hand side target matrix
6290  , typename MT4 // Type of the left-hand side matrix operand
6291  , typename MT5 // Type of the right-hand side matrix operand
6292  , typename ST2 > // Type of the scalar value
6293  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6294  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6295  {
6296  const size_t M( A.rows() );
6297  const size_t N( B.columns() );
6298 
6299  for( size_t j=0UL; j<N; ++j )
6300  {
6301  const size_t ibegin( ( IsLower_v<MT5> )
6302  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6303  :( 0UL ) );
6304  const size_t iend( ( IsUpper_v<MT5> )
6305  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6306  :( M ) );
6307  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6308 
6309  const size_t inum( iend - ibegin );
6310  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6311 
6312  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6313  C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6314  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6315  }
6316  if( ipos < iend ) {
6317  C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6318  }
6319  }
6320  }
6321  //**********************************************************************************************
6322 
6323  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6337  template< typename MT3 // Type of the left-hand side target matrix
6338  , typename MT4 // Type of the left-hand side matrix operand
6339  , typename MT5 // Type of the right-hand side matrix operand
6340  , typename ST2 > // Type of the scalar value
6341  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6342  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6343  {
6344  for( size_t i=0UL; i<A.rows(); ++i ) {
6345  C(i,i) += A(i,i) * B(i,i) * scalar;
6346  }
6347  }
6348  //**********************************************************************************************
6349 
6350  //**Default addition assignment to dense matrices (small matrices)******************************
6364  template< typename MT3 // Type of the left-hand side target matrix
6365  , typename MT4 // Type of the left-hand side matrix operand
6366  , typename MT5 // Type of the right-hand side matrix operand
6367  , typename ST2 > // Type of the scalar value
6368  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6369  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6370  {
6371  selectDefaultAddAssignKernel( C, A, B, scalar );
6372  }
6373  //**********************************************************************************************
6374 
6375  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
6390  template< typename MT3 // Type of the left-hand side target matrix
6391  , typename MT4 // Type of the left-hand side matrix operand
6392  , typename MT5 // Type of the right-hand side matrix operand
6393  , typename ST2 > // Type of the scalar value
6394  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6395  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6396  {
6397  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
6398 
6399  const size_t M( A.rows() );
6400  const size_t N( B.columns() );
6401  const size_t K( A.columns() );
6402 
6403  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6404 
6405  size_t i( 0UL );
6406 
6407  for( ; (i+2UL) <= M; i+=2UL )
6408  {
6409  const size_t jend( LOW ? i+2UL : N );
6410  size_t j( UPP ? i : 0UL );
6411 
6412  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
6413  {
6414  const size_t kbegin( ( IsUpper_v<MT4> )
6415  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6416  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6417  const size_t kend( ( IsLower_v<MT4> )
6418  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
6419  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
6420 
6421  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6422  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6423 
6424  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6425  size_t k( kbegin );
6426 
6427  for( ; k<kpos; k+=SIMDSIZE ) {
6428  const SIMDType a1( A.load(i ,k) );
6429  const SIMDType a2( A.load(i+1UL,k) );
6430  const SIMDType b1( B.load(k,j ) );
6431  const SIMDType b2( B.load(k,j+1UL) );
6432  const SIMDType b3( B.load(k,j+2UL) );
6433  const SIMDType b4( B.load(k,j+3UL) );
6434  xmm1 += a1 * b1;
6435  xmm2 += a1 * b2;
6436  xmm3 += a1 * b3;
6437  xmm4 += a1 * b4;
6438  xmm5 += a2 * b1;
6439  xmm6 += a2 * b2;
6440  xmm7 += a2 * b3;
6441  xmm8 += a2 * b4;
6442  }
6443 
6444  C(i ,j ) += sum( xmm1 ) * scalar;
6445  C(i ,j+1UL) += sum( xmm2 ) * scalar;
6446  C(i ,j+2UL) += sum( xmm3 ) * scalar;
6447  C(i ,j+3UL) += sum( xmm4 ) * scalar;
6448  C(i+1UL,j ) += sum( xmm5 ) * scalar;
6449  C(i+1UL,j+1UL) += sum( xmm6 ) * scalar;
6450  C(i+1UL,j+2UL) += sum( xmm7 ) * scalar;
6451  C(i+1UL,j+3UL) += sum( xmm8 ) * scalar;
6452 
6453  for( ; remainder && k<kend; ++k ) {
6454  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6455  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6456  C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
6457  C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
6458  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6459  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6460  C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
6461  C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
6462  }
6463  }
6464 
6465  for( ; (j+2UL) <= jend; j+=2UL )
6466  {
6467  const size_t kbegin( ( IsUpper_v<MT4> )
6468  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6469  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6470  const size_t kend( ( IsLower_v<MT4> )
6471  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
6472  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6473 
6474  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6475  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6476 
6477  SIMDType xmm1, xmm2, xmm3, xmm4;
6478  size_t k( kbegin );
6479 
6480  for( ; k<kpos; k+=SIMDSIZE ) {
6481  const SIMDType a1( A.load(i ,k) );
6482  const SIMDType a2( A.load(i+1UL,k) );
6483  const SIMDType b1( B.load(k,j ) );
6484  const SIMDType b2( B.load(k,j+1UL) );
6485  xmm1 += a1 * b1;
6486  xmm2 += a1 * b2;
6487  xmm3 += a2 * b1;
6488  xmm4 += a2 * b2;
6489  }
6490 
6491  C(i ,j ) += sum( xmm1 ) * scalar;
6492  C(i ,j+1UL) += sum( xmm2 ) * scalar;
6493  C(i+1UL,j ) += sum( xmm3 ) * scalar;
6494  C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
6495 
6496  for( ; remainder && k<kend; ++k ) {
6497  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6498  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6499  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6500  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6501  }
6502  }
6503 
6504  if( j < jend )
6505  {
6506  const size_t kbegin( ( IsUpper_v<MT4> )
6507  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6508  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6509  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6510 
6511  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6512  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6513 
6514  SIMDType xmm1, xmm2;
6515  size_t k( kbegin );
6516 
6517  for( ; k<kpos; k+=SIMDSIZE ) {
6518  const SIMDType b1( B.load(k,j) );
6519  xmm1 += A.load(i ,k) * b1;
6520  xmm2 += A.load(i+1UL,k) * b1;
6521  }
6522 
6523  C(i ,j) += sum( xmm1 ) * scalar;
6524  C(i+1UL,j) += sum( xmm2 ) * scalar;
6525 
6526  for( ; remainder && k<kend; ++k ) {
6527  C(i ,j) += A(i ,k) * B(k,j) * scalar;
6528  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6529  }
6530  }
6531  }
6532 
6533  if( i < M )
6534  {
6535  const size_t jend( LOW ? i+1UL : N );
6536  size_t j( UPP ? i : 0UL );
6537 
6538  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
6539  {
6540  const size_t kbegin( ( IsUpper_v<MT4> )
6541  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6542  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6543  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
6544 
6545  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6546  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6547 
6548  SIMDType xmm1, xmm2, xmm3, xmm4;
6549  size_t k( kbegin );
6550 
6551  for( ; k<kpos; k+=SIMDSIZE ) {
6552  const SIMDType a1( A.load(i,k) );
6553  xmm1 += a1 * B.load(k,j );
6554  xmm2 += a1 * B.load(k,j+1UL);
6555  xmm3 += a1 * B.load(k,j+2UL);
6556  xmm4 += a1 * B.load(k,j+3UL);
6557  }
6558 
6559  C(i,j ) += sum( xmm1 ) * scalar;
6560  C(i,j+1UL) += sum( xmm2 ) * scalar;
6561  C(i,j+2UL) += sum( xmm3 ) * scalar;
6562  C(i,j+3UL) += sum( xmm4 ) * scalar;
6563 
6564  for( ; remainder && k<kend; ++k ) {
6565  C(i,j ) += A(i,k) * B(k,j ) * scalar;
6566  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6567  C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
6568  C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
6569  }
6570  }
6571 
6572  for( ; (j+2UL) <= jend; j+=2UL )
6573  {
6574  const size_t kbegin( ( IsUpper_v<MT4> )
6575  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6576  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6577  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6578 
6579  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6580  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6581 
6582  SIMDType xmm1, xmm2;
6583  size_t k( kbegin );
6584 
6585  for( ; k<kpos; k+=SIMDSIZE ) {
6586  const SIMDType a1( A.load(i,k) );
6587  xmm1 += a1 * B.load(k,j );
6588  xmm2 += a1 * B.load(k,j+1UL);
6589  }
6590 
6591  C(i,j ) += sum( xmm1 ) * scalar;
6592  C(i,j+1UL) += sum( xmm2 ) * scalar;
6593 
6594  for( ; remainder && k<kend; ++k ) {
6595  C(i,j ) += A(i,k) * B(k,j ) * scalar;
6596  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6597  }
6598  }
6599 
6600  if( j < jend )
6601  {
6602  const size_t kbegin( ( IsUpper_v<MT4> )
6603  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6604  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6605 
6606  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
6607  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6608 
6609  SIMDType xmm1;
6610  size_t k( kbegin );
6611 
6612  for( ; k<kpos; k+=SIMDSIZE ) {
6613  xmm1 += A.load(i,k) * B.load(k,j);
6614  }
6615 
6616  C(i,j) += sum( xmm1 ) * scalar;
6617 
6618  for( ; remainder && k<K; ++k ) {
6619  C(i,j) += A(i,k) * B(k,j) * scalar;
6620  }
6621  }
6622  }
6623  }
6624  //**********************************************************************************************
6625 
6626  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
6641  template< typename MT3 // Type of the left-hand side target matrix
6642  , typename MT4 // Type of the left-hand side matrix operand
6643  , typename MT5 // Type of the right-hand side matrix operand
6644  , typename ST2 > // Type of the scalar value
6645  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6646  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6647  {
6648  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
6649 
6650  const size_t M( A.rows() );
6651  const size_t N( B.columns() );
6652  const size_t K( A.columns() );
6653 
6654  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6655 
6656  size_t i( 0UL );
6657 
6658  for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
6659  {
6660  size_t j( 0UL );
6661 
6662  for( ; (j+2UL) <= N; j+=2UL )
6663  {
6664  const size_t kbegin( ( IsUpper_v<MT4> )
6665  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6666  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6667  const size_t kend( ( IsLower_v<MT4> )
6668  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
6669  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6670 
6671  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6672  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6673 
6674  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6675  size_t k( kbegin );
6676 
6677  for( ; k<kpos; k+=SIMDSIZE ) {
6678  const SIMDType a1( A.load(i ,k) );
6679  const SIMDType a2( A.load(i+1UL,k) );
6680  const SIMDType a3( A.load(i+2UL,k) );
6681  const SIMDType a4( A.load(i+3UL,k) );
6682  const SIMDType b1( B.load(k,j ) );
6683  const SIMDType b2( B.load(k,j+1UL) );
6684  xmm1 += a1 * b1;
6685  xmm2 += a1 * b2;
6686  xmm3 += a2 * b1;
6687  xmm4 += a2 * b2;
6688  xmm5 += a3 * b1;
6689  xmm6 += a3 * b2;
6690  xmm7 += a4 * b1;
6691  xmm8 += a4 * b2;
6692  }
6693 
6694  C(i ,j ) += sum( xmm1 ) * scalar;
6695  C(i ,j+1UL) += sum( xmm2 ) * scalar;
6696  C(i+1UL,j ) += sum( xmm3 ) * scalar;
6697  C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
6698  C(i+2UL,j ) += sum( xmm5 ) * scalar;
6699  C(i+2UL,j+1UL) += sum( xmm6 ) * scalar;
6700  C(i+3UL,j ) += sum( xmm7 ) * scalar;
6701  C(i+3UL,j+1UL) += sum( xmm8 ) * scalar;
6702 
6703  for( ; remainder && k<kend; ++k ) {
6704  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6705  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6706  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6707  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6708  C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
6709  C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
6710  C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
6711  C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
6712  }
6713  }
6714 
6715  if( j < N )
6716  {
6717  const size_t kbegin( ( IsUpper_v<MT4> )
6718  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6719  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6720  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
6721 
6722  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6723  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6724 
6725  SIMDType xmm1, xmm2, xmm3, xmm4;
6726  size_t k( kbegin );
6727 
6728  for( ; k<kpos; k+=SIMDSIZE ) {
6729  const SIMDType b1( B.load(k,j) );
6730  xmm1 += A.load(i ,k) * b1;
6731  xmm2 += A.load(i+1UL,k) * b1;
6732  xmm3 += A.load(i+2UL,k) * b1;
6733  xmm4 += A.load(i+3UL,k) * b1;
6734  }
6735 
6736  C(i ,j) += sum( xmm1 ) * scalar;
6737  C(i+1UL,j) += sum( xmm2 ) * scalar;
6738  C(i+2UL,j) += sum( xmm3 ) * scalar;
6739  C(i+3UL,j) += sum( xmm4 ) * scalar;
6740 
6741  for( ; remainder && k<kend; ++k ) {
6742  C(i ,j) += A(i ,k) * B(k,j) * scalar;
6743  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6744  C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
6745  C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
6746  }
6747  }
6748  }
6749 
6750  for( ; (i+2UL) <= M; i+=2UL )
6751  {
6752  const size_t jend( LOW ? i+2UL : N );
6753  size_t j( UPP ? i : 0UL );
6754 
6755  for( ; (j+2UL) <= jend; j+=2UL )
6756  {
6757  const size_t kbegin( ( IsUpper_v<MT4> )
6758  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6759  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6760  const size_t kend( ( IsLower_v<MT4> )
6761  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
6762  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6763 
6764  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6765  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6766 
6767  SIMDType xmm1, xmm2, xmm3, xmm4;
6768  size_t k( kbegin );
6769 
6770  for( ; k<kpos; k+=SIMDSIZE ) {
6771  const SIMDType a1( A.load(i ,k) );
6772  const SIMDType a2( A.load(i+1UL,k) );
6773  const SIMDType b1( B.load(k,j ) );
6774  const SIMDType b2( B.load(k,j+1UL) );
6775  xmm1 += a1 * b1;
6776  xmm2 += a1 * b2;
6777  xmm3 += a2 * b1;
6778  xmm4 += a2 * b2;
6779  }
6780 
6781  C(i ,j ) += sum( xmm1 ) * scalar;
6782  C(i ,j+1UL) += sum( xmm2 ) * scalar;
6783  C(i+1UL,j ) += sum( xmm3 ) * scalar;
6784  C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
6785 
6786  for( ; remainder && k<kend; ++k ) {
6787  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6788  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6789  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6790  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6791  }
6792  }
6793 
6794  if( j < jend )
6795  {
6796  const size_t kbegin( ( IsUpper_v<MT4> )
6797  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6798  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6799  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6800 
6801  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6802  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6803 
6804  SIMDType xmm1, xmm2;
6805  size_t k( kbegin );
6806 
6807  for( ; k<kpos; k+=SIMDSIZE ) {
6808  const SIMDType b1( B.load(k,j) );
6809  xmm1 += A.load(i ,k) * b1;
6810  xmm2 += A.load(i+1UL,k) * b1;
6811  }
6812 
6813  C(i ,j) += sum( xmm1 ) * scalar;
6814  C(i+1UL,j) += sum( xmm2 ) * scalar;
6815 
6816  for( ; remainder && k<kend; ++k ) {
6817  C(i ,j) += A(i ,k) * B(k,j) * scalar;
6818  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6819  }
6820  }
6821  }
6822 
6823  if( i < M )
6824  {
6825  const size_t jend( LOW ? i+1UL : N );
6826  size_t j( UPP ? i : 0UL );
6827 
6828  for( ; (j+2UL) <= jend; j+=2UL )
6829  {
6830  const size_t kbegin( ( IsUpper_v<MT4> )
6831  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6832  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6833  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6834 
6835  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6836  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6837 
6838  SIMDType xmm1, xmm2;
6839  size_t k( kbegin );
6840 
6841  for( ; k<kpos; k+=SIMDSIZE ) {
6842  const SIMDType a1( A.load(i,k) );
6843  xmm1 += a1 * B.load(k,j );
6844  xmm2 += a1 * B.load(k,j+1UL);
6845  }
6846 
6847  C(i,j ) += sum( xmm1 ) * scalar;
6848  C(i,j+1UL) += sum( xmm2 ) * scalar;
6849 
6850  for( ; remainder && k<kend; ++k ) {
6851  C(i,j ) += A(i,k) * B(k,j ) * scalar;
6852  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6853  }
6854  }
6855 
6856  if( j < jend )
6857  {
6858  const size_t kbegin( ( IsUpper_v<MT4> )
6859  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6860  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6861 
6862  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
6863  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6864 
6865  SIMDType xmm1;
6866  size_t k( kbegin );
6867 
6868  for( ; k<kpos; k+=SIMDSIZE ) {
6869  xmm1 += A.load(i,k) * B.load(k,j);
6870  }
6871 
6872  C(i,j) += sum( xmm1 ) * scalar;
6873 
6874  for( ; remainder && k<K; ++k ) {
6875  C(i,j) += A(i,k) * B(k,j) * scalar;
6876  }
6877  }
6878  }
6879  }
6880  //**********************************************************************************************
6881 
6882  //**Default addition assignment to dense matrices (large matrices)******************************
6896  template< typename MT3 // Type of the left-hand side target matrix
6897  , typename MT4 // Type of the left-hand side matrix operand
6898  , typename MT5 // Type of the right-hand side matrix operand
6899  , typename ST2 > // Type of the scalar value
6900  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6901  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6902  {
6903  selectDefaultAddAssignKernel( C, A, B, scalar );
6904  }
6905  //**********************************************************************************************
6906 
6907  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
6922  template< typename MT3 // Type of the left-hand side target matrix
6923  , typename MT4 // Type of the left-hand side matrix operand
6924  , typename MT5 // Type of the right-hand side matrix operand
6925  , typename ST2 > // Type of the scalar value
6926  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6927  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6928  {
6929  if( LOW )
6930  lmmm( C, A, B, scalar, ST2(1) );
6931  else if( UPP )
6932  ummm( C, A, B, scalar, ST2(1) );
6933  else
6934  mmm( C, A, B, scalar, ST2(1) );
6935  }
6936  //**********************************************************************************************
6937 
6938  //**BLAS-based addition assignment to dense matrices (default)**********************************
6952  template< typename MT3 // Type of the left-hand side target matrix
6953  , typename MT4 // Type of the left-hand side matrix operand
6954  , typename MT5 // Type of the right-hand side matrix operand
6955  , typename ST2 > // Type of the scalar value
6956  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6957  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6958  {
6959  selectLargeAddAssignKernel( C, A, B, scalar );
6960  }
6961  //**********************************************************************************************
6962 
6963  //**BLAS-based addition assignment to dense matrices********************************************
6964 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6965 
6978  template< typename MT3 // Type of the left-hand side target matrix
6979  , typename MT4 // Type of the left-hand side matrix operand
6980  , typename MT5 // Type of the right-hand side matrix operand
6981  , typename ST2 > // Type of the scalar value
6982  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6983  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6984  {
6985  using ET = ElementType_t<MT3>;
6986 
6987  if( IsTriangular_v<MT4> ) {
6988  ResultType_t<MT3> tmp( serial( B ) );
6989  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6990  addAssign( C, tmp );
6991  }
6992  else if( IsTriangular_v<MT5> ) {
6993  ResultType_t<MT3> tmp( serial( A ) );
6994  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6995  addAssign( C, tmp );
6996  }
6997  else {
6998  gemm( C, A, B, ET(scalar), ET(1) );
6999  }
7000  }
7001 #endif
7002  //**********************************************************************************************
7003 
7004  //**Addition assignment to sparse matrices******************************************************
7005  // No special implementation for the addition assignment to sparse matrices.
7006  //**********************************************************************************************
7007 
7008  //**Subtraction assignment to dense matrices****************************************************
7020  template< typename MT // Type of the target dense matrix
7021  , bool SO > // Storage order of the target dense matrix
7022  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7023  {
7025 
7026  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7027  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7028 
7029  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7030  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7031 
7032  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7033  return;
7034  }
7035 
7036  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7037  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7038 
7039  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7040  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7041  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7042  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7043  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7044  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7045 
7046  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7047  }
7048  //**********************************************************************************************
7049 
7050  //**Subtraction assignment to dense matrices (kernel selection)*********************************
7061  template< typename MT3 // Type of the left-hand side target matrix
7062  , typename MT4 // Type of the left-hand side matrix operand
7063  , typename MT5 // Type of the right-hand side matrix operand
7064  , typename ST2 > // Type of the scalar value
7065  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7066  {
7067  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
7068  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
7069  selectSmallSubAssignKernel( C, A, B, scalar );
7070  else
7071  selectBlasSubAssignKernel( C, A, B, scalar );
7072  }
7073  //**********************************************************************************************
7074 
7075  //**Default subtraction assignment to dense matrices (general/general)**************************
7089  template< typename MT3 // Type of the left-hand side target matrix
7090  , typename MT4 // Type of the left-hand side matrix operand
7091  , typename MT5 // Type of the right-hand side matrix operand
7092  , typename ST2 > // Type of the scalar value
7093  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7094  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7095  {
7096  const ResultType tmp( serial( A * B * scalar ) );
7097  subAssign( C, tmp );
7098  }
7099  //**********************************************************************************************
7100 
7101  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
7115  template< typename MT3 // Type of the left-hand side target matrix
7116  , typename MT4 // Type of the left-hand side matrix operand
7117  , typename MT5 // Type of the right-hand side matrix operand
7118  , typename ST2 > // Type of the scalar value
7119  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7120  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7121  {
7122  const size_t M( A.rows() );
7123  const size_t N( B.columns() );
7124 
7125  for( size_t i=0UL; i<M; ++i )
7126  {
7127  const size_t jbegin( ( IsUpper_v<MT4> )
7128  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7129  :( 0UL ) );
7130  const size_t jend( ( IsLower_v<MT4> )
7131  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7132  :( N ) );
7133  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7134 
7135  const size_t jnum( jend - jbegin );
7136  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
7137 
7138  for( size_t j=jbegin; j<jpos; j+=2UL ) {
7139  C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7140  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7141  }
7142  if( jpos < jend ) {
7143  C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7144  }
7145  }
7146  }
7147  //**********************************************************************************************
7148 
7149  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
7163  template< typename MT3 // Type of the left-hand side target matrix
7164  , typename MT4 // Type of the left-hand side matrix operand
7165  , typename MT5 // Type of the right-hand side matrix operand
7166  , typename ST2 > // Type of the scalar value
7167  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7168  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7169  {
7170  constexpr size_t block( BLOCK_SIZE );
7171 
7172  const size_t M( A.rows() );
7173  const size_t N( B.columns() );
7174 
7175  for( size_t jj=0UL; jj<N; jj+=block ) {
7176  const size_t jend( min( N, jj+block ) );
7177  for( size_t ii=0UL; ii<M; ii+=block ) {
7178  const size_t iend( min( M, ii+block ) );
7179  for( size_t j=jj; j<jend; ++j )
7180  {
7181  const size_t ibegin( ( IsLower_v<MT4> )
7182  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
7183  :( ii ) );
7184  const size_t ipos( ( IsUpper_v<MT4> )
7185  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
7186  :( iend ) );
7187 
7188  for( size_t i=ibegin; i<ipos; ++i ) {
7189  C(i,j) -= A(i,j) * B(j,j) * scalar;
7190  }
7191  }
7192  }
7193  }
7194  }
7195  //**********************************************************************************************
7196 
7197  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
7212  template< typename MT3 // Type of the left-hand side target matrix
7213  , typename MT4 // Type of the left-hand side matrix operand
7214  , typename MT5 // Type of the right-hand side matrix operand
7215  , typename ST2 > // Type of the scalar value
7216  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7217  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7218  {
7219  constexpr size_t block( BLOCK_SIZE );
7220 
7221  const size_t M( A.rows() );
7222  const size_t N( B.columns() );
7223 
7224  for( size_t ii=0UL; ii<M; ii+=block ) {
7225  const size_t iend( min( M, ii+block ) );
7226  for( size_t jj=0UL; jj<N; jj+=block ) {
7227  const size_t jend( min( N, jj+block ) );
7228  for( size_t i=ii; i<iend; ++i )
7229  {
7230  const size_t jbegin( ( IsUpper_v<MT5> )
7231  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
7232  :( jj ) );
7233  const size_t jpos( ( IsLower_v<MT5> )
7234  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
7235  :( jend ) );
7236 
7237  for( size_t j=jbegin; j<jpos; ++j ) {
7238  C(i,j) -= A(i,i) * B(i,j) * scalar;
7239  }
7240  }
7241  }
7242  }
7243  }
7244  //**********************************************************************************************
7245 
7246  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
7261  template< typename MT3 // Type of the left-hand side target matrix
7262  , typename MT4 // Type of the left-hand side matrix operand
7263  , typename MT5 // Type of the right-hand side matrix operand
7264  , typename ST2 > // Type of the scalar value
7265  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7266  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7267  {
7268  const size_t M( A.rows() );
7269  const size_t N( B.columns() );
7270 
7271  for( size_t j=0UL; j<N; ++j )
7272  {
7273  const size_t ibegin( ( IsLower_v<MT5> )
7274  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7275  :( 0UL ) );
7276  const size_t iend( ( IsUpper_v<MT5> )
7277  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7278  :( M ) );
7279  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7280 
7281  const size_t inum( iend - ibegin );
7282  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7283 
7284  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7285  C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7286  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7287  }
7288  if( ipos < iend ) {
7289  C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7290  }
7291  }
7292  }
7293  //**********************************************************************************************
7294 
7295  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
7309  template< typename MT3 // Type of the left-hand side target matrix
7310  , typename MT4 // Type of the left-hand side matrix operand
7311  , typename MT5 // Type of the right-hand side matrix operand
7312  , typename ST2 > // Type of the scalar value
7313  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7314  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7315  {
7316  for( size_t i=0UL; i<A.rows(); ++i ) {
7317  C(i,i) -= A(i,i) * B(i,i) * scalar;
7318  }
7319  }
7320  //**********************************************************************************************
7321 
7322  //**Default subtraction assignment to dense matrices (small matrices)***************************
7336  template< typename MT3 // Type of the left-hand side target matrix
7337  , typename MT4 // Type of the left-hand side matrix operand
7338  , typename MT5 // Type of the right-hand side matrix operand
7339  , typename ST2 > // Type of the scalar value
7340  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7341  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7342  {
7343  selectDefaultSubAssignKernel( C, A, B, scalar );
7344  }
7345  //**********************************************************************************************
7346 
7347  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
7362  template< typename MT3 // Type of the left-hand side target matrix
7363  , typename MT4 // Type of the left-hand side matrix operand
7364  , typename MT5 // Type of the right-hand side matrix operand
7365  , typename ST2 > // Type of the scalar value
7366  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7367  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7368  {
7369  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7370 
7371  const size_t M( A.rows() );
7372  const size_t N( B.columns() );
7373  const size_t K( A.columns() );
7374 
7375  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7376 
7377  size_t i( 0UL );
7378 
7379  for( ; (i+2UL) <= M; i+=2UL )
7380  {
7381  const size_t jend( LOW ? i+2UL : N );
7382  size_t j( UPP ? i : 0UL );
7383 
7384  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
7385  {
7386  const size_t kbegin( ( IsUpper_v<MT4> )
7387  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7388  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7389  const size_t kend( ( IsLower_v<MT4> )
7390  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
7391  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
7392 
7393  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7394  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7395 
7396  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7397  size_t k( kbegin );
7398 
7399  for( ; k<kpos; k+=SIMDSIZE ) {
7400  const SIMDType a1( A.load(i ,k) );
7401  const SIMDType a2( A.load(i+1UL,k) );
7402  const SIMDType b1( B.load(k,j ) );
7403  const SIMDType b2( B.load(k,j+1UL) );
7404  const SIMDType b3( B.load(k,j+2UL) );
7405  const SIMDType b4( B.load(k,j+3UL) );
7406  xmm1 += a1 * b1;
7407  xmm2 += a1 * b2;
7408  xmm3 += a1 * b3;
7409  xmm4 += a1 * b4;
7410  xmm5 += a2 * b1;
7411  xmm6 += a2 * b2;
7412  xmm7 += a2 * b3;
7413  xmm8 += a2 * b4;
7414  }
7415 
7416  C(i ,j ) -= sum( xmm1 ) * scalar;
7417  C(i ,j+1UL) -= sum( xmm2 ) * scalar;
7418  C(i ,j+2UL) -= sum( xmm3 ) * scalar;
7419  C(i ,j+3UL) -= sum( xmm4 ) * scalar;
7420  C(i+1UL,j ) -= sum( xmm5 ) * scalar;
7421  C(i+1UL,j+1UL) -= sum( xmm6 ) * scalar;
7422  C(i+1UL,j+2UL) -= sum( xmm7 ) * scalar;
7423  C(i+1UL,j+3UL) -= sum( xmm8 ) * scalar;
7424 
7425  for( ; remainder && k<kend; ++k ) {
7426  C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7427  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7428  C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
7429  C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
7430  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7431  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7432  C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
7433  C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
7434  }
7435  }
7436 
7437  for( ; (j+2UL) <= jend; j+=2UL )
7438  {
7439  const size_t kbegin( ( IsUpper_v<MT4> )
7440  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7441  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7442  const size_t kend( ( IsLower_v<MT4> )
7443  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
7444  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7445 
7446  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7447  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7448 
7449  SIMDType xmm1, xmm2, xmm3, xmm4;
7450  size_t k( kbegin );
7451 
7452  for( ; k<kpos; k+=SIMDSIZE ) {
7453  const SIMDType a1( A.load(i ,k) );
7454  const SIMDType a2( A.load(i+1UL,k) );
7455  const SIMDType b1( B.load(k,j ) );
7456  const SIMDType b2( B.load(k,j+1UL) );
7457  xmm1 += a1 * b1;
7458  xmm2 += a1 * b2;
7459  xmm3 += a2 * b1;
7460  xmm4 += a2 * b2;
7461  }
7462 
7463  C(i ,j ) -= sum( xmm1 ) * scalar;
7464  C(i ,j+1UL) -= sum( xmm2 ) * scalar;
7465  C(i+1UL,j ) -= sum( xmm3 ) * scalar;
7466  C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
7467 
7468  for( ; remainder && k<kend; ++k ) {
7469  C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7470  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7471  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7472  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7473  }
7474  }
7475 
7476  if( j < jend )
7477  {
7478  const size_t kbegin( ( IsUpper_v<MT4> )
7479  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7480  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7481  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
7482 
7483  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7484  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7485 
7486  SIMDType xmm1, xmm2;
7487  size_t k( kbegin );
7488 
7489  for( ; k<kpos; k+=SIMDSIZE ) {
7490  const SIMDType b1( B.load(k,j) );
7491  xmm1 += A.load(i ,k) * b1;
7492  xmm2 += A.load(i+1UL,k) * b1;
7493  }
7494 
7495  C(i ,j) -= sum( xmm1 ) * scalar;
7496  C(i+1UL,j) -= sum( xmm2 ) * scalar;
7497 
7498  for( ; remainder && k<kend; ++k ) {
7499  C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7500  C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7501  }
7502  }
7503  }
7504 
7505  if( i < M )
7506  {
7507  const size_t jend( LOW ? i+1UL : N );
7508  size_t j( UPP ? i : 0UL );
7509 
7510  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
7511  {
7512  const size_t kbegin( ( IsUpper_v<MT4> )
7513  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7514  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7515  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
7516 
7517  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7518  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7519 
7520  SIMDType xmm1, xmm2, xmm3, xmm4;
7521  size_t k( kbegin );
7522 
7523  for( ; k<kpos; k+=SIMDSIZE ) {
7524  const SIMDType a1( A.load(i,k) );
7525  xmm1 += a1 * B.load(k,j );
7526  xmm2 += a1 * B.load(k,j+1UL);
7527  xmm3 += a1 * B.load(k,j+2UL);
7528  xmm4 += a1 * B.load(k,j+3UL);
7529  }
7530 
7531  C(i,j ) -= sum( xmm1 ) * scalar;
7532  C(i,j+1UL) -= sum( xmm2 ) * scalar;
7533  C(i,j+2UL) -= sum( xmm3 ) * scalar;
7534  C(i,j+3UL) -= sum( xmm4 ) * scalar;
7535 
7536  for( ; remainder && k<kend; ++k ) {
7537  C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7538  C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7539  C(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
7540  C(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
7541  }
7542  }
7543 
7544  for( ; (j+2UL) <= jend; j+=2UL )
7545  {
7546  const size_t kbegin( ( IsUpper_v<MT4> )
7547  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7548  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7549  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
7550 
7551  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7552  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7553 
7554  SIMDType xmm1, xmm2;
7555  size_t k( kbegin );
7556 
7557  for( ; k<kpos; k+=SIMDSIZE ) {
7558  const SIMDType a1( A.load(i,k) );
7559  xmm1 += a1 * B.load(k,j );
7560  xmm2 += a1 * B.load(k,j+1UL);
7561  }
7562 
7563  C(i,j ) -= sum( xmm1 ) * scalar;
7564  C(i,j+1UL) -= sum( xmm2 ) * scalar;
7565 
7566  for( ; remainder && k<kend; ++k ) {
7567  C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7568  C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7569  }
7570  }
7571 
7572  if( j < jend )
7573  {
7574  const size_t kbegin( ( IsUpper_v<MT4> )
7575  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7576  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7577 
7578  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
7579  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7580 
7581  SIMDType xmm1;
7582  size_t k( kbegin );
7583 
7584  for( ; k<kpos; k+=SIMDSIZE ) {
7585  xmm1 += A.load(i,k) * B.load(k,j);
7586  }
7587 
7588  C(i,j) -= sum( xmm1 ) * scalar;
7589 
7590  for( ; remainder && k<K; ++k ) {
7591  C(i,j) -= A(i,k) * B(k,j) * scalar;
7592  }
7593  }
7594  }
7595  }
7596  //**********************************************************************************************
7597 
7598  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
7613  template< typename MT3 // Type of the left-hand side target matrix
7614  , typename MT4 // Type of the left-hand side matrix operand
7615  , typename MT5 // Type of the right-hand side matrix operand
7616  , typename ST2 > // Type of the scalar value
7617  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7618  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7619  {
7620  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7621 
7622  const size_t M( A.rows() );
7623  const size_t N( B.columns() );
7624  const size_t K( A.columns() );
7625 
7626  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7627 
7628  size_t i( 0UL );
7629 
7630  for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
7631  {
7632  size_t j( 0UL );
7633 
7634  for( ; (j+2UL) <= N; j+=2UL )
7635  {
7636  const size_t kbegin( ( IsUpper_v<MT4> )
7637  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7638  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7639  const size_t kend( ( IsLower_v<MT4> )
7640  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
7641  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7642 
7643  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7644  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7645 
7646  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7647  size_t k( kbegin );
7648 
7649  for( ; k<kpos; k+=SIMDSIZE )
7650  {
7651  const SIMDType a1( A.load(i ,k) );
7652  const SIMDType a2( A.load(i+1UL,k) );
7653  const SIMDType a3( A.load(i+2UL,k) );
7654  const SIMDType a4( A.load(i+3UL,k) );
7655  const SIMDType b1( B.load(k,j ) );
7656  const SIMDType b2( B.load(k,j+1UL) );
7657  xmm1 += a1 * b1;
7658  xmm2 += a1 * b2;
7659  xmm3 += a2 * b1;
7660  xmm4 += a2 * b2;
7661  xmm5 += a3 * b1;
7662  xmm6 += a3 * b2;
7663  xmm7 += a4 * b1;
7664  xmm8 += a4 * b2;
7665  }
7666 
7667  C(i ,j ) -= sum( xmm1 ) * scalar;
7668  C(i ,j+1UL) -= sum( xmm2 ) * scalar;
7669  C(i+1UL,j ) -= sum( xmm3 ) * scalar;
7670  C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
7671  C(i+2UL,j ) -= sum( xmm5 ) * scalar;
7672  C(i+2UL,j+1UL) -= sum( xmm6 ) * scalar;
7673  C(i+3UL,j ) -= sum( xmm7 ) * scalar;
7674  C(i+3UL,j+1UL) -= sum( xmm8 ) * scalar;
7675 
7676  for( ; remainder && k<kend; ++k ) {
7677  C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7678  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7679  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7680  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7681  C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
7682  C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
7683  C(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
7684  C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
7685  }
7686  }
7687 
7688  if( j < N )
7689  {
7690  const size_t kbegin( ( IsUpper_v<MT4> )
7691  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7692  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7693  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
7694 
7695  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7696  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7697 
7698  SIMDType xmm1, xmm2, xmm3, xmm4;
7699  size_t k( kbegin );
7700 
7701  for( ; k<kpos; k+=SIMDSIZE ) {
7702  const SIMDType b1( B.load(k,j) );
7703  xmm1 += A.load(i ,k) * b1;
7704  xmm2 += A.load(i+1UL,k) * b1;
7705  xmm3 += A.load(i+2UL,k) * b1;
7706  xmm4 += A.load(i+3UL,k) * b1;
7707  }
7708 
7709  C(i ,j) -= sum( xmm1 ) * scalar;
7710  C(i+1UL,j) -= sum( xmm2 ) * scalar;
7711  C(i+2UL,j) -= sum( xmm3 ) * scalar;
7712  C(i+3UL,j) -= sum( xmm4 ) * scalar;
7713 
7714  for( ; remainder && k<kend; ++k ) {
7715  C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7716  C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7717  C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
7718  C(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
7719  }
7720  }
7721  }
7722 
7723  for( ; (i+2UL) <= M; i+=2UL )
7724  {
7725  const size_t jend( LOW ? i+2UL : N );
7726  size_t j( UPP ? i : 0UL );
7727 
7728  for( ; (j+2UL) <= jend; j+=2UL )
7729  {
7730  const size_t kbegin( ( IsUpper_v<MT4> )
7731  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7732  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7733  const size_t kend( ( IsLower_v<MT4> )
7734  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
7735  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7736 
7737  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7738  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7739 
7740  SIMDType xmm1, xmm2, xmm3, xmm4;
7741  size_t k( kbegin );
7742 
7743  for( ; k<kpos; k+=SIMDSIZE ) {
7744  const SIMDType a1( A.load(i ,k) );
7745  const SIMDType a2( A.load(i+1UL,k) );
7746  const SIMDType b1( B.load(k,j ) );
7747  const SIMDType b2( B.load(k,j+1UL) );
7748  xmm1 += a1 * b1;
7749  xmm2 += a1 * b2;
7750  xmm3 += a2 * b1;
7751  xmm4 += a2 * b2;
7752  }
7753 
7754  C(i ,j ) -= sum( xmm1 ) * scalar;
7755  C(i ,j+1UL) -= sum( xmm2 ) * scalar;
7756  C(i+1UL,j ) -= sum( xmm3 ) * scalar;
7757  C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
7758 
7759  for( ; remainder && k<kend; ++k ) {
7760  C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7761  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7762  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7763  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7764  }
7765  }
7766 
7767  if( j < jend )
7768  {
7769  const size_t kbegin( ( IsUpper_v<MT4> )
7770  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7771  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7772  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
7773 
7774  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7775  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7776 
7777  SIMDType xmm1, xmm2;
7778  size_t k( kbegin );
7779 
7780  for( ; k<kpos; k+=SIMDSIZE ) {
7781  const SIMDType b1( B.load(k,j) );
7782  xmm1 += A.load(i ,k) * b1;
7783  xmm2 += A.load(i+1UL,k) * b1;
7784  }
7785 
7786  C(i ,j) -= sum( xmm1 ) * scalar;
7787  C(i+1UL,j) -= sum( xmm2 ) * scalar;
7788 
7789  for( ; remainder && k<kend; ++k ) {
7790  C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7791  C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7792  }
7793  }
7794  }
7795 
7796  if( i < M )
7797  {
7798  const size_t jend( LOW ? i+1UL : N );
7799  size_t j( UPP ? i : 0UL );
7800 
7801  for( ; (j+2UL) <= jend; j+=2UL )
7802  {
7803  const size_t kbegin( ( IsUpper_v<MT4> )
7804  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7805  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7806  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
7807 
7808  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7809  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7810 
7811  SIMDType xmm1, xmm2;
7812  size_t k( kbegin );
7813 
7814  for( ; k<kpos; k+=SIMDSIZE ) {
7815  const SIMDType a1( A.load(i,k) );
7816  xmm1 += a1 * B.load(k,j );
7817  xmm2 += a1 * B.load(k,j+1UL);
7818  }
7819 
7820  C(i,j ) -= sum( xmm1 ) * scalar;
7821  C(i,j+1UL) -= sum( xmm2 ) * scalar;
7822 
7823  for( ; remainder && k<kend; ++k ) {
7824  C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7825  C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7826  }
7827  }
7828 
7829  if( j < jend )
7830  {
7831  const size_t kbegin( ( IsUpper_v<MT4> )
7832  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7833  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7834 
7835  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
7836  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7837 
7838  SIMDType xmm1;
7839  size_t k( kbegin );
7840 
7841  for( ; k<kpos; k+=SIMDSIZE ) {
7842  xmm1 += A.load(i,k) * B.load(k,j);
7843  }
7844 
7845  C(i,j) -= sum( xmm1 ) * scalar;
7846 
7847  for( ; remainder && k<K; ++k ) {
7848  C(i,j) -= A(i,k) * B(k,j) * scalar;
7849  }
7850  }
7851  }
7852  }
7853  //**********************************************************************************************
7854 
7855  //**Default subtraction assignment to dense matrices (large matrices)***************************
7869  template< typename MT3 // Type of the left-hand side target matrix
7870  , typename MT4 // Type of the left-hand side matrix operand
7871  , typename MT5 // Type of the right-hand side matrix operand
7872  , typename ST2 > // Type of the scalar value
7873  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7874  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7875  {
7876  selectDefaultSubAssignKernel( C, A, B, scalar );
7877  }
7878  //**********************************************************************************************
7879 
7880  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
7895  template< typename MT3 // Type of the left-hand side target matrix
7896  , typename MT4 // Type of the left-hand side matrix operand
7897  , typename MT5 // Type of the right-hand side matrix operand
7898  , typename ST2 > // Type of the scalar value
7899  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7900  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7901  {
7902  if( LOW )
7903  lmmm( C, A, B, -scalar, ST2(1) );
7904  else if( UPP )
7905  ummm( C, A, B, -scalar, ST2(1) );
7906  else
7907  mmm( C, A, B, -scalar, ST2(1) );
7908  }
7909  //**********************************************************************************************
7910 
7911  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
7925  template< typename MT3 // Type of the left-hand side target matrix
7926  , typename MT4 // Type of the left-hand side matrix operand
7927  , typename MT5 // Type of the right-hand side matrix operand
7928  , typename ST2 > // Type of the scalar value
7929  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7930  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7931  {
7932  selectLargeSubAssignKernel( C, A, B, scalar );
7933  }
7934  //**********************************************************************************************
7935 
7936  //**BLAS-based subraction assignment to dense matrices******************************************
7937 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7938 
7951  template< typename MT3 // Type of the left-hand side target matrix
7952  , typename MT4 // Type of the left-hand side matrix operand
7953  , typename MT5 // Type of the right-hand side matrix operand
7954  , typename ST2 > // Type of the scalar value
7955  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7956  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7957  {
7958  using ET = ElementType_t<MT3>;
7959 
7960  if( IsTriangular_v<MT4> ) {
7961  ResultType_t<MT3> tmp( serial( B ) );
7962  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7963  subAssign( C, tmp );
7964  }
7965  else if( IsTriangular_v<MT5> ) {
7966  ResultType_t<MT3> tmp( serial( A ) );
7967  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7968  subAssign( C, tmp );
7969  }
7970  else {
7971  gemm( C, A, B, ET(-scalar), ET(1) );
7972  }
7973  }
7974 #endif
7975  //**********************************************************************************************
7976 
7977  //**Subtraction assignment to sparse matrices***************************************************
7978  // No special implementation for the subtraction assignment to sparse matrices.
7979  //**********************************************************************************************
7980 
7981  //**Schur product assignment to dense matrices**************************************************
7993  template< typename MT // Type of the target dense matrix
7994  , bool SO > // Storage order of the target dense matrix
7995  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7996  {
7998 
8002 
8003  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8004  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8005 
8006  const ResultType tmp( serial( rhs ) );
8007  schurAssign( ~lhs, tmp );
8008  }
8009  //**********************************************************************************************
8010 
8011  //**Schur product assignment to sparse matrices*************************************************
8012  // No special implementation for the Schur product assignment to sparse matrices.
8013  //**********************************************************************************************
8014 
8015  //**Multiplication assignment to dense matrices*************************************************
8016  // No special implementation for the multiplication assignment to dense matrices.
8017  //**********************************************************************************************
8018 
8019  //**Multiplication assignment to sparse matrices************************************************
8020  // No special implementation for the multiplication assignment to sparse matrices.
8021  //**********************************************************************************************
8022 
8023  //**SMP assignment to dense matrices************************************************************
8038  template< typename MT // Type of the target dense matrix
8039  , bool SO > // Storage order of the target dense matrix
8040  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8041  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8042  {
8044 
8045  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8046  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8047 
8048  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8049  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8050 
8051  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
8052  return;
8053  }
8054  else if( left.columns() == 0UL ) {
8055  reset( ~lhs );
8056  return;
8057  }
8058 
8059  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8060  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8061 
8062  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8063  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8064  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8065  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8066  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8067  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8068 
8069  smpAssign( ~lhs, A * B * rhs.scalar_ );
8070  }
8071  //**********************************************************************************************
8072 
8073  //**SMP assignment to sparse matrices***********************************************************
8088  template< typename MT // Type of the target sparse matrix
8089  , bool SO > // Storage order of the target sparse matrix
8090  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8091  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8092  {
8094 
8095  using TmpType = If_t< SO, OppositeType, ResultType >;
8096 
8103 
8104  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8105  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8106 
8107  const ForwardFunctor fwd;
8108 
8109  const TmpType tmp( rhs );
8110  smpAssign( ~lhs, fwd( tmp ) );
8111  }
8112  //**********************************************************************************************
8113 
8114  //**SMP addition assignment to dense matrices***************************************************
8129  template< typename MT // Type of the target dense matrix
8130  , bool SO > // Storage order of the target dense matrix
8131  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8132  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8133  {
8135 
8136  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8137  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8138 
8139  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8140  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8141 
8142  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8143  return;
8144  }
8145 
8146  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8147  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8148 
8149  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8150  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8151  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8152  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8153  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8154  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8155 
8156  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
8157  }
8158  //**********************************************************************************************
8159 
8160  //**SMP addition assignment to sparse matrices**************************************************
8161  // No special implementation for the SMP addition assignment to sparse matrices.
8162  //**********************************************************************************************
8163 
8164  //**SMP subtraction assignment to dense matrices************************************************
8179  template< typename MT // Type of the target dense matrix
8180  , bool SO > // Storage order of the target dense matrix
8181  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8182  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8183  {
8185 
8186  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8187  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8188 
8189  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8190  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8191 
8192  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8193  return;
8194  }
8195 
8196  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8197  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8198 
8199  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8200  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8201  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8202  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8203  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8204  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8205 
8206  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
8207  }
8208  //**********************************************************************************************
8209 
8210  //**SMP subtraction assignment to sparse matrices***********************************************
8211  // No special implementation for the SMP subtraction assignment to sparse matrices.
8212  //**********************************************************************************************
8213 
8214  //**SMP Schur product assignment to dense matrices**********************************************
8226  template< typename MT // Type of the target dense matrix
8227  , bool SO > // Storage order of the target dense matrix
8228  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8229  {
8231 
8235 
8236  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8237  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8238 
8239  const ResultType tmp( rhs );
8240  smpSchurAssign( ~lhs, tmp );
8241  }
8242  //**********************************************************************************************
8243 
8244  //**SMP Schur product assignment to sparse matrices*********************************************
8245  // No special implementation for the SMP Schur product assignment to sparse matrices.
8246  //**********************************************************************************************
8247 
8248  //**SMP multiplication assignment to dense matrices*********************************************
8249  // No special implementation for the SMP multiplication assignment to dense matrices.
8250  //**********************************************************************************************
8251 
8252  //**SMP multiplication assignment to sparse matrices********************************************
8253  // No special implementation for the SMP multiplication assignment to sparse matrices.
8254  //**********************************************************************************************
8255 
8256  //**Compile time checks*************************************************************************
8265  //**********************************************************************************************
8266 };
8268 //*************************************************************************************************
8269 
8270 
8271 
8272 
8273 //=================================================================================================
8274 //
8275 // GLOBAL BINARY ARITHMETIC OPERATORS
8276 //
8277 //=================================================================================================
8278 
8279 //*************************************************************************************************
8309 template< typename MT1 // Type of the left-hand side dense matrix
8310  , typename MT2 > // Type of the right-hand side dense matrix
8311 inline decltype(auto)
8312  operator*( const DenseMatrix<MT1,false>& lhs, const DenseMatrix<MT2,true>& rhs )
8313 {
8315 
8316  if( (~lhs).columns() != (~rhs).rows() ) {
8317  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
8318  }
8319 
8320  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,false,false,false,false>;
8321  return ReturnType( ~lhs, ~rhs );
8322 }
8323 //*************************************************************************************************
8324 
8325 
8326 
8327 
8328 //=================================================================================================
8329 //
8330 // GLOBAL FUNCTIONS
8331 //
8332 //=================================================================================================
8333 
8334 //*************************************************************************************************
8359 template< typename MT1 // Type of the left-hand side dense matrix
8360  , typename MT2 // Type of the right-hand side dense matrix
8361  , bool SF // Symmetry flag
8362  , bool HF // Hermitian flag
8363  , bool LF // Lower flag
8364  , bool UF > // Upper flag
8365 inline decltype(auto) declsym( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8366 {
8368 
8369  if( !isSquare( dm ) ) {
8370  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
8371  }
8372 
8373  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
8374  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8375 }
8377 //*************************************************************************************************
8378 
8379 
8380 //*************************************************************************************************
8405 template< typename MT1 // Type of the left-hand side dense matrix
8406  , typename MT2 // Type of the right-hand side dense matrix
8407  , bool SF // Symmetry flag
8408  , bool HF // Hermitian flag
8409  , bool LF // Lower flag
8410  , bool UF > // Upper flag
8411 inline decltype(auto) declherm( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8412 {
8414 
8415  if( !isSquare( dm ) ) {
8416  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
8417  }
8418 
8419  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
8420  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8421 }
8423 //*************************************************************************************************
8424 
8425 
8426 //*************************************************************************************************
8451 template< typename MT1 // Type of the left-hand side dense matrix
8452  , typename MT2 // Type of the right-hand side dense matrix
8453  , bool SF // Symmetry flag
8454  , bool HF // Hermitian flag
8455  , bool LF // Lower flag
8456  , bool UF > // Upper flag
8457 inline decltype(auto) decllow( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8458 {
8460 
8461  if( !isSquare( dm ) ) {
8462  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
8463  }
8464 
8465  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
8466  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8467 }
8469 //*************************************************************************************************
8470 
8471 
8472 //*************************************************************************************************
8497 template< typename MT1 // Type of the left-hand side dense matrix
8498  , typename MT2 // Type of the right-hand side dense matrix
8499  , bool SF // Symmetry flag
8500  , bool HF // Hermitian flag
8501  , bool LF // Lower flag
8502  , bool UF > // Upper flag
8503 inline decltype(auto) declupp( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8504 {
8506 
8507  if( !isSquare( dm ) ) {
8508  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
8509  }
8510 
8511  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
8512  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8513 }
8515 //*************************************************************************************************
8516 
8517 
8518 //*************************************************************************************************
8543 template< typename MT1 // Type of the left-hand side dense matrix
8544  , typename MT2 // Type of the right-hand side dense matrix
8545  , bool SF // Symmetry flag
8546  , bool HF // Hermitian flag
8547  , bool LF // Lower flag
8548  , bool UF > // Upper flag
8549 inline decltype(auto) decldiag( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8550 {
8552 
8553  if( !isSquare( dm ) ) {
8554  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
8555  }
8556 
8557  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
8558  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8559 }
8561 //*************************************************************************************************
8562 
8563 
8564 
8565 
8566 //=================================================================================================
8567 //
8568 // SIZE SPECIALIZATIONS
8569 //
8570 //=================================================================================================
8571 
8572 //*************************************************************************************************
8574 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
8575 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
8576  : public Size<MT1,0UL>
8577 {};
8578 
8579 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
8580 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
8581  : public Size<MT2,1UL>
8582 {};
8584 //*************************************************************************************************
8585 
8586 
8587 
8588 
8589 //=================================================================================================
8590 //
8591 // ISALIGNED SPECIALIZATIONS
8592 //
8593 //=================================================================================================
8594 
8595 //*************************************************************************************************
8597 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
8598 struct IsAligned< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8599  : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
8600 {};
8602 //*************************************************************************************************
8603 
8604 } // namespace blaze
8605 
8606 #endif
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatTDMatMultExpr.h:287
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:427
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatTDMatMultExpr.h:171
Header file for basic type definitions.
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:149
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatTDMatMultExpr.h:169
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:151
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:270
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:533
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:373
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:265
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:606
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:523
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1001
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:597
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
Header file for the Computation base class.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:273
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:432
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:475
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:389
Header file for the IsComplexDouble type trait.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:309
Constraint on the data type.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:150
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:154
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:565
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:59
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:443
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2147
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatTDMatMultExpr.h:168
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:168
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1001
Header file for the IsLower type trait.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatTDMatMultExpr.h:300
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:279
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:431
Header file for the IsAligned type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:553
Generic wrapper for the null function.
Definition: Noop.h:60
Header file for the IsTriangular type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:269
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:162
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:158
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:267
Header file for the exception macros of the math module.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:476
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:469
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:453
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:161
Header file for the MatScalarMultExpr base class.
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:263
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:174
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:422
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:164
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:419
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:266
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:59
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:105
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:577
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatTDMatMultExpr.h:294
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:409
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:59
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:282
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatTDMatMultExpr.h:268
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:324
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatTDMatMultExpr.h:170
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:454
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:441
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:152
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:59
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:143
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:587
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:107
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1324
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:59
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:399
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:543
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:171
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
Header file for the DeclSym functor.
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:276
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:463
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:159
Header file for the IsExpression type trait class.
Header file for the function trace functionality.