DMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/dense/MMM.h>
53 #include <blaze/math/Exception.h>
66 #include <blaze/math/shims/Reset.h>
68 #include <blaze/math/SIMD.h>
96 #include <blaze/math/views/Check.h>
97 #include <blaze/system/BLAS.h>
98 #include <blaze/system/Blocking.h>
100 #include <blaze/system/Thresholds.h>
103 #include <blaze/util/Assert.h>
104 #include <blaze/util/Complex.h>
107 #include <blaze/util/DisableIf.h>
108 #include <blaze/util/EnableIf.h>
111 #include <blaze/util/mpl/If.h>
112 #include <blaze/util/TrueType.h>
113 #include <blaze/util/Types.h>
121 
122 
123 namespace blaze {
124 
125 //=================================================================================================
126 //
127 // CLASS DMATTDMATMULTEXPR
128 //
129 //=================================================================================================
130 
131 //*************************************************************************************************
138 template< typename MT1 // Type of the left-hand side dense matrix
139  , typename MT2 // Type of the right-hand side dense matrix
140  , bool SF // Symmetry flag
141  , bool HF // Hermitian flag
142  , bool LF // Lower flag
143  , bool UF > // Upper flag
145  : public MatMatMultExpr< DenseMatrix< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
146  , private Computation
147 {
148  private:
149  //**Type definitions****************************************************************************
156  //**********************************************************************************************
157 
158  //**********************************************************************************************
160  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
161  //**********************************************************************************************
162 
163  //**********************************************************************************************
165  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
166  //**********************************************************************************************
167 
168  //**********************************************************************************************
169  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
170  static constexpr bool HERM = ( HF && !( LF || UF ) );
171  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
172  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
173  //**********************************************************************************************
174 
175  //**********************************************************************************************
177 
181  template< typename T1, typename T2, typename T3 >
182  static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
184  //**********************************************************************************************
185 
186  //**********************************************************************************************
188 
191  template< typename T1, typename T2, typename T3 >
192  static constexpr bool UseBlasKernel_v =
193  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
194  !SYM && !HERM && !LOW && !UPP &&
195  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
196  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
197  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
198  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
199  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
200  IsBLASCompatible_v< ElementType_t<T1> > &&
201  IsBLASCompatible_v< ElementType_t<T2> > &&
202  IsBLASCompatible_v< ElementType_t<T3> > &&
203  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
204  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
206  //**********************************************************************************************
207 
208  //**********************************************************************************************
210 
213  template< typename T1, typename T2, typename T3 >
214  static constexpr bool UseVectorizedDefaultKernel_v =
215  ( useOptimizedKernels &&
216  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
217  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
218  IsSIMDCombinable_v< ElementType_t<T1>
220  , ElementType_t<T3> > &&
221  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
222  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
224  //**********************************************************************************************
225 
226  //**********************************************************************************************
228 
231  using ForwardFunctor = If_t< HERM
232  , DeclHerm
233  , If_t< SYM
234  , DeclSym
235  , If_t< LOW
236  , If_t< UPP
237  , DeclDiag
238  , DeclLow >
239  , If_t< UPP
240  , DeclUpp
241  , Noop > > > >;
243  //**********************************************************************************************
244 
245  public:
246  //**Type definitions****************************************************************************
249 
252 
254  using ResultType = typename If_t< HERM
256  , If_t< SYM
258  , If_t< LOW
259  , If_t< UPP
262  , If_t< UPP
264  , MultTrait<RT1,RT2> > > > >::Type;
265 
270  using ReturnType = const ElementType;
271  using CompositeType = const ResultType;
272 
274  using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
275 
277  using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
278 
281 
284  //**********************************************************************************************
285 
286  //**Compilation flags***************************************************************************
288  static constexpr bool simdEnabled =
289  ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
290  MT1::simdEnabled && MT2::simdEnabled &&
291  HasSIMDAdd_v<ET1,ET2> &&
292  HasSIMDMult_v<ET1,ET2> );
293 
295  static constexpr bool smpAssignable =
297  //**********************************************************************************************
298 
299  //**SIMD properties*****************************************************************************
301  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
302  //**********************************************************************************************
303 
304  //**Constructor*********************************************************************************
310  explicit inline DMatTDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
311  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
312  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
313  {
314  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
315  }
316  //**********************************************************************************************
317 
318  //**Access operator*****************************************************************************
325  inline ReturnType operator()( size_t i, size_t j ) const {
326  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
327  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
328 
329  if( IsDiagonal_v<MT1> ) {
330  return lhs_(i,i) * rhs_(i,j);
331  }
332  else if( IsDiagonal_v<MT2> ) {
333  return lhs_(i,j) * rhs_(j,j);
334  }
335  else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
336  const size_t begin( ( IsUpper_v<MT1> )
337  ?( ( IsLower_v<MT2> )
338  ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
339  , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
340  :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
341  :( ( IsLower_v<MT2> )
342  ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
343  :( 0UL ) ) );
344  const size_t end( ( IsLower_v<MT1> )
345  ?( ( IsUpper_v<MT2> )
346  ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
347  , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
348  :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
349  :( ( IsUpper_v<MT2> )
350  ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
351  :( lhs_.columns() ) ) );
352 
353  if( begin >= end ) return ElementType();
354 
355  const size_t n( end - begin );
356 
357  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
358  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
359  }
360  else {
361  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
362  }
363  }
364  //**********************************************************************************************
365 
366  //**At function*********************************************************************************
374  inline ReturnType at( size_t i, size_t j ) const {
375  if( i >= lhs_.rows() ) {
376  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
377  }
378  if( j >= rhs_.columns() ) {
379  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
380  }
381  return (*this)(i,j);
382  }
383  //**********************************************************************************************
384 
385  //**Rows function*******************************************************************************
390  inline size_t rows() const noexcept {
391  return lhs_.rows();
392  }
393  //**********************************************************************************************
394 
395  //**Columns function****************************************************************************
400  inline size_t columns() const noexcept {
401  return rhs_.columns();
402  }
403  //**********************************************************************************************
404 
405  //**Left operand access*************************************************************************
410  inline LeftOperand leftOperand() const noexcept {
411  return lhs_;
412  }
413  //**********************************************************************************************
414 
415  //**Right operand access************************************************************************
420  inline RightOperand rightOperand() const noexcept {
421  return rhs_;
422  }
423  //**********************************************************************************************
424 
425  //**********************************************************************************************
431  template< typename T >
432  inline bool canAlias( const T* alias ) const noexcept {
433  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
434  }
435  //**********************************************************************************************
436 
437  //**********************************************************************************************
443  template< typename T >
444  inline bool isAliased( const T* alias ) const noexcept {
445  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
446  }
447  //**********************************************************************************************
448 
449  //**********************************************************************************************
454  inline bool isAligned() const noexcept {
455  return lhs_.isAligned() && rhs_.isAligned();
456  }
457  //**********************************************************************************************
458 
459  //**********************************************************************************************
464  inline bool canSMPAssign() const noexcept {
465  return ( !BLAZE_BLAS_MODE ||
466  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
468  ( rows() * columns() < DMATTDMATMULT_THRESHOLD ) ) &&
469  ( rows() * columns() >= SMP_DMATTDMATMULT_THRESHOLD ) &&
470  !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
471  }
472  //**********************************************************************************************
473 
474  private:
475  //**Member variables****************************************************************************
478  //**********************************************************************************************
479 
480  //**Assignment to dense matrices****************************************************************
493  template< typename MT // Type of the target dense matrix
494  , bool SO > // Storage order of the target dense matrix
495  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
496  {
498 
499  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
500  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
501 
502  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
503  return;
504  }
505  else if( rhs.lhs_.columns() == 0UL ) {
506  reset( ~lhs );
507  return;
508  }
509 
510  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
511  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
512 
513  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
514  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
515  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
516  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
517  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
518  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
519 
520  DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
521  }
523  //**********************************************************************************************
524 
525  //**Assignment to dense matrices (kernel selection)*********************************************
536  template< typename MT3 // Type of the left-hand side target matrix
537  , typename MT4 // Type of the left-hand side matrix operand
538  , typename MT5 > // Type of the right-hand side matrix operand
539  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
540  {
541  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
542  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
543  selectSmallAssignKernel( C, A, B );
544  else
545  selectBlasAssignKernel( C, A, B );
546  }
548  //**********************************************************************************************
549 
550  //**Default assignment to row-major dense matrices (general/general)****************************
564  template< typename MT3 // Type of the left-hand side target matrix
565  , typename MT4 // Type of the left-hand side matrix operand
566  , typename MT5 > // Type of the right-hand side matrix operand
567  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
568  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
569  {
570  const size_t M( A.rows() );
571  const size_t N( B.columns() );
572  const size_t K( A.columns() );
573 
574  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
575 
576  const size_t ibegin( ( IsStrictlyLower_v<MT4> )
577  ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
578  :( 0UL ) );
579  const size_t iend( ( IsStrictlyUpper_v<MT4> )
580  ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
581  :( M ) );
582  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
583 
584  for( size_t i=0UL; i<ibegin; ++i ) {
585  for( size_t j=0UL; j<N; ++j ) {
586  reset( C(i,j) );
587  }
588  }
589  for( size_t i=ibegin; i<iend; ++i )
590  {
591  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
592  ?( ( IsStrictlyUpper_v<MT4> )
593  ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
594  :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
595  :( ( IsStrictlyUpper_v<MT5> )
596  ?( SYM || HERM || UPP ? max( i, 1UL ) : 1UL )
597  :( SYM || HERM || UPP ? i : 0UL ) ) );
598  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
599  ?( ( IsStrictlyLower_v<MT4> )
600  ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
601  :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
602  :( ( IsStrictlyLower_v<MT5> )
603  ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
604  :( LOW ? i+1UL : N ) ) );
605 
606  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
607  for( size_t j=0UL; j<N; ++j ) {
608  reset( C(i,j) );
609  }
610  continue;
611  }
612 
613  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
614 
615  for( size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
616  reset( C(i,j) );
617  }
618  for( size_t j=jbegin; j<jend; ++j )
619  {
620  const size_t kbegin( ( IsUpper_v<MT4> )
621  ?( ( IsLower_v<MT5> )
622  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
623  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
624  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
625  :( ( IsLower_v<MT5> )
626  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
627  :( 0UL ) ) );
628  const size_t kend( ( IsLower_v<MT4> )
629  ?( ( IsUpper_v<MT5> )
630  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
631  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
632  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
633  :( ( IsUpper_v<MT5> )
634  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
635  :( K ) ) );
636  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
637 
638  C(i,j) = A(i,kbegin) * B(kbegin,j);
639  for( size_t k=kbegin+1UL; k<kend; ++k ) {
640  C(i,j) += A(i,k) * B(k,j);
641  }
642  }
643  for( size_t j=jend; j<N; ++j ) {
644  reset( C(i,j) );
645  }
646  }
647  for( size_t i=iend; i<M; ++i ) {
648  for( size_t j=0UL; j<N; ++j ) {
649  reset( C(i,j) );
650  }
651  }
652 
653  if( SYM || HERM ) {
654  for( size_t i=1UL; i<M; ++i ) {
655  for( size_t j=0UL; j<i; ++j ) {
656  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
657  }
658  }
659  }
660  }
662  //**********************************************************************************************
663 
664  //**Default assignment to column-major dense matrices (general/general)*************************
678  template< typename MT3 // Type of the left-hand side target matrix
679  , typename MT4 // Type of the left-hand side matrix operand
680  , typename MT5 > // Type of the right-hand side matrix operand
681  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
682  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
683  {
684  const size_t M( A.rows() );
685  const size_t N( B.columns() );
686  const size_t K( A.columns() );
687 
688  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
689 
690  const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
691  ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
692  :( 0UL ) );
693  const size_t jend( ( IsStrictlyLower_v<MT5> )
694  ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
695  :( N ) );
696  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
697 
698  for( size_t j=0UL; j<jbegin; ++j ) {
699  for( size_t i=0UL; i<M; ++i ) {
700  reset( C(i,j) );
701  }
702  }
703  for( size_t j=jbegin; j<jend; ++j )
704  {
705  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
706  ?( ( IsStrictlyLower_v<MT4> )
707  ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
708  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
709  :( ( IsStrictlyLower_v<MT4> )
710  ?( SYM || HERM || LOW ? max( j, 1UL ) : 1UL )
711  :( SYM || HERM || LOW ? j : 0UL ) ) );
712  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
713  ?( ( IsStrictlyUpper_v<MT4> )
714  ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
715  :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
716  :( ( IsStrictlyUpper_v<MT4> )
717  ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
718  :( UPP ? j+1UL : M ) ) );
719 
720  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
721  for( size_t i=0UL; i<M; ++i ) {
722  reset( C(i,j) );
723  }
724  continue;
725  }
726 
727  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
728 
729  for( size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
730  reset( C(i,j) );
731  }
732  for( size_t i=ibegin; i<iend; ++i )
733  {
734  const size_t kbegin( ( IsUpper_v<MT4> )
735  ?( ( IsLower_v<MT5> )
736  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
737  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
738  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
739  :( ( IsLower_v<MT5> )
740  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
741  :( 0UL ) ) );
742  const size_t kend( ( IsLower_v<MT4> )
743  ?( ( IsUpper_v<MT5> )
744  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
745  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
746  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
747  :( ( IsUpper_v<MT5> )
748  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
749  :( K ) ) );
750  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
751 
752  C(i,j) = A(i,kbegin) * B(kbegin,j);
753  for( size_t k=kbegin+1UL; k<kend; ++k ) {
754  C(i,j) += A(i,k) * B(k,j);
755  }
756  }
757  for( size_t i=iend; i<M; ++i ) {
758  reset( C(i,j) );
759  }
760  }
761  for( size_t j=jend; j<N; ++j ) {
762  for( size_t i=0UL; i<M; ++i ) {
763  reset( C(i,j) );
764  }
765  }
766 
767  if( SYM || HERM ) {
768  for( size_t j=1UL; j<N; ++j ) {
769  for( size_t i=0UL; i<j; ++i ) {
770  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
771  }
772  }
773  }
774  }
776  //**********************************************************************************************
777 
778  //**Default assignment to row-major dense matrices (general/diagonal)***************************
792  template< typename MT3 // Type of the left-hand side target matrix
793  , typename MT4 // Type of the left-hand side matrix operand
794  , typename MT5 > // Type of the right-hand side matrix operand
795  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
796  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
797  {
798  const size_t M( A.rows() );
799  const size_t N( B.columns() );
800 
801  for( size_t i=0UL; i<M; ++i )
802  {
803  const size_t jbegin( ( IsUpper_v<MT4> )
804  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
805  :( 0UL ) );
806  const size_t jend( ( IsLower_v<MT4> )
807  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
808  :( N ) );
809  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
810 
811  if( IsUpper_v<MT4> ) {
812  for( size_t j=0UL; j<jbegin; ++j ) {
813  reset( C(i,j) );
814  }
815  }
816  for( size_t j=jbegin; j<jend; ++j ) {
817  C(i,j) = A(i,j) * B(j,j);
818  }
819  if( IsLower_v<MT4> ) {
820  for( size_t j=jend; j<N; ++j ) {
821  reset( C(i,j) );
822  }
823  }
824  }
825  }
827  //**********************************************************************************************
828 
829  //**Default assignment to column-major dense matrices (general/diagonal)************************
843  template< typename MT3 // Type of the left-hand side target matrix
844  , typename MT4 // Type of the left-hand side matrix operand
845  , typename MT5 > // Type of the right-hand side matrix operand
846  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
847  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
848  {
849  constexpr size_t block( BLOCK_SIZE );
850 
851  const size_t M( A.rows() );
852  const size_t N( B.columns() );
853 
854  for( size_t jj=0UL; jj<N; jj+=block ) {
855  const size_t jend( min( N, jj+block ) );
856  for( size_t ii=0UL; ii<M; ii+=block ) {
857  const size_t iend( min( M, ii+block ) );
858  for( size_t j=jj; j<jend; ++j )
859  {
860  const size_t ibegin( ( IsLower_v<MT4> )
861  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
862  :( ii ) );
863  const size_t ipos( ( IsUpper_v<MT4> )
864  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
865  :( iend ) );
866 
867  if( IsLower_v<MT4> ) {
868  for( size_t i=ii; i<ibegin; ++i ) {
869  reset( C(i,j) );
870  }
871  }
872  for( size_t i=ibegin; i<ipos; ++i ) {
873  C(i,j) = A(i,j) * B(j,j);
874  }
875  if( IsUpper_v<MT4> ) {
876  for( size_t i=ipos; i<iend; ++i ) {
877  reset( C(i,j) );
878  }
879  }
880  }
881  }
882  }
883  }
885  //**********************************************************************************************
886 
887  //**Default assignment to row-major dense matrices (diagonal/general)***************************
901  template< typename MT3 // Type of the left-hand side target matrix
902  , typename MT4 // Type of the left-hand side matrix operand
903  , typename MT5 > // Type of the right-hand side matrix operand
904  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
905  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
906  {
907  constexpr size_t block( BLOCK_SIZE );
908 
909  const size_t M( A.rows() );
910  const size_t N( B.columns() );
911 
912  for( size_t ii=0UL; ii<M; ii+=block ) {
913  const size_t iend( min( M, ii+block ) );
914  for( size_t jj=0UL; jj<N; jj+=block ) {
915  const size_t jend( min( N, jj+block ) );
916  for( size_t i=ii; i<iend; ++i )
917  {
918  const size_t jbegin( ( IsUpper_v<MT5> )
919  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
920  :( jj ) );
921  const size_t jpos( ( IsLower_v<MT5> )
922  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
923  :( jend ) );
924 
925  if( IsUpper_v<MT5> ) {
926  for( size_t j=jj; j<jbegin; ++j ) {
927  reset( C(i,j) );
928  }
929  }
930  for( size_t j=jbegin; j<jpos; ++j ) {
931  C(i,j) = A(i,i) * B(i,j);
932  }
933  if( IsLower_v<MT5> ) {
934  for( size_t j=jpos; j<jend; ++j ) {
935  reset( C(i,j) );
936  }
937  }
938  }
939  }
940  }
941  }
943  //**********************************************************************************************
944 
945  //**Default assignment to column-major dense matrices (diagonal/general)************************
959  template< typename MT3 // Type of the left-hand side target matrix
960  , typename MT4 // Type of the left-hand side matrix operand
961  , typename MT5 > // Type of the right-hand side matrix operand
962  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
963  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
964  {
965  const size_t M( A.rows() );
966  const size_t N( B.columns() );
967 
968  for( size_t j=0UL; j<N; ++j )
969  {
970  const size_t ibegin( ( IsLower_v<MT5> )
971  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
972  :( 0UL ) );
973  const size_t iend( ( IsUpper_v<MT5> )
974  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
975  :( M ) );
976  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
977 
978  if( IsLower_v<MT5> ) {
979  for( size_t i=0UL; i<ibegin; ++i ) {
980  reset( C(i,j) );
981  }
982  }
983  for( size_t i=ibegin; i<iend; ++i ) {
984  C(i,j) = A(i,i) * B(i,j);
985  }
986  if( IsUpper_v<MT5> ) {
987  for( size_t i=iend; i<M; ++i ) {
988  reset( C(i,j) );
989  }
990  }
991  }
992  }
994  //**********************************************************************************************
995 
996  //**Default assignment to dense matrices (diagonal/diagonal)************************************
1010  template< typename MT3 // Type of the left-hand side target matrix
1011  , typename MT4 // Type of the left-hand side matrix operand
1012  , typename MT5 > // Type of the right-hand side matrix operand
1013  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
1014  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1015  {
1016  reset( C );
1017 
1018  for( size_t i=0UL; i<A.rows(); ++i ) {
1019  C(i,i) = A(i,i) * B(i,i);
1020  }
1021  }
1023  //**********************************************************************************************
1024 
1025  //**Default assignment to dense matrices (small matrices)***************************************
1039  template< typename MT3 // Type of the left-hand side target matrix
1040  , typename MT4 // Type of the left-hand side matrix operand
1041  , typename MT5 > // Type of the right-hand side matrix operand
1042  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1043  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1044  {
1045  selectDefaultAssignKernel( C, A, B );
1046  }
1048  //**********************************************************************************************
1049 
1050  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1065  template< typename MT3 // Type of the left-hand side target matrix
1066  , typename MT4 // Type of the left-hand side matrix operand
1067  , typename MT5 > // Type of the right-hand side matrix operand
1068  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1069  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1070  {
1071  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1072 
1073  const size_t M( A.rows() );
1074  const size_t N( B.columns() );
1075  const size_t K( A.columns() );
1076 
1077  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1078 
1079  if( LOW && UPP ) {
1080  reset( C );
1081  }
1082 
1083  {
1084  size_t i( 0UL );
1085 
1086  for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
1087  {
1088  const size_t jend( LOW ? i+2UL : N );
1089  size_t j( SYM || HERM || UPP ? i : 0UL );
1090 
1091  for( ; (j+4UL) <= jend; j+=4UL )
1092  {
1093  const size_t kbegin( ( IsUpper_v<MT4> )
1094  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1095  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1096  const size_t kend( ( IsLower_v<MT4> )
1097  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
1098  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
1099 
1100  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1101  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1102 
1103  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1104  size_t k( kbegin );
1105 
1106  for( ; k<kpos; k+=SIMDSIZE ) {
1107  const SIMDType a1( A.load(i ,k) );
1108  const SIMDType a2( A.load(i+1UL,k) );
1109  const SIMDType b1( B.load(k,j ) );
1110  const SIMDType b2( B.load(k,j+1UL) );
1111  const SIMDType b3( B.load(k,j+2UL) );
1112  const SIMDType b4( B.load(k,j+3UL) );
1113  xmm1 += a1 * b1;
1114  xmm2 += a1 * b2;
1115  xmm3 += a1 * b3;
1116  xmm4 += a1 * b4;
1117  xmm5 += a2 * b1;
1118  xmm6 += a2 * b2;
1119  xmm7 += a2 * b3;
1120  xmm8 += a2 * b4;
1121  }
1122 
1123  C(i ,j ) = sum( xmm1 );
1124  C(i ,j+1UL) = sum( xmm2 );
1125  C(i ,j+2UL) = sum( xmm3 );
1126  C(i ,j+3UL) = sum( xmm4 );
1127  C(i+1UL,j ) = sum( xmm5 );
1128  C(i+1UL,j+1UL) = sum( xmm6 );
1129  C(i+1UL,j+2UL) = sum( xmm7 );
1130  C(i+1UL,j+3UL) = sum( xmm8 );
1131 
1132  for( ; remainder && k<kend; ++k ) {
1133  C(i ,j ) += A(i ,k) * B(k,j );
1134  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1135  C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1136  C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1137  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1138  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1139  C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1140  C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1141  }
1142  }
1143 
1144  for( ; (j+2UL) <= jend; j+=2UL )
1145  {
1146  const size_t kbegin( ( IsUpper_v<MT4> )
1147  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1148  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1149  const size_t kend( ( IsLower_v<MT4> )
1150  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
1151  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1152 
1153  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1154  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1155 
1156  SIMDType xmm1, xmm2, xmm3, xmm4;
1157  size_t k( kbegin );
1158 
1159  for( ; k<kpos; k+=SIMDSIZE ) {
1160  const SIMDType a1( A.load(i ,k) );
1161  const SIMDType a2( A.load(i+1UL,k) );
1162  const SIMDType b1( B.load(k,j ) );
1163  const SIMDType b2( B.load(k,j+1UL) );
1164  xmm1 += a1 * b1;
1165  xmm2 += a1 * b2;
1166  xmm3 += a2 * b1;
1167  xmm4 += a2 * b2;
1168  }
1169 
1170  C(i ,j ) = sum( xmm1 );
1171  C(i ,j+1UL) = sum( xmm2 );
1172  C(i+1UL,j ) = sum( xmm3 );
1173  C(i+1UL,j+1UL) = sum( xmm4 );
1174 
1175  for( ; remainder && k<kend; ++k ) {
1176  C(i ,j ) += A(i ,k) * B(k,j );
1177  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1178  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1179  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1180  }
1181  }
1182 
1183  if( j < jend )
1184  {
1185  const size_t kbegin( ( IsUpper_v<MT4> )
1186  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1187  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1188  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1189 
1190  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1191  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1192 
1193  SIMDType xmm1, xmm2;
1194  size_t k( kbegin );
1195 
1196  for( ; k<kpos; k+=SIMDSIZE ) {
1197  const SIMDType b1( B.load(k,j) );
1198  xmm1 += A.load(i ,k) * b1;
1199  xmm2 += A.load(i+1UL,k) * b1;
1200  }
1201 
1202  C(i ,j) = sum( xmm1 );
1203  C(i+1UL,j) = sum( xmm2 );
1204 
1205  for( ; remainder && k<kend; ++k ) {
1206  C(i ,j) += A(i ,k) * B(k,j);
1207  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1208  }
1209  }
1210  }
1211 
1212  for( ; i<M; ++i )
1213  {
1214  const size_t jend( LOW ? i+1UL : N );
1215  size_t j( SYM || HERM || UPP ? i : 0UL );
1216 
1217  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
1218  {
1219  const size_t kbegin( ( IsUpper_v<MT4> )
1220  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1221  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1222  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
1223 
1224  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1225  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1226 
1227  SIMDType xmm1, xmm2, xmm3, xmm4;
1228  size_t k( kbegin );
1229 
1230  for( ; k<kpos; k+=SIMDSIZE ) {
1231  const SIMDType a1( A.load(i,k) );
1232  xmm1 += a1 * B.load(k,j );
1233  xmm2 += a1 * B.load(k,j+1UL);
1234  xmm3 += a1 * B.load(k,j+2UL);
1235  xmm4 += a1 * B.load(k,j+3UL);
1236  }
1237 
1238  C(i,j ) = sum( xmm1 );
1239  C(i,j+1UL) = sum( xmm2 );
1240  C(i,j+2UL) = sum( xmm3 );
1241  C(i,j+3UL) = sum( xmm4 );
1242 
1243  for( ; remainder && k<kend; ++k ) {
1244  C(i,j ) += A(i,k) * B(k,j );
1245  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1246  C(i,j+2UL) += A(i,k) * B(k,j+2UL);
1247  C(i,j+3UL) += A(i,k) * B(k,j+3UL);
1248  }
1249  }
1250 
1251  for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
1252  {
1253  const size_t kbegin( ( IsUpper_v<MT4> )
1254  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1255  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1256  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1257 
1258  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1259  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1260 
1261  SIMDType xmm1, xmm2;
1262  size_t k( kbegin );
1263 
1264  for( ; k<kpos; k+=SIMDSIZE ) {
1265  const SIMDType a1( A.load(i,k) );
1266  xmm1 += a1 * B.load(k,j );
1267  xmm2 += a1 * B.load(k,j+1UL);
1268  }
1269 
1270  C(i,j ) = sum( xmm1 );
1271  C(i,j+1UL) = sum( xmm2 );
1272 
1273  for( ; remainder && k<kend; ++k ) {
1274  C(i,j ) += A(i,k) * B(k,j );
1275  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1276  }
1277  }
1278 
1279  for( ; j<jend; ++j )
1280  {
1281  const size_t kbegin( ( IsUpper_v<MT4> )
1282  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1283  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1284 
1285  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
1286  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1287 
1288  SIMDType xmm1;
1289  size_t k( kbegin );
1290 
1291  for( ; k<kpos; k+=SIMDSIZE ) {
1292  xmm1 += A.load(i,k) * B.load(k,j);
1293  }
1294 
1295  C(i,j) = sum( xmm1 );
1296 
1297  for( ; remainder && k<K; ++k ) {
1298  C(i,j) += A(i,k) * B(k,j);
1299  }
1300  }
1301  }
1302  }
1303 
1304  if( SYM || HERM ) {
1305  for( size_t i=2UL; i<M; ++i ) {
1306  const size_t jend( 2UL * ( i/2UL ) );
1307  for( size_t j=0UL; j<jend; ++j ) {
1308  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1309  }
1310  }
1311  }
1312  else if( LOW && !UPP ) {
1313  for( size_t j=2UL; j<N; ++j ) {
1314  const size_t iend( 2UL * ( j/2UL ) );
1315  for( size_t i=0UL; i<iend; ++i ) {
1316  reset( C(i,j) );
1317  }
1318  }
1319  }
1320  else if( !LOW && UPP ) {
1321  for( size_t i=2UL; i<M; ++i ) {
1322  const size_t jend( 2UL * ( i/2UL ) );
1323  for( size_t j=0UL; j<jend; ++j ) {
1324  reset( C(i,j) );
1325  }
1326  }
1327  }
1328  }
1330  //**********************************************************************************************
1331 
1332  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1347  template< typename MT3 // Type of the left-hand side target matrix
1348  , typename MT4 // Type of the left-hand side matrix operand
1349  , typename MT5 > // Type of the right-hand side matrix operand
1350  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1351  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1352  {
1353  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1354 
1355  const size_t M( A.rows() );
1356  const size_t N( B.columns() );
1357  const size_t K( A.columns() );
1358 
1359  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1360 
1361  if( LOW && UPP ) {
1362  reset( C );
1363  }
1364 
1365  {
1366  size_t i( 0UL );
1367 
1368  for( ; !( LOW && UPP ) && (i+4UL) <= M; i+=4UL )
1369  {
1370  const size_t jend( SYM || HERM || LOW ? i+4UL : N );
1371  size_t j( UPP ? i : 0UL );
1372 
1373  for( ; (j+2UL) <= jend; j+=2UL )
1374  {
1375  const size_t kbegin( ( IsUpper_v<MT4> )
1376  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1377  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1378  const size_t kend( ( IsLower_v<MT4> )
1379  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
1380  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1381 
1382  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1383  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1384 
1385  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1386  size_t k( kbegin );
1387 
1388  for( ; k<kpos; k+=SIMDSIZE ) {
1389  const SIMDType a1( A.load(i ,k) );
1390  const SIMDType a2( A.load(i+1UL,k) );
1391  const SIMDType a3( A.load(i+2UL,k) );
1392  const SIMDType a4( A.load(i+3UL,k) );
1393  const SIMDType b1( B.load(k,j ) );
1394  const SIMDType b2( B.load(k,j+1UL) );
1395  xmm1 += a1 * b1;
1396  xmm2 += a1 * b2;
1397  xmm3 += a2 * b1;
1398  xmm4 += a2 * b2;
1399  xmm5 += a3 * b1;
1400  xmm6 += a3 * b2;
1401  xmm7 += a4 * b1;
1402  xmm8 += a4 * b2;
1403  }
1404 
1405  C(i ,j ) = sum( xmm1 );
1406  C(i ,j+1UL) = sum( xmm2 );
1407  C(i+1UL,j ) = sum( xmm3 );
1408  C(i+1UL,j+1UL) = sum( xmm4 );
1409  C(i+2UL,j ) = sum( xmm5 );
1410  C(i+2UL,j+1UL) = sum( xmm6 );
1411  C(i+3UL,j ) = sum( xmm7 );
1412  C(i+3UL,j+1UL) = sum( xmm8 );
1413 
1414  for( ; remainder && k<kend; ++k ) {
1415  C(i ,j ) += A(i ,k) * B(k,j );
1416  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1417  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1418  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1419  C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1420  C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1421  C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1422  C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1423  }
1424  }
1425 
1426  if( j < jend )
1427  {
1428  const size_t kbegin( ( IsUpper_v<MT4> )
1429  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1430  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1431  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
1432 
1433  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1434  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1435 
1436  SIMDType xmm1, xmm2, xmm3, xmm4;
1437  size_t k( kbegin );
1438 
1439  for( ; k<kpos; k+=SIMDSIZE ) {
1440  const SIMDType b1( B.load(k,j) );
1441  xmm1 += A.load(i ,k) * b1;
1442  xmm2 += A.load(i+1UL,k) * b1;
1443  xmm3 += A.load(i+2UL,k) * b1;
1444  xmm4 += A.load(i+3UL,k) * b1;
1445  }
1446 
1447  C(i ,j) = sum( xmm1 );
1448  C(i+1UL,j) = sum( xmm2 );
1449  C(i+2UL,j) = sum( xmm3 );
1450  C(i+3UL,j) = sum( xmm4 );
1451 
1452  for( ; remainder && k<kend; ++k ) {
1453  C(i ,j) += A(i ,k) * B(k,j);
1454  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1455  C(i+2UL,j) += A(i+2UL,k) * B(k,j);
1456  C(i+3UL,j) += A(i+3UL,k) * B(k,j);
1457  }
1458  }
1459  }
1460 
1461  for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
1462  {
1463  size_t j( 0UL );
1464 
1465  for( ; (j+2UL) <= N; j+=2UL )
1466  {
1467  const size_t kbegin( ( IsUpper_v<MT4> )
1468  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1469  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1470  const size_t kend( ( IsLower_v<MT4> )
1471  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
1472  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1473 
1474  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1475  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1476 
1477  SIMDType xmm1, xmm2, xmm3, xmm4;
1478  size_t k( kbegin );
1479 
1480  for( ; k<kpos; k+=SIMDSIZE ) {
1481  const SIMDType a1( A.load(i ,k) );
1482  const SIMDType a2( A.load(i+1UL,k) );
1483  const SIMDType b1( B.load(k,j ) );
1484  const SIMDType b2( B.load(k,j+1UL) );
1485  xmm1 += a1 * b1;
1486  xmm2 += a1 * b2;
1487  xmm3 += a2 * b1;
1488  xmm4 += a2 * b2;
1489  }
1490 
1491  C(i ,j ) = sum( xmm1 );
1492  C(i ,j+1UL) = sum( xmm2 );
1493  C(i+1UL,j ) = sum( xmm3 );
1494  C(i+1UL,j+1UL) = sum( xmm4 );
1495 
1496  for( ; remainder && k<kend; ++k ) {
1497  C(i ,j ) += A(i ,k) * B(k,j );
1498  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1499  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1500  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1501  }
1502  }
1503 
1504  if( j < N )
1505  {
1506  const size_t kbegin( ( IsUpper_v<MT4> )
1507  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1508  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1509  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1510 
1511  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1512  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1513 
1514  SIMDType xmm1, xmm2;
1515  size_t k( kbegin );
1516 
1517  for( ; k<kpos; k+=SIMDSIZE ) {
1518  const SIMDType b1( B.load(k,j) );
1519  xmm1 += A.load(i ,k) * b1;
1520  xmm2 += A.load(i+1UL,k) * b1;
1521  }
1522 
1523  C(i ,j) = sum( xmm1 );
1524  C(i+1UL,j) = sum( xmm2 );
1525 
1526  for( ; remainder && k<kend; ++k ) {
1527  C(i ,j) += A(i ,k) * B(k,j);
1528  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1529  }
1530  }
1531  }
1532 
1533  for( ; i<M; ++i )
1534  {
1535  const size_t jend( LOW && UPP ? i+1UL : N );
1536  size_t j( LOW && UPP ? i : 0UL );
1537 
1538  for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
1539  {
1540  const size_t kbegin( ( IsUpper_v<MT4> )
1541  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1542  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1543  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1544 
1545  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
1546  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1547 
1548  SIMDType xmm1, xmm2;
1549  size_t k( kbegin );
1550 
1551  for( ; k<kpos; k+=SIMDSIZE ) {
1552  const SIMDType a1( A.load(i,k) );
1553  xmm1 += a1 * B.load(k,j );
1554  xmm2 += a1 * B.load(k,j+1UL);
1555  }
1556 
1557  C(i,j ) = sum( xmm1 );
1558  C(i,j+1UL) = sum( xmm2 );
1559 
1560  for( ; remainder && k<kend; ++k ) {
1561  C(i,j ) += A(i,k) * B(k,j );
1562  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1563  }
1564  }
1565 
1566  for( ; j<jend; ++j )
1567  {
1568  const size_t kbegin( ( IsUpper_v<MT4> )
1569  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
1570  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
1571 
1572  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
1573  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
1574 
1575  SIMDType xmm1;
1576  size_t k( kbegin );
1577 
1578  for( ; k<kpos; k+=SIMDSIZE ) {
1579  xmm1 += A.load(i,k) * B.load(k,j);
1580  }
1581 
1582  C(i,j) = sum( xmm1 );
1583 
1584  for( ; remainder && k<K; ++k ) {
1585  C(i,j) += A(i,k) * B(k,j);
1586  }
1587  }
1588  }
1589  }
1590 
1591  if( ( SYM || HERM ) && ( N > 4UL ) ) {
1592  for( size_t j=4UL; j<N; ++j ) {
1593  const size_t iend( 4UL * ( j/4UL ) );
1594  for( size_t i=0UL; i<iend; ++i ) {
1595  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1596  }
1597  }
1598  }
1599  else if( LOW && !UPP ) {
1600  for( size_t j=4UL; j<N; ++j ) {
1601  const size_t iend( 4UL * ( j/4UL ) );
1602  for( size_t i=0UL; i<iend; ++i ) {
1603  reset( C(i,j) );
1604  }
1605  }
1606  }
1607  else if( !LOW && UPP ) {
1608  for( size_t i=4UL; i<N; ++i ) {
1609  const size_t jend( 4UL * ( i/4UL ) );
1610  for( size_t j=0UL; j<jend; ++j ) {
1611  reset( C(i,j) );
1612  }
1613  }
1614  }
1615  }
1617  //**********************************************************************************************
1618 
1619  //**Default assignment to dense matrices (large matrices)***************************************
1633  template< typename MT3 // Type of the left-hand side target matrix
1634  , typename MT4 // Type of the left-hand side matrix operand
1635  , typename MT5 > // Type of the right-hand side matrix operand
1636  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1637  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1638  {
1639  selectDefaultAssignKernel( C, A, B );
1640  }
1642  //**********************************************************************************************
1643 
1644  //**Vectorized default assignment to dense matrices (large matrices)****************************
1659  template< typename MT3 // Type of the left-hand side target matrix
1660  , typename MT4 // Type of the left-hand side matrix operand
1661  , typename MT5 > // Type of the right-hand side matrix operand
1662  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1663  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1664  {
1665  if( SYM )
1666  smmm( C, A, B, ElementType(1) );
1667  else if( HERM )
1668  hmmm( C, A, B, ElementType(1) );
1669  else if( LOW )
1670  lmmm( C, A, B, ElementType(1), ElementType(0) );
1671  else if( UPP )
1672  ummm( C, A, B, ElementType(1), ElementType(0) );
1673  else
1674  mmm( C, A, B, ElementType(1), ElementType(0) );
1675  }
1677  //**********************************************************************************************
1678 
1679  //**BLAS-based assignment to dense matrices (default)*******************************************
1693  template< typename MT3 // Type of the left-hand side target matrix
1694  , typename MT4 // Type of the left-hand side matrix operand
1695  , typename MT5 > // Type of the right-hand side matrix operand
1696  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1697  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1698  {
1699  selectLargeAssignKernel( C, A, B );
1700  }
1702  //**********************************************************************************************
1703 
1704  //**BLAS-based assignment to dense matrices*****************************************************
1705 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1706 
1719  template< typename MT3 // Type of the left-hand side target matrix
1720  , typename MT4 // Type of the left-hand side matrix operand
1721  , typename MT5 > // Type of the right-hand side matrix operand
1722  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1723  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1724  {
1725  using ET = ElementType_t<MT3>;
1726 
1727  if( IsTriangular_v<MT4> ) {
1728  assign( C, B );
1729  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1730  }
1731  else if( IsTriangular_v<MT5> ) {
1732  assign( C, A );
1733  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1734  }
1735  else {
1736  gemm( C, A, B, ET(1), ET(0) );
1737  }
1738  }
1740 #endif
1741  //**********************************************************************************************
1742 
1743  //**Assignment to sparse matrices***************************************************************
1756  template< typename MT // Type of the target sparse matrix
1757  , bool SO > // Storage order of the target sparse matrix
1758  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
1759  {
1761 
1762  using TmpType = If_t< SO, OppositeType, ResultType >;
1763 
1770 
1771  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1772  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1773 
1774  const ForwardFunctor fwd;
1775 
1776  const TmpType tmp( serial( rhs ) );
1777  assign( ~lhs, fwd( tmp ) );
1778  }
1780  //**********************************************************************************************
1781 
1782  //**Addition assignment to dense matrices*******************************************************
1795  template< typename MT // Type of the target dense matrix
1796  , bool SO > // Storage order of the target dense matrix
1797  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
1798  {
1800 
1801  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1802  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1803 
1804  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1805  return;
1806  }
1807 
1808  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1809  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1810 
1811  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1812  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1813  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1814  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1815  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1816  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1817 
1818  DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1819  }
1821  //**********************************************************************************************
1822 
1823  //**Addition assignment to dense matrices (kernel selection)************************************
1834  template< typename MT3 // Type of the left-hand side target matrix
1835  , typename MT4 // Type of the left-hand side matrix operand
1836  , typename MT5 > // Type of the right-hand side matrix operand
1837  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1838  {
1839  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
1840  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
1841  selectSmallAddAssignKernel( C, A, B );
1842  else
1843  selectBlasAddAssignKernel( C, A, B );
1844  }
1846  //**********************************************************************************************
1847 
1848  //**Default addition assignment to row-major dense matrices (general/general)*******************
1862  template< typename MT3 // Type of the left-hand side target matrix
1863  , typename MT4 // Type of the left-hand side matrix operand
1864  , typename MT5 > // Type of the right-hand side matrix operand
1865  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1866  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1867  {
1868  const size_t M( A.rows() );
1869  const size_t N( B.columns() );
1870  const size_t K( A.columns() );
1871 
1872  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1873 
1874  const size_t ibegin( ( IsStrictlyLower_v<MT4> )
1875  ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
1876  :( 0UL ) );
1877  const size_t iend( ( IsStrictlyUpper_v<MT4> )
1878  ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
1879  :( M ) );
1880  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1881 
1882  for( size_t i=ibegin; i<iend; ++i )
1883  {
1884  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
1885  ?( ( IsStrictlyUpper_v<MT4> )
1886  ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
1887  :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
1888  :( ( IsStrictlyUpper_v<MT5> )
1889  ?( UPP ? max( i, 1UL ) : 1UL )
1890  :( UPP ? i : 0UL ) ) );
1891  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
1892  ?( ( IsStrictlyLower_v<MT4> )
1893  ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
1894  :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
1895  :( ( IsStrictlyLower_v<MT5> )
1896  ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
1897  :( LOW ? i+1UL : N ) ) );
1898 
1899  if( ( LOW || UPP ) && ( jbegin > jend ) ) continue;
1900  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1901 
1902  for( size_t j=jbegin; j<jend; ++j )
1903  {
1904  const size_t kbegin( ( IsUpper_v<MT4> )
1905  ?( ( IsLower_v<MT5> )
1906  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1907  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1908  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1909  :( ( IsLower_v<MT5> )
1910  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1911  :( 0UL ) ) );
1912  const size_t kend( ( IsLower_v<MT4> )
1913  ?( ( IsUpper_v<MT5> )
1914  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
1915  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
1916  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1917  :( ( IsUpper_v<MT5> )
1918  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1919  :( K ) ) );
1920  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
1921 
1922  const size_t knum( kend - kbegin );
1923  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
1924 
1925  for( size_t k=kbegin; k<kpos; k+=2UL ) {
1926  C(i,j) += A(i,k ) * B(k ,j);
1927  C(i,j) += A(i,k+1UL) * B(k+1UL,j);
1928  }
1929  if( kpos < kend ) {
1930  C(i,j) += A(i,kpos) * B(kpos,j);
1931  }
1932  }
1933  }
1934  }
1936  //**********************************************************************************************
1937 
1938  //**Default addition assignment to column-major dense matrices (general/general)****************
1952  template< typename MT3 // Type of the left-hand side target matrix
1953  , typename MT4 // Type of the left-hand side matrix operand
1954  , typename MT5 > // Type of the right-hand side matrix operand
1955  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1956  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1957  {
1958  const size_t M( A.rows() );
1959  const size_t N( B.columns() );
1960  const size_t K( A.columns() );
1961 
1962  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1963 
1964  const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
1965  ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
1966  :( 0UL ) );
1967  const size_t jend( ( IsStrictlyLower_v<MT5> )
1968  ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
1969  :( N ) );
1970  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1971 
1972  for( size_t j=jbegin; j<jend; ++j )
1973  {
1974  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
1975  ?( ( IsStrictlyLower_v<MT4> )
1976  ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
1977  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1978  :( ( IsStrictlyLower_v<MT4> )
1979  ?( LOW ? max( j, 1UL ) : 1UL )
1980  :( LOW ? j : 0UL ) ) );
1981  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
1982  ?( ( IsStrictlyUpper_v<MT4> )
1983  ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
1984  :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
1985  :( ( IsStrictlyUpper_v<MT4> )
1986  ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
1987  :( UPP ? j+1UL : M ) ) );
1988 
1989  if( ( LOW || UPP ) && ( ibegin > iend ) ) continue;
1990  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1991 
1992  for( size_t i=ibegin; i<iend; ++i )
1993  {
1994  const size_t kbegin( ( IsUpper_v<MT4> )
1995  ?( ( IsLower_v<MT5> )
1996  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1997  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1998  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1999  :( ( IsLower_v<MT5> )
2000  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2001  :( 0UL ) ) );
2002  const size_t kend( ( IsLower_v<MT4> )
2003  ?( ( IsUpper_v<MT5> )
2004  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
2005  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2006  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2007  :( ( IsUpper_v<MT5> )
2008  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2009  :( K ) ) );
2010  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
2011 
2012  const size_t knum( kend - kbegin );
2013  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
2014 
2015  for( size_t k=kbegin; k<kpos; k+=2UL ) {
2016  C(i,j) += A(i,k ) * B(k ,j);
2017  C(i,j) += A(i,k+1UL) * B(k+1UL,j);
2018  }
2019  if( kpos < kend ) {
2020  C(i,j) += A(i,kpos) * B(kpos,j);
2021  }
2022  }
2023  }
2024  }
2026  //**********************************************************************************************
2027 
2028  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2042  template< typename MT3 // Type of the left-hand side target matrix
2043  , typename MT4 // Type of the left-hand side matrix operand
2044  , typename MT5 > // Type of the right-hand side matrix operand
2045  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2046  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2047  {
2048  const size_t M( A.rows() );
2049  const size_t N( B.columns() );
2050 
2051  for( size_t i=0UL; i<M; ++i )
2052  {
2053  const size_t jbegin( ( IsUpper_v<MT4> )
2054  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2055  :( 0UL ) );
2056  const size_t jend( ( IsLower_v<MT4> )
2057  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2058  :( N ) );
2059  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2060 
2061  const size_t jnum( jend - jbegin );
2062  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2063 
2064  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2065  C(i,j ) += A(i,j ) * B(j ,j );
2066  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2067  }
2068  if( jpos < jend ) {
2069  C(i,jpos) += A(i,jpos) * B(jpos,jpos);
2070  }
2071  }
2072  }
2074  //**********************************************************************************************
2075 
2076  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
2090  template< typename MT3 // Type of the left-hand side target matrix
2091  , typename MT4 // Type of the left-hand side matrix operand
2092  , typename MT5 > // Type of the right-hand side matrix operand
2093  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2094  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2095  {
2096  constexpr size_t block( BLOCK_SIZE );
2097 
2098  const size_t M( A.rows() );
2099  const size_t N( B.columns() );
2100 
2101  for( size_t jj=0UL; jj<N; jj+=block ) {
2102  const size_t jend( min( N, jj+block ) );
2103  for( size_t ii=0UL; ii<M; ii+=block ) {
2104  const size_t iend( min( M, ii+block ) );
2105  for( size_t j=jj; j<jend; ++j )
2106  {
2107  const size_t ibegin( ( IsLower_v<MT4> )
2108  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
2109  :( ii ) );
2110  const size_t ipos( ( IsUpper_v<MT4> )
2111  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
2112  :( iend ) );
2113 
2114  for( size_t i=ibegin; i<ipos; ++i ) {
2115  C(i,j) += A(i,j) * B(j,j);
2116  }
2117  }
2118  }
2119  }
2120  }
2122  //**********************************************************************************************
2123 
2124  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
2138  template< typename MT3 // Type of the left-hand side target matrix
2139  , typename MT4 // Type of the left-hand side matrix operand
2140  , typename MT5 > // Type of the right-hand side matrix operand
2141  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2142  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2143  {
2144  constexpr size_t block( BLOCK_SIZE );
2145 
2146  const size_t M( A.rows() );
2147  const size_t N( B.columns() );
2148 
2149  for( size_t ii=0UL; ii<M; ii+=block ) {
2150  const size_t iend( min( M, ii+block ) );
2151  for( size_t jj=0UL; jj<N; jj+=block ) {
2152  const size_t jend( min( N, jj+block ) );
2153  for( size_t i=ii; i<iend; ++i )
2154  {
2155  const size_t jbegin( ( IsUpper_v<MT5> )
2156  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
2157  :( jj ) );
2158  const size_t jpos( ( IsLower_v<MT5> )
2159  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
2160  :( jend ) );
2161 
2162  for( size_t j=jbegin; j<jpos; ++j ) {
2163  C(i,j) += A(i,i) * B(i,j);
2164  }
2165  }
2166  }
2167  }
2168  }
2170  //**********************************************************************************************
2171 
2172  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2186  template< typename MT3 // Type of the left-hand side target matrix
2187  , typename MT4 // Type of the left-hand side matrix operand
2188  , typename MT5 > // Type of the right-hand side matrix operand
2189  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2190  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2191  {
2192  const size_t M( A.rows() );
2193  const size_t N( B.columns() );
2194 
2195  for( size_t j=0UL; j<N; ++j )
2196  {
2197  const size_t ibegin( ( IsLower_v<MT5> )
2198  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2199  :( 0UL ) );
2200  const size_t iend( ( IsUpper_v<MT5> )
2201  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2202  :( M ) );
2203  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2204 
2205  const size_t inum( iend - ibegin );
2206  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2207 
2208  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2209  C(i ,j) += A(i ,i ) * B(i ,j);
2210  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2211  }
2212  if( ipos < iend ) {
2213  C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2214  }
2215  }
2216  }
2218  //**********************************************************************************************
2219 
2220  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2234  template< typename MT3 // Type of the left-hand side target matrix
2235  , typename MT4 // Type of the left-hand side matrix operand
2236  , typename MT5 > // Type of the right-hand side matrix operand
2237  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2238  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2239  {
2240  for( size_t i=0UL; i<A.rows(); ++i ) {
2241  C(i,i) += A(i,i) * B(i,i);
2242  }
2243  }
2245  //**********************************************************************************************
2246 
2247  //**Default addition assignment to dense matrices (small matrices)******************************
2261  template< typename MT3 // Type of the left-hand side target matrix
2262  , typename MT4 // Type of the left-hand side matrix operand
2263  , typename MT5 > // Type of the right-hand side matrix operand
2264  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2265  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2266  {
2267  selectDefaultAddAssignKernel( C, A, B );
2268  }
2270  //**********************************************************************************************
2271 
2272  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2287  template< typename MT3 // Type of the left-hand side target matrix
2288  , typename MT4 // Type of the left-hand side matrix operand
2289  , typename MT5 > // Type of the right-hand side matrix operand
2290  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2291  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2292  {
2293  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
2294 
2295  const size_t M( A.rows() );
2296  const size_t N( B.columns() );
2297  const size_t K( A.columns() );
2298 
2299  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2300 
2301  size_t i( 0UL );
2302 
2303  for( ; (i+2UL) <= M; i+=2UL )
2304  {
2305  const size_t jend( LOW ? i+2UL : N );
2306  size_t j( UPP ? i : 0UL );
2307 
2308  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
2309  {
2310  const size_t kbegin( ( IsUpper_v<MT4> )
2311  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2312  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2313  const size_t kend( ( IsLower_v<MT4> )
2314  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
2315  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
2316 
2317  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2318  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2319 
2320  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2321  size_t k( kbegin );
2322 
2323  for( ; k<kpos; k+=SIMDSIZE ) {
2324  const SIMDType a1( A.load(i ,k) );
2325  const SIMDType a2( A.load(i+1UL,k) );
2326  const SIMDType b1( B.load(k,j ) );
2327  const SIMDType b2( B.load(k,j+1UL) );
2328  const SIMDType b3( B.load(k,j+2UL) );
2329  const SIMDType b4( B.load(k,j+3UL) );
2330  xmm1 += a1 * b1;
2331  xmm2 += a1 * b2;
2332  xmm3 += a1 * b3;
2333  xmm4 += a1 * b4;
2334  xmm5 += a2 * b1;
2335  xmm6 += a2 * b2;
2336  xmm7 += a2 * b3;
2337  xmm8 += a2 * b4;
2338  }
2339 
2340  C(i ,j ) += sum( xmm1 );
2341  C(i ,j+1UL) += sum( xmm2 );
2342  C(i ,j+2UL) += sum( xmm3 );
2343  C(i ,j+3UL) += sum( xmm4 );
2344  C(i+1UL,j ) += sum( xmm5 );
2345  C(i+1UL,j+1UL) += sum( xmm6 );
2346  C(i+1UL,j+2UL) += sum( xmm7 );
2347  C(i+1UL,j+3UL) += sum( xmm8 );
2348 
2349  for( ; remainder && k<kend; ++k ) {
2350  C(i ,j ) += A(i ,k) * B(k,j );
2351  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2352  C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2353  C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
2354  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2355  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2356  C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2357  C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
2358  }
2359  }
2360 
2361  for( ; (j+2UL) <= jend; j+=2UL )
2362  {
2363  const size_t kbegin( ( IsUpper_v<MT4> )
2364  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2365  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2366  const size_t kend( ( IsLower_v<MT4> )
2367  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
2368  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2369 
2370  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2371  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2372 
2373  SIMDType xmm1, xmm2, xmm3, xmm4;
2374  size_t k( kbegin );
2375 
2376  for( ; k<kpos; k+=SIMDSIZE ) {
2377  const SIMDType a1( A.load(i ,k) );
2378  const SIMDType a2( A.load(i+1UL,k) );
2379  const SIMDType b1( B.load(k,j ) );
2380  const SIMDType b2( B.load(k,j+1UL) );
2381  xmm1 += a1 * b1;
2382  xmm2 += a1 * b2;
2383  xmm3 += a2 * b1;
2384  xmm4 += a2 * b2;
2385  }
2386 
2387  C(i ,j ) += sum( xmm1 );
2388  C(i ,j+1UL) += sum( xmm2 );
2389  C(i+1UL,j ) += sum( xmm3 );
2390  C(i+1UL,j+1UL) += sum( xmm4 );
2391 
2392  for( ; remainder && k<kend; ++k ) {
2393  C(i ,j ) += A(i ,k) * B(k,j );
2394  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2395  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2396  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2397  }
2398  }
2399 
2400  if( j < jend )
2401  {
2402  const size_t kbegin( ( IsUpper_v<MT4> )
2403  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2404  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2405  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2406 
2407  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2408  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2409 
2410  SIMDType xmm1, xmm2;
2411  size_t k( kbegin );
2412 
2413  for( ; k<kpos; k+=SIMDSIZE ) {
2414  const SIMDType b1( B.load(k,j) );
2415  xmm1 += A.load(i ,k) * b1;
2416  xmm2 += A.load(i+1UL,k) * b1;
2417  }
2418 
2419  C(i ,j) += sum( xmm1 );
2420  C(i+1UL,j) += sum( xmm2 );
2421 
2422  for( ; remainder && k<kend; ++k ) {
2423  C(i ,j) += A(i ,k) * B(k,j);
2424  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2425  }
2426  }
2427  }
2428 
2429  if( i < M )
2430  {
2431  const size_t jend( LOW ? i+1UL : N );
2432  size_t j( UPP ? i : 0UL );
2433 
2434  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
2435  {
2436  const size_t kbegin( ( IsUpper_v<MT4> )
2437  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2438  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2439  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
2440 
2441  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2442  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2443 
2444  SIMDType xmm1, xmm2, xmm3, xmm4;
2445  size_t k( kbegin );
2446 
2447  for( ; k<kpos; k+=SIMDSIZE ) {
2448  const SIMDType a1( A.load(i,k) );
2449  xmm1 += a1 * B.load(k,j );
2450  xmm2 += a1 * B.load(k,j+1UL);
2451  xmm3 += a1 * B.load(k,j+2UL);
2452  xmm4 += a1 * B.load(k,j+3UL);
2453  }
2454 
2455  C(i,j ) += sum( xmm1 );
2456  C(i,j+1UL) += sum( xmm2 );
2457  C(i,j+2UL) += sum( xmm3 );
2458  C(i,j+3UL) += sum( xmm4 );
2459 
2460  for( ; remainder && k<kend; ++k ) {
2461  C(i,j ) += A(i,k) * B(k,j );
2462  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2463  C(i,j+2UL) += A(i,k) * B(k,j+2UL);
2464  C(i,j+3UL) += A(i,k) * B(k,j+3UL);
2465  }
2466  }
2467 
2468  for( ; (j+2UL) <= jend; j+=2UL )
2469  {
2470  const size_t kbegin( ( IsUpper_v<MT4> )
2471  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2472  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2473  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2474 
2475  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2476  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2477 
2478  SIMDType xmm1, xmm2;
2479  size_t k( kbegin );
2480 
2481  for( ; k<kpos; k+=SIMDSIZE ) {
2482  const SIMDType a1( A.load(i,k) );
2483  xmm1 += a1 * B.load(k,j );
2484  xmm2 += a1 * B.load(k,j+1UL);
2485  }
2486 
2487  C(i,j ) += sum( xmm1 );
2488  C(i,j+1UL) += sum( xmm2 );
2489 
2490  for( ; remainder && k<kend; ++k ) {
2491  C(i,j ) += A(i,k) * B(k,j );
2492  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2493  }
2494  }
2495 
2496  if( j < jend )
2497  {
2498  const size_t kbegin( ( IsUpper_v<MT4> )
2499  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2500  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2501 
2502  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
2503  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2504 
2505  SIMDType xmm1;
2506  size_t k( kbegin );
2507 
2508  for( ; k<kpos; k+=SIMDSIZE ) {
2509  xmm1 += A.load(i,k) * B.load(k,j);
2510  }
2511 
2512  C(i,j) += sum( xmm1 );
2513 
2514  for( ; remainder && k<K; ++k ) {
2515  C(i,j) += A(i,k) * B(k,j);
2516  }
2517  }
2518  }
2519  }
2521  //**********************************************************************************************
2522 
2523  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2538  template< typename MT3 // Type of the left-hand side target matrix
2539  , typename MT4 // Type of the left-hand side matrix operand
2540  , typename MT5 > // Type of the right-hand side matrix operand
2541  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2542  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2543  {
2544  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
2545 
2546  const size_t M( A.rows() );
2547  const size_t N( B.columns() );
2548  const size_t K( A.columns() );
2549 
2550  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2551 
2552  size_t i( 0UL );
2553 
2554  for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
2555  {
2556  size_t j( 0UL );
2557 
2558  for( ; (j+2UL) <= N; j+=2UL )
2559  {
2560  const size_t kbegin( ( IsUpper_v<MT4> )
2561  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2562  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2563  const size_t kend( ( IsLower_v<MT4> )
2564  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
2565  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2566 
2567  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2568  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2569 
2570  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2571  size_t k( kbegin );
2572 
2573  for( ; k<kpos; k+=SIMDSIZE ) {
2574  const SIMDType a1( A.load(i ,k) );
2575  const SIMDType a2( A.load(i+1UL,k) );
2576  const SIMDType a3( A.load(i+2UL,k) );
2577  const SIMDType a4( A.load(i+3UL,k) );
2578  const SIMDType b1( B.load(k,j ) );
2579  const SIMDType b2( B.load(k,j+1UL) );
2580  xmm1 += a1 * b1;
2581  xmm2 += a1 * b2;
2582  xmm3 += a2 * b1;
2583  xmm4 += a2 * b2;
2584  xmm5 += a3 * b1;
2585  xmm6 += a3 * b2;
2586  xmm7 += a4 * b1;
2587  xmm8 += a4 * b2;
2588  }
2589 
2590  C(i ,j ) += sum( xmm1 );
2591  C(i ,j+1UL) += sum( xmm2 );
2592  C(i+1UL,j ) += sum( xmm3 );
2593  C(i+1UL,j+1UL) += sum( xmm4 );
2594  C(i+2UL,j ) += sum( xmm5 );
2595  C(i+2UL,j+1UL) += sum( xmm6 );
2596  C(i+3UL,j ) += sum( xmm7 );
2597  C(i+3UL,j+1UL) += sum( xmm8 );
2598 
2599  for( ; remainder && k<kend; ++k ) {
2600  C(i ,j ) += A(i ,k) * B(k,j );
2601  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2602  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2603  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2604  C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2605  C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2606  C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
2607  C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
2608  }
2609  }
2610 
2611  if( j < N )
2612  {
2613  const size_t kbegin( ( IsUpper_v<MT4> )
2614  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2615  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2616  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
2617 
2618  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2619  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2620 
2621  SIMDType xmm1, xmm2, xmm3, xmm4;
2622  size_t k( kbegin );
2623 
2624  for( ; k<kpos; k+=SIMDSIZE ) {
2625  const SIMDType b1( B.load(k,j) );
2626  xmm1 += A.load(i ,k) * b1;
2627  xmm2 += A.load(i+1UL,k) * b1;
2628  xmm3 += A.load(i+2UL,k) * b1;
2629  xmm4 += A.load(i+3UL,k) * b1;
2630  }
2631 
2632  C(i ,j) += sum( xmm1 );
2633  C(i+1UL,j) += sum( xmm2 );
2634  C(i+2UL,j) += sum( xmm3 );
2635  C(i+3UL,j) += sum( xmm4 );
2636 
2637  for( ; remainder && k<kend; ++k ) {
2638  C(i ,j) += A(i ,k) * B(k,j);
2639  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2640  C(i+2UL,j) += A(i+2UL,k) * B(k,j);
2641  C(i+3UL,j) += A(i+3UL,k) * B(k,j);
2642  }
2643  }
2644  }
2645 
2646  for( ; (i+2UL) <= M; i+=2UL )
2647  {
2648  const size_t jend( LOW ? i+2UL : N );
2649  size_t j( UPP ? i : 0UL );
2650 
2651  for( ; (j+2UL) <= jend; j+=2UL )
2652  {
2653  const size_t kbegin( ( IsUpper_v<MT4> )
2654  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2655  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2656  const size_t kend( ( IsLower_v<MT4> )
2657  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
2658  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2659 
2660  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2661  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2662 
2663  SIMDType xmm1, xmm2, xmm3, xmm4;
2664  size_t k( kbegin );
2665 
2666  for( ; k<kpos; k+=SIMDSIZE ) {
2667  const SIMDType a1( A.load(i ,k) );
2668  const SIMDType a2( A.load(i+1UL,k) );
2669  const SIMDType b1( B.load(k,j ) );
2670  const SIMDType b2( B.load(k,j+1UL) );
2671  xmm1 += a1 * b1;
2672  xmm2 += a1 * b2;
2673  xmm3 += a2 * b1;
2674  xmm4 += a2 * b2;
2675  }
2676 
2677  C(i ,j ) += sum( xmm1 );
2678  C(i ,j+1UL) += sum( xmm2 );
2679  C(i+1UL,j ) += sum( xmm3 );
2680  C(i+1UL,j+1UL) += sum( xmm4 );
2681 
2682  for( ; remainder && k<kend; ++k ) {
2683  C(i ,j ) += A(i ,k) * B(k,j );
2684  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2685  C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2686  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2687  }
2688  }
2689 
2690  if( j < jend )
2691  {
2692  const size_t kbegin( ( IsUpper_v<MT4> )
2693  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2694  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2695  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2696 
2697  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2698  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2699 
2700  SIMDType xmm1, xmm2;
2701  size_t k( kbegin );
2702 
2703  for( ; k<kpos; k+=SIMDSIZE ) {
2704  const SIMDType b1( B.load(k,j) );
2705  xmm1 += A.load(i ,k) * b1;
2706  xmm2 += A.load(i+1UL,k) * b1;
2707  }
2708 
2709  C(i ,j) += sum( xmm1 );
2710  C(i+1UL,j) += sum( xmm2 );
2711 
2712  for( ; remainder && k<kend; ++k ) {
2713  C(i ,j) += A(i ,k) * B(k,j);
2714  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2715  }
2716  }
2717  }
2718 
2719  if( i < M )
2720  {
2721  const size_t jend( LOW ? i+1UL : N );
2722  size_t j( UPP ? i : 0UL );
2723 
2724  for( ; (j+2UL) <= jend; j+=2UL )
2725  {
2726  const size_t kbegin( ( IsUpper_v<MT4> )
2727  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2728  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2729  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2730 
2731  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
2732  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2733 
2734  SIMDType xmm1, xmm2;
2735  size_t k( kbegin );
2736 
2737  for( ; k<kpos; k+=SIMDSIZE ) {
2738  const SIMDType a1( A.load(i,k) );
2739  xmm1 += a1 * B.load(k,j );
2740  xmm2 += a1 * B.load(k,j+1UL);
2741  }
2742 
2743  C(i,j ) += sum( xmm1 );
2744  C(i,j+1UL) += sum( xmm2 );
2745 
2746  for( ; remainder && k<kend; ++k ) {
2747  C(i,j ) += A(i,k) * B(k,j );
2748  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2749  }
2750  }
2751 
2752  if( j < jend )
2753  {
2754  const size_t kbegin( ( IsUpper_v<MT4> )
2755  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
2756  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
2757 
2758  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
2759  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
2760 
2761  SIMDType xmm1;
2762  size_t k( kbegin );
2763 
2764  for( ; k<kpos; k+=SIMDSIZE ) {
2765  xmm1 += A.load(i,k) * B.load(k,j);
2766  }
2767 
2768  C(i,j) += sum( xmm1 );
2769 
2770  for( ; remainder && k<K; ++k ) {
2771  C(i,j) += A(i,k) * B(k,j);
2772  }
2773  }
2774  }
2775  }
2777  //**********************************************************************************************
2778 
2779  //**Default addition assignment to dense matrices (large matrices)******************************
2793  template< typename MT3 // Type of the left-hand side target matrix
2794  , typename MT4 // Type of the left-hand side matrix operand
2795  , typename MT5 > // Type of the right-hand side matrix operand
2796  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2797  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2798  {
2799  selectDefaultAddAssignKernel( C, A, B );
2800  }
2802  //**********************************************************************************************
2803 
2804  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
2819  template< typename MT3 // Type of the left-hand side target matrix
2820  , typename MT4 // Type of the left-hand side matrix operand
2821  , typename MT5 > // Type of the right-hand side matrix operand
2822  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2823  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2824  {
2825  if( LOW )
2826  lmmm( C, A, B, ElementType(1), ElementType(1) );
2827  else if( UPP )
2828  ummm( C, A, B, ElementType(1), ElementType(1) );
2829  else
2830  mmm( C, A, B, ElementType(1), ElementType(1) );
2831  }
2833  //**********************************************************************************************
2834 
2835  //**BLAS-based addition assignment to dense matrices (default)**********************************
2849  template< typename MT3 // Type of the left-hand side target matrix
2850  , typename MT4 // Type of the left-hand side matrix operand
2851  , typename MT5 > // Type of the right-hand side matrix operand
2852  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2853  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2854  {
2855  selectLargeAddAssignKernel( C, A, B );
2856  }
2858  //**********************************************************************************************
2859 
2860  //**BLAS-based addition assignment to dense matrices********************************************
2861 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2862 
2875  template< typename MT3 // Type of the left-hand side target matrix
2876  , typename MT4 // Type of the left-hand side matrix operand
2877  , typename MT5 > // Type of the right-hand side matrix operand
2878  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2879  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2880  {
2881  using ET = ElementType_t<MT3>;
2882 
2883  if( IsTriangular_v<MT4> ) {
2884  ResultType_t<MT3> tmp( serial( B ) );
2885  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2886  addAssign( C, tmp );
2887  }
2888  else if( IsTriangular_v<MT5> ) {
2889  ResultType_t<MT3> tmp( serial( A ) );
2890  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2891  addAssign( C, tmp );
2892  }
2893  else {
2894  gemm( C, A, B, ET(1), ET(1) );
2895  }
2896  }
2898 #endif
2899  //**********************************************************************************************
2900 
2901  //**Addition assignment to sparse matrices******************************************************
2902  // No special implementation for the addition assignment to sparse matrices.
2903  //**********************************************************************************************
2904 
2905  //**Subtraction assignment to dense matrices****************************************************
2918  template< typename MT // Type of the target dense matrix
2919  , bool SO > // Storage order of the target dense matrix
2920  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
2921  {
2923 
2924  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2925  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2926 
2927  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2928  return;
2929  }
2930 
2931  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2932  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2933 
2934  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2935  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2936  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2937  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2938  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2939  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2940 
2941  DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2942  }
2944  //**********************************************************************************************
2945 
2946  //**Subtraction assignment to dense matrices (kernel selection)*********************************
2957  template< typename MT3 // Type of the left-hand side target matrix
2958  , typename MT4 // Type of the left-hand side matrix operand
2959  , typename MT5 > // Type of the right-hand side matrix operand
2960  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2961  {
2962  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
2963  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
2964  selectSmallSubAssignKernel( C, A, B );
2965  else
2966  selectBlasSubAssignKernel( C, A, B );
2967  }
2969  //**********************************************************************************************
2970 
2971  //**Default subtraction assignment to row-major dense matrices (general/general)****************
2985  template< typename MT3 // Type of the left-hand side target matrix
2986  , typename MT4 // Type of the left-hand side matrix operand
2987  , typename MT5 > // Type of the right-hand side matrix operand
2988  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
2989  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2990  {
2991  const size_t M( A.rows() );
2992  const size_t N( B.columns() );
2993  const size_t K( A.columns() );
2994 
2995  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2996 
2997  const size_t ibegin( ( IsStrictlyLower_v<MT4> )
2998  ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
2999  :( 0UL ) );
3000  const size_t iend( ( IsStrictlyUpper_v<MT4> )
3001  ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
3002  :( M ) );
3003  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3004 
3005  for( size_t i=ibegin; i<iend; ++i )
3006  {
3007  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3008  ?( ( IsStrictlyUpper_v<MT4> )
3009  ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
3010  :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
3011  :( ( IsStrictlyUpper_v<MT5> )
3012  ?( UPP ? max( i, 1UL ) : 1UL )
3013  :( UPP ? i : 0UL ) ) );
3014  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
3015  ?( ( IsStrictlyLower_v<MT4> )
3016  ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
3017  :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
3018  :( ( IsStrictlyLower_v<MT5> )
3019  ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
3020  :( LOW ? i+1UL : N ) ) );
3021 
3022  if( ( LOW || UPP ) && ( jbegin > jend ) ) continue;
3023  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3024 
3025  for( size_t j=jbegin; j<jend; ++j )
3026  {
3027  const size_t kbegin( ( IsUpper_v<MT4> )
3028  ?( ( IsLower_v<MT5> )
3029  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3030  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3031  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3032  :( ( IsLower_v<MT5> )
3033  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3034  :( 0UL ) ) );
3035  const size_t kend( ( IsLower_v<MT4> )
3036  ?( ( IsUpper_v<MT5> )
3037  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3038  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3039  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3040  :( ( IsUpper_v<MT5> )
3041  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3042  :( K ) ) );
3043  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
3044 
3045  const size_t knum( kend - kbegin );
3046  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
3047 
3048  for( size_t k=kbegin; k<kpos; k+=2UL ) {
3049  C(i,j) -= A(i,k ) * B(k ,j);
3050  C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3051  }
3052  if( kpos < kend ) {
3053  C(i,j) -= A(i,kpos) * B(kpos,j);
3054  }
3055  }
3056  }
3057  }
3059  //**********************************************************************************************
3060 
3061  //**Default subtraction assignment to column-major dense matrices (general/general)*************
3075  template< typename MT3 // Type of the left-hand side target matrix
3076  , typename MT4 // Type of the left-hand side matrix operand
3077  , typename MT5 > // Type of the right-hand side matrix operand
3078  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3079  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3080  {
3081  const size_t M( A.rows() );
3082  const size_t N( B.columns() );
3083  const size_t K( A.columns() );
3084 
3085  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3086 
3087  const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
3088  ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
3089  :( 0UL ) );
3090  const size_t jend( ( IsStrictlyLower_v<MT5> )
3091  ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
3092  :( N ) );
3093  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3094 
3095  for( size_t j=jbegin; j<jend; ++j )
3096  {
3097  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
3098  ?( ( IsStrictlyLower_v<MT4> )
3099  ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
3100  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3101  :( ( IsStrictlyLower_v<MT4> )
3102  ?( LOW ? max( j, 1UL ) : 1UL )
3103  :( LOW ? j : 0UL ) ) );
3104  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3105  ?( ( IsStrictlyUpper_v<MT4> )
3106  ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
3107  :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
3108  :( ( IsStrictlyUpper_v<MT4> )
3109  ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
3110  :( UPP ? j+1UL : M ) ) );
3111 
3112  if( ( LOW || UPP ) && ( ibegin > iend ) ) continue;
3113  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3114 
3115  for( size_t i=ibegin; i<iend; ++i )
3116  {
3117  const size_t kbegin( ( IsUpper_v<MT4> )
3118  ?( ( IsLower_v<MT5> )
3119  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3120  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3121  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3122  :( ( IsLower_v<MT5> )
3123  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3124  :( 0UL ) ) );
3125  const size_t kend( ( IsLower_v<MT4> )
3126  ?( ( IsUpper_v<MT5> )
3127  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3128  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3129  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3130  :( ( IsUpper_v<MT5> )
3131  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3132  :( K ) ) );
3133  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
3134 
3135  const size_t knum( kend - kbegin );
3136  const size_t kpos( kbegin + ( knum & size_t(-2) ) );
3137 
3138  for( size_t k=kbegin; k<kpos; k+=2UL ) {
3139  C(i,j) -= A(i,k ) * B(k ,j);
3140  C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3141  }
3142  if( kpos < kend ) {
3143  C(i,j) -= A(i,kpos) * B(kpos,j);
3144  }
3145  }
3146  }
3147  }
3149  //**********************************************************************************************
3150 
3151  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
3165  template< typename MT3 // Type of the left-hand side target matrix
3166  , typename MT4 // Type of the left-hand side matrix operand
3167  , typename MT5 > // Type of the right-hand side matrix operand
3168  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3169  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3170  {
3171  const size_t M( A.rows() );
3172  const size_t N( B.columns() );
3173 
3174  for( size_t i=0UL; i<M; ++i )
3175  {
3176  const size_t jbegin( ( IsUpper_v<MT4> )
3177  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3178  :( 0UL ) );
3179  const size_t jend( ( IsLower_v<MT4> )
3180  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3181  :( N ) );
3182  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3183 
3184  const size_t jnum( jend - jbegin );
3185  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3186 
3187  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3188  C(i,j ) -= A(i,j ) * B(j ,j );
3189  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3190  }
3191  if( jpos < jend ) {
3192  C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3193  }
3194  }
3195  }
3197  //**********************************************************************************************
3198 
3199  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
3213  template< typename MT3 // Type of the left-hand side target matrix
3214  , typename MT4 // Type of the left-hand side matrix operand
3215  , typename MT5 > // Type of the right-hand side matrix operand
3216  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3217  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3218  {
3219  constexpr size_t block( BLOCK_SIZE );
3220 
3221  const size_t M( A.rows() );
3222  const size_t N( B.columns() );
3223 
3224  for( size_t jj=0UL; jj<N; jj+=block ) {
3225  const size_t jend( min( N, jj+block ) );
3226  for( size_t ii=0UL; ii<M; ii+=block ) {
3227  const size_t iend( min( M, ii+block ) );
3228  for( size_t j=jj; j<jend; ++j )
3229  {
3230  const size_t ibegin( ( IsLower_v<MT4> )
3231  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
3232  :( ii ) );
3233  const size_t ipos( ( IsUpper_v<MT4> )
3234  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
3235  :( iend ) );
3236 
3237  for( size_t i=ibegin; i<ipos; ++i ) {
3238  C(i,j) -= A(i,j) * B(j,j);
3239  }
3240  }
3241  }
3242  }
3243  }
3245  //**********************************************************************************************
3246 
3247  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
3261  template< typename MT3 // Type of the left-hand side target matrix
3262  , typename MT4 // Type of the left-hand side matrix operand
3263  , typename MT5 > // Type of the right-hand side matrix operand
3264  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3265  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3266  {
3267  constexpr size_t block( BLOCK_SIZE );
3268 
3269  const size_t M( A.rows() );
3270  const size_t N( B.columns() );
3271 
3272  for( size_t ii=0UL; ii<M; ii+=block ) {
3273  const size_t iend( min( M, ii+block ) );
3274  for( size_t jj=0UL; jj<N; jj+=block ) {
3275  const size_t jend( min( N, jj+block ) );
3276  for( size_t i=ii; i<iend; ++i )
3277  {
3278  const size_t jbegin( ( IsUpper_v<MT5> )
3279  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
3280  :( jj ) );
3281  const size_t jpos( ( IsLower_v<MT5> )
3282  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
3283  :( jend ) );
3284 
3285  for( size_t j=jbegin; j<jpos; ++j ) {
3286  C(i,j) -= A(i,i) * B(i,j);
3287  }
3288  }
3289  }
3290  }
3291  }
3293  //**********************************************************************************************
3294 
3295  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
3309  template< typename MT3 // Type of the left-hand side target matrix
3310  , typename MT4 // Type of the left-hand side matrix operand
3311  , typename MT5 > // Type of the right-hand side matrix operand
3312  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3313  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3314  {
3315  const size_t M( A.rows() );
3316  const size_t N( B.columns() );
3317 
3318  for( size_t j=0UL; j<N; ++j )
3319  {
3320  const size_t ibegin( ( IsLower_v<MT5> )
3321  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3322  :( 0UL ) );
3323  const size_t iend( ( IsUpper_v<MT5> )
3324  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3325  :( M ) );
3326  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3327 
3328  const size_t inum( iend - ibegin );
3329  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3330 
3331  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3332  C(i ,j) -= A(i ,i ) * B(i ,j);
3333  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3334  }
3335  if( ipos < iend ) {
3336  C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3337  }
3338  }
3339  }
3341  //**********************************************************************************************
3342 
3343  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3357  template< typename MT3 // Type of the left-hand side target matrix
3358  , typename MT4 // Type of the left-hand side matrix operand
3359  , typename MT5 > // Type of the right-hand side matrix operand
3360  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3361  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3362  {
3363  for( size_t i=0UL; i<A.rows(); ++i ) {
3364  C(i,i) -= A(i,i) * B(i,i);
3365  }
3366  }
3368  //**********************************************************************************************
3369 
3370  //**Default subtraction assignment to dense matrices (small matrices)***************************
3384  template< typename MT3 // Type of the left-hand side target matrix
3385  , typename MT4 // Type of the left-hand side matrix operand
3386  , typename MT5 > // Type of the right-hand side matrix operand
3387  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3388  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3389  {
3390  selectDefaultSubAssignKernel( C, A, B );
3391  }
3393  //**********************************************************************************************
3394 
3395  //**Default subtraction assignment to row-major dense matrices (small matrices)*****************
3410  template< typename MT3 // Type of the left-hand side target matrix
3411  , typename MT4 // Type of the left-hand side matrix operand
3412  , typename MT5 > // Type of the right-hand side matrix operand
3413  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3414  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3415  {
3416  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3417 
3418  const size_t M( A.rows() );
3419  const size_t N( B.columns() );
3420  const size_t K( A.columns() );
3421 
3422  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3423 
3424  size_t i( 0UL );
3425 
3426  for( ; (i+2UL) <= M; i+=2UL )
3427  {
3428  const size_t jend( LOW ? i+2UL : N );
3429  size_t j( UPP ? i : 0UL );
3430 
3431  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
3432  {
3433  const size_t kbegin( ( IsUpper_v<MT4> )
3434  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3435  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3436  const size_t kend( ( IsLower_v<MT4> )
3437  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
3438  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
3439 
3440  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3441  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3442 
3443  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3444  size_t k( kbegin );
3445 
3446  for( ; k<kpos; k+=SIMDSIZE ) {
3447  const SIMDType a1( A.load(i ,k) );
3448  const SIMDType a2( A.load(i+1UL,k) );
3449  const SIMDType b1( B.load(k,j ) );
3450  const SIMDType b2( B.load(k,j+1UL) );
3451  const SIMDType b3( B.load(k,j+2UL) );
3452  const SIMDType b4( B.load(k,j+3UL) );
3453  xmm1 += a1 * b1;
3454  xmm2 += a1 * b2;
3455  xmm3 += a1 * b3;
3456  xmm4 += a1 * b4;
3457  xmm5 += a2 * b1;
3458  xmm6 += a2 * b2;
3459  xmm7 += a2 * b3;
3460  xmm8 += a2 * b4;
3461  }
3462 
3463  C(i ,j ) -= sum( xmm1 );
3464  C(i ,j+1UL) -= sum( xmm2 );
3465  C(i ,j+2UL) -= sum( xmm3 );
3466  C(i ,j+3UL) -= sum( xmm4 );
3467  C(i+1UL,j ) -= sum( xmm5 );
3468  C(i+1UL,j+1UL) -= sum( xmm6 );
3469  C(i+1UL,j+2UL) -= sum( xmm7 );
3470  C(i+1UL,j+3UL) -= sum( xmm8 );
3471 
3472  for( ; remainder && k<kend; ++k ) {
3473  C(i ,j ) -= A(i ,k) * B(k,j );
3474  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3475  C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
3476  C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
3477  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3478  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3479  C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
3480  C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
3481  }
3482  }
3483 
3484  for( ; (j+2UL) <= jend; j+=2UL )
3485  {
3486  const size_t kbegin( ( IsUpper_v<MT4> )
3487  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3488  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3489  const size_t kend( ( IsLower_v<MT4> )
3490  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
3491  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3492 
3493  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3494  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3495 
3496  SIMDType xmm1, xmm2, xmm3, xmm4;
3497  size_t k( kbegin );
3498 
3499  for( ; k<kpos; k+=SIMDSIZE ) {
3500  const SIMDType a1( A.load(i ,k) );
3501  const SIMDType a2( A.load(i+1UL,k) );
3502  const SIMDType b1( B.load(k,j ) );
3503  const SIMDType b2( B.load(k,j+1UL) );
3504  xmm1 += a1 * b1;
3505  xmm2 += a1 * b2;
3506  xmm3 += a2 * b1;
3507  xmm4 += a2 * b2;
3508  }
3509 
3510  C(i ,j ) -= sum( xmm1 );
3511  C(i ,j+1UL) -= sum( xmm2 );
3512  C(i+1UL,j ) -= sum( xmm3 );
3513  C(i+1UL,j+1UL) -= sum( xmm4 );
3514 
3515  for( ; remainder && k<kend; ++k ) {
3516  C(i ,j ) -= A(i ,k) * B(k,j );
3517  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3518  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3519  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3520  }
3521  }
3522 
3523  if( j < jend )
3524  {
3525  const size_t kbegin( ( IsUpper_v<MT4> )
3526  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3527  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3528  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3529 
3530  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3531  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3532 
3533  SIMDType xmm1, xmm2;
3534  size_t k( kbegin );
3535 
3536  for( ; k<kpos; k+=SIMDSIZE ) {
3537  const SIMDType b1( B.load(k,j) );
3538  xmm1 += A.load(i ,k) * b1;
3539  xmm2 += A.load(i+1UL,k) * b1;
3540  }
3541 
3542  C(i ,j) -= sum( xmm1 );
3543  C(i+1UL,j) -= sum( xmm2 );
3544 
3545  for( ; remainder && k<kend; ++k ) {
3546  C(i ,j) -= A(i ,k) * B(k,j);
3547  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3548  }
3549  }
3550  }
3551 
3552  if( i < M )
3553  {
3554  const size_t jend( LOW ? i+1UL : N );
3555  size_t j( UPP ? i : 0UL );
3556 
3557  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
3558  {
3559  const size_t kbegin( ( IsUpper_v<MT4> )
3560  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3561  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3562  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
3563 
3564  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3565  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3566 
3567  SIMDType xmm1, xmm2, xmm3, xmm4;
3568  size_t k( kbegin );
3569 
3570  for( ; k<kpos; k+=SIMDSIZE ) {
3571  const SIMDType a1( A.load(i,k) );
3572  xmm1 += a1 * B.load(k,j );
3573  xmm2 += a1 * B.load(k,j+1UL);
3574  xmm3 += a1 * B.load(k,j+2UL);
3575  xmm4 += a1 * B.load(k,j+3UL);
3576  }
3577 
3578  C(i,j ) -= sum( xmm1 );
3579  C(i,j+1UL) -= sum( xmm2 );
3580  C(i,j+2UL) -= sum( xmm3 );
3581  C(i,j+3UL) -= sum( xmm4 );
3582 
3583  for( ; remainder && k<kend; ++k ) {
3584  C(i,j ) -= A(i,k) * B(k,j );
3585  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3586  C(i,j+2UL) -= A(i,k) * B(k,j+2UL);
3587  C(i,j+3UL) -= A(i,k) * B(k,j+3UL);
3588  }
3589  }
3590 
3591  for( ; (j+2UL) <= jend; j+=2UL )
3592  {
3593  const size_t kbegin( ( IsUpper_v<MT4> )
3594  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3595  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3596  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3597 
3598  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3599  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3600 
3601  SIMDType xmm1, xmm2;
3602  size_t k( kbegin );
3603 
3604  for( ; k<kpos; k+=SIMDSIZE ) {
3605  const SIMDType a1( A.load(i,k) );
3606  xmm1 += a1 * B.load(k,j );
3607  xmm2 += a1 * B.load(k,j+1UL);
3608  }
3609 
3610  C(i,j ) -= sum( xmm1 );
3611  C(i,j+1UL) -= sum( xmm2 );
3612 
3613  for( ; remainder && k<kend; ++k ) {
3614  C(i,j ) -= A(i,k) * B(k,j );
3615  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3616  }
3617  }
3618 
3619  if( j < jend )
3620  {
3621  const size_t kbegin( ( IsUpper_v<MT4> )
3622  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3623  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3624 
3625  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
3626  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3627 
3628  SIMDType xmm1;
3629  size_t k( kbegin );
3630 
3631  for( ; k<kpos; k+=SIMDSIZE ) {
3632  xmm1 += A.load(i,k) * B.load(k,j);
3633  }
3634 
3635  C(i,j) -= sum( xmm1 );
3636 
3637  for( ; remainder && k<K; ++k ) {
3638  C(i,j) -= A(i,k) * B(k,j);
3639  }
3640  }
3641  }
3642  }
3644  //**********************************************************************************************
3645 
3646  //**Default subtraction assignment to column-major dense matrices (small matrices)**************
3661  template< typename MT3 // Type of the left-hand side target matrix
3662  , typename MT4 // Type of the left-hand side matrix operand
3663  , typename MT5 > // Type of the right-hand side matrix operand
3664  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3665  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3666  {
3667  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3668 
3669  const size_t M( A.rows() );
3670  const size_t N( B.columns() );
3671  const size_t K( A.columns() );
3672 
3673  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3674 
3675  size_t i( 0UL );
3676 
3677  for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
3678  {
3679  size_t j( 0UL );
3680 
3681  for( ; (j+2UL) <= N; j+=2UL )
3682  {
3683  const size_t kbegin( ( IsUpper_v<MT4> )
3684  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3685  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3686  const size_t kend( ( IsLower_v<MT4> )
3687  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
3688  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3689 
3690  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3691  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3692 
3693  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3694  size_t k( kbegin );
3695 
3696  for( ; k<kpos; k+=SIMDSIZE ) {
3697  const SIMDType a1( A.load(i ,k) );
3698  const SIMDType a2( A.load(i+1UL,k) );
3699  const SIMDType a3( A.load(i+2UL,k) );
3700  const SIMDType a4( A.load(i+3UL,k) );
3701  const SIMDType b1( B.load(k,j ) );
3702  const SIMDType b2( B.load(k,j+1UL) );
3703  xmm1 += a1 * b1;
3704  xmm2 += a1 * b2;
3705  xmm3 += a2 * b1;
3706  xmm4 += a2 * b2;
3707  xmm5 += a3 * b1;
3708  xmm6 += a3 * b2;
3709  xmm7 += a4 * b1;
3710  xmm8 += a4 * b2;
3711  }
3712 
3713  C(i ,j ) -= sum( xmm1 );
3714  C(i ,j+1UL) -= sum( xmm2 );
3715  C(i+1UL,j ) -= sum( xmm3 );
3716  C(i+1UL,j+1UL) -= sum( xmm4 );
3717  C(i+2UL,j ) -= sum( xmm5 );
3718  C(i+2UL,j+1UL) -= sum( xmm6 );
3719  C(i+3UL,j ) -= sum( xmm7 );
3720  C(i+3UL,j+1UL) -= sum( xmm8 );
3721 
3722  for( ; remainder && k<kend; ++k ) {
3723  C(i ,j ) -= A(i ,k) * B(k,j );
3724  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3725  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3726  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3727  C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3728  C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
3729  C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3730  C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
3731  }
3732  }
3733 
3734  if( j < N )
3735  {
3736  const size_t kbegin( ( IsUpper_v<MT4> )
3737  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3738  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3739  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
3740 
3741  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3742  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3743 
3744  SIMDType xmm1, xmm2, xmm3, xmm4;
3745  size_t k( kbegin );
3746 
3747  for( ; k<kpos; k+=SIMDSIZE ) {
3748  const SIMDType b1( B.load(k,j) );
3749  xmm1 += A.load(i ,k) * b1;
3750  xmm2 += A.load(i+1UL,k) * b1;
3751  xmm3 += A.load(i+2UL,k) * b1;
3752  xmm4 += A.load(i+3UL,k) * b1;
3753  }
3754 
3755  C(i ,j) -= sum( xmm1 );
3756  C(i+1UL,j) -= sum( xmm2 );
3757  C(i+2UL,j) -= sum( xmm3 );
3758  C(i+3UL,j) -= sum( xmm4 );
3759 
3760  for( ; remainder && k<kend; ++k ) {
3761  C(i ,j ) -= A(i ,k) * B(k,j );
3762  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3763  C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3764  C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3765  }
3766  }
3767  }
3768 
3769  for( ; (i+2UL) <= M; i+=2UL )
3770  {
3771  const size_t jend( LOW ? i+2UL : N );
3772  size_t j( UPP ? i : 0UL );
3773 
3774  for( ; (j+2UL) <= jend; j+=2UL )
3775  {
3776  const size_t kbegin( ( IsUpper_v<MT4> )
3777  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3778  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3779  const size_t kend( ( IsLower_v<MT4> )
3780  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
3781  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3782 
3783  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3784  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3785 
3786  SIMDType xmm1, xmm2, xmm3, xmm4;
3787  size_t k( kbegin );
3788 
3789  for( ; k<kpos; k+=SIMDSIZE ) {
3790  const SIMDType a1( A.load(i ,k) );
3791  const SIMDType a2( A.load(i+1UL,k) );
3792  const SIMDType b1( B.load(k,j ) );
3793  const SIMDType b2( B.load(k,j+1UL) );
3794  xmm1 += a1 * b1;
3795  xmm2 += a1 * b2;
3796  xmm3 += a2 * b1;
3797  xmm4 += a2 * b2;
3798  }
3799 
3800  C(i ,j ) -= sum( xmm1 );
3801  C(i ,j+1UL) -= sum( xmm2 );
3802  C(i+1UL,j ) -= sum( xmm3 );
3803  C(i+1UL,j+1UL) -= sum( xmm4 );
3804 
3805  for( ; remainder && k<kend; ++k ) {
3806  C(i ,j ) -= A(i ,k) * B(k,j );
3807  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3808  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3809  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3810  }
3811  }
3812 
3813  if( j < jend )
3814  {
3815  const size_t kbegin( ( IsUpper_v<MT4> )
3816  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3817  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3818  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3819 
3820  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3821  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3822 
3823  SIMDType xmm1, xmm2;
3824  size_t k( kbegin );
3825 
3826  for( ; k<kpos; k+=SIMDSIZE ) {
3827  const SIMDType b1( B.load(k,j) );
3828  xmm1 += A.load(i ,k) * b1;
3829  xmm2 += A.load(i+1UL,k) * b1;
3830  }
3831 
3832  C(i ,j) -= sum( xmm1 );
3833  C(i+1UL,j) -= sum( xmm2 );
3834 
3835  for( ; remainder && k<kend; ++k ) {
3836  C(i ,j) -= A(i ,k) * B(k,j);
3837  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3838  }
3839  }
3840  }
3841 
3842  if( i < M )
3843  {
3844  const size_t jend( LOW ? i+1UL : N );
3845  size_t j( UPP ? i : 0UL );
3846 
3847  for( ; (j+2UL) <= jend; j+=2UL )
3848  {
3849  const size_t kbegin( ( IsUpper_v<MT4> )
3850  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3851  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3852  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3853 
3854  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
3855  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3856 
3857  SIMDType xmm1, xmm2;
3858  size_t k( kbegin );
3859 
3860  for( ; k<kpos; k+=SIMDSIZE ) {
3861  const SIMDType a1( A.load(i,k) );
3862  xmm1 += a1 * B.load(k,j );
3863  xmm2 += a1 * B.load(k,j+1UL);
3864  }
3865 
3866  C(i,j ) -= sum( xmm1 );
3867  C(i,j+1UL) -= sum( xmm2 );
3868 
3869  for( ; remainder && k<kend; ++k ) {
3870  C(i,j ) -= A(i,k) * B(k,j );
3871  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3872  }
3873  }
3874 
3875  if( j < jend )
3876  {
3877  const size_t kbegin( ( IsUpper_v<MT4> )
3878  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
3879  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
3880 
3881  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
3882  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
3883 
3884  SIMDType xmm1;
3885  size_t k( kbegin );
3886 
3887  for( ; k<kpos; k+=SIMDSIZE ) {
3888  xmm1 += A.load(i,k) * B.load(k,j);
3889  }
3890 
3891  C(i,j) -= sum( xmm1 );
3892 
3893  for( ; remainder && k<K; ++k ) {
3894  C(i,j) -= A(i,k) * B(k,j);
3895  }
3896  }
3897  }
3898  }
3900  //**********************************************************************************************
3901 
3902  //**Default subtraction assignment to dense matrices (large matrices)***************************
3916  template< typename MT3 // Type of the left-hand side target matrix
3917  , typename MT4 // Type of the left-hand side matrix operand
3918  , typename MT5 > // Type of the right-hand side matrix operand
3919  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3920  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3921  {
3922  selectDefaultSubAssignKernel( C, A, B );
3923  }
3925  //**********************************************************************************************
3926 
3927  //**Default subtraction assignment to dense matrices (large matrices)***************************
3942  template< typename MT3 // Type of the left-hand side target matrix
3943  , typename MT4 // Type of the left-hand side matrix operand
3944  , typename MT5 > // Type of the right-hand side matrix operand
3945  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3946  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3947  {
3948  if( LOW )
3949  lmmm( C, A, B, ElementType(-1), ElementType(1) );
3950  else if( UPP )
3951  ummm( C, A, B, ElementType(-1), ElementType(1) );
3952  else
3953  mmm( C, A, B, ElementType(-1), ElementType(1) );
3954  }
3956  //**********************************************************************************************
3957 
3958  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
3972  template< typename MT3 // Type of the left-hand side target matrix
3973  , typename MT4 // Type of the left-hand side matrix operand
3974  , typename MT5 > // Type of the right-hand side matrix operand
3975  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3976  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3977  {
3978  selectLargeSubAssignKernel( C, A, B );
3979  }
3981  //**********************************************************************************************
3982 
3983  //**BLAS-based subraction assignment to dense matrices******************************************
3984 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3985 
3998  template< typename MT3 // Type of the left-hand side target matrix
3999  , typename MT4 // Type of the left-hand side matrix operand
4000  , typename MT5 > // Type of the right-hand side matrix operand
4001  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4002  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4003  {
4004  using ET = ElementType_t<MT3>;
4005 
4006  if( IsTriangular_v<MT4> ) {
4007  ResultType_t<MT3> tmp( serial( B ) );
4008  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4009  subAssign( C, tmp );
4010  }
4011  else if( IsTriangular_v<MT5> ) {
4012  ResultType_t<MT3> tmp( serial( A ) );
4013  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4014  subAssign( C, tmp );
4015  }
4016  else {
4017  gemm( C, A, B, ET(-1), ET(1) );
4018  }
4019  }
4021 #endif
4022  //**********************************************************************************************
4023 
4024  //**Subtraction assignment to sparse matrices***************************************************
4025  // No special implementation for the subtraction assignment to sparse matrices.
4026  //**********************************************************************************************
4027 
4028  //**Schur product assignment to dense matrices**************************************************
4041  template< typename MT // Type of the target dense matrix
4042  , bool SO > // Storage order of the target dense matrix
4043  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4044  {
4046 
4050 
4051  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4052  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4053 
4054  const ResultType tmp( serial( rhs ) );
4055  schurAssign( ~lhs, tmp );
4056  }
4058  //**********************************************************************************************
4059 
4060  //**Schur product assignment to sparse matrices*************************************************
4061  // No special implementation for the Schur product assignment to sparse matrices.
4062  //**********************************************************************************************
4063 
4064  //**Multiplication assignment to dense matrices*************************************************
4065  // No special implementation for the multiplication assignment to dense matrices.
4066  //**********************************************************************************************
4067 
4068  //**Multiplication assignment to sparse matrices************************************************
4069  // No special implementation for the multiplication assignment to sparse matrices.
4070  //**********************************************************************************************
4071 
4072  //**SMP assignment to dense matrices************************************************************
4087  template< typename MT // Type of the target dense matrix
4088  , bool SO > // Storage order of the target dense matrix
4089  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4090  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4091  {
4093 
4094  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4095  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4096 
4097  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4098  return;
4099  }
4100  else if( rhs.lhs_.columns() == 0UL ) {
4101  reset( ~lhs );
4102  return;
4103  }
4104 
4105  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4106  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4107 
4108  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4109  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4110  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4111  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4112  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4113  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4114 
4115  smpAssign( ~lhs, A * B );
4116  }
4118  //**********************************************************************************************
4119 
4120  //**SMP assignment to sparse matrices***********************************************************
4135  template< typename MT // Type of the target sparse matrix
4136  , bool SO > // Storage order of the target sparse matrix
4137  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4138  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4139  {
4141 
4142  using TmpType = If_t< SO, OppositeType, ResultType >;
4143 
4150 
4151  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4152  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4153 
4154  const ForwardFunctor fwd;
4155 
4156  const TmpType tmp( rhs );
4157  smpAssign( ~lhs, fwd( tmp ) );
4158  }
4160  //**********************************************************************************************
4161 
4162  //**SMP addition assignment to dense matrices***************************************************
4178  template< typename MT // Type of the target dense matrix
4179  , bool SO > // Storage order of the target dense matrix
4180  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4181  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4182  {
4184 
4185  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4186  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4187 
4188  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4189  return;
4190  }
4191 
4192  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4193  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4194 
4195  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4196  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4197  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4198  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4199  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4200  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4201 
4202  smpAddAssign( ~lhs, A * B );
4203  }
4205  //**********************************************************************************************
4206 
4207  //**SMP addition assignment to sparse matrices**************************************************
4208  // No special implementation for the SMP addition assignment to sparse matrices.
4209  //**********************************************************************************************
4210 
4211  //**SMP subtraction assignment to dense matrices************************************************
4227  template< typename MT // Type of the target dense matrix
4228  , bool SO > // Storage order of the target dense matrix
4229  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4230  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4231  {
4233 
4234  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4235  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4236 
4237  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4238  return;
4239  }
4240 
4241  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4242  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4243 
4244  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4245  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4246  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4247  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4248  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4249  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4250 
4251  smpSubAssign( ~lhs, A * B );
4252  }
4254  //**********************************************************************************************
4255 
4256  //**SMP subtraction assignment to sparse matrices***********************************************
4257  // No special implementation for the SMP subtraction assignment to sparse matrices.
4258  //**********************************************************************************************
4259 
4260  //**SMP Schur product assignment to dense matrices**********************************************
4273  template< typename MT // Type of the target dense matrix
4274  , bool SO > // Storage order of the target dense matrix
4275  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatTDMatMultExpr& rhs )
4276  {
4278 
4282 
4283  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4284  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4285 
4286  const ResultType tmp( rhs );
4287  smpSchurAssign( ~lhs, tmp );
4288  }
4290  //**********************************************************************************************
4291 
4292  //**SMP Schur product assignment to sparse matrices*********************************************
4293  // No special implementation for the SMP Schur product assignment to sparse matrices.
4294  //**********************************************************************************************
4295 
4296  //**SMP multiplication assignment to dense matrices*********************************************
4297  // No special implementation for the SMP multiplication assignment to dense matrices.
4298  //**********************************************************************************************
4299 
4300  //**SMP multiplication assignment to sparse matrices********************************************
4301  // No special implementation for the SMP multiplication assignment to sparse matrices.
4302  //**********************************************************************************************
4303 
4304  //**Compile time checks*************************************************************************
4312  //**********************************************************************************************
4313 };
4314 //*************************************************************************************************
4315 
4316 
4317 
4318 
4319 //=================================================================================================
4320 //
4321 // DMATSCALARMULTEXPR SPECIALIZATION
4322 //
4323 //=================================================================================================
4324 
4325 //*************************************************************************************************
4333 template< typename MT1 // Type of the left-hand side dense matrix
4334  , typename MT2 // Type of the right-hand side dense matrix
4335  , bool SF // Symmetry flag
4336  , bool HF // Hermitian flag
4337  , bool LF // Lower flag
4338  , bool UF // Upper flag
4339  , typename ST > // Type of the right-hand side scalar value
4340 class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4341  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4342  , private Computation
4343 {
4344  private:
4345  //**Type definitions****************************************************************************
4347  using MMM = DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4348 
4349  using RES = ResultType_t<MMM>;
4350  using RT1 = ResultType_t<MT1>;
4351  using RT2 = ResultType_t<MT2>;
4352  using ET1 = ElementType_t<RT1>;
4353  using ET2 = ElementType_t<RT2>;
4354  using CT1 = CompositeType_t<MT1>;
4355  using CT2 = CompositeType_t<MT2>;
4356  //**********************************************************************************************
4357 
4358  //**********************************************************************************************
4360  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4361  //**********************************************************************************************
4362 
4363  //**********************************************************************************************
4365  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4366  //**********************************************************************************************
4367 
4368  //**********************************************************************************************
4369  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4370  static constexpr bool HERM = ( HF && !( LF || UF ) );
4371  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4372  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4373  //**********************************************************************************************
4374 
4375  //**********************************************************************************************
4377 
4380  template< typename T1, typename T2, typename T3 >
4381  static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
4382  //**********************************************************************************************
4383 
4384  //**********************************************************************************************
4386 
4388  template< typename T1, typename T2, typename T3, typename T4 >
4389  static constexpr bool UseBlasKernel_v =
4390  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4391  !SYM && !HERM && !LOW && !UPP &&
4392  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4393  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4394  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4395  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4396  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4397  IsBLASCompatible_v< ElementType_t<T1> > &&
4398  IsBLASCompatible_v< ElementType_t<T2> > &&
4399  IsBLASCompatible_v< ElementType_t<T3> > &&
4400  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4401  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4402  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4403  //**********************************************************************************************
4404 
4405  //**********************************************************************************************
4407 
4409  template< typename T1, typename T2, typename T3, typename T4 >
4410  static constexpr bool UseVectorizedDefaultKernel_v =
4411  ( useOptimizedKernels &&
4412  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4413  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4414  IsSIMDCombinable_v< ElementType_t<T1>
4415  , ElementType_t<T2>
4416  , ElementType_t<T3>
4417  , T4 > &&
4418  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4419  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4420  //**********************************************************************************************
4421 
4422  //**********************************************************************************************
4424 
4426  using ForwardFunctor = If_t< HERM
4427  , DeclHerm
4428  , If_t< SYM
4429  , DeclSym
4430  , If_t< LOW
4431  , If_t< UPP
4432  , DeclDiag
4433  , DeclLow >
4434  , If_t< UPP
4435  , DeclUpp
4436  , Noop > > > >;
4437  //**********************************************************************************************
4438 
4439  public:
4440  //**Type definitions****************************************************************************
4442  using This = DMatScalarMultExpr<MMM,ST,false>;
4443 
4445  using BaseType = DenseMatrix<This,false>;
4446 
4448  using ResultType = typename If_t< HERM
4449  , DeclHermTrait< MultTrait_t<RES,ST> >
4450  , If_t< SYM
4451  , DeclSymTrait< MultTrait_t<RES,ST> >
4452  , If_t< LOW
4453  , If_t< UPP
4454  , DeclDiagTrait< MultTrait_t<RES,ST> >
4455  , DeclLowTrait< MultTrait_t<RES,ST> > >
4456  , If_t< UPP
4457  , DeclUppTrait< MultTrait_t<RES,ST> >
4458  , MultTrait<RES,ST> > > > >::Type;
4459 
4460  using OppositeType = OppositeType_t<ResultType>;
4461  using TransposeType = TransposeType_t<ResultType>;
4462  using ElementType = ElementType_t<ResultType>;
4463  using SIMDType = SIMDTrait_t<ElementType>;
4464  using ReturnType = const ElementType;
4465  using CompositeType = const ResultType;
4466 
4468  using LeftOperand = const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4469 
4471  using RightOperand = ST;
4472 
4474  using LT = If_t< evaluateLeft, const RT1, CT1 >;
4475 
4477  using RT = If_t< evaluateRight, const RT2, CT2 >;
4478  //**********************************************************************************************
4479 
4480  //**Compilation flags***************************************************************************
4482  static constexpr bool simdEnabled =
4483  ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
4484  MT1::simdEnabled && MT2::simdEnabled &&
4485  IsSIMDCombinable_v<ET1,ET2,ST> &&
4486  HasSIMDAdd_v<ET1,ET2> &&
4487  HasSIMDMult_v<ET1,ET2> );
4488 
4490  static constexpr bool smpAssignable =
4491  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4492  //**********************************************************************************************
4493 
4494  //**SIMD properties*****************************************************************************
4496  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
4497  //**********************************************************************************************
4498 
4499  //**Constructor*********************************************************************************
4505  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4506  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4507  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4508  {}
4509  //**********************************************************************************************
4510 
4511  //**Access operator*****************************************************************************
4518  inline ReturnType operator()( size_t i, size_t j ) const {
4519  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4520  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4521  return matrix_(i,j) * scalar_;
4522  }
4523  //**********************************************************************************************
4524 
4525  //**At function*********************************************************************************
4533  inline ReturnType at( size_t i, size_t j ) const {
4534  if( i >= matrix_.rows() ) {
4535  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4536  }
4537  if( j >= matrix_.columns() ) {
4538  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4539  }
4540  return (*this)(i,j);
4541  }
4542  //**********************************************************************************************
4543 
4544  //**Rows function*******************************************************************************
4549  inline size_t rows() const {
4550  return matrix_.rows();
4551  }
4552  //**********************************************************************************************
4553 
4554  //**Columns function****************************************************************************
4559  inline size_t columns() const {
4560  return matrix_.columns();
4561  }
4562  //**********************************************************************************************
4563 
4564  //**Left operand access*************************************************************************
4569  inline LeftOperand leftOperand() const {
4570  return matrix_;
4571  }
4572  //**********************************************************************************************
4573 
4574  //**Right operand access************************************************************************
4579  inline RightOperand rightOperand() const {
4580  return scalar_;
4581  }
4582  //**********************************************************************************************
4583 
4584  //**********************************************************************************************
4590  template< typename T >
4591  inline bool canAlias( const T* alias ) const {
4592  return matrix_.canAlias( alias );
4593  }
4594  //**********************************************************************************************
4595 
4596  //**********************************************************************************************
4602  template< typename T >
4603  inline bool isAliased( const T* alias ) const {
4604  return matrix_.isAliased( alias );
4605  }
4606  //**********************************************************************************************
4607 
4608  //**********************************************************************************************
4613  inline bool isAligned() const {
4614  return matrix_.isAligned();
4615  }
4616  //**********************************************************************************************
4617 
4618  //**********************************************************************************************
4623  inline bool canSMPAssign() const noexcept {
4624  return ( !BLAZE_BLAS_MODE ||
4625  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4627  ( rows() * columns() < DMATTDMATMULT_THRESHOLD ) ) &&
4628  ( rows() * columns() >= SMP_DMATTDMATMULT_THRESHOLD );
4629  }
4630  //**********************************************************************************************
4631 
4632  private:
4633  //**Member variables****************************************************************************
4636  //**********************************************************************************************
4637 
4638  //**Assignment to dense matrices****************************************************************
4650  template< typename MT // Type of the target dense matrix
4651  , bool SO > // Storage order of the target dense matrix
4652  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4653  {
4655 
4656  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4657  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4658 
4659  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
4660  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
4661 
4662  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4663  return;
4664  }
4665  else if( left.columns() == 0UL ) {
4666  reset( ~lhs );
4667  return;
4668  }
4669 
4670  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4671  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4672 
4673  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4674  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4675  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4676  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4677  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4678  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4679 
4680  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4681  }
4682  //**********************************************************************************************
4683 
4684  //**Assignment to dense matrices (kernel selection)*********************************************
4695  template< typename MT3 // Type of the left-hand side target matrix
4696  , typename MT4 // Type of the left-hand side matrix operand
4697  , typename MT5 // Type of the right-hand side matrix operand
4698  , typename ST2 > // Type of the scalar value
4699  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4700  {
4701  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
4702  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4703  selectSmallAssignKernel( C, A, B, scalar );
4704  else
4705  selectBlasAssignKernel( C, A, B, scalar );
4706  }
4707  //**********************************************************************************************
4708 
4709  //**Default assignment to row-major dense matrices (general/general)****************************
4723  template< typename MT3 // Type of the left-hand side target matrix
4724  , typename MT4 // Type of the left-hand side matrix operand
4725  , typename MT5 // Type of the right-hand side matrix operand
4726  , typename ST2 > // Type of the scalar value
4727  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4728  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4729  {
4730  const size_t M( A.rows() );
4731  const size_t N( B.columns() );
4732  const size_t K( A.columns() );
4733 
4734  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4735 
4736  const size_t ibegin( ( IsStrictlyLower_v<MT4> )
4737  ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
4738  :( 0UL ) );
4739  const size_t iend( ( IsStrictlyUpper_v<MT4> )
4740  ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
4741  :( M ) );
4742  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4743 
4744  for( size_t i=0UL; i<ibegin; ++i ) {
4745  for( size_t j=0UL; j<N; ++j ) {
4746  reset( C(i,j) );
4747  }
4748  }
4749  for( size_t i=ibegin; i<iend; ++i )
4750  {
4751  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4752  ?( ( IsStrictlyUpper_v<MT4> )
4753  ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
4754  :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
4755  :( ( IsStrictlyUpper_v<MT5> )
4756  ?( SYM || HERM || UPP ? max( i, 1UL ) : 1UL )
4757  :( SYM || HERM || UPP ? i : 0UL ) ) );
4758  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
4759  ?( ( IsStrictlyLower_v<MT4> )
4760  ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
4761  :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
4762  :( ( IsStrictlyLower_v<MT5> )
4763  ?( LOW ? min(i+1UL,N-1UL) : N-1UL )
4764  :( LOW ? i+1UL : N ) ) );
4765 
4766  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
4767  for( size_t j=0UL; j<N; ++j ) {
4768  reset( C(i,j) );
4769  }
4770  continue;
4771  }
4772 
4773  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4774 
4775  for( size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
4776  reset( C(i,j) );
4777  }
4778  for( size_t j=jbegin; j<jend; ++j )
4779  {
4780  const size_t kbegin( ( IsUpper_v<MT4> )
4781  ?( ( IsLower_v<MT5> )
4782  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4783  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4784  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4785  :( ( IsLower_v<MT5> )
4786  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4787  :( 0UL ) ) );
4788  const size_t kend( ( IsLower_v<MT4> )
4789  ?( ( IsUpper_v<MT5> )
4790  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4791  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4792  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4793  :( ( IsUpper_v<MT5> )
4794  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4795  :( K ) ) );
4796  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
4797 
4798  C(i,j) = A(i,kbegin) * B(kbegin,j);
4799  for( size_t k=kbegin+1UL; k<kend; ++k ) {
4800  C(i,j) += A(i,k) * B(k,j);
4801  }
4802  C(i,j) *= scalar;
4803  }
4804  for( size_t j=jend; j<N; ++j ) {
4805  reset( C(i,j) );
4806  }
4807  }
4808  for( size_t i=iend; i<M; ++i ) {
4809  for( size_t j=0UL; j<N; ++j ) {
4810  reset( C(i,j) );
4811  }
4812  }
4813 
4814  if( SYM || HERM ) {
4815  for( size_t i=1UL; i<M; ++i ) {
4816  for( size_t j=0UL; j<i; ++j ) {
4817  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
4818  }
4819  }
4820  }
4821  }
4822  //**********************************************************************************************
4823 
4824  //**Default assignment to column-major dense matrices (general/general)*************************
4838  template< typename MT3 // Type of the left-hand side target matrix
4839  , typename MT4 // Type of the left-hand side matrix operand
4840  , typename MT5 // Type of the right-hand side matrix operand
4841  , typename ST2 > // Type of the scalar value
4842  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4843  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4844  {
4845  const size_t M( A.rows() );
4846  const size_t N( B.columns() );
4847  const size_t K( A.columns() );
4848 
4849  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4850 
4851  const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
4852  ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
4853  :( 0UL ) );
4854  const size_t jend( ( IsStrictlyLower_v<MT5> )
4855  ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
4856  :( N ) );
4857  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4858 
4859  for( size_t j=0UL; j<jbegin; ++j ) {
4860  for( size_t i=0UL; i<M; ++i ) {
4861  reset( C(i,j) );
4862  }
4863  }
4864  for( size_t j=jbegin; j<jend; ++j )
4865  {
4866  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
4867  ?( ( IsStrictlyLower_v<MT4> )
4868  ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
4869  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4870  :( ( IsStrictlyLower_v<MT4> )
4871  ?( SYM || HERM || LOW ? max( j, 1UL ) : 1UL )
4872  :( SYM || HERM || LOW ? j : 0UL ) ) );
4873  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4874  ?( ( IsStrictlyUpper_v<MT4> )
4875  ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
4876  :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
4877  :( ( IsStrictlyUpper_v<MT4> )
4878  ?( UPP ? min(j+1UL,M-1UL) : M-1UL )
4879  :( UPP ? j+1UL : M ) ) );
4880 
4881  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
4882  for( size_t i=0UL; i<M; ++i ) {
4883  reset( C(i,j) );
4884  }
4885  continue;
4886  }
4887 
4888  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4889 
4890  for( size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
4891  reset( C(i,j) );
4892  }
4893  for( size_t i=ibegin; i<iend; ++i )
4894  {
4895  const size_t kbegin( ( IsUpper_v<MT4> )
4896  ?( ( IsLower_v<MT5> )
4897  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4898  , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4899  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4900  :( ( IsLower_v<MT5> )
4901  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4902  :( 0UL ) ) );
4903  const size_t kend( ( IsLower_v<MT4> )
4904  ?( ( IsUpper_v<MT5> )
4905  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4906  , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4907  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4908  :( ( IsUpper_v<MT5> )
4909  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4910  :( K ) ) );
4911  BLAZE_INTERNAL_ASSERT( kbegin < kend, "Invalid loop indices detected" );
4912 
4913  C(i,j) = A(i,kbegin) * B(kbegin,j);
4914  for( size_t k=kbegin+1UL; k<kend; ++k ) {
4915  C(i,j) += A(i,k) * B(k,j);
4916  }
4917  C(i,j) *= scalar;
4918  }
4919  for( size_t i=iend; i<M; ++i ) {
4920  reset( C(i,j) );
4921  }
4922  }
4923  for( size_t j=jend; j<N; ++j ) {
4924  for( size_t i=0UL; i<M; ++i ) {
4925  reset( C(i,j) );
4926  }
4927  }
4928 
4929  if( SYM || HERM ) {
4930  for( size_t j=1UL; j<N; ++j ) {
4931  for( size_t i=0UL; i<j; ++i ) {
4932  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
4933  }
4934  }
4935  }
4936  }
4937  //**********************************************************************************************
4938 
4939  //**Default assignment to row-major dense matrices (general/diagonal)***************************
4953  template< typename MT3 // Type of the left-hand side target matrix
4954  , typename MT4 // Type of the left-hand side matrix operand
4955  , typename MT5 // Type of the right-hand side matrix operand
4956  , typename ST2 > // Type of the scalar value
4957  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4958  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4959  {
4960  const size_t M( A.rows() );
4961  const size_t N( B.columns() );
4962 
4963  for( size_t i=0UL; i<M; ++i )
4964  {
4965  const size_t jbegin( ( IsUpper_v<MT4> )
4966  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4967  :( 0UL ) );
4968  const size_t jend( ( IsLower_v<MT4> )
4969  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
4970  :( N ) );
4971  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4972 
4973  if( IsUpper_v<MT4> ) {
4974  for( size_t j=0UL; j<jbegin; ++j ) {
4975  reset( C(i,j) );
4976  }
4977  }
4978  for( size_t j=jbegin; j<jend; ++j ) {
4979  C(i,j) = A(i,j) * B(j,j) * scalar;
4980  }
4981  if( IsLower_v<MT4> ) {
4982  for( size_t j=jend; j<N; ++j ) {
4983  reset( C(i,j) );
4984  }
4985  }
4986  }
4987  }
4988  //**********************************************************************************************
4989 
4990  //**Default assignment to column-major dense matrices (general/diagonal)************************
5004  template< typename MT3 // Type of the left-hand side target matrix
5005  , typename MT4 // Type of the left-hand side matrix operand
5006  , typename MT5 // Type of the right-hand side matrix operand
5007  , typename ST2 > // Type of the scalar value
5008  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5009  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5010  {
5011  constexpr size_t block( BLOCK_SIZE );
5012 
5013  const size_t M( A.rows() );
5014  const size_t N( B.columns() );
5015 
5016  for( size_t jj=0UL; jj<N; jj+=block ) {
5017  const size_t jend( min( N, jj+block ) );
5018  for( size_t ii=0UL; ii<M; ii+=block ) {
5019  const size_t iend( min( M, ii+block ) );
5020  for( size_t j=jj; j<jend; ++j )
5021  {
5022  const size_t ibegin( ( IsLower_v<MT4> )
5023  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
5024  :( ii ) );
5025  const size_t ipos( ( IsUpper_v<MT4> )
5026  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
5027  :( iend ) );
5028 
5029  if( IsLower_v<MT4> ) {
5030  for( size_t i=ii; i<ibegin; ++i ) {
5031  reset( C(i,j) );
5032  }
5033  }
5034  for( size_t i=ibegin; i<ipos; ++i ) {
5035  C(i,j) = A(i,j) * B(j,j) * scalar;
5036  }
5037  if( IsUpper_v<MT4> ) {
5038  for( size_t i=ipos; i<iend; ++i ) {
5039  reset( C(i,j) );
5040  }
5041  }
5042  }
5043  }
5044  }
5045  }
5046  //**********************************************************************************************
5047 
5048  //**Default assignment to row-major dense matrices (diagonal/general)***************************
5062  template< typename MT3 // Type of the left-hand side target matrix
5063  , typename MT4 // Type of the left-hand side matrix operand
5064  , typename MT5 // Type of the right-hand side matrix operand
5065  , typename ST2 > // Type of the scalar value
5066  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5067  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5068  {
5069  constexpr size_t block( BLOCK_SIZE );
5070 
5071  const size_t M( A.rows() );
5072  const size_t N( B.columns() );
5073 
5074  for( size_t ii=0UL; ii<M; ii+=block ) {
5075  const size_t iend( min( M, ii+block ) );
5076  for( size_t jj=0UL; jj<N; jj+=block ) {
5077  const size_t jend( min( N, jj+block ) );
5078  for( size_t i=ii; i<iend; ++i )
5079  {
5080  const size_t jbegin( ( IsUpper_v<MT5> )
5081  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
5082  :( jj ) );
5083  const size_t jpos( ( IsLower_v<MT5> )
5084  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
5085  :( jend ) );
5086 
5087  if( IsUpper_v<MT5> ) {
5088  for( size_t j=jj; j<jbegin; ++j ) {
5089  reset( C(i,j) );
5090  }
5091  }
5092  for( size_t j=jbegin; j<jpos; ++j ) {
5093  C(i,j) = A(i,i) * B(i,j) * scalar;
5094  }
5095  if( IsLower_v<MT5> ) {
5096  for( size_t j=jpos; j<jend; ++j ) {
5097  reset( C(i,j) );
5098  }
5099  }
5100  }
5101  }
5102  }
5103  }
5104  //**********************************************************************************************
5105 
5106  //**Default assignment to column-major dense matrices (diagonal/general)************************
5120  template< typename MT3 // Type of the left-hand side target matrix
5121  , typename MT4 // Type of the left-hand side matrix operand
5122  , typename MT5 // Type of the right-hand side matrix operand
5123  , typename ST2 > // Type of the scalar value
5124  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5125  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5126  {
5127  const size_t M( A.rows() );
5128  const size_t N( B.columns() );
5129 
5130  for( size_t j=0UL; j<N; ++j )
5131  {
5132  const size_t ibegin( ( IsLower_v<MT5> )
5133  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5134  :( 0UL ) );
5135  const size_t iend( ( IsUpper_v<MT5> )
5136  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5137  :( M ) );
5138  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5139 
5140  if( IsLower_v<MT5> ) {
5141  for( size_t i=0UL; i<ibegin; ++i ) {
5142  reset( C(i,j) );
5143  }
5144  }
5145  for( size_t i=ibegin; i<iend; ++i ) {
5146  C(i,j) = A(i,i) * B(i,j) * scalar;
5147  }
5148  if( IsUpper_v<MT5> ) {
5149  for( size_t i=iend; i<M; ++i ) {
5150  reset( C(i,j) );
5151  }
5152  }
5153  }
5154  }
5155  //**********************************************************************************************
5156 
5157  //**Default assignment to dense matrices (diagonal/diagonal)************************************
5171  template< typename MT3 // Type of the left-hand side target matrix
5172  , typename MT4 // Type of the left-hand side matrix operand
5173  , typename MT5 // Type of the right-hand side matrix operand
5174  , typename ST2 > // Type of the scalar value
5175  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5176  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5177  {
5178  reset( C );
5179 
5180  for( size_t i=0UL; i<A.rows(); ++i ) {
5181  C(i,i) = A(i,i) * B(i,i) * scalar;
5182  }
5183  }
5184  //**********************************************************************************************
5185 
5186  //**Default assignment to dense matrices (small matrices)***************************************
5200  template< typename MT3 // Type of the left-hand side target matrix
5201  , typename MT4 // Type of the left-hand side matrix operand
5202  , typename MT5 // Type of the right-hand side matrix operand
5203  , typename ST2 > // Type of the scalar value
5204  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5205  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5206  {
5207  selectDefaultAssignKernel( C, A, B, scalar );
5208  }
5209  //**********************************************************************************************
5210 
5211  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5226  template< typename MT3 // Type of the left-hand side target matrix
5227  , typename MT4 // Type of the left-hand side matrix operand
5228  , typename MT5 // Type of the right-hand side matrix operand
5229  , typename ST2 > // Type of the scalar value
5230  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5231  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5232  {
5233  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5234 
5235  const size_t M( A.rows() );
5236  const size_t N( B.columns() );
5237  const size_t K( A.columns() );
5238 
5239  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5240 
5241  if( LOW && UPP ) {
5242  reset( C );
5243  }
5244 
5245  {
5246  size_t i( 0UL );
5247 
5248  for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
5249  {
5250  const size_t jend( LOW ? i+2UL : N );
5251  size_t j( SYM || HERM || UPP ? i : 0UL );
5252 
5253  for( ; (j+4UL) <= jend; j+=4UL )
5254  {
5255  const size_t kbegin( ( IsUpper_v<MT4> )
5256  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5257  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5258  const size_t kend( ( IsLower_v<MT4> )
5259  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
5260  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
5261 
5262  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5263  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5264 
5265  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5266  size_t k( kbegin );
5267 
5268  for( ; k<kpos; k+=SIMDSIZE ) {
5269  const SIMDType a1( A.load(i ,k) );
5270  const SIMDType a2( A.load(i+1UL,k) );
5271  const SIMDType b1( B.load(k,j ) );
5272  const SIMDType b2( B.load(k,j+1UL) );
5273  const SIMDType b3( B.load(k,j+2UL) );
5274  const SIMDType b4( B.load(k,j+3UL) );
5275  xmm1 += a1 * b1;
5276  xmm2 += a1 * b2;
5277  xmm3 += a1 * b3;
5278  xmm4 += a1 * b4;
5279  xmm5 += a2 * b1;
5280  xmm6 += a2 * b2;
5281  xmm7 += a2 * b3;
5282  xmm8 += a2 * b4;
5283  }
5284 
5285  C(i ,j ) = sum( xmm1 ) * scalar;
5286  C(i ,j+1UL) = sum( xmm2 ) * scalar;
5287  C(i ,j+2UL) = sum( xmm3 ) * scalar;
5288  C(i ,j+3UL) = sum( xmm4 ) * scalar;
5289  C(i+1UL,j ) = sum( xmm5 ) * scalar;
5290  C(i+1UL,j+1UL) = sum( xmm6 ) * scalar;
5291  C(i+1UL,j+2UL) = sum( xmm7 ) * scalar;
5292  C(i+1UL,j+3UL) = sum( xmm8 ) * scalar;
5293 
5294  for( ; remainder && k<kend; ++k ) {
5295  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5296  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5297  C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
5298  C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
5299  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5300  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5301  C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
5302  C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
5303  }
5304  }
5305 
5306  for( ; (j+2UL) <= jend; j+=2UL )
5307  {
5308  const size_t kbegin( ( IsUpper_v<MT4> )
5309  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5310  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5311  const size_t kend( ( IsLower_v<MT4> )
5312  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
5313  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5314 
5315  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5316  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5317 
5318  SIMDType xmm1, xmm2, xmm3, xmm4;
5319  size_t k( kbegin );
5320 
5321  for( ; k<kpos; k+=SIMDSIZE ) {
5322  const SIMDType a1( A.load(i ,k) );
5323  const SIMDType a2( A.load(i+1UL,k) );
5324  const SIMDType b1( B.load(k,j ) );
5325  const SIMDType b2( B.load(k,j+1UL) );
5326  xmm1 += a1 * b1;
5327  xmm2 += a1 * b2;
5328  xmm3 += a2 * b1;
5329  xmm4 += a2 * b2;
5330  }
5331 
5332  C(i ,j ) = sum( xmm1 ) * scalar;
5333  C(i ,j+1UL) = sum( xmm2 ) * scalar;
5334  C(i+1UL,j ) = sum( xmm3 ) * scalar;
5335  C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
5336 
5337  for( ; remainder && k<kend; ++k ) {
5338  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5339  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5340  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5341  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5342  }
5343  }
5344 
5345  if( j < jend )
5346  {
5347  const size_t kbegin( ( IsUpper_v<MT4> )
5348  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5349  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5350  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5351 
5352  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5353  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5354 
5355  SIMDType xmm1, xmm2;
5356  size_t k( kbegin );
5357 
5358  for( ; k<kpos; k+=SIMDSIZE ) {
5359  const SIMDType b1( B.load(k,j) );
5360  xmm1 += A.load(i ,k) * b1;
5361  xmm2 += A.load(i+1UL,k) * b1;
5362  }
5363 
5364  C(i ,j) = sum( xmm1 ) * scalar;
5365  C(i+1UL,j) = sum( xmm2 ) * scalar;
5366 
5367  for( ; remainder && k<kend; ++k ) {
5368  C(i ,j) += A(i ,k) * B(k,j) * scalar;
5369  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5370  }
5371  }
5372  }
5373 
5374  for( ; i<M; ++i )
5375  {
5376  const size_t jend( LOW ? i+1UL : N );
5377  size_t j( SYM || HERM || UPP ? i : 0UL );
5378 
5379  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
5380  {
5381  const size_t kbegin( ( IsUpper_v<MT4> )
5382  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5383  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5384  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
5385 
5386  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5387  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5388 
5389  SIMDType xmm1, xmm2, xmm3, xmm4;
5390  size_t k( kbegin );
5391 
5392  for( ; k<kpos; k+=SIMDSIZE ) {
5393  const SIMDType a1( A.load(i,k) );
5394  xmm1 += a1 * B.load(k,j );
5395  xmm2 += a1 * B.load(k,j+1UL);
5396  xmm3 += a1 * B.load(k,j+2UL);
5397  xmm4 += a1 * B.load(k,j+3UL);
5398  }
5399 
5400  C(i,j ) = sum( xmm1 ) * scalar;
5401  C(i,j+1UL) = sum( xmm2 ) * scalar;
5402  C(i,j+2UL) = sum( xmm3 ) * scalar;
5403  C(i,j+3UL) = sum( xmm4 ) * scalar;
5404 
5405  for( ; remainder && k<kend; ++k ) {
5406  C(i,j ) += A(i,k) * B(k,j ) * scalar;
5407  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5408  C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
5409  C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
5410  }
5411  }
5412 
5413  for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
5414  {
5415  const size_t kbegin( ( IsUpper_v<MT4> )
5416  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5417  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5418  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5419 
5420  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5421  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5422 
5423  SIMDType xmm1, xmm2;
5424  size_t k( kbegin );
5425 
5426  for( ; k<kpos; k+=SIMDSIZE ) {
5427  const SIMDType a1( A.load(i,k) );
5428  xmm1 += a1 * B.load(k,j );
5429  xmm2 += a1 * B.load(k,j+1UL);
5430  }
5431 
5432  C(i,j ) = sum( xmm1 ) * scalar;
5433  C(i,j+1UL) = sum( xmm2 ) * scalar;
5434 
5435  for( ; remainder && k<kend; ++k ) {
5436  C(i,j ) += A(i,k) * B(k,j ) * scalar;
5437  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5438  }
5439  }
5440 
5441  for( ; j<jend; ++j )
5442  {
5443  const size_t kbegin( ( IsUpper_v<MT4> )
5444  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5445  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5446 
5447  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
5448  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5449 
5450  SIMDType xmm1;
5451  size_t k( kbegin );
5452 
5453  for( ; k<kpos; k+=SIMDSIZE ) {
5454  xmm1 += A.load(i,k) * B.load(k,j);
5455  }
5456 
5457  C(i,j) = sum( xmm1 ) * scalar;
5458 
5459  for( ; remainder && k<K; ++k ) {
5460  C(i,j) += A(i,k) * B(k,j) * scalar;
5461  }
5462  }
5463  }
5464  }
5465 
5466  if( SYM || HERM ) {
5467  for( size_t i=2UL; i<M; ++i ) {
5468  const size_t jend( 2UL * ( i/2UL ) );
5469  for( size_t j=0UL; j<jend; ++j ) {
5470  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5471  }
5472  }
5473  }
5474  else if( LOW && !UPP ) {
5475  for( size_t j=2UL; j<N; ++j ) {
5476  const size_t iend( 2UL * ( j/2UL ) );
5477  for( size_t i=0UL; i<iend; ++i ) {
5478  reset( C(i,j) );
5479  }
5480  }
5481  }
5482  else if( !LOW && UPP ) {
5483  for( size_t i=2UL; i<M; ++i ) {
5484  const size_t jend( 2UL * ( i/2UL ) );
5485  for( size_t j=0UL; j<jend; ++j ) {
5486  reset( C(i,j) );
5487  }
5488  }
5489  }
5490  }
5491  //**********************************************************************************************
5492 
5493  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
5508  template< typename MT3 // Type of the left-hand side target matrix
5509  , typename MT4 // Type of the left-hand side matrix operand
5510  , typename MT5 // Type of the right-hand side matrix operand
5511  , typename ST2 > // Type of the scalar value
5512  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5513  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5514  {
5515  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5516 
5517  const size_t M( A.rows() );
5518  const size_t N( B.columns() );
5519  const size_t K( A.columns() );
5520 
5521  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5522 
5523  if( LOW || UPP ) {
5524  reset( C );
5525  }
5526 
5527  {
5528  size_t i( 0UL );
5529 
5530  for( ; !SYM && !HERM && !LOW && !UPP && (i+4UL) <= M; i+=4UL )
5531  {
5532  size_t j( 0UL );
5533 
5534  for( ; (j+2UL) <= N; j+=2UL )
5535  {
5536  const size_t kbegin( ( IsUpper_v<MT4> )
5537  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5538  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5539  const size_t kend( ( IsLower_v<MT4> )
5540  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
5541  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5542 
5543  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5544  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5545 
5546  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5547  size_t k( kbegin );
5548 
5549  for( ; k<kpos; k+=SIMDSIZE ) {
5550  const SIMDType a1( A.load(i ,k) );
5551  const SIMDType a2( A.load(i+1UL,k) );
5552  const SIMDType a3( A.load(i+2UL,k) );
5553  const SIMDType a4( A.load(i+3UL,k) );
5554  const SIMDType b1( B.load(k,j ) );
5555  const SIMDType b2( B.load(k,j+1UL) );
5556  xmm1 += a1 * b1;
5557  xmm2 += a1 * b2;
5558  xmm3 += a2 * b1;
5559  xmm4 += a2 * b2;
5560  xmm5 += a3 * b1;
5561  xmm6 += a3 * b2;
5562  xmm7 += a4 * b1;
5563  xmm8 += a4 * b2;
5564  }
5565 
5566  C(i ,j ) = sum( xmm1 ) * scalar;
5567  C(i ,j+1UL) = sum( xmm2 ) * scalar;
5568  C(i+1UL,j ) = sum( xmm3 ) * scalar;
5569  C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
5570  C(i+2UL,j ) = sum( xmm5 ) * scalar;
5571  C(i+2UL,j+1UL) = sum( xmm6 ) * scalar;
5572  C(i+3UL,j ) = sum( xmm7 ) * scalar;
5573  C(i+3UL,j+1UL) = sum( xmm8 ) * scalar;
5574 
5575  for( ; remainder && k<kend; ++k ) {
5576  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5577  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5578  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5579  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5580  C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
5581  C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
5582  C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
5583  C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
5584  }
5585  }
5586 
5587  if( j < N )
5588  {
5589  const size_t kbegin( ( IsUpper_v<MT4> )
5590  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5591  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5592  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
5593 
5594  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5595  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5596 
5597  SIMDType xmm1, xmm2, xmm3, xmm4;
5598  size_t k( kbegin );
5599 
5600  for( ; k<kpos; k+=SIMDSIZE ) {
5601  const SIMDType b1( B.load(k,j) );
5602  xmm1 += A.load(i ,k) * b1;
5603  xmm2 += A.load(i+1UL,k) * b1;
5604  xmm3 += A.load(i+2UL,k) * b1;
5605  xmm4 += A.load(i+3UL,k) * b1;
5606  }
5607 
5608  C(i ,j) = sum( xmm1 ) * scalar;
5609  C(i+1UL,j) = sum( xmm2 ) * scalar;
5610  C(i+2UL,j) = sum( xmm3 ) * scalar;
5611  C(i+3UL,j) = sum( xmm4 ) * scalar;
5612 
5613  for( ; remainder && k<kend; ++k ) {
5614  C(i ,j) += A(i ,k) * B(k,j) * scalar;
5615  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5616  C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
5617  C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
5618  }
5619  }
5620  }
5621 
5622  for( ; (i+2UL) <= M; i+=2UL )
5623  {
5624  const size_t jend( LOW ? i+2UL : N );
5625  size_t j( SYM || HERM || UPP ? i : 0UL );
5626 
5627  for( ; (j+2UL) <= jend; j+=2UL )
5628  {
5629  const size_t kbegin( ( IsUpper_v<MT4> )
5630  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5631  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5632  const size_t kend( ( IsLower_v<MT4> )
5633  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
5634  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5635 
5636  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5637  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5638 
5639  SIMDType xmm1, xmm2, xmm3, xmm4;
5640  size_t k( kbegin );
5641 
5642  for( ; k<kpos; k+=SIMDSIZE ) {
5643  const SIMDType a1( A.load(i ,k) );
5644  const SIMDType a2( A.load(i+1UL,k) );
5645  const SIMDType b1( B.load(k,j ) );
5646  const SIMDType b2( B.load(k,j+1UL) );
5647  xmm1 += a1 * b1;
5648  xmm2 += a1 * b2;
5649  xmm3 += a2 * b1;
5650  xmm4 += a2 * b2;
5651  }
5652 
5653  C(i ,j ) = sum( xmm1 ) * scalar;
5654  C(i ,j+1UL) = sum( xmm2 ) * scalar;
5655  C(i+1UL,j ) = sum( xmm3 ) * scalar;
5656  C(i+1UL,j+1UL) = sum( xmm4 ) * scalar;
5657 
5658  for( ; remainder && k<kend; ++k ) {
5659  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5660  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5661  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5662  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5663  }
5664  }
5665 
5666  if( j < jend )
5667  {
5668  const size_t kbegin( ( IsUpper_v<MT4> )
5669  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5670  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5671  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5672 
5673  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5674  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5675 
5676  SIMDType xmm1, xmm2;
5677  size_t k( kbegin );
5678 
5679  for( ; k<kpos; k+=SIMDSIZE ) {
5680  const SIMDType b1( B.load(k,j) );
5681  xmm1 += A.load(i ,k) * b1;
5682  xmm2 += A.load(i+1UL,k) * b1;
5683  }
5684 
5685  C(i ,j) = sum( xmm1 ) * scalar;
5686  C(i+1UL,j) = sum( xmm2 ) * scalar;
5687 
5688  for( ; remainder && k<kend; ++k ) {
5689  C(i ,j) += A(i ,k) * B(k,j) * scalar;
5690  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5691  }
5692  }
5693  }
5694 
5695  if( i < M )
5696  {
5697  const size_t jend( LOW ? i+1UL : N );
5698  size_t j( SYM || HERM || UPP ? i : 0UL );
5699 
5700  for( ; (j+2UL) <= jend; j+=2UL )
5701  {
5702  const size_t kbegin( ( IsUpper_v<MT4> )
5703  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5704  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5705  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5706 
5707  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
5708  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5709 
5710  SIMDType xmm1, xmm2;
5711  size_t k( kbegin );
5712 
5713  for( ; k<kpos; k+=SIMDSIZE ) {
5714  const SIMDType a1( A.load(i,k) );
5715  xmm1 += a1 * B.load(k,j );
5716  xmm2 += a1 * B.load(k,j+1UL);
5717  }
5718 
5719  C(i,j ) = sum( xmm1 ) * scalar;
5720  C(i,j+1UL) = sum( xmm2 ) * scalar;
5721 
5722  for( ; remainder && k<kend; ++k ) {
5723  C(i,j ) += A(i,k) * B(k,j ) * scalar;
5724  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5725  }
5726  }
5727 
5728  if( j < jend )
5729  {
5730  const size_t kbegin( ( IsUpper_v<MT4> )
5731  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
5732  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
5733 
5734  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
5735  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
5736 
5737  SIMDType xmm1;
5738  size_t k( kbegin );
5739 
5740  for( ; k<kpos; k+=SIMDSIZE ) {
5741  xmm1 += A.load(i,k) * B.load(k,j);
5742  }
5743 
5744  C(i,j) = sum( xmm1 ) * scalar;
5745 
5746  for( ; remainder && k<K; ++k ) {
5747  C(i,j) += A(i,k) * B(k,j) * scalar;
5748  }
5749  }
5750  }
5751  }
5752 
5753  if( SYM || HERM ) {
5754  for( size_t j=0UL; j<N; ++j ) {
5755  for( size_t i=j+1UL; i<M; ++i ) {
5756  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5757  }
5758  }
5759  }
5760  }
5761  //**********************************************************************************************
5762 
5763  //**Default assignment to dense matrices (large matrices)***************************************
5777  template< typename MT3 // Type of the left-hand side target matrix
5778  , typename MT4 // Type of the left-hand side matrix operand
5779  , typename MT5 // Type of the right-hand side matrix operand
5780  , typename ST2 > // Type of the scalar value
5781  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5782  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5783  {
5784  selectDefaultAssignKernel( C, A, B, scalar );
5785  }
5786  //**********************************************************************************************
5787 
5788  //**Vectorized default assignment to dense matrices (large matrices)****************************
5803  template< typename MT3 // Type of the left-hand side target matrix
5804  , typename MT4 // Type of the left-hand side matrix operand
5805  , typename MT5 // Type of the right-hand side matrix operand
5806  , typename ST2 > // Type of the scalar value
5807  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5808  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5809  {
5810  if( SYM )
5811  smmm( C, A, B, scalar );
5812  else if( HERM )
5813  hmmm( C, A, B, scalar );
5814  else if( LOW )
5815  lmmm( C, A, B, scalar, ST2(0) );
5816  else if( UPP )
5817  ummm( C, A, B, scalar, ST2(0) );
5818  else
5819  mmm( C, A, B, scalar, ST2(0) );
5820  }
5821  //**********************************************************************************************
5822 
5823  //**BLAS-based assignment to dense matrices (default)*******************************************
5837  template< typename MT3 // Type of the left-hand side target matrix
5838  , typename MT4 // Type of the left-hand side matrix operand
5839  , typename MT5 // Type of the right-hand side matrix operand
5840  , typename ST2 > // Type of the scalar value
5841  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5842  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
5843  {
5844  selectLargeAssignKernel( C, A, B, scalar );
5845  }
5846  //**********************************************************************************************
5847 
5848  //**BLAS-based assignment to dense matrices*****************************************************
5849 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5850 
5863  template< typename MT3 // Type of the left-hand side target matrix
5864  , typename MT4 // Type of the left-hand side matrix operand
5865  , typename MT5 // Type of the right-hand side matrix operand
5866  , typename ST2 > // Type of the scalar value
5867  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5868  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
5869  {
5870  using ET = ElementType_t<MT3>;
5871 
5872  if( IsTriangular_v<MT4> ) {
5873  assign( C, B );
5874  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
5875  }
5876  else if( IsTriangular_v<MT5> ) {
5877  assign( C, A );
5878  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
5879  }
5880  else {
5881  gemm( C, A, B, ET(scalar), ET(0) );
5882  }
5883  }
5884 #endif
5885  //**********************************************************************************************
5886 
5887  //**Assignment to sparse matrices***************************************************************
5899  template< typename MT // Type of the target sparse matrix
5900  , bool SO > // Storage order of the target sparse matrix
5901  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5902  {
5904 
5905  using TmpType = If_t< SO, OppositeType, ResultType >;
5906 
5913 
5914  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5915  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5916 
5917  const ForwardFunctor fwd;
5918 
5919  const TmpType tmp( serial( rhs ) );
5920  assign( ~lhs, fwd( tmp ) );
5921  }
5922  //**********************************************************************************************
5923 
5924  //**Addition assignment to dense matrices*******************************************************
5936  template< typename MT // Type of the target dense matrix
5937  , bool SO > // Storage order of the target dense matrix
5938  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5939  {
5941 
5942  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5943  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5944 
5945  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5946  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5947 
5948  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
5949  return;
5950  }
5951 
5952  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5953  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5954 
5955  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5956  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5957  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5958  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5959  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5960  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
5961 
5962  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5963  }
5964  //**********************************************************************************************
5965 
5966  //**Addition assignment to dense matrices (kernel selection)************************************
5977  template< typename MT3 // Type of the left-hand side target matrix
5978  , typename MT4 // Type of the left-hand side matrix operand
5979  , typename MT5 // Type of the right-hand side matrix operand
5980  , typename ST2 > // Type of the scalar value
5981  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5982  {
5983  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
5984  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
5985  selectSmallAddAssignKernel( C, A, B, scalar );
5986  else
5987  selectBlasAddAssignKernel( C, A, B, scalar );
5988  }
5989  //**********************************************************************************************
5990 
5991  //**Default addition assignment to dense matrices (general/general)*****************************
6005  template< typename MT3 // Type of the left-hand side target matrix
6006  , typename MT4 // Type of the left-hand side matrix operand
6007  , typename MT5 // Type of the right-hand side matrix operand
6008  , typename ST2 > // Type of the scalar value
6009  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6010  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6011  {
6012  const ResultType tmp( serial( A * B * scalar ) );
6013  addAssign( C, tmp );
6014  }
6015  //**********************************************************************************************
6016 
6017  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
6031  template< typename MT3 // Type of the left-hand side target matrix
6032  , typename MT4 // Type of the left-hand side matrix operand
6033  , typename MT5 // Type of the right-hand side matrix operand
6034  , typename ST2 > // Type of the scalar value
6035  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6036  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6037  {
6038  const size_t M( A.rows() );
6039  const size_t N( B.columns() );
6040 
6041  for( size_t i=0UL; i<M; ++i )
6042  {
6043  const size_t jbegin( ( IsUpper_v<MT4> )
6044  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6045  :( 0UL ) );
6046  const size_t jend( ( IsLower_v<MT4> )
6047  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6048  :( N ) );
6049  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6050 
6051  const size_t jnum( jend - jbegin );
6052  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6053 
6054  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6055  C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6056  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6057  }
6058  if( jpos < jend ) {
6059  C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6060  }
6061  }
6062  }
6063  //**********************************************************************************************
6064 
6065  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
6079  template< typename MT3 // Type of the left-hand side target matrix
6080  , typename MT4 // Type of the left-hand side matrix operand
6081  , typename MT5 // Type of the right-hand side matrix operand
6082  , typename ST2 > // Type of the scalar value
6083  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6084  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6085  {
6086  constexpr size_t block( BLOCK_SIZE );
6087 
6088  const size_t M( A.rows() );
6089  const size_t N( B.columns() );
6090 
6091  for( size_t jj=0UL; jj<N; jj+=block ) {
6092  const size_t jend( min( N, jj+block ) );
6093  for( size_t ii=0UL; ii<M; ii+=block ) {
6094  const size_t iend( min( M, ii+block ) );
6095  for( size_t j=jj; j<jend; ++j )
6096  {
6097  const size_t ibegin( ( IsLower_v<MT4> )
6098  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
6099  :( ii ) );
6100  const size_t ipos( ( IsUpper_v<MT4> )
6101  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
6102  :( iend ) );
6103 
6104  for( size_t i=ibegin; i<ipos; ++i ) {
6105  C(i,j) += A(i,j) * B(j,j) * scalar;
6106  }
6107  }
6108  }
6109  }
6110  }
6111  //**********************************************************************************************
6112 
6113  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
6127  template< typename MT3 // Type of the left-hand side target matrix
6128  , typename MT4 // Type of the left-hand side matrix operand
6129  , typename MT5 // Type of the right-hand side matrix operand
6130  , typename ST2 > // Type of the scalar value
6131  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6132  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6133  {
6134  constexpr size_t block( BLOCK_SIZE );
6135 
6136  const size_t M( A.rows() );
6137  const size_t N( B.columns() );
6138 
6139  for( size_t ii=0UL; ii<M; ii+=block ) {
6140  const size_t iend( min( M, ii+block ) );
6141  for( size_t jj=0UL; jj<N; jj+=block ) {
6142  const size_t jend( min( N, jj+block ) );
6143  for( size_t i=ii; i<iend; ++i )
6144  {
6145  const size_t jbegin( ( IsUpper_v<MT5> )
6146  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
6147  :( jj ) );
6148  const size_t jpos( ( IsLower_v<MT5> )
6149  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
6150  :( jend ) );
6151 
6152  for( size_t j=jbegin; j<jpos; ++j ) {
6153  C(i,j) += A(i,i) * B(i,j) * scalar;
6154  }
6155  }
6156  }
6157  }
6158  }
6159  //**********************************************************************************************
6160 
6161  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
6175  template< typename MT3 // Type of the left-hand side target matrix
6176  , typename MT4 // Type of the left-hand side matrix operand
6177  , typename MT5 // Type of the right-hand side matrix operand
6178  , typename ST2 > // Type of the scalar value
6179  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6180  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6181  {
6182  const size_t M( A.rows() );
6183  const size_t N( B.columns() );
6184 
6185  for( size_t j=0UL; j<N; ++j )
6186  {
6187  const size_t ibegin( ( IsLower_v<MT5> )
6188  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6189  :( 0UL ) );
6190  const size_t iend( ( IsUpper_v<MT5> )
6191  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6192  :( M ) );
6193  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6194 
6195  const size_t inum( iend - ibegin );
6196  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6197 
6198  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6199  C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6200  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6201  }
6202  if( ipos < iend ) {
6203  C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6204  }
6205  }
6206  }
6207  //**********************************************************************************************
6208 
6209  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6223  template< typename MT3 // Type of the left-hand side target matrix
6224  , typename MT4 // Type of the left-hand side matrix operand
6225  , typename MT5 // Type of the right-hand side matrix operand
6226  , typename ST2 > // Type of the scalar value
6227  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6228  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6229  {
6230  for( size_t i=0UL; i<A.rows(); ++i ) {
6231  C(i,i) += A(i,i) * B(i,i) * scalar;
6232  }
6233  }
6234  //**********************************************************************************************
6235 
6236  //**Default addition assignment to dense matrices (small matrices)******************************
6250  template< typename MT3 // Type of the left-hand side target matrix
6251  , typename MT4 // Type of the left-hand side matrix operand
6252  , typename MT5 // Type of the right-hand side matrix operand
6253  , typename ST2 > // Type of the scalar value
6254  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6255  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6256  {
6257  selectDefaultAddAssignKernel( C, A, B, scalar );
6258  }
6259  //**********************************************************************************************
6260 
6261  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
6276  template< typename MT3 // Type of the left-hand side target matrix
6277  , typename MT4 // Type of the left-hand side matrix operand
6278  , typename MT5 // Type of the right-hand side matrix operand
6279  , typename ST2 > // Type of the scalar value
6280  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6281  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6282  {
6283  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
6284 
6285  const size_t M( A.rows() );
6286  const size_t N( B.columns() );
6287  const size_t K( A.columns() );
6288 
6289  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6290 
6291  size_t i( 0UL );
6292 
6293  for( ; (i+2UL) <= M; i+=2UL )
6294  {
6295  const size_t jend( LOW ? i+2UL : N );
6296  size_t j( UPP ? i : 0UL );
6297 
6298  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
6299  {
6300  const size_t kbegin( ( IsUpper_v<MT4> )
6301  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6302  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6303  const size_t kend( ( IsLower_v<MT4> )
6304  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
6305  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
6306 
6307  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6308  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6309 
6310  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6311  size_t k( kbegin );
6312 
6313  for( ; k<kpos; k+=SIMDSIZE ) {
6314  const SIMDType a1( A.load(i ,k) );
6315  const SIMDType a2( A.load(i+1UL,k) );
6316  const SIMDType b1( B.load(k,j ) );
6317  const SIMDType b2( B.load(k,j+1UL) );
6318  const SIMDType b3( B.load(k,j+2UL) );
6319  const SIMDType b4( B.load(k,j+3UL) );
6320  xmm1 += a1 * b1;
6321  xmm2 += a1 * b2;
6322  xmm3 += a1 * b3;
6323  xmm4 += a1 * b4;
6324  xmm5 += a2 * b1;
6325  xmm6 += a2 * b2;
6326  xmm7 += a2 * b3;
6327  xmm8 += a2 * b4;
6328  }
6329 
6330  C(i ,j ) += sum( xmm1 ) * scalar;
6331  C(i ,j+1UL) += sum( xmm2 ) * scalar;
6332  C(i ,j+2UL) += sum( xmm3 ) * scalar;
6333  C(i ,j+3UL) += sum( xmm4 ) * scalar;
6334  C(i+1UL,j ) += sum( xmm5 ) * scalar;
6335  C(i+1UL,j+1UL) += sum( xmm6 ) * scalar;
6336  C(i+1UL,j+2UL) += sum( xmm7 ) * scalar;
6337  C(i+1UL,j+3UL) += sum( xmm8 ) * scalar;
6338 
6339  for( ; remainder && k<kend; ++k ) {
6340  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6341  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6342  C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
6343  C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
6344  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6345  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6346  C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
6347  C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
6348  }
6349  }
6350 
6351  for( ; (j+2UL) <= jend; j+=2UL )
6352  {
6353  const size_t kbegin( ( IsUpper_v<MT4> )
6354  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6355  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6356  const size_t kend( ( IsLower_v<MT4> )
6357  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
6358  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6359 
6360  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6361  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6362 
6363  SIMDType xmm1, xmm2, xmm3, xmm4;
6364  size_t k( kbegin );
6365 
6366  for( ; k<kpos; k+=SIMDSIZE ) {
6367  const SIMDType a1( A.load(i ,k) );
6368  const SIMDType a2( A.load(i+1UL,k) );
6369  const SIMDType b1( B.load(k,j ) );
6370  const SIMDType b2( B.load(k,j+1UL) );
6371  xmm1 += a1 * b1;
6372  xmm2 += a1 * b2;
6373  xmm3 += a2 * b1;
6374  xmm4 += a2 * b2;
6375  }
6376 
6377  C(i ,j ) += sum( xmm1 ) * scalar;
6378  C(i ,j+1UL) += sum( xmm2 ) * scalar;
6379  C(i+1UL,j ) += sum( xmm3 ) * scalar;
6380  C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
6381 
6382  for( ; remainder && k<kend; ++k ) {
6383  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6384  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6385  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6386  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6387  }
6388  }
6389 
6390  if( j < jend )
6391  {
6392  const size_t kbegin( ( IsUpper_v<MT4> )
6393  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6394  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6395  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6396 
6397  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6398  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6399 
6400  SIMDType xmm1, xmm2;
6401  size_t k( kbegin );
6402 
6403  for( ; k<kpos; k+=SIMDSIZE ) {
6404  const SIMDType b1( B.load(k,j) );
6405  xmm1 += A.load(i ,k) * b1;
6406  xmm2 += A.load(i+1UL,k) * b1;
6407  }
6408 
6409  C(i ,j) += sum( xmm1 ) * scalar;
6410  C(i+1UL,j) += sum( xmm2 ) * scalar;
6411 
6412  for( ; remainder && k<kend; ++k ) {
6413  C(i ,j) += A(i ,k) * B(k,j) * scalar;
6414  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6415  }
6416  }
6417  }
6418 
6419  if( i < M )
6420  {
6421  const size_t jend( LOW ? i+1UL : N );
6422  size_t j( UPP ? i : 0UL );
6423 
6424  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
6425  {
6426  const size_t kbegin( ( IsUpper_v<MT4> )
6427  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6428  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6429  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
6430 
6431  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6432  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6433 
6434  SIMDType xmm1, xmm2, xmm3, xmm4;
6435  size_t k( kbegin );
6436 
6437  for( ; k<kpos; k+=SIMDSIZE ) {
6438  const SIMDType a1( A.load(i,k) );
6439  xmm1 += a1 * B.load(k,j );
6440  xmm2 += a1 * B.load(k,j+1UL);
6441  xmm3 += a1 * B.load(k,j+2UL);
6442  xmm4 += a1 * B.load(k,j+3UL);
6443  }
6444 
6445  C(i,j ) += sum( xmm1 ) * scalar;
6446  C(i,j+1UL) += sum( xmm2 ) * scalar;
6447  C(i,j+2UL) += sum( xmm3 ) * scalar;
6448  C(i,j+3UL) += sum( xmm4 ) * scalar;
6449 
6450  for( ; remainder && k<kend; ++k ) {
6451  C(i,j ) += A(i,k) * B(k,j ) * scalar;
6452  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6453  C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
6454  C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
6455  }
6456  }
6457 
6458  for( ; (j+2UL) <= jend; j+=2UL )
6459  {
6460  const size_t kbegin( ( IsUpper_v<MT4> )
6461  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6462  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6463  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6464 
6465  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6466  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6467 
6468  SIMDType xmm1, xmm2;
6469  size_t k( kbegin );
6470 
6471  for( ; k<kpos; k+=SIMDSIZE ) {
6472  const SIMDType a1( A.load(i,k) );
6473  xmm1 += a1 * B.load(k,j );
6474  xmm2 += a1 * B.load(k,j+1UL);
6475  }
6476 
6477  C(i,j ) += sum( xmm1 ) * scalar;
6478  C(i,j+1UL) += sum( xmm2 ) * scalar;
6479 
6480  for( ; remainder && k<kend; ++k ) {
6481  C(i,j ) += A(i,k) * B(k,j ) * scalar;
6482  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6483  }
6484  }
6485 
6486  if( j < jend )
6487  {
6488  const size_t kbegin( ( IsUpper_v<MT4> )
6489  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6490  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6491 
6492  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
6493  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6494 
6495  SIMDType xmm1;
6496  size_t k( kbegin );
6497 
6498  for( ; k<kpos; k+=SIMDSIZE ) {
6499  xmm1 += A.load(i,k) * B.load(k,j);
6500  }
6501 
6502  C(i,j) += sum( xmm1 ) * scalar;
6503 
6504  for( ; remainder && k<K; ++k ) {
6505  C(i,j) += A(i,k) * B(k,j) * scalar;
6506  }
6507  }
6508  }
6509  }
6510  //**********************************************************************************************
6511 
6512  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
6527  template< typename MT3 // Type of the left-hand side target matrix
6528  , typename MT4 // Type of the left-hand side matrix operand
6529  , typename MT5 // Type of the right-hand side matrix operand
6530  , typename ST2 > // Type of the scalar value
6531  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6532  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6533  {
6534  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
6535 
6536  const size_t M( A.rows() );
6537  const size_t N( B.columns() );
6538  const size_t K( A.columns() );
6539 
6540  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6541 
6542  size_t i( 0UL );
6543 
6544  for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
6545  {
6546  size_t j( 0UL );
6547 
6548  for( ; (j+2UL) <= N; j+=2UL )
6549  {
6550  const size_t kbegin( ( IsUpper_v<MT4> )
6551  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6552  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6553  const size_t kend( ( IsLower_v<MT4> )
6554  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
6555  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6556 
6557  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6558  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6559 
6560  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6561  size_t k( kbegin );
6562 
6563  for( ; k<kpos; k+=SIMDSIZE ) {
6564  const SIMDType a1( A.load(i ,k) );
6565  const SIMDType a2( A.load(i+1UL,k) );
6566  const SIMDType a3( A.load(i+2UL,k) );
6567  const SIMDType a4( A.load(i+3UL,k) );
6568  const SIMDType b1( B.load(k,j ) );
6569  const SIMDType b2( B.load(k,j+1UL) );
6570  xmm1 += a1 * b1;
6571  xmm2 += a1 * b2;
6572  xmm3 += a2 * b1;
6573  xmm4 += a2 * b2;
6574  xmm5 += a3 * b1;
6575  xmm6 += a3 * b2;
6576  xmm7 += a4 * b1;
6577  xmm8 += a4 * b2;
6578  }
6579 
6580  C(i ,j ) += sum( xmm1 ) * scalar;
6581  C(i ,j+1UL) += sum( xmm2 ) * scalar;
6582  C(i+1UL,j ) += sum( xmm3 ) * scalar;
6583  C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
6584  C(i+2UL,j ) += sum( xmm5 ) * scalar;
6585  C(i+2UL,j+1UL) += sum( xmm6 ) * scalar;
6586  C(i+3UL,j ) += sum( xmm7 ) * scalar;
6587  C(i+3UL,j+1UL) += sum( xmm8 ) * scalar;
6588 
6589  for( ; remainder && k<kend; ++k ) {
6590  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6591  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6592  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6593  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6594  C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
6595  C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
6596  C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
6597  C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
6598  }
6599  }
6600 
6601  if( j < N )
6602  {
6603  const size_t kbegin( ( IsUpper_v<MT4> )
6604  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6605  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6606  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
6607 
6608  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6609  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6610 
6611  SIMDType xmm1, xmm2, xmm3, xmm4;
6612  size_t k( kbegin );
6613 
6614  for( ; k<kpos; k+=SIMDSIZE ) {
6615  const SIMDType b1( B.load(k,j) );
6616  xmm1 += A.load(i ,k) * b1;
6617  xmm2 += A.load(i+1UL,k) * b1;
6618  xmm3 += A.load(i+2UL,k) * b1;
6619  xmm4 += A.load(i+3UL,k) * b1;
6620  }
6621 
6622  C(i ,j) += sum( xmm1 ) * scalar;
6623  C(i+1UL,j) += sum( xmm2 ) * scalar;
6624  C(i+2UL,j) += sum( xmm3 ) * scalar;
6625  C(i+3UL,j) += sum( xmm4 ) * scalar;
6626 
6627  for( ; remainder && k<kend; ++k ) {
6628  C(i ,j) += A(i ,k) * B(k,j) * scalar;
6629  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6630  C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
6631  C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
6632  }
6633  }
6634  }
6635 
6636  for( ; (i+2UL) <= M; i+=2UL )
6637  {
6638  const size_t jend( LOW ? i+2UL : N );
6639  size_t j( UPP ? i : 0UL );
6640 
6641  for( ; (j+2UL) <= jend; j+=2UL )
6642  {
6643  const size_t kbegin( ( IsUpper_v<MT4> )
6644  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6645  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6646  const size_t kend( ( IsLower_v<MT4> )
6647  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
6648  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6649 
6650  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6651  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6652 
6653  SIMDType xmm1, xmm2, xmm3, xmm4;
6654  size_t k( kbegin );
6655 
6656  for( ; k<kpos; k+=SIMDSIZE ) {
6657  const SIMDType a1( A.load(i ,k) );
6658  const SIMDType a2( A.load(i+1UL,k) );
6659  const SIMDType b1( B.load(k,j ) );
6660  const SIMDType b2( B.load(k,j+1UL) );
6661  xmm1 += a1 * b1;
6662  xmm2 += a1 * b2;
6663  xmm3 += a2 * b1;
6664  xmm4 += a2 * b2;
6665  }
6666 
6667  C(i ,j ) += sum( xmm1 ) * scalar;
6668  C(i ,j+1UL) += sum( xmm2 ) * scalar;
6669  C(i+1UL,j ) += sum( xmm3 ) * scalar;
6670  C(i+1UL,j+1UL) += sum( xmm4 ) * scalar;
6671 
6672  for( ; remainder && k<kend; ++k ) {
6673  C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6674  C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6675  C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6676  C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6677  }
6678  }
6679 
6680  if( j < jend )
6681  {
6682  const size_t kbegin( ( IsUpper_v<MT4> )
6683  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6684  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6685  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6686 
6687  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6688  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6689 
6690  SIMDType xmm1, xmm2;
6691  size_t k( kbegin );
6692 
6693  for( ; k<kpos; k+=SIMDSIZE ) {
6694  const SIMDType b1( B.load(k,j) );
6695  xmm1 += A.load(i ,k) * b1;
6696  xmm2 += A.load(i+1UL,k) * b1;
6697  }
6698 
6699  C(i ,j) += sum( xmm1 ) * scalar;
6700  C(i+1UL,j) += sum( xmm2 ) * scalar;
6701 
6702  for( ; remainder && k<kend; ++k ) {
6703  C(i ,j) += A(i ,k) * B(k,j) * scalar;
6704  C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6705  }
6706  }
6707  }
6708 
6709  if( i < M )
6710  {
6711  const size_t jend( LOW ? i+1UL : N );
6712  size_t j( UPP ? i : 0UL );
6713 
6714  for( ; (j+2UL) <= jend; j+=2UL )
6715  {
6716  const size_t kbegin( ( IsUpper_v<MT4> )
6717  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6718  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6719  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6720 
6721  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
6722  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6723 
6724  SIMDType xmm1, xmm2;
6725  size_t k( kbegin );
6726 
6727  for( ; k<kpos; k+=SIMDSIZE ) {
6728  const SIMDType a1( A.load(i,k) );
6729  xmm1 += a1 * B.load(k,j );
6730  xmm2 += a1 * B.load(k,j+1UL);
6731  }
6732 
6733  C(i,j ) += sum( xmm1 ) * scalar;
6734  C(i,j+1UL) += sum( xmm2 ) * scalar;
6735 
6736  for( ; remainder && k<kend; ++k ) {
6737  C(i,j ) += A(i,k) * B(k,j ) * scalar;
6738  C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6739  }
6740  }
6741 
6742  if( j < jend )
6743  {
6744  const size_t kbegin( ( IsUpper_v<MT4> )
6745  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
6746  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
6747 
6748  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
6749  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
6750 
6751  SIMDType xmm1;
6752  size_t k( kbegin );
6753 
6754  for( ; k<kpos; k+=SIMDSIZE ) {
6755  xmm1 += A.load(i,k) * B.load(k,j);
6756  }
6757 
6758  C(i,j) += sum( xmm1 ) * scalar;
6759 
6760  for( ; remainder && k<K; ++k ) {
6761  C(i,j) += A(i,k) * B(k,j) * scalar;
6762  }
6763  }
6764  }
6765  }
6766  //**********************************************************************************************
6767 
6768  //**Default addition assignment to dense matrices (large matrices)******************************
6782  template< typename MT3 // Type of the left-hand side target matrix
6783  , typename MT4 // Type of the left-hand side matrix operand
6784  , typename MT5 // Type of the right-hand side matrix operand
6785  , typename ST2 > // Type of the scalar value
6786  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6787  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6788  {
6789  selectDefaultAddAssignKernel( C, A, B, scalar );
6790  }
6791  //**********************************************************************************************
6792 
6793  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
6808  template< typename MT3 // Type of the left-hand side target matrix
6809  , typename MT4 // Type of the left-hand side matrix operand
6810  , typename MT5 // Type of the right-hand side matrix operand
6811  , typename ST2 > // Type of the scalar value
6812  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6813  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6814  {
6815  if( LOW )
6816  lmmm( C, A, B, scalar, ST2(1) );
6817  else if( UPP )
6818  ummm( C, A, B, scalar, ST2(1) );
6819  else
6820  mmm( C, A, B, scalar, ST2(1) );
6821  }
6822  //**********************************************************************************************
6823 
6824  //**BLAS-based addition assignment to dense matrices (default)**********************************
6838  template< typename MT3 // Type of the left-hand side target matrix
6839  , typename MT4 // Type of the left-hand side matrix operand
6840  , typename MT5 // Type of the right-hand side matrix operand
6841  , typename ST2 > // Type of the scalar value
6842  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6843  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6844  {
6845  selectLargeAddAssignKernel( C, A, B, scalar );
6846  }
6847  //**********************************************************************************************
6848 
6849  //**BLAS-based addition assignment to dense matrices********************************************
6850 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6851 
6864  template< typename MT3 // Type of the left-hand side target matrix
6865  , typename MT4 // Type of the left-hand side matrix operand
6866  , typename MT5 // Type of the right-hand side matrix operand
6867  , typename ST2 > // Type of the scalar value
6868  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6869  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6870  {
6871  using ET = ElementType_t<MT3>;
6872 
6873  if( IsTriangular_v<MT4> ) {
6874  ResultType_t<MT3> tmp( serial( B ) );
6875  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6876  addAssign( C, tmp );
6877  }
6878  else if( IsTriangular_v<MT5> ) {
6879  ResultType_t<MT3> tmp( serial( A ) );
6880  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6881  addAssign( C, tmp );
6882  }
6883  else {
6884  gemm( C, A, B, ET(scalar), ET(1) );
6885  }
6886  }
6887 #endif
6888  //**********************************************************************************************
6889 
6890  //**Addition assignment to sparse matrices******************************************************
6891  // No special implementation for the addition assignment to sparse matrices.
6892  //**********************************************************************************************
6893 
6894  //**Subtraction assignment to dense matrices****************************************************
6906  template< typename MT // Type of the target dense matrix
6907  , bool SO > // Storage order of the target dense matrix
6908  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6909  {
6911 
6912  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6913  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6914 
6915  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6916  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6917 
6918  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6919  return;
6920  }
6921 
6922  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6923  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6924 
6925  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6926  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6927  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6928  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6929  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6930  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6931 
6932  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6933  }
6934  //**********************************************************************************************
6935 
6936  //**Subtraction assignment to dense matrices (kernel selection)*********************************
6947  template< typename MT3 // Type of the left-hand side target matrix
6948  , typename MT4 // Type of the left-hand side matrix operand
6949  , typename MT5 // Type of the right-hand side matrix operand
6950  , typename ST2 > // Type of the scalar value
6951  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6952  {
6953  if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
6954  ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
6955  selectSmallSubAssignKernel( C, A, B, scalar );
6956  else
6957  selectBlasSubAssignKernel( C, A, B, scalar );
6958  }
6959  //**********************************************************************************************
6960 
6961  //**Default subtraction assignment to dense matrices (general/general)**************************
6975  template< typename MT3 // Type of the left-hand side target matrix
6976  , typename MT4 // Type of the left-hand side matrix operand
6977  , typename MT5 // Type of the right-hand side matrix operand
6978  , typename ST2 > // Type of the scalar value
6979  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6980  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6981  {
6982  const ResultType tmp( serial( A * B * scalar ) );
6983  subAssign( C, tmp );
6984  }
6985  //**********************************************************************************************
6986 
6987  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
7001  template< typename MT3 // Type of the left-hand side target matrix
7002  , typename MT4 // Type of the left-hand side matrix operand
7003  , typename MT5 // Type of the right-hand side matrix operand
7004  , typename ST2 > // Type of the scalar value
7005  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7006  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7007  {
7008  const size_t M( A.rows() );
7009  const size_t N( B.columns() );
7010 
7011  for( size_t i=0UL; i<M; ++i )
7012  {
7013  const size_t jbegin( ( IsUpper_v<MT4> )
7014  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7015  :( 0UL ) );
7016  const size_t jend( ( IsLower_v<MT4> )
7017  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7018  :( N ) );
7019  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7020 
7021  const size_t jnum( jend - jbegin );
7022  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
7023 
7024  for( size_t j=jbegin; j<jpos; j+=2UL ) {
7025  C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7026  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7027  }
7028  if( jpos < jend ) {
7029  C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7030  }
7031  }
7032  }
7033  //**********************************************************************************************
7034 
7035  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
7049  template< typename MT3 // Type of the left-hand side target matrix
7050  , typename MT4 // Type of the left-hand side matrix operand
7051  , typename MT5 // Type of the right-hand side matrix operand
7052  , typename ST2 > // Type of the scalar value
7053  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7054  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7055  {
7056  constexpr size_t block( BLOCK_SIZE );
7057 
7058  const size_t M( A.rows() );
7059  const size_t N( B.columns() );
7060 
7061  for( size_t jj=0UL; jj<N; jj+=block ) {
7062  const size_t jend( min( N, jj+block ) );
7063  for( size_t ii=0UL; ii<M; ii+=block ) {
7064  const size_t iend( min( M, ii+block ) );
7065  for( size_t j=jj; j<jend; ++j )
7066  {
7067  const size_t ibegin( ( IsLower_v<MT4> )
7068  ?( max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
7069  :( ii ) );
7070  const size_t ipos( ( IsUpper_v<MT4> )
7071  ?( min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
7072  :( iend ) );
7073 
7074  for( size_t i=ibegin; i<ipos; ++i ) {
7075  C(i,j) -= A(i,j) * B(j,j) * scalar;
7076  }
7077  }
7078  }
7079  }
7080  }
7081  //**********************************************************************************************
7082 
7083  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
7098  template< typename MT3 // Type of the left-hand side target matrix
7099  , typename MT4 // Type of the left-hand side matrix operand
7100  , typename MT5 // Type of the right-hand side matrix operand
7101  , typename ST2 > // Type of the scalar value
7102  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7103  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7104  {
7105  constexpr size_t block( BLOCK_SIZE );
7106 
7107  const size_t M( A.rows() );
7108  const size_t N( B.columns() );
7109 
7110  for( size_t ii=0UL; ii<M; ii+=block ) {
7111  const size_t iend( min( M, ii+block ) );
7112  for( size_t jj=0UL; jj<N; jj+=block ) {
7113  const size_t jend( min( N, jj+block ) );
7114  for( size_t i=ii; i<iend; ++i )
7115  {
7116  const size_t jbegin( ( IsUpper_v<MT5> )
7117  ?( max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
7118  :( jj ) );
7119  const size_t jpos( ( IsLower_v<MT5> )
7120  ?( min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
7121  :( jend ) );
7122 
7123  for( size_t j=jbegin; j<jpos; ++j ) {
7124  C(i,j) -= A(i,i) * B(i,j) * scalar;
7125  }
7126  }
7127  }
7128  }
7129  }
7130  //**********************************************************************************************
7131 
7132  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
7147  template< typename MT3 // Type of the left-hand side target matrix
7148  , typename MT4 // Type of the left-hand side matrix operand
7149  , typename MT5 // Type of the right-hand side matrix operand
7150  , typename ST2 > // Type of the scalar value
7151  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7152  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7153  {
7154  const size_t M( A.rows() );
7155  const size_t N( B.columns() );
7156 
7157  for( size_t j=0UL; j<N; ++j )
7158  {
7159  const size_t ibegin( ( IsLower_v<MT5> )
7160  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7161  :( 0UL ) );
7162  const size_t iend( ( IsUpper_v<MT5> )
7163  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7164  :( M ) );
7165  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7166 
7167  const size_t inum( iend - ibegin );
7168  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7169 
7170  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7171  C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7172  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7173  }
7174  if( ipos < iend ) {
7175  C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7176  }
7177  }
7178  }
7179  //**********************************************************************************************
7180 
7181  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
7195  template< typename MT3 // Type of the left-hand side target matrix
7196  , typename MT4 // Type of the left-hand side matrix operand
7197  , typename MT5 // Type of the right-hand side matrix operand
7198  , typename ST2 > // Type of the scalar value
7199  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7200  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7201  {
7202  for( size_t i=0UL; i<A.rows(); ++i ) {
7203  C(i,i) -= A(i,i) * B(i,i) * scalar;
7204  }
7205  }
7206  //**********************************************************************************************
7207 
7208  //**Default subtraction assignment to dense matrices (small matrices)***************************
7222  template< typename MT3 // Type of the left-hand side target matrix
7223  , typename MT4 // Type of the left-hand side matrix operand
7224  , typename MT5 // Type of the right-hand side matrix operand
7225  , typename ST2 > // Type of the scalar value
7226  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7227  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7228  {
7229  selectDefaultSubAssignKernel( C, A, B, scalar );
7230  }
7231  //**********************************************************************************************
7232 
7233  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
7248  template< typename MT3 // Type of the left-hand side target matrix
7249  , typename MT4 // Type of the left-hand side matrix operand
7250  , typename MT5 // Type of the right-hand side matrix operand
7251  , typename ST2 > // Type of the scalar value
7252  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7253  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7254  {
7255  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7256 
7257  const size_t M( A.rows() );
7258  const size_t N( B.columns() );
7259  const size_t K( A.columns() );
7260 
7261  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7262 
7263  size_t i( 0UL );
7264 
7265  for( ; (i+2UL) <= M; i+=2UL )
7266  {
7267  const size_t jend( LOW ? i+2UL : N );
7268  size_t j( UPP ? i : 0UL );
7269 
7270  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
7271  {
7272  const size_t kbegin( ( IsUpper_v<MT4> )
7273  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7274  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7275  const size_t kend( ( IsLower_v<MT4> )
7276  ?( IsUpper_v<MT5> ? min( i+2UL, j+4UL ) : ( i+2UL ) )
7277  :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
7278 
7279  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7280  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7281 
7282  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7283  size_t k( kbegin );
7284 
7285  for( ; k<kpos; k+=SIMDSIZE ) {
7286  const SIMDType a1( A.load(i ,k) );
7287  const SIMDType a2( A.load(i+1UL,k) );
7288  const SIMDType b1( B.load(k,j ) );
7289  const SIMDType b2( B.load(k,j+1UL) );
7290  const SIMDType b3( B.load(k,j+2UL) );
7291  const SIMDType b4( B.load(k,j+3UL) );
7292  xmm1 += a1 * b1;
7293  xmm2 += a1 * b2;
7294  xmm3 += a1 * b3;
7295  xmm4 += a1 * b4;
7296  xmm5 += a2 * b1;
7297  xmm6 += a2 * b2;
7298  xmm7 += a2 * b3;
7299  xmm8 += a2 * b4;
7300  }
7301 
7302  C(i ,j ) -= sum( xmm1 ) * scalar;
7303  C(i ,j+1UL) -= sum( xmm2 ) * scalar;
7304  C(i ,j+2UL) -= sum( xmm3 ) * scalar;
7305  C(i ,j+3UL) -= sum( xmm4 ) * scalar;
7306  C(i+1UL,j ) -= sum( xmm5 ) * scalar;
7307  C(i+1UL,j+1UL) -= sum( xmm6 ) * scalar;
7308  C(i+1UL,j+2UL) -= sum( xmm7 ) * scalar;
7309  C(i+1UL,j+3UL) -= sum( xmm8 ) * scalar;
7310 
7311  for( ; remainder && k<kend; ++k ) {
7312  C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7313  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7314  C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
7315  C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
7316  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7317  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7318  C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
7319  C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
7320  }
7321  }
7322 
7323  for( ; (j+2UL) <= jend; j+=2UL )
7324  {
7325  const size_t kbegin( ( IsUpper_v<MT4> )
7326  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7327  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7328  const size_t kend( ( IsLower_v<MT4> )
7329  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
7330  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7331 
7332  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7333  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7334 
7335  SIMDType xmm1, xmm2, xmm3, xmm4;
7336  size_t k( kbegin );
7337 
7338  for( ; k<kpos; k+=SIMDSIZE ) {
7339  const SIMDType a1( A.load(i ,k) );
7340  const SIMDType a2( A.load(i+1UL,k) );
7341  const SIMDType b1( B.load(k,j ) );
7342  const SIMDType b2( B.load(k,j+1UL) );
7343  xmm1 += a1 * b1;
7344  xmm2 += a1 * b2;
7345  xmm3 += a2 * b1;
7346  xmm4 += a2 * b2;
7347  }
7348 
7349  C(i ,j ) -= sum( xmm1 ) * scalar;
7350  C(i ,j+1UL) -= sum( xmm2 ) * scalar;
7351  C(i+1UL,j ) -= sum( xmm3 ) * scalar;
7352  C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
7353 
7354  for( ; remainder && k<kend; ++k ) {
7355  C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7356  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7357  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7358  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7359  }
7360  }
7361 
7362  if( j < jend )
7363  {
7364  const size_t kbegin( ( IsUpper_v<MT4> )
7365  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7366  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7367  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
7368 
7369  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7370  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7371 
7372  SIMDType xmm1, xmm2;
7373  size_t k( kbegin );
7374 
7375  for( ; k<kpos; k+=SIMDSIZE ) {
7376  const SIMDType b1( B.load(k,j) );
7377  xmm1 += A.load(i ,k) * b1;
7378  xmm2 += A.load(i+1UL,k) * b1;
7379  }
7380 
7381  C(i ,j) -= sum( xmm1 ) * scalar;
7382  C(i+1UL,j) -= sum( xmm2 ) * scalar;
7383 
7384  for( ; remainder && k<kend; ++k ) {
7385  C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7386  C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7387  }
7388  }
7389  }
7390 
7391  if( i < M )
7392  {
7393  const size_t jend( LOW ? i+1UL : N );
7394  size_t j( UPP ? i : 0UL );
7395 
7396  for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
7397  {
7398  const size_t kbegin( ( IsUpper_v<MT4> )
7399  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7400  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7401  const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
7402 
7403  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7404  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7405 
7406  SIMDType xmm1, xmm2, xmm3, xmm4;
7407  size_t k( kbegin );
7408 
7409  for( ; k<kpos; k+=SIMDSIZE ) {
7410  const SIMDType a1( A.load(i,k) );
7411  xmm1 += a1 * B.load(k,j );
7412  xmm2 += a1 * B.load(k,j+1UL);
7413  xmm3 += a1 * B.load(k,j+2UL);
7414  xmm4 += a1 * B.load(k,j+3UL);
7415  }
7416 
7417  C(i,j ) -= sum( xmm1 ) * scalar;
7418  C(i,j+1UL) -= sum( xmm2 ) * scalar;
7419  C(i,j+2UL) -= sum( xmm3 ) * scalar;
7420  C(i,j+3UL) -= sum( xmm4 ) * scalar;
7421 
7422  for( ; remainder && k<kend; ++k ) {
7423  C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7424  C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7425  C(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
7426  C(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
7427  }
7428  }
7429 
7430  for( ; (j+2UL) <= jend; j+=2UL )
7431  {
7432  const size_t kbegin( ( IsUpper_v<MT4> )
7433  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7434  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7435  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
7436 
7437  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7438  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7439 
7440  SIMDType xmm1, xmm2;
7441  size_t k( kbegin );
7442 
7443  for( ; k<kpos; k+=SIMDSIZE ) {
7444  const SIMDType a1( A.load(i,k) );
7445  xmm1 += a1 * B.load(k,j );
7446  xmm2 += a1 * B.load(k,j+1UL);
7447  }
7448 
7449  C(i,j ) -= sum( xmm1 ) * scalar;
7450  C(i,j+1UL) -= sum( xmm2 ) * scalar;
7451 
7452  for( ; remainder && k<kend; ++k ) {
7453  C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7454  C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7455  }
7456  }
7457 
7458  if( j < jend )
7459  {
7460  const size_t kbegin( ( IsUpper_v<MT4> )
7461  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7462  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7463 
7464  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
7465  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7466 
7467  SIMDType xmm1;
7468  size_t k( kbegin );
7469 
7470  for( ; k<kpos; k+=SIMDSIZE ) {
7471  xmm1 += A.load(i,k) * B.load(k,j);
7472  }
7473 
7474  C(i,j) -= sum( xmm1 ) * scalar;
7475 
7476  for( ; remainder && k<K; ++k ) {
7477  C(i,j) -= A(i,k) * B(k,j) * scalar;
7478  }
7479  }
7480  }
7481  }
7482  //**********************************************************************************************
7483 
7484  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
7499  template< typename MT3 // Type of the left-hand side target matrix
7500  , typename MT4 // Type of the left-hand side matrix operand
7501  , typename MT5 // Type of the right-hand side matrix operand
7502  , typename ST2 > // Type of the scalar value
7503  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7504  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7505  {
7506  constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7507 
7508  const size_t M( A.rows() );
7509  const size_t N( B.columns() );
7510  const size_t K( A.columns() );
7511 
7512  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7513 
7514  size_t i( 0UL );
7515 
7516  for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
7517  {
7518  size_t j( 0UL );
7519 
7520  for( ; (j+2UL) <= N; j+=2UL )
7521  {
7522  const size_t kbegin( ( IsUpper_v<MT4> )
7523  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7524  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7525  const size_t kend( ( IsLower_v<MT4> )
7526  ?( IsUpper_v<MT5> ? min( i+4UL, j+2UL ) : ( i+4UL ) )
7527  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7528 
7529  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7530  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7531 
7532  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7533  size_t k( kbegin );
7534 
7535  for( ; k<kpos; k+=SIMDSIZE )
7536  {
7537  const SIMDType a1( A.load(i ,k) );
7538  const SIMDType a2( A.load(i+1UL,k) );
7539  const SIMDType a3( A.load(i+2UL,k) );
7540  const SIMDType a4( A.load(i+3UL,k) );
7541  const SIMDType b1( B.load(k,j ) );
7542  const SIMDType b2( B.load(k,j+1UL) );
7543  xmm1 += a1 * b1;
7544  xmm2 += a1 * b2;
7545  xmm3 += a2 * b1;
7546  xmm4 += a2 * b2;
7547  xmm5 += a3 * b1;
7548  xmm6 += a3 * b2;
7549  xmm7 += a4 * b1;
7550  xmm8 += a4 * b2;
7551  }
7552 
7553  C(i ,j ) -= sum( xmm1 ) * scalar;
7554  C(i ,j+1UL) -= sum( xmm2 ) * scalar;
7555  C(i+1UL,j ) -= sum( xmm3 ) * scalar;
7556  C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
7557  C(i+2UL,j ) -= sum( xmm5 ) * scalar;
7558  C(i+2UL,j+1UL) -= sum( xmm6 ) * scalar;
7559  C(i+3UL,j ) -= sum( xmm7 ) * scalar;
7560  C(i+3UL,j+1UL) -= sum( xmm8 ) * scalar;
7561 
7562  for( ; remainder && k<kend; ++k ) {
7563  C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7564  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7565  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7566  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7567  C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
7568  C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
7569  C(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
7570  C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
7571  }
7572  }
7573 
7574  if( j < N )
7575  {
7576  const size_t kbegin( ( IsUpper_v<MT4> )
7577  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7578  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7579  const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
7580 
7581  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7582  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7583 
7584  SIMDType xmm1, xmm2, xmm3, xmm4;
7585  size_t k( kbegin );
7586 
7587  for( ; k<kpos; k+=SIMDSIZE ) {
7588  const SIMDType b1( B.load(k,j) );
7589  xmm1 += A.load(i ,k) * b1;
7590  xmm2 += A.load(i+1UL,k) * b1;
7591  xmm3 += A.load(i+2UL,k) * b1;
7592  xmm4 += A.load(i+3UL,k) * b1;
7593  }
7594 
7595  C(i ,j) -= sum( xmm1 ) * scalar;
7596  C(i+1UL,j) -= sum( xmm2 ) * scalar;
7597  C(i+2UL,j) -= sum( xmm3 ) * scalar;
7598  C(i+3UL,j) -= sum( xmm4 ) * scalar;
7599 
7600  for( ; remainder && k<kend; ++k ) {
7601  C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7602  C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7603  C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
7604  C(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
7605  }
7606  }
7607  }
7608 
7609  for( ; (i+2UL) <= M; i+=2UL )
7610  {
7611  const size_t jend( LOW ? i+2UL : N );
7612  size_t j( UPP ? i : 0UL );
7613 
7614  for( ; (j+2UL) <= jend; j+=2UL )
7615  {
7616  const size_t kbegin( ( IsUpper_v<MT4> )
7617  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7618  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7619  const size_t kend( ( IsLower_v<MT4> )
7620  ?( IsUpper_v<MT5> ? min( i+2UL, j+2UL ) : ( i+2UL ) )
7621  :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7622 
7623  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7624  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7625 
7626  SIMDType xmm1, xmm2, xmm3, xmm4;
7627  size_t k( kbegin );
7628 
7629  for( ; k<kpos; k+=SIMDSIZE ) {
7630  const SIMDType a1( A.load(i ,k) );
7631  const SIMDType a2( A.load(i+1UL,k) );
7632  const SIMDType b1( B.load(k,j ) );
7633  const SIMDType b2( B.load(k,j+1UL) );
7634  xmm1 += a1 * b1;
7635  xmm2 += a1 * b2;
7636  xmm3 += a2 * b1;
7637  xmm4 += a2 * b2;
7638  }
7639 
7640  C(i ,j ) -= sum( xmm1 ) * scalar;
7641  C(i ,j+1UL) -= sum( xmm2 ) * scalar;
7642  C(i+1UL,j ) -= sum( xmm3 ) * scalar;
7643  C(i+1UL,j+1UL) -= sum( xmm4 ) * scalar;
7644 
7645  for( ; remainder && k<kend; ++k ) {
7646  C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7647  C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7648  C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7649  C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7650  }
7651  }
7652 
7653  if( j < jend )
7654  {
7655  const size_t kbegin( ( IsUpper_v<MT4> )
7656  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7657  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7658  const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
7659 
7660  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7661  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7662 
7663  SIMDType xmm1, xmm2;
7664  size_t k( kbegin );
7665 
7666  for( ; k<kpos; k+=SIMDSIZE ) {
7667  const SIMDType b1( B.load(k,j) );
7668  xmm1 += A.load(i ,k) * b1;
7669  xmm2 += A.load(i+1UL,k) * b1;
7670  }
7671 
7672  C(i ,j) -= sum( xmm1 ) * scalar;
7673  C(i+1UL,j) -= sum( xmm2 ) * scalar;
7674 
7675  for( ; remainder && k<kend; ++k ) {
7676  C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7677  C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7678  }
7679  }
7680  }
7681 
7682  if( i < M )
7683  {
7684  const size_t jend( LOW ? i+1UL : N );
7685  size_t j( UPP ? i : 0UL );
7686 
7687  for( ; (j+2UL) <= jend; j+=2UL )
7688  {
7689  const size_t kbegin( ( IsUpper_v<MT4> )
7690  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7691  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7692  const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
7693 
7694  const size_t kpos( remainder ? ( kend & size_t(-SIMDSIZE) ) : kend );
7695  BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7696 
7697  SIMDType xmm1, xmm2;
7698  size_t k( kbegin );
7699 
7700  for( ; k<kpos; k+=SIMDSIZE ) {
7701  const SIMDType a1( A.load(i,k) );
7702  xmm1 += a1 * B.load(k,j );
7703  xmm2 += a1 * B.load(k,j+1UL);
7704  }
7705 
7706  C(i,j ) -= sum( xmm1 ) * scalar;
7707  C(i,j+1UL) -= sum( xmm2 ) * scalar;
7708 
7709  for( ; remainder && k<kend; ++k ) {
7710  C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7711  C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7712  }
7713  }
7714 
7715  if( j < jend )
7716  {
7717  const size_t kbegin( ( IsUpper_v<MT4> )
7718  ?( ( IsLower_v<MT5> ? max( i, j ) : i ) & size_t(-SIMDSIZE) )
7719  :( IsLower_v<MT5> ? ( j & size_t(-SIMDSIZE) ) : 0UL ) );
7720 
7721  const size_t kpos( remainder ? ( K & size_t(-SIMDSIZE) ) : K );
7722  BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos, "Invalid end calculation" );
7723 
7724  SIMDType xmm1;
7725  size_t k( kbegin );
7726 
7727  for( ; k<kpos; k+=SIMDSIZE ) {
7728  xmm1 += A.load(i,k) * B.load(k,j);
7729  }
7730 
7731  C(i,j) -= sum( xmm1 ) * scalar;
7732 
7733  for( ; remainder && k<K; ++k ) {
7734  C(i,j) -= A(i,k) * B(k,j) * scalar;
7735  }
7736  }
7737  }
7738  }
7739  //**********************************************************************************************
7740 
7741  //**Default subtraction assignment to dense matrices (large matrices)***************************
7755  template< typename MT3 // Type of the left-hand side target matrix
7756  , typename MT4 // Type of the left-hand side matrix operand
7757  , typename MT5 // Type of the right-hand side matrix operand
7758  , typename ST2 > // Type of the scalar value
7759  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7760  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7761  {
7762  selectDefaultSubAssignKernel( C, A, B, scalar );
7763  }
7764  //**********************************************************************************************
7765 
7766  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
7781  template< typename MT3 // Type of the left-hand side target matrix
7782  , typename MT4 // Type of the left-hand side matrix operand
7783  , typename MT5 // Type of the right-hand side matrix operand
7784  , typename ST2 > // Type of the scalar value
7785  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7786  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7787  {
7788  if( LOW )
7789  lmmm( C, A, B, -scalar, ST2(1) );
7790  else if( UPP )
7791  ummm( C, A, B, -scalar, ST2(1) );
7792  else
7793  mmm( C, A, B, -scalar, ST2(1) );
7794  }
7795  //**********************************************************************************************
7796 
7797  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
7811  template< typename MT3 // Type of the left-hand side target matrix
7812  , typename MT4 // Type of the left-hand side matrix operand
7813  , typename MT5 // Type of the right-hand side matrix operand
7814  , typename ST2 > // Type of the scalar value
7815  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7816  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7817  {
7818  selectLargeSubAssignKernel( C, A, B, scalar );
7819  }
7820  //**********************************************************************************************
7821 
7822  //**BLAS-based subraction assignment to dense matrices******************************************
7823 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7824 
7837  template< typename MT3 // Type of the left-hand side target matrix
7838  , typename MT4 // Type of the left-hand side matrix operand
7839  , typename MT5 // Type of the right-hand side matrix operand
7840  , typename ST2 > // Type of the scalar value
7841  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7842  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7843  {
7844  using ET = ElementType_t<MT3>;
7845 
7846  if( IsTriangular_v<MT4> ) {
7847  ResultType_t<MT3> tmp( serial( B ) );
7848  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7849  subAssign( C, tmp );
7850  }
7851  else if( IsTriangular_v<MT5> ) {
7852  ResultType_t<MT3> tmp( serial( A ) );
7853  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7854  subAssign( C, tmp );
7855  }
7856  else {
7857  gemm( C, A, B, ET(-scalar), ET(1) );
7858  }
7859  }
7860 #endif
7861  //**********************************************************************************************
7862 
7863  //**Subtraction assignment to sparse matrices***************************************************
7864  // No special implementation for the subtraction assignment to sparse matrices.
7865  //**********************************************************************************************
7866 
7867  //**Schur product assignment to dense matrices**************************************************
7879  template< typename MT // Type of the target dense matrix
7880  , bool SO > // Storage order of the target dense matrix
7881  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7882  {
7884 
7888 
7889  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7890  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7891 
7892  const ResultType tmp( serial( rhs ) );
7893  schurAssign( ~lhs, tmp );
7894  }
7895  //**********************************************************************************************
7896 
7897  //**Schur product assignment to sparse matrices*************************************************
7898  // No special implementation for the Schur product assignment to sparse matrices.
7899  //**********************************************************************************************
7900 
7901  //**Multiplication assignment to dense matrices*************************************************
7902  // No special implementation for the multiplication assignment to dense matrices.
7903  //**********************************************************************************************
7904 
7905  //**Multiplication assignment to sparse matrices************************************************
7906  // No special implementation for the multiplication assignment to sparse matrices.
7907  //**********************************************************************************************
7908 
7909  //**SMP assignment to dense matrices************************************************************
7924  template< typename MT // Type of the target dense matrix
7925  , bool SO > // Storage order of the target dense matrix
7926  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7927  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7928  {
7930 
7931  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7932  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7933 
7934  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7935  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7936 
7937  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7938  return;
7939  }
7940  else if( left.columns() == 0UL ) {
7941  reset( ~lhs );
7942  return;
7943  }
7944 
7945  LT A( left ); // Evaluation of the left-hand side dense matrix operand
7946  RT B( right ); // Evaluation of the right-hand side dense matrix operand
7947 
7948  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7949  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7950  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7951  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7952  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7953  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7954 
7955  smpAssign( ~lhs, A * B * rhs.scalar_ );
7956  }
7957  //**********************************************************************************************
7958 
7959  //**SMP assignment to sparse matrices***********************************************************
7974  template< typename MT // Type of the target sparse matrix
7975  , bool SO > // Storage order of the target sparse matrix
7976  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7977  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7978  {
7980 
7981  using TmpType = If_t< SO, OppositeType, ResultType >;
7982 
7989 
7990  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7991  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7992 
7993  const ForwardFunctor fwd;
7994 
7995  const TmpType tmp( rhs );
7996  smpAssign( ~lhs, fwd( tmp ) );
7997  }
7998  //**********************************************************************************************
7999 
8000  //**SMP addition assignment to dense matrices***************************************************
8015  template< typename MT // Type of the target dense matrix
8016  , bool SO > // Storage order of the target dense matrix
8017  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8018  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8019  {
8021 
8022  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8023  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8024 
8025  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8026  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8027 
8028  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8029  return;
8030  }
8031 
8032  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8033  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8034 
8035  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8036  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8037  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8038  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8039  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8040  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8041 
8042  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
8043  }
8044  //**********************************************************************************************
8045 
8046  //**SMP addition assignment to sparse matrices**************************************************
8047  // No special implementation for the SMP addition assignment to sparse matrices.
8048  //**********************************************************************************************
8049 
8050  //**SMP subtraction assignment to dense matrices************************************************
8065  template< typename MT // Type of the target dense matrix
8066  , bool SO > // Storage order of the target dense matrix
8067  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8068  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8069  {
8071 
8072  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8073  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8074 
8075  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8076  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8077 
8078  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8079  return;
8080  }
8081 
8082  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8083  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8084 
8085  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8086  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8087  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8088  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8089  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8090  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8091 
8092  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
8093  }
8094  //**********************************************************************************************
8095 
8096  //**SMP subtraction assignment to sparse matrices***********************************************
8097  // No special implementation for the SMP subtraction assignment to sparse matrices.
8098  //**********************************************************************************************
8099 
8100  //**SMP Schur product assignment to dense matrices**********************************************
8112  template< typename MT // Type of the target dense matrix
8113  , bool SO > // Storage order of the target dense matrix
8114  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8115  {
8117 
8121 
8122  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8123  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8124 
8125  const ResultType tmp( rhs );
8126  smpSchurAssign( ~lhs, tmp );
8127  }
8128  //**********************************************************************************************
8129 
8130  //**SMP Schur product assignment to sparse matrices*********************************************
8131  // No special implementation for the SMP Schur product assignment to sparse matrices.
8132  //**********************************************************************************************
8133 
8134  //**SMP multiplication assignment to dense matrices*********************************************
8135  // No special implementation for the SMP multiplication assignment to dense matrices.
8136  //**********************************************************************************************
8137 
8138  //**SMP multiplication assignment to sparse matrices********************************************
8139  // No special implementation for the SMP multiplication assignment to sparse matrices.
8140  //**********************************************************************************************
8141 
8142  //**Compile time checks*************************************************************************
8151  //**********************************************************************************************
8152 };
8154 //*************************************************************************************************
8155 
8156 
8157 
8158 
8159 //=================================================================================================
8160 //
8161 // GLOBAL BINARY ARITHMETIC OPERATORS
8162 //
8163 //=================================================================================================
8164 
8165 //*************************************************************************************************
8195 template< typename MT1 // Type of the left-hand side dense matrix
8196  , typename MT2 > // Type of the right-hand side dense matrix
8197 inline decltype(auto)
8198  operator*( const DenseMatrix<MT1,false>& lhs, const DenseMatrix<MT2,true>& rhs )
8199 {
8201 
8202  if( (~lhs).columns() != (~rhs).rows() ) {
8203  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
8204  }
8205 
8207  return ReturnType( ~lhs, ~rhs );
8208 }
8209 //*************************************************************************************************
8210 
8211 
8212 
8213 
8214 //=================================================================================================
8215 //
8216 // GLOBAL FUNCTIONS
8217 //
8218 //=================================================================================================
8219 
8220 //*************************************************************************************************
8245 template< typename MT1 // Type of the left-hand side dense matrix
8246  , typename MT2 // Type of the right-hand side dense matrix
8247  , bool SF // Symmetry flag
8248  , bool HF // Hermitian flag
8249  , bool LF // Lower flag
8250  , bool UF > // Upper flag
8251 inline decltype(auto) declsym( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8252 {
8254 
8255  if( !isSquare( dm ) ) {
8256  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
8257  }
8258 
8259  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
8260  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8261 }
8263 //*************************************************************************************************
8264 
8265 
8266 //*************************************************************************************************
8291 template< typename MT1 // Type of the left-hand side dense matrix
8292  , typename MT2 // Type of the right-hand side dense matrix
8293  , bool SF // Symmetry flag
8294  , bool HF // Hermitian flag
8295  , bool LF // Lower flag
8296  , bool UF > // Upper flag
8297 inline decltype(auto) declherm( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8298 {
8300 
8301  if( !isSquare( dm ) ) {
8302  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
8303  }
8304 
8305  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
8306  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8307 }
8309 //*************************************************************************************************
8310 
8311 
8312 //*************************************************************************************************
8337 template< typename MT1 // Type of the left-hand side dense matrix
8338  , typename MT2 // Type of the right-hand side dense matrix
8339  , bool SF // Symmetry flag
8340  , bool HF // Hermitian flag
8341  , bool LF // Lower flag
8342  , bool UF > // Upper flag
8343 inline decltype(auto) decllow( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8344 {
8346 
8347  if( !isSquare( dm ) ) {
8348  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
8349  }
8350 
8351  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
8352  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8353 }
8355 //*************************************************************************************************
8356 
8357 
8358 //*************************************************************************************************
8383 template< typename MT1 // Type of the left-hand side dense matrix
8384  , typename MT2 // Type of the right-hand side dense matrix
8385  , bool SF // Symmetry flag
8386  , bool HF // Hermitian flag
8387  , bool LF // Lower flag
8388  , bool UF > // Upper flag
8389 inline decltype(auto) declupp( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8390 {
8392 
8393  if( !isSquare( dm ) ) {
8394  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
8395  }
8396 
8397  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
8398  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8399 }
8401 //*************************************************************************************************
8402 
8403 
8404 //*************************************************************************************************
8429 template< typename MT1 // Type of the left-hand side dense matrix
8430  , typename MT2 // Type of the right-hand side dense matrix
8431  , bool SF // Symmetry flag
8432  , bool HF // Hermitian flag
8433  , bool LF // Lower flag
8434  , bool UF > // Upper flag
8435 inline decltype(auto) decldiag( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8436 {
8438 
8439  if( !isSquare( dm ) ) {
8440  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
8441  }
8442 
8443  using ReturnType = const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
8444  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8445 }
8447 //*************************************************************************************************
8448 
8449 
8450 
8451 
8452 //=================================================================================================
8453 //
8454 // SIZE SPECIALIZATIONS
8455 //
8456 //=================================================================================================
8457 
8458 //*************************************************************************************************
8460 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
8461 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
8462  : public Size<MT1,0UL>
8463 {};
8464 
8465 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
8466 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
8467  : public Size<MT2,1UL>
8468 {};
8470 //*************************************************************************************************
8471 
8472 
8473 
8474 
8475 //=================================================================================================
8476 //
8477 // ISALIGNED SPECIALIZATIONS
8478 //
8479 //=================================================================================================
8480 
8481 //*************************************************************************************************
8483 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
8484 struct IsAligned< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8485  : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
8486 {};
8488 //*************************************************************************************************
8489 
8490 } // namespace blaze
8491 
8492 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatTDMatMultExpr.h:288
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:426
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatTDMatMultExpr.h:172
Header file for basic type definitions.
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:150
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatTDMatMultExpr.h:170
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:152
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:154
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:271
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:532
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:374
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:266
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:522
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1002
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:596
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:158
Header file for the Computation base class.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:274
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:431
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:476
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:390
Header file for the IsComplexDouble type trait.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:310
Constraint on the data type.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:151
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:155
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:564
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:444
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2146
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatTDMatMultExpr.h:169
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:167
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1002
Header file for the IsLower type trait.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatTDMatMultExpr.h:301
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:280
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:432
Header file for the IsAligned type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:552
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:270
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:161
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:268
Header file for the exception macros of the math module.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:477
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:604
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:468
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:454
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
Header file for the MatScalarMultExpr base class.
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:264
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:173
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:421
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:165
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:420
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:267
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:576
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatTDMatMultExpr.h:295
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:410
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:283
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatTDMatMultExpr.h:269
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:325
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatTDMatMultExpr.h:171
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:453
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:440
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:156
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:144
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:586
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1326
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:400
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:542
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:170
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the DeclSym functor.
If_t< IsExpression_v< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:277
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:464
Header file for the TrueType type/value trait base class.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:160
Header file for the IsExpression type trait class.
Header file for the function trace functionality.