Blaze  3.6
TDMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/dense/MMM.h>
54 #include <blaze/math/Exception.h>
67 #include <blaze/math/shims/Reset.h>
69 #include <blaze/math/SIMD.h>
100 #include <blaze/math/views/Check.h>
101 #include <blaze/system/BLAS.h>
102 #include <blaze/system/Blocking.h>
103 #include <blaze/system/Debugging.h>
105 #include <blaze/system/Thresholds.h>
108 #include <blaze/util/Assert.h>
109 #include <blaze/util/Complex.h>
112 #include <blaze/util/DisableIf.h>
113 #include <blaze/util/EnableIf.h>
116 #include <blaze/util/mpl/If.h>
117 #include <blaze/util/Types.h>
126 
127 
128 namespace blaze {
129 
130 //=================================================================================================
131 //
132 // CLASS TDMATTDMATMULTEXPR
133 //
134 //=================================================================================================
135 
136 //*************************************************************************************************
143 template< typename MT1 // Type of the left-hand side dense matrix
144  , typename MT2 // Type of the right-hand side dense matrix
145  , bool SF // Symmetry flag
146  , bool HF // Hermitian flag
147  , bool LF // Lower flag
148  , bool UF > // Upper flag
149 class TDMatTDMatMultExpr
150  : public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
151  , private Computation
152 {
153  private:
154  //**Type definitions****************************************************************************
161  //**********************************************************************************************
162 
163  //**********************************************************************************************
165  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
166  //**********************************************************************************************
167 
168  //**********************************************************************************************
170  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
171  //**********************************************************************************************
172 
173  //**********************************************************************************************
174  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
175  static constexpr bool HERM = ( HF && !( LF || UF ) );
176  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
177  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
178  //**********************************************************************************************
179 
180  //**********************************************************************************************
182 
187  template< typename T1, typename T2, typename T3 >
188  static constexpr bool CanExploitSymmetry_v =
189  ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
191  //**********************************************************************************************
192 
193  //**********************************************************************************************
195 
199  template< typename T1, typename T2, typename T3 >
200  static constexpr bool IsEvaluationRequired_v =
201  ( ( evaluateLeft || evaluateRight ) && CanExploitSymmetry_v<T1,T2,T3> );
203  //**********************************************************************************************
204 
205  //**********************************************************************************************
207 
210  template< typename T1, typename T2, typename T3 >
211  static constexpr bool UseBlasKernel_v =
212  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
213  !SYM && !HERM && !LOW && !UPP &&
214  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
215  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
216  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
217  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
218  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219  IsBLASCompatible_v< ElementType_t<T1> > &&
220  IsBLASCompatible_v< ElementType_t<T2> > &&
221  IsBLASCompatible_v< ElementType_t<T3> > &&
222  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
223  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
225  //**********************************************************************************************
226 
227  //**********************************************************************************************
229 
232  template< typename T1, typename T2, typename T3 >
233  static constexpr bool UseVectorizedDefaultKernel_v =
234  ( useOptimizedKernels &&
235  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
236  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
237  IsSIMDCombinable_v< ElementType_t<T1>
239  , ElementType_t<T3> > &&
240  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
241  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
243  //**********************************************************************************************
244 
245  //**********************************************************************************************
247 
250  using ForwardFunctor = If_t< HERM
251  , DeclHerm
252  , If_t< SYM
253  , DeclSym
254  , If_t< LOW
255  , If_t< UPP
256  , DeclDiag
257  , DeclLow >
258  , If_t< UPP
259  , DeclUpp
260  , Noop > > > >;
262  //**********************************************************************************************
263 
264  public:
265  //**Type definitions****************************************************************************
268 
271 
273  using ResultType = typename If_t< HERM
275  , If_t< SYM
277  , If_t< LOW
278  , If_t< UPP
281  , If_t< UPP
283  , MultTrait<RT1,RT2> > > > >::Type;
284 
289  using ReturnType = const ElementType;
290  using CompositeType = const ResultType;
291 
293  using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
294 
296  using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
297 
300 
303  //**********************************************************************************************
304 
305  //**Compilation flags***************************************************************************
307  static constexpr bool simdEnabled =
308  ( !IsDiagonal_v<MT1> &&
309  MT1::simdEnabled && MT2::simdEnabled &&
310  HasSIMDAdd_v<ET1,ET2> &&
311  HasSIMDMult_v<ET1,ET2> );
312 
314  static constexpr bool smpAssignable =
315  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
316  //**********************************************************************************************
317 
318  //**SIMD properties*****************************************************************************
320  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
321  //**********************************************************************************************
322 
323  //**Constructor*********************************************************************************
329  explicit inline TDMatTDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
330  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
331  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
332  {
333  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
334  }
335  //**********************************************************************************************
336 
337  //**Access operator*****************************************************************************
344  inline ReturnType operator()( size_t i, size_t j ) const {
345  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
346  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
347 
348  if( IsDiagonal_v<MT1> ) {
349  return lhs_(i,i) * rhs_(i,j);
350  }
351  else if( IsDiagonal_v<MT2> ) {
352  return lhs_(i,j) * rhs_(j,j);
353  }
354  else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
355  const size_t begin( ( IsUpper_v<MT1> )
356  ?( ( IsLower_v<MT2> )
357  ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
358  , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
359  :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
360  :( ( IsLower_v<MT2> )
361  ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
362  :( 0UL ) ) );
363  const size_t end( ( IsLower_v<MT1> )
364  ?( ( IsUpper_v<MT2> )
365  ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
366  , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
367  :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
368  :( ( IsUpper_v<MT2> )
369  ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
370  :( lhs_.columns() ) ) );
371 
372  if( begin >= end ) return ElementType();
373 
374  const size_t n( end - begin );
375 
376  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
377  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
378  }
379  else {
380  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
381  }
382  }
383  //**********************************************************************************************
384 
385  //**At function*********************************************************************************
393  inline ReturnType at( size_t i, size_t j ) const {
394  if( i >= lhs_.rows() ) {
395  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
396  }
397  if( j >= rhs_.columns() ) {
398  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
399  }
400  return (*this)(i,j);
401  }
402  //**********************************************************************************************
403 
404  //**Rows function*******************************************************************************
409  inline size_t rows() const noexcept {
410  return lhs_.rows();
411  }
412  //**********************************************************************************************
413 
414  //**Columns function****************************************************************************
419  inline size_t columns() const noexcept {
420  return rhs_.columns();
421  }
422  //**********************************************************************************************
423 
424  //**Left operand access*************************************************************************
429  inline LeftOperand leftOperand() const noexcept {
430  return lhs_;
431  }
432  //**********************************************************************************************
433 
434  //**Right operand access************************************************************************
439  inline RightOperand rightOperand() const noexcept {
440  return rhs_;
441  }
442  //**********************************************************************************************
443 
444  //**********************************************************************************************
450  template< typename T >
451  inline bool canAlias( const T* alias ) const noexcept {
452  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
453  }
454  //**********************************************************************************************
455 
456  //**********************************************************************************************
462  template< typename T >
463  inline bool isAliased( const T* alias ) const noexcept {
464  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
465  }
466  //**********************************************************************************************
467 
468  //**********************************************************************************************
473  inline bool isAligned() const noexcept {
474  return lhs_.isAligned() && rhs_.isAligned();
475  }
476  //**********************************************************************************************
477 
478  //**********************************************************************************************
483  inline bool canSMPAssign() const noexcept {
484  return ( !BLAZE_BLAS_MODE ||
485  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
487  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
488  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
489  !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
490  }
491  //**********************************************************************************************
492 
493  private:
494  //**Member variables****************************************************************************
497  //**********************************************************************************************
498 
499  //**Assignment to dense matrices****************************************************************
512  template< typename MT // Type of the target dense matrix
513  , bool SO > // Storage order of the target dense matrix
514  friend inline auto assign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
516  {
518 
519  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
520  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
521 
522  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
523  return;
524  }
525  else if( rhs.lhs_.columns() == 0UL ) {
526  reset( ~lhs );
527  return;
528  }
529 
530  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
531  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
532 
533  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
534  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
535  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
536  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
537  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
538  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
539 
540  TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
541  }
543  //**********************************************************************************************
544 
545  //**Assignment to dense matrices (kernel selection)*********************************************
556  template< typename MT3 // Type of the left-hand side target matrix
557  , typename MT4 // Type of the left-hand side matrix operand
558  , typename MT5 > // Type of the right-hand side matrix operand
559  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
560  {
561  if( ( IsDiagonal_v<MT4> ) ||
562  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
563  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
564  selectSmallAssignKernel( C, A, B );
565  else
566  selectBlasAssignKernel( C, A, B );
567  }
569  //**********************************************************************************************
570 
571  //**Default assignment to dense matrices (general/general)**************************************
585  template< typename MT3 // Type of the left-hand side target matrix
586  , typename MT4 // Type of the left-hand side matrix operand
587  , typename MT5 > // Type of the right-hand side matrix operand
588  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
589  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
590  {
591  const size_t M( A.rows() );
592  const size_t N( B.columns() );
593  const size_t K( A.columns() );
594 
595  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
596 
597  for( size_t j=0UL; j<N; ++j )
598  {
599  const size_t kbegin( ( IsLower_v<MT5> )
600  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
601  :( 0UL ) );
602  const size_t kend( ( IsUpper_v<MT5> )
603  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
604  :( K ) );
605  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
606 
607  if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
608  for( size_t i=0UL; i<M; ++i ) {
609  reset( C(i,j) );
610  }
611  continue;
612  }
613 
614  {
615  const size_t ibegin( ( IsLower_v<MT4> )
616  ?( ( IsStrictlyLower_v<MT4> )
617  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
618  :( LOW ? max(j,kbegin) : kbegin ) )
619  :( LOW ? j : 0UL ) );
620  const size_t iend( ( IsUpper_v<MT4> )
621  ?( ( IsStrictlyUpper_v<MT4> )
622  ?( UPP ? min(j+1UL,kbegin) : kbegin )
623  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
624  :( UPP ? j+1UL : M ) );
625 
626  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
627  for( size_t i=0UL; i<ibegin; ++i ) {
628  reset( C(i,j) );
629  }
630  }
631  else if( IsStrictlyLower_v<MT4> ) {
632  reset( C(0UL,j) );
633  }
634  for( size_t i=ibegin; i<iend; ++i ) {
635  C(i,j) = A(i,kbegin) * B(kbegin,j);
636  }
637  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
638  for( size_t i=iend; i<M; ++i ) {
639  reset( C(i,j) );
640  }
641  }
642  else if( IsStrictlyUpper_v<MT4> ) {
643  reset( C(M-1UL,j) );
644  }
645  }
646 
647  for( size_t k=kbegin+1UL; k<kend; ++k )
648  {
649  const size_t ibegin( ( IsLower_v<MT4> )
650  ?( ( IsStrictlyLower_v<MT4> )
651  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
652  :( SYM || HERM || LOW ? max( j, k ) : k ) )
653  :( SYM || HERM || LOW ? j : 0UL ) );
654  const size_t iend( ( IsUpper_v<MT4> )
655  ?( ( IsStrictlyUpper_v<MT4> )
656  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
657  :( UPP ? min(j+1UL,k) : k ) )
658  :( UPP ? j+1UL : M ) );
659 
660  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
661  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
662 
663  for( size_t i=ibegin; i<iend; ++i ) {
664  C(i,j) += A(i,k) * B(k,j);
665  }
666  if( IsUpper_v<MT4> ) {
667  C(iend,j) = A(iend,k) * B(k,j);
668  }
669  }
670  }
671 
672  if( SYM || HERM ) {
673  for( size_t j=1UL; j<N; ++j ) {
674  for( size_t i=0UL; i<j; ++i ) {
675  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
676  }
677  }
678  }
679  }
681  //**********************************************************************************************
682 
683  //**Default assignment to dense matrices (general/diagonal)*************************************
697  template< typename MT3 // Type of the left-hand side target matrix
698  , typename MT4 // Type of the left-hand side matrix operand
699  , typename MT5 > // Type of the right-hand side matrix operand
700  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
701  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
702  {
704 
705  const size_t M( A.rows() );
706  const size_t N( B.columns() );
707 
708  for( size_t j=0UL; j<N; ++j )
709  {
710  const size_t ibegin( ( IsLower_v<MT4> )
711  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
712  :( 0UL ) );
713  const size_t iend( ( IsUpper_v<MT4> )
714  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
715  :( M ) );
716  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
717 
718  if( IsLower_v<MT4> ) {
719  for( size_t i=0UL; i<ibegin; ++i ) {
720  reset( C(i,j) );
721  }
722  }
723  for( size_t i=ibegin; i<iend; ++i ) {
724  C(i,j) = A(i,j) * B(j,j);
725  }
726  if( IsUpper_v<MT4> ) {
727  for( size_t i=iend; i<M; ++i ) {
728  reset( C(i,j) );
729  }
730  }
731  }
732  }
734  //**********************************************************************************************
735 
736  //**Default assignment to dense matrices (diagonal/general)*************************************
750  template< typename MT3 // Type of the left-hand side target matrix
751  , typename MT4 // Type of the left-hand side matrix operand
752  , typename MT5 > // Type of the right-hand side matrix operand
753  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
754  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
755  {
757 
758  const size_t M( A.rows() );
759  const size_t N( B.columns() );
760 
761  for( size_t j=0UL; j<N; ++j )
762  {
763  const size_t ibegin( ( IsLower_v<MT5> )
764  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
765  :( 0UL ) );
766  const size_t iend( ( IsUpper_v<MT5> )
767  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
768  :( M ) );
769  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
770 
771  if( IsLower_v<MT4> ) {
772  for( size_t i=0UL; i<ibegin; ++i ) {
773  reset( C(i,j) );
774  }
775  }
776  for( size_t i=ibegin; i<iend; ++i ) {
777  C(i,j) = A(i,i) * B(i,j);
778  }
779  if( IsUpper_v<MT4> ) {
780  for( size_t i=iend; i<M; ++i ) {
781  reset( C(i,j) );
782  }
783  }
784  }
785  }
787  //**********************************************************************************************
788 
789  //**Default assignment to dense matrices (diagonal/diagonal)************************************
803  template< typename MT3 // Type of the left-hand side target matrix
804  , typename MT4 // Type of the left-hand side matrix operand
805  , typename MT5 > // Type of the right-hand side matrix operand
806  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
807  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
808  {
810 
811  reset( C );
812 
813  for( size_t i=0UL; i<A.rows(); ++i ) {
814  C(i,i) = A(i,i) * B(i,i);
815  }
816  }
818  //**********************************************************************************************
819 
820  //**Default assignment to dense matrices (small matrices)***************************************
834  template< typename MT3 // Type of the left-hand side target matrix
835  , typename MT4 // Type of the left-hand side matrix operand
836  , typename MT5 > // Type of the right-hand side matrix operand
837  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
838  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
839  {
840  selectDefaultAssignKernel( C, A, B );
841  }
843  //**********************************************************************************************
844 
845  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
860  template< typename MT3 // Type of the left-hand side target matrix
861  , typename MT4 // Type of the left-hand side matrix operand
862  , typename MT5 > // Type of the right-hand side matrix operand
863  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
864  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
865  {
868  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
869  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
870 
871  const ForwardFunctor fwd;
872 
873  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
874  const OppositeType_t<MT5> tmp( serial( B ) );
875  assign( C, fwd( A * tmp ) );
876  }
877  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
878  const OppositeType_t<MT4> tmp( serial( A ) );
879  assign( C, fwd( tmp * B ) );
880  }
881  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
882  const OppositeType_t<MT5> tmp( serial( B ) );
883  assign( C, fwd( A * tmp ) );
884  }
885  else {
886  const OppositeType_t<MT4> tmp( serial( A ) );
887  assign( C, fwd( tmp * B ) );
888  }
889  }
891  //**********************************************************************************************
892 
893  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
908  template< typename MT3 // Type of the left-hand side target matrix
909  , typename MT4 // Type of the left-hand side matrix operand
910  , typename MT5 > // Type of the right-hand side matrix operand
911  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
912  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
913  {
914  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
915 
916  const size_t M( A.rows() );
917  const size_t N( B.columns() );
918  const size_t K( A.columns() );
919 
920  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
921 
922  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
923  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
924 
925  size_t i( 0UL );
926 
927  if( IsIntegral_v<ElementType> )
928  {
929  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
930  for( size_t j=0UL; j<N; ++j )
931  {
932  const size_t kbegin( ( IsLower_v<MT5> )
933  ?( ( IsUpper_v<MT4> )
934  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
935  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
936  :( IsUpper_v<MT4> ? i : 0UL ) );
937  const size_t kend( ( IsUpper_v<MT5> )
938  ?( ( IsLower_v<MT4> )
939  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
940  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
941  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
942 
943  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
944 
945  for( size_t k=kbegin; k<kend; ++k ) {
946  const SIMDType b1( set( B(k,j) ) );
947  xmm1 += A.load(i ,k) * b1;
948  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
949  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
950  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
951  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
952  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
953  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
954  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
955  }
956 
957  C.store( i , j, xmm1 );
958  C.store( i+SIMDSIZE , j, xmm2 );
959  C.store( i+SIMDSIZE*2UL, j, xmm3 );
960  C.store( i+SIMDSIZE*3UL, j, xmm4 );
961  C.store( i+SIMDSIZE*4UL, j, xmm5 );
962  C.store( i+SIMDSIZE*5UL, j, xmm6 );
963  C.store( i+SIMDSIZE*6UL, j, xmm7 );
964  C.store( i+SIMDSIZE*7UL, j, xmm8 );
965  }
966  }
967  }
968 
969  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
970  {
971  size_t j( 0UL );
972 
973  for( ; (j+2UL) <= N; j+=2UL )
974  {
975  const size_t kbegin( ( IsLower_v<MT5> )
976  ?( ( IsUpper_v<MT4> )
977  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
978  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
979  :( IsUpper_v<MT4> ? i : 0UL ) );
980  const size_t kend( ( IsUpper_v<MT5> )
981  ?( ( IsLower_v<MT4> )
982  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
983  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
984  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
985 
986  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
987 
988  for( size_t k=kbegin; k<kend; ++k ) {
989  const SIMDType a1( A.load(i ,k) );
990  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
991  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
992  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
993  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
994  const SIMDType b1( set( B(k,j ) ) );
995  const SIMDType b2( set( B(k,j+1UL) ) );
996  xmm1 += a1 * b1;
997  xmm2 += a2 * b1;
998  xmm3 += a3 * b1;
999  xmm4 += a4 * b1;
1000  xmm5 += a5 * b1;
1001  xmm6 += a1 * b2;
1002  xmm7 += a2 * b2;
1003  xmm8 += a3 * b2;
1004  xmm9 += a4 * b2;
1005  xmm10 += a5 * b2;
1006  }
1007 
1008  C.store( i , j , xmm1 );
1009  C.store( i+SIMDSIZE , j , xmm2 );
1010  C.store( i+SIMDSIZE*2UL, j , xmm3 );
1011  C.store( i+SIMDSIZE*3UL, j , xmm4 );
1012  C.store( i+SIMDSIZE*4UL, j , xmm5 );
1013  C.store( i , j+1UL, xmm6 );
1014  C.store( i+SIMDSIZE , j+1UL, xmm7 );
1015  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1016  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1017  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1018  }
1019 
1020  if( j < N )
1021  {
1022  const size_t kbegin( ( IsLower_v<MT5> )
1023  ?( ( IsUpper_v<MT4> )
1024  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1025  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1026  :( IsUpper_v<MT4> ? i : 0UL ) );
1027  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1028 
1029  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1030 
1031  for( size_t k=kbegin; k<kend; ++k ) {
1032  const SIMDType b1( set( B(k,j) ) );
1033  xmm1 += A.load(i ,k) * b1;
1034  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1035  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1036  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1037  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1038  }
1039 
1040  C.store( i , j, xmm1 );
1041  C.store( i+SIMDSIZE , j, xmm2 );
1042  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1043  C.store( i+SIMDSIZE*3UL, j, xmm4 );
1044  C.store( i+SIMDSIZE*4UL, j, xmm5 );
1045  }
1046  }
1047 
1048  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1049  {
1050  const size_t jend( LOW ? min(i+SIMDSIZE*4UL,N) : N );
1051  size_t j( 0UL );
1052 
1053  if( SYM || HERM ) {
1054  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
1055  for( ; j<i; ++j ) {
1056  for( size_t ii=i; ii<iiend; ++ii ) {
1057  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
1058  }
1059  }
1060  }
1061  else if( UPP ) {
1062  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
1063  for( ; j<i; ++j ) {
1064  for( size_t ii=i; ii<iiend; ++ii ) {
1065  reset( C(ii,j) );
1066  }
1067  }
1068  }
1069 
1070  for( ; (j+2UL) <= jend; j+=2UL )
1071  {
1072  const size_t kbegin( ( IsLower_v<MT5> )
1073  ?( ( IsUpper_v<MT4> )
1074  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1075  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1076  :( IsUpper_v<MT4> ? i : 0UL ) );
1077  const size_t kend( ( IsUpper_v<MT5> )
1078  ?( ( IsLower_v<MT4> )
1079  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1080  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1081  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
1082 
1083  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1084 
1085  for( size_t k=kbegin; k<kend; ++k ) {
1086  const SIMDType a1( A.load(i ,k) );
1087  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1088  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1089  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1090  const SIMDType b1( set( B(k,j ) ) );
1091  const SIMDType b2( set( B(k,j+1UL) ) );
1092  xmm1 += a1 * b1;
1093  xmm2 += a2 * b1;
1094  xmm3 += a3 * b1;
1095  xmm4 += a4 * b1;
1096  xmm5 += a1 * b2;
1097  xmm6 += a2 * b2;
1098  xmm7 += a3 * b2;
1099  xmm8 += a4 * b2;
1100  }
1101 
1102  C.store( i , j , xmm1 );
1103  C.store( i+SIMDSIZE , j , xmm2 );
1104  C.store( i+SIMDSIZE*2UL, j , xmm3 );
1105  C.store( i+SIMDSIZE*3UL, j , xmm4 );
1106  C.store( i , j+1UL, xmm5 );
1107  C.store( i+SIMDSIZE , j+1UL, xmm6 );
1108  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1109  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1110  }
1111 
1112  if( j < jend )
1113  {
1114  const size_t kbegin( ( IsLower_v<MT5> )
1115  ?( ( IsUpper_v<MT4> )
1116  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1117  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1118  :( IsUpper_v<MT4> ? i : 0UL ) );
1119  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1120 
1121  SIMDType xmm1, xmm2, xmm3, xmm4;
1122 
1123  for( size_t k=kbegin; k<kend; ++k ) {
1124  const SIMDType b1( set( B(k,j) ) );
1125  xmm1 += A.load(i ,k) * b1;
1126  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1127  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1128  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1129  }
1130 
1131  C.store( i , j, xmm1 );
1132  C.store( i+SIMDSIZE , j, xmm2 );
1133  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1134  C.store( i+SIMDSIZE*3UL, j, xmm4 );
1135 
1136  if( LOW ) ++j;
1137  }
1138 
1139  if( LOW ) {
1140  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
1141  for( ; j<N; ++j ) {
1142  for( size_t ii=i; ii<iiend; ++ii ) {
1143  reset( C(ii,j) );
1144  }
1145  }
1146  }
1147  }
1148 
1149  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1150  {
1151  const size_t jend( LOW ? min(i+SIMDSIZE*3UL,N) : N );
1152  size_t j( 0UL );
1153 
1154  if( SYM || HERM ) {
1155  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
1156  for( ; j<i; ++j ) {
1157  for( size_t ii=i; ii<iiend; ++ii ) {
1158  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
1159  }
1160  }
1161  }
1162  else if( UPP ) {
1163  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
1164  for( ; j<i; ++j ) {
1165  for( size_t ii=i; ii<iiend; ++ii ) {
1166  reset( C(ii,j) );
1167  }
1168  }
1169  }
1170 
1171  for( ; (j+2UL) <= jend; j+=2UL )
1172  {
1173  const size_t kbegin( ( IsLower_v<MT5> )
1174  ?( ( IsUpper_v<MT4> )
1175  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1176  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1177  :( IsUpper_v<MT4> ? i : 0UL ) );
1178  const size_t kend( ( IsUpper_v<MT5> )
1179  ?( ( IsLower_v<MT4> )
1180  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1181  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1182  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
1183 
1184  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1185 
1186  for( size_t k=kbegin; k<kend; ++k ) {
1187  const SIMDType a1( A.load(i ,k) );
1188  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1189  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1190  const SIMDType b1( set( B(k,j ) ) );
1191  const SIMDType b2( set( B(k,j+1UL) ) );
1192  xmm1 += a1 * b1;
1193  xmm2 += a2 * b1;
1194  xmm3 += a3 * b1;
1195  xmm4 += a1 * b2;
1196  xmm5 += a2 * b2;
1197  xmm6 += a3 * b2;
1198  }
1199 
1200  C.store( i , j , xmm1 );
1201  C.store( i+SIMDSIZE , j , xmm2 );
1202  C.store( i+SIMDSIZE*2UL, j , xmm3 );
1203  C.store( i , j+1UL, xmm4 );
1204  C.store( i+SIMDSIZE , j+1UL, xmm5 );
1205  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1206  }
1207 
1208  if( j < jend )
1209  {
1210  const size_t kbegin( ( IsLower_v<MT5> )
1211  ?( ( IsUpper_v<MT4> )
1212  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1213  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1214  :( IsUpper_v<MT4> ? i : 0UL ) );
1215  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
1216 
1217  SIMDType xmm1, xmm2, xmm3;
1218 
1219  for( size_t k=kbegin; k<kend; ++k ) {
1220  const SIMDType b1( set( B(k,j) ) );
1221  xmm1 += A.load(i ,k) * b1;
1222  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1223  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1224  }
1225 
1226  C.store( i , j, xmm1 );
1227  C.store( i+SIMDSIZE , j, xmm2 );
1228  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1229 
1230  if( LOW ) ++j;
1231  }
1232 
1233  if( LOW ) {
1234  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
1235  for( ; j<N; ++j ) {
1236  for( size_t ii=i; ii<iiend; ++ii ) {
1237  reset( C(ii,j) );
1238  }
1239  }
1240  }
1241  }
1242 
1243  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1244  {
1245  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
1246  size_t j( 0UL );
1247 
1248  if( SYM || HERM ) {
1249  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
1250  for( ; j<i; ++j ) {
1251  for( size_t ii=i; ii<iiend; ++ii ) {
1252  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
1253  }
1254  }
1255  }
1256  else if( UPP ) {
1257  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
1258  for( ; j<i; ++j ) {
1259  for( size_t ii=i; ii<iiend; ++ii ) {
1260  reset( C(ii,j) );
1261  }
1262  }
1263  }
1264 
1265  for( ; (j+4UL) <= jend; j+=4UL )
1266  {
1267  const size_t kbegin( ( IsLower_v<MT5> )
1268  ?( ( IsUpper_v<MT4> )
1269  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1270  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1271  :( IsUpper_v<MT4> ? i : 0UL ) );
1272  const size_t kend( ( IsUpper_v<MT5> )
1273  ?( ( IsLower_v<MT4> )
1274  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
1275  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
1276  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
1277 
1278  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1279 
1280  for( size_t k=kbegin; k<kend; ++k ) {
1281  const SIMDType a1( A.load(i ,k) );
1282  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1283  const SIMDType b1( set( B(k,j ) ) );
1284  const SIMDType b2( set( B(k,j+1UL) ) );
1285  const SIMDType b3( set( B(k,j+2UL) ) );
1286  const SIMDType b4( set( B(k,j+3UL) ) );
1287  xmm1 += a1 * b1;
1288  xmm2 += a2 * b1;
1289  xmm3 += a1 * b2;
1290  xmm4 += a2 * b2;
1291  xmm5 += a1 * b3;
1292  xmm6 += a2 * b3;
1293  xmm7 += a1 * b4;
1294  xmm8 += a2 * b4;
1295  }
1296 
1297  C.store( i , j , xmm1 );
1298  C.store( i+SIMDSIZE, j , xmm2 );
1299  C.store( i , j+1UL, xmm3 );
1300  C.store( i+SIMDSIZE, j+1UL, xmm4 );
1301  C.store( i , j+2UL, xmm5 );
1302  C.store( i+SIMDSIZE, j+2UL, xmm6 );
1303  C.store( i , j+3UL, xmm7 );
1304  C.store( i+SIMDSIZE, j+3UL, xmm8 );
1305  }
1306 
1307  for( ; (j+3UL) <= jend; j+=3UL )
1308  {
1309  const size_t kbegin( ( IsLower_v<MT5> )
1310  ?( ( IsUpper_v<MT4> )
1311  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1312  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1313  :( IsUpper_v<MT4> ? i : 0UL ) );
1314  const size_t kend( ( IsUpper_v<MT5> )
1315  ?( ( IsLower_v<MT4> )
1316  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
1317  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
1318  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
1319 
1320  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1321 
1322  for( size_t k=kbegin; k<kend; ++k ) {
1323  const SIMDType a1( A.load(i ,k) );
1324  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1325  const SIMDType b1( set( B(k,j ) ) );
1326  const SIMDType b2( set( B(k,j+1UL) ) );
1327  const SIMDType b3( set( B(k,j+2UL) ) );
1328  xmm1 += a1 * b1;
1329  xmm2 += a2 * b1;
1330  xmm3 += a1 * b2;
1331  xmm4 += a2 * b2;
1332  xmm5 += a1 * b3;
1333  xmm6 += a2 * b3;
1334  }
1335 
1336  C.store( i , j , xmm1 );
1337  C.store( i+SIMDSIZE, j , xmm2 );
1338  C.store( i , j+1UL, xmm3 );
1339  C.store( i+SIMDSIZE, j+1UL, xmm4 );
1340  C.store( i , j+2UL, xmm5 );
1341  C.store( i+SIMDSIZE, j+2UL, xmm6 );
1342  }
1343 
1344  for( ; (j+2UL) <= jend; j+=2UL )
1345  {
1346  const size_t kbegin( ( IsLower_v<MT5> )
1347  ?( ( IsUpper_v<MT4> )
1348  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1349  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1350  :( IsUpper_v<MT4> ? i : 0UL ) );
1351  const size_t kend( ( IsUpper_v<MT5> )
1352  ?( ( IsLower_v<MT4> )
1353  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1354  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1355  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
1356 
1357  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1358  size_t k( kbegin );
1359 
1360  for( ; (k+2UL) <= kend; k+=2UL ) {
1361  const SIMDType a1( A.load(i ,k ) );
1362  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
1363  const SIMDType a3( A.load(i ,k+1UL) );
1364  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
1365  const SIMDType b1( set( B(k ,j ) ) );
1366  const SIMDType b2( set( B(k ,j+1UL) ) );
1367  const SIMDType b3( set( B(k+1UL,j ) ) );
1368  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
1369  xmm1 += a1 * b1;
1370  xmm2 += a2 * b1;
1371  xmm3 += a1 * b2;
1372  xmm4 += a2 * b2;
1373  xmm5 += a3 * b3;
1374  xmm6 += a4 * b3;
1375  xmm7 += a3 * b4;
1376  xmm8 += a4 * b4;
1377  }
1378 
1379  for( ; k<kend; ++k ) {
1380  const SIMDType a1( A.load(i ,k) );
1381  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1382  const SIMDType b1( set( B(k,j ) ) );
1383  const SIMDType b2( set( B(k,j+1UL) ) );
1384  xmm1 += a1 * b1;
1385  xmm2 += a2 * b1;
1386  xmm3 += a1 * b2;
1387  xmm4 += a2 * b2;
1388  }
1389 
1390  C.store( i , j , xmm1+xmm5 );
1391  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
1392  C.store( i , j+1UL, xmm3+xmm7 );
1393  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
1394  }
1395 
1396  if( j < jend )
1397  {
1398  const size_t kbegin( ( IsLower_v<MT5> )
1399  ?( ( IsUpper_v<MT4> )
1400  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1401  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1402  :( IsUpper_v<MT4> ? i : 0UL ) );
1403  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
1404 
1405  SIMDType xmm1, xmm2, xmm3, xmm4;
1406  size_t k( kbegin );
1407 
1408  for( ; (k+2UL) <= kend; k+=2UL ) {
1409  const SIMDType b1( set( B(k ,j) ) );
1410  const SIMDType b2( set( B(k+1UL,j) ) );
1411  xmm1 += A.load(i ,k ) * b1;
1412  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
1413  xmm3 += A.load(i ,k+1UL) * b2;
1414  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
1415  }
1416 
1417  for( ; k<kend; ++k ) {
1418  const SIMDType b1( set( B(k,j) ) );
1419  xmm1 += A.load(i ,k) * b1;
1420  xmm2 += A.load(i+SIMDSIZE,k) * b1;
1421  }
1422 
1423  C.store( i , j, xmm1+xmm3 );
1424  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
1425 
1426  if( LOW ) ++j;
1427  }
1428 
1429  if( LOW ) {
1430  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
1431  for( ; j<N; ++j ) {
1432  for( size_t ii=i; ii<iiend; ++ii ) {
1433  reset( C(ii,j) );
1434  }
1435  }
1436  }
1437  }
1438 
1439  for( ; i<ipos; i+=SIMDSIZE )
1440  {
1441  const size_t jend( LOW ? min(i+SIMDSIZE,N) : N );
1442  size_t j( 0UL );
1443 
1444  if( SYM || HERM ) {
1445  const size_t iiend( min(i+SIMDSIZE,M) );
1446  for( ; j<i; ++j ) {
1447  for( size_t ii=i; ii<iiend; ++ii ) {
1448  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
1449  }
1450  }
1451  }
1452  else if( UPP ) {
1453  const size_t iiend( min(i+SIMDSIZE,M) );
1454  for( ; j<i; ++j ) {
1455  for( size_t ii=i; ii<iiend; ++ii ) {
1456  reset( C(ii,j) );
1457  }
1458  }
1459  }
1460 
1461  for( ; (j+4UL) <= jend; j+=4UL )
1462  {
1463  const size_t kbegin( ( IsLower_v<MT5> )
1464  ?( ( IsUpper_v<MT4> )
1465  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1466  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1467  :( IsUpper_v<MT4> ? i : 0UL ) );
1468  const size_t kend( ( IsUpper_v<MT5> )
1469  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
1470  :( K ) );
1471 
1472  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1473  size_t k( kbegin );
1474 
1475  for( ; (k+2UL) <= kend; k+=2UL ) {
1476  const SIMDType a1( A.load(i,k ) );
1477  const SIMDType a2( A.load(i,k+1UL) );
1478  xmm1 += a1 * set( B(k ,j ) );
1479  xmm2 += a1 * set( B(k ,j+1UL) );
1480  xmm3 += a1 * set( B(k ,j+2UL) );
1481  xmm4 += a1 * set( B(k ,j+3UL) );
1482  xmm5 += a2 * set( B(k+1UL,j ) );
1483  xmm6 += a2 * set( B(k+1UL,j+1UL) );
1484  xmm7 += a2 * set( B(k+1UL,j+2UL) );
1485  xmm8 += a2 * set( B(k+1UL,j+3UL) );
1486  }
1487 
1488  for( ; k<kend; ++k ) {
1489  const SIMDType a1( A.load(i,k) );
1490  xmm1 += a1 * set( B(k,j ) );
1491  xmm2 += a1 * set( B(k,j+1UL) );
1492  xmm3 += a1 * set( B(k,j+2UL) );
1493  xmm4 += a1 * set( B(k,j+3UL) );
1494  }
1495 
1496  C.store( i, j , xmm1+xmm5 );
1497  C.store( i, j+1UL, xmm2+xmm6 );
1498  C.store( i, j+2UL, xmm3+xmm7 );
1499  C.store( i, j+3UL, xmm4+xmm8 );
1500  }
1501 
1502  for( ; (j+3UL) <= jend; j+=3UL )
1503  {
1504  const size_t kbegin( ( IsLower_v<MT5> )
1505  ?( ( IsUpper_v<MT4> )
1506  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1507  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1508  :( IsUpper_v<MT4> ? i : 0UL ) );
1509  const size_t kend( ( IsUpper_v<MT5> )
1510  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
1511  :( K ) );
1512 
1513  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1514  size_t k( kbegin );
1515 
1516  for( ; (k+2UL) <= kend; k+=2UL ) {
1517  const SIMDType a1( A.load(i,k ) );
1518  const SIMDType a2( A.load(i,k+1UL) );
1519  xmm1 += a1 * set( B(k ,j ) );
1520  xmm2 += a1 * set( B(k ,j+1UL) );
1521  xmm3 += a1 * set( B(k ,j+2UL) );
1522  xmm4 += a2 * set( B(k+1UL,j ) );
1523  xmm5 += a2 * set( B(k+1UL,j+1UL) );
1524  xmm6 += a2 * set( B(k+1UL,j+2UL) );
1525  }
1526 
1527  for( ; k<kend; ++k ) {
1528  const SIMDType a1( A.load(i,k) );
1529  xmm1 += a1 * set( B(k,j ) );
1530  xmm2 += a1 * set( B(k,j+1UL) );
1531  xmm3 += a1 * set( B(k,j+2UL) );
1532  }
1533 
1534  C.store( i, j , xmm1+xmm4 );
1535  C.store( i, j+1UL, xmm2+xmm5 );
1536  C.store( i, j+2UL, xmm3+xmm6 );
1537  }
1538 
1539  for( ; (j+2UL) <= jend; j+=2UL )
1540  {
1541  const size_t kbegin( ( IsLower_v<MT5> )
1542  ?( ( IsUpper_v<MT4> )
1543  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1544  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1545  :( IsUpper_v<MT4> ? i : 0UL ) );
1546  const size_t kend( ( IsUpper_v<MT5> )
1547  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1548  :( K ) );
1549 
1550  SIMDType xmm1, xmm2, xmm3, xmm4;
1551  size_t k( kbegin );
1552 
1553  for( ; (k+2UL) <= kend; k+=2UL ) {
1554  const SIMDType a1( A.load(i,k ) );
1555  const SIMDType a2( A.load(i,k+1UL) );
1556  xmm1 += a1 * set( B(k ,j ) );
1557  xmm2 += a1 * set( B(k ,j+1UL) );
1558  xmm3 += a2 * set( B(k+1UL,j ) );
1559  xmm4 += a2 * set( B(k+1UL,j+1UL) );
1560  }
1561 
1562  for( ; k<kend; ++k ) {
1563  const SIMDType a1( A.load(i,k) );
1564  xmm1 += a1 * set( B(k,j ) );
1565  xmm2 += a1 * set( B(k,j+1UL) );
1566  }
1567 
1568  C.store( i, j , xmm1+xmm3 );
1569  C.store( i, j+1UL, xmm2+xmm4 );
1570  }
1571 
1572  if( j < jend )
1573  {
1574  const size_t kbegin( ( IsLower_v<MT5> )
1575  ?( ( IsUpper_v<MT4> )
1576  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1577  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1578  :( IsUpper_v<MT4> ? i : 0UL ) );
1579 
1580  SIMDType xmm1, xmm2;
1581  size_t k( kbegin );
1582 
1583  for( ; (k+2UL) <= K; k+=2UL ) {
1584  xmm1 += A.load(i,k ) * set( B(k ,j) );
1585  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
1586  }
1587 
1588  for( ; k<K; ++k ) {
1589  xmm1 += A.load(i,k) * set( B(k,j) );
1590  }
1591 
1592  C.store( i, j, xmm1+xmm2 );
1593 
1594  if( LOW ) ++j;
1595  }
1596 
1597  if( LOW ) {
1598  const size_t iiend( min(i+SIMDSIZE,M) );
1599  for( ; j<N; ++j ) {
1600  for( size_t ii=i; ii<iiend; ++ii ) {
1601  reset( C(ii,j) );
1602  }
1603  }
1604  }
1605  }
1606 
1607  for( ; remainder && i<M; ++i )
1608  {
1609  size_t j( 0UL );
1610 
1611  if( SYM || HERM ) {
1612  for( ; j<i; ++j ) {
1613  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1614  }
1615  }
1616  else if( UPP ) {
1617  for( ; j<i; ++j ) {
1618  reset( C(i,j) );
1619  }
1620  }
1621 
1622  for( ; (j+2UL) <= N; j+=2UL )
1623  {
1624  const size_t kbegin( ( IsLower_v<MT5> )
1625  ?( ( IsUpper_v<MT4> )
1626  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1627  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1628  :( IsUpper_v<MT4> ? i : 0UL ) );
1629  const size_t kend( ( IsUpper_v<MT5> )
1630  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1631  :( K ) );
1632 
1633  ElementType value1{};
1634  ElementType value2{};
1635 
1636  for( size_t k=kbegin; k<kend; ++k ) {
1637  value1 += A(i,k) * B(k,j );
1638  value2 += A(i,k) * B(k,j+1UL);
1639  }
1640 
1641  C(i,j ) = value1;
1642  C(i,j+1UL) = value2;
1643  }
1644 
1645  if( j < N )
1646  {
1647  const size_t kbegin( ( IsLower_v<MT5> )
1648  ?( ( IsUpper_v<MT4> )
1649  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1650  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1651  :( IsUpper_v<MT4> ? i : 0UL ) );
1652 
1653  ElementType value{};
1654 
1655  for( size_t k=kbegin; k<K; ++k ) {
1656  value += A(i,k) * B(k,j);
1657  }
1658 
1659  C(i,j) = value;
1660  }
1661  }
1662  }
1664  //**********************************************************************************************
1665 
1666  //**Default assignment to dense matrices (large matrices)***************************************
1680  template< typename MT3 // Type of the left-hand side target matrix
1681  , typename MT4 // Type of the left-hand side matrix operand
1682  , typename MT5 > // Type of the right-hand side matrix operand
1683  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1684  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1685  {
1686  selectDefaultAssignKernel( C, A, B );
1687  }
1689  //**********************************************************************************************
1690 
1691  //**Vectorized default assignment to dense matrices (large matrices)****************************
1706  template< typename MT3 // Type of the left-hand side target matrix
1707  , typename MT4 // Type of the left-hand side matrix operand
1708  , typename MT5 > // Type of the right-hand side matrix operand
1709  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1710  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1711  {
1712  if( SYM )
1713  smmm( C, A, B, ElementType(1) );
1714  else if( HERM )
1715  hmmm( C, A, B, ElementType(1) );
1716  else if( LOW )
1717  lmmm( C, A, B, ElementType(1), ElementType(0) );
1718  else if( UPP )
1719  ummm( C, A, B, ElementType(1), ElementType(0) );
1720  else
1721  mmm( C, A, B, ElementType(1), ElementType(0) );
1722  }
1724  //**********************************************************************************************
1725 
1726  //**BLAS-based assignment to dense matrices (default)*******************************************
1740  template< typename MT3 // Type of the left-hand side target matrix
1741  , typename MT4 // Type of the left-hand side matrix operand
1742  , typename MT5 > // Type of the right-hand side matrix operand
1743  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1744  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1745  {
1746  selectLargeAssignKernel( C, A, B );
1747  }
1749  //**********************************************************************************************
1750 
1751  //**BLAS-based assignment to dense matrices*****************************************************
1752 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1753 
1766  template< typename MT3 // Type of the left-hand side target matrix
1767  , typename MT4 // Type of the left-hand side matrix operand
1768  , typename MT5 > // Type of the right-hand side matrix operand
1769  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1770  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1771  {
1772  using ET = ElementType_t<MT3>;
1773 
1774  if( IsTriangular_v<MT4> ) {
1775  assign( C, B );
1776  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1777  }
1778  else if( IsTriangular_v<MT5> ) {
1779  assign( C, A );
1780  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1781  }
1782  else {
1783  gemm( C, A, B, ET(1), ET(0) );
1784  }
1785  }
1787 #endif
1788  //**********************************************************************************************
1789 
1790  //**Assignment to sparse matrices***************************************************************
1803  template< typename MT // Type of the target sparse matrix
1804  , bool SO > // Storage order of the target sparse matrix
1805  friend inline auto assign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1806  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1807  {
1809 
1810  using TmpType = If_t< SO, ResultType, OppositeType >;
1811 
1818 
1819  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1820  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1821 
1822  const ForwardFunctor fwd;
1823 
1824  const TmpType tmp( serial( rhs ) );
1825  assign( ~lhs, fwd( tmp ) );
1826  }
1828  //**********************************************************************************************
1829 
1830  //**Restructuring assignment to row-major matrices**********************************************
1845  template< typename MT > // Type of the target matrix
1846  friend inline auto assign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
1847  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1848  {
1850 
1852 
1853  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1854  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1855 
1856  const ForwardFunctor fwd;
1857 
1858  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
1859  assign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
1860  else if( IsSymmetric_v<MT1> )
1861  assign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
1862  else
1863  assign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
1864  }
1866  //**********************************************************************************************
1867 
1868  //**Addition assignment to dense matrices*******************************************************
1881  template< typename MT // Type of the target dense matrix
1882  , bool SO > // Storage order of the target dense matrix
1883  friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1884  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1885  {
1887 
1888  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1889  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1890 
1891  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1892  return;
1893  }
1894 
1895  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1896  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1897 
1898  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1899  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1900  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1901  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1902  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1903  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1904 
1905  TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1906  }
1908  //**********************************************************************************************
1909 
1910  //**Addition assignment to dense matrices (kernel selection)************************************
1921  template< typename MT3 // Type of the left-hand side target matrix
1922  , typename MT4 // Type of the left-hand side matrix operand
1923  , typename MT5 > // Type of the right-hand side matrix operand
1924  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1925  {
1926  if( ( IsDiagonal_v<MT4> ) ||
1927  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
1928  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1929  selectSmallAddAssignKernel( C, A, B );
1930  else
1931  selectBlasAddAssignKernel( C, A, B );
1932  }
1934  //**********************************************************************************************
1935 
1936  //**Default addition assignment to dense matrices (general/general)*****************************
1950  template< typename MT3 // Type of the left-hand side target matrix
1951  , typename MT4 // Type of the left-hand side matrix operand
1952  , typename MT5 > // Type of the right-hand side matrix operand
1953  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1954  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1955  {
1956  const size_t M( A.rows() );
1957  const size_t N( B.columns() );
1958  const size_t K( A.columns() );
1959 
1960  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1961 
1962  for( size_t j=0UL; j<N; ++j )
1963  {
1964  const size_t kbegin( ( IsLower_v<MT5> )
1965  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1966  :( 0UL ) );
1967  const size_t kend( ( IsUpper_v<MT5> )
1968  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1969  :( K ) );
1970  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1971 
1972  for( size_t k=kbegin; k<kend; ++k )
1973  {
1974  const size_t ibegin( ( IsLower_v<MT4> )
1975  ?( ( IsStrictlyLower_v<MT4> )
1976  ?( LOW ? max(j,k+1UL) : k+1UL )
1977  :( LOW ? max(j,k) : k ) )
1978  :( LOW ? j : 0UL ) );
1979  const size_t iend( ( IsUpper_v<MT4> )
1980  ?( ( IsStrictlyUpper_v<MT4> )
1981  ?( UPP ? min(j+1UL,k) : k )
1982  :( UPP ? min(j,k)+1UL : k+1UL ) )
1983  :( UPP ? j+1UL : M ) );
1984 
1985  if( ( LOW || UPP ) && ibegin >= iend ) continue;
1986  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1987 
1988  const size_t inum( iend - ibegin );
1989  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1990 
1991  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1992  C(i ,j) += A(i ,k) * B(k,j);
1993  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1994  }
1995  if( ipos < iend ) {
1996  C(ipos,j) += A(ipos,k) * B(k,j);
1997  }
1998  }
1999  }
2000  }
2002  //**********************************************************************************************
2003 
2004  //**Default addition assignment to dense matrices (general/diagonal)****************************
2018  template< typename MT3 // Type of the left-hand side target matrix
2019  , typename MT4 // Type of the left-hand side matrix operand
2020  , typename MT5 > // Type of the right-hand side matrix operand
2021  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2022  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2023  {
2025 
2026  const size_t M( A.rows() );
2027  const size_t N( B.columns() );
2028 
2029  for( size_t j=0UL; j<N; ++j )
2030  {
2031  const size_t ibegin( ( IsLower_v<MT4> )
2032  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
2033  :( 0UL ) );
2034  const size_t iend( ( IsUpper_v<MT4> )
2035  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
2036  :( M ) );
2037  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2038 
2039  const size_t inum( iend - ibegin );
2040  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2041 
2042  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2043  C(i ,j) += A(i ,j) * B(j,j);
2044  C(i+1UL,j) += A(i+1UL,j) * B(j,j);
2045  }
2046  if( ipos < iend ) {
2047  C(ipos,j) += A(ipos,j) * B(j,j);
2048  }
2049  }
2050  }
2052  //**********************************************************************************************
2053 
2054  //**Default addition assignment to dense matrices (diagonal/general)****************************
2068  template< typename MT3 // Type of the left-hand side target matrix
2069  , typename MT4 // Type of the left-hand side matrix operand
2070  , typename MT5 > // Type of the right-hand side matrix operand
2071  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2072  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2073  {
2075 
2076  const size_t M( A.rows() );
2077  const size_t N( B.columns() );
2078 
2079  for( size_t j=0UL; j<N; ++j )
2080  {
2081  const size_t ibegin( ( IsLower_v<MT5> )
2082  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2083  :( 0UL ) );
2084  const size_t iend( ( IsUpper_v<MT5> )
2085  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2086  :( M ) );
2087  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2088 
2089  const size_t inum( iend - ibegin );
2090  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2091 
2092  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2093  C(i ,j) += A(i ,i ) * B(i ,j);
2094  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2095  }
2096  if( ipos < iend ) {
2097  C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2098  }
2099  }
2100  }
2102  //**********************************************************************************************
2103 
2104  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2118  template< typename MT3 // Type of the left-hand side target matrix
2119  , typename MT4 // Type of the left-hand side matrix operand
2120  , typename MT5 > // Type of the right-hand side matrix operand
2121  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2122  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2123  {
2125 
2126  for( size_t i=0UL; i<A.rows(); ++i ) {
2127  C(i,i) += A(i,i) * B(i,i);
2128  }
2129  }
2131  //**********************************************************************************************
2132 
2133  //**Default addition assignment to dense matrices (small matrices)******************************
2147  template< typename MT3 // Type of the left-hand side target matrix
2148  , typename MT4 // Type of the left-hand side matrix operand
2149  , typename MT5 > // Type of the right-hand side matrix operand
2150  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2151  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2152  {
2153  selectDefaultAddAssignKernel( C, A, B );
2154  }
2156  //**********************************************************************************************
2157 
2158  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2173  template< typename MT3 // Type of the left-hand side target matrix
2174  , typename MT4 // Type of the left-hand side matrix operand
2175  , typename MT5 > // Type of the right-hand side matrix operand
2176  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2177  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2178  {
2181  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
2182  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
2183 
2184  const ForwardFunctor fwd;
2185 
2186  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2187  const OppositeType_t<MT5> tmp( serial( B ) );
2188  addAssign( C, fwd( A * tmp ) );
2189  }
2190  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2191  const OppositeType_t<MT4> tmp( serial( A ) );
2192  addAssign( C, fwd( tmp * B ) );
2193  }
2194  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2195  const OppositeType_t<MT5> tmp( serial( B ) );
2196  addAssign( C, fwd( A * tmp ) );
2197  }
2198  else {
2199  const OppositeType_t<MT4> tmp( serial( A ) );
2200  addAssign( C, fwd( tmp * B ) );
2201  }
2202  }
2204  //**********************************************************************************************
2205 
2206  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2221  template< typename MT3 // Type of the left-hand side target matrix
2222  , typename MT4 // Type of the left-hand side matrix operand
2223  , typename MT5 > // Type of the right-hand side matrix operand
2224  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2225  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2226  {
2227  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
2228 
2229  const size_t M( A.rows() );
2230  const size_t N( B.columns() );
2231  const size_t K( A.columns() );
2232 
2233  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2234 
2235  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
2236  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
2237 
2238  size_t i( 0UL );
2239 
2240  if( IsIntegral_v<ElementType> )
2241  {
2242  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
2243  for( size_t j=0UL; j<N; ++j )
2244  {
2245  const size_t kbegin( ( IsLower_v<MT5> )
2246  ?( ( IsUpper_v<MT4> )
2247  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2248  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2249  :( IsUpper_v<MT4> ? i : 0UL ) );
2250  const size_t kend( ( IsUpper_v<MT5> )
2251  ?( ( IsLower_v<MT4> )
2252  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2253  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
2254  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
2255 
2256  SIMDType xmm1( C.load(i ,j) );
2257  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2258  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2259  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
2260  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
2261  SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
2262  SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
2263  SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
2264 
2265  for( size_t k=kbegin; k<kend; ++k ) {
2266  const SIMDType b1( set( B(k,j) ) );
2267  xmm1 += A.load(i ,k) * b1;
2268  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2269  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2270  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2271  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2272  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
2273  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
2274  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
2275  }
2276 
2277  C.store( i , j, xmm1 );
2278  C.store( i+SIMDSIZE , j, xmm2 );
2279  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2280  C.store( i+SIMDSIZE*3UL, j, xmm4 );
2281  C.store( i+SIMDSIZE*4UL, j, xmm5 );
2282  C.store( i+SIMDSIZE*5UL, j, xmm6 );
2283  C.store( i+SIMDSIZE*6UL, j, xmm7 );
2284  C.store( i+SIMDSIZE*7UL, j, xmm8 );
2285  }
2286  }
2287  }
2288 
2289  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2290  {
2291  size_t j( 0UL );
2292 
2293  for( ; (j+2UL) <= N; j+=2UL )
2294  {
2295  const size_t kbegin( ( IsLower_v<MT5> )
2296  ?( ( IsUpper_v<MT4> )
2297  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2298  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2299  :( IsUpper_v<MT4> ? i : 0UL ) );
2300  const size_t kend( ( IsUpper_v<MT5> )
2301  ?( ( IsLower_v<MT4> )
2302  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2303  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2304  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
2305 
2306  SIMDType xmm1 ( C.load(i ,j ) );
2307  SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
2308  SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
2309  SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
2310  SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
2311  SIMDType xmm6 ( C.load(i ,j+1UL) );
2312  SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
2313  SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
2314  SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
2315  SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
2316 
2317  for( size_t k=kbegin; k<kend; ++k ) {
2318  const SIMDType a1( A.load(i ,k) );
2319  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2320  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2321  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2322  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
2323  const SIMDType b1( set( B(k,j ) ) );
2324  const SIMDType b2( set( B(k,j+1UL) ) );
2325  xmm1 += a1 * b1;
2326  xmm2 += a2 * b1;
2327  xmm3 += a3 * b1;
2328  xmm4 += a4 * b1;
2329  xmm5 += a5 * b1;
2330  xmm6 += a1 * b2;
2331  xmm7 += a2 * b2;
2332  xmm8 += a3 * b2;
2333  xmm9 += a4 * b2;
2334  xmm10 += a5 * b2;
2335  }
2336 
2337  C.store( i , j , xmm1 );
2338  C.store( i+SIMDSIZE , j , xmm2 );
2339  C.store( i+SIMDSIZE*2UL, j , xmm3 );
2340  C.store( i+SIMDSIZE*3UL, j , xmm4 );
2341  C.store( i+SIMDSIZE*4UL, j , xmm5 );
2342  C.store( i , j+1UL, xmm6 );
2343  C.store( i+SIMDSIZE , j+1UL, xmm7 );
2344  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
2345  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
2346  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
2347  }
2348 
2349  if( j < N )
2350  {
2351  const size_t kbegin( ( IsLower_v<MT5> )
2352  ?( ( IsUpper_v<MT4> )
2353  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2354  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2355  :( IsUpper_v<MT4> ? i : 0UL ) );
2356  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
2357 
2358  SIMDType xmm1( C.load(i ,j) );
2359  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2360  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2361  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
2362  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
2363 
2364  for( size_t k=kbegin; k<kend; ++k ) {
2365  const SIMDType b1( set( B(k,j) ) );
2366  xmm1 += A.load(i ,k) * b1;
2367  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2368  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2369  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2370  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2371  }
2372 
2373  C.store( i , j, xmm1 );
2374  C.store( i+SIMDSIZE , j, xmm2 );
2375  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2376  C.store( i+SIMDSIZE*3UL, j, xmm4 );
2377  C.store( i+SIMDSIZE*4UL, j, xmm5 );
2378  }
2379  }
2380 
2381  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2382  {
2383  size_t j( 0UL );
2384 
2385  for( ; (j+2UL) <= N; j+=2UL )
2386  {
2387  const size_t kbegin( ( IsLower_v<MT5> )
2388  ?( ( IsUpper_v<MT4> )
2389  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2390  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2391  :( IsUpper_v<MT4> ? i : 0UL ) );
2392  const size_t kend( ( IsUpper_v<MT5> )
2393  ?( ( IsLower_v<MT4> )
2394  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2395  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2396  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
2397 
2398  SIMDType xmm1( C.load(i ,j ) );
2399  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
2400  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
2401  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
2402  SIMDType xmm5( C.load(i ,j+1UL) );
2403  SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
2404  SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
2405  SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
2406 
2407  for( size_t k=kbegin; k<kend; ++k ) {
2408  const SIMDType a1( A.load(i ,k) );
2409  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2410  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2411  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2412  const SIMDType b1( set( B(k,j ) ) );
2413  const SIMDType b2( set( B(k,j+1UL) ) );
2414  xmm1 += a1 * b1;
2415  xmm2 += a2 * b1;
2416  xmm3 += a3 * b1;
2417  xmm4 += a4 * b1;
2418  xmm5 += a1 * b2;
2419  xmm6 += a2 * b2;
2420  xmm7 += a3 * b2;
2421  xmm8 += a4 * b2;
2422  }
2423 
2424  C.store( i , j , xmm1 );
2425  C.store( i+SIMDSIZE , j , xmm2 );
2426  C.store( i+SIMDSIZE*2UL, j , xmm3 );
2427  C.store( i+SIMDSIZE*3UL, j , xmm4 );
2428  C.store( i , j+1UL, xmm5 );
2429  C.store( i+SIMDSIZE , j+1UL, xmm6 );
2430  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2431  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2432  }
2433 
2434  if( j < N )
2435  {
2436  const size_t kbegin( ( IsLower_v<MT5> )
2437  ?( ( IsUpper_v<MT4> )
2438  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2439  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2440  :( IsUpper_v<MT4> ? i : 0UL ) );
2441  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
2442 
2443  SIMDType xmm1( C.load(i ,j) );
2444  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2445  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2446  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
2447 
2448  for( size_t k=kbegin; k<kend; ++k ) {
2449  const SIMDType b1( set( B(k,j) ) );
2450  xmm1 += A.load(i ,k) * b1;
2451  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2452  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2453  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2454  }
2455 
2456  C.store( i , j, xmm1 );
2457  C.store( i+SIMDSIZE , j, xmm2 );
2458  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2459  C.store( i+SIMDSIZE*3UL, j, xmm4 );
2460  }
2461  }
2462 
2463  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2464  {
2465  size_t j( 0UL );
2466 
2467  for( ; (j+2UL) <= N; j+=2UL )
2468  {
2469  const size_t kbegin( ( IsLower_v<MT5> )
2470  ?( ( IsUpper_v<MT4> )
2471  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2472  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2473  :( IsUpper_v<MT4> ? i : 0UL ) );
2474  const size_t kend( ( IsUpper_v<MT5> )
2475  ?( ( IsLower_v<MT4> )
2476  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2477  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2478  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
2479 
2480  SIMDType xmm1( C.load(i ,j ) );
2481  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
2482  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
2483  SIMDType xmm4( C.load(i ,j+1UL) );
2484  SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
2485  SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
2486 
2487  for( size_t k=kbegin; k<kend; ++k ) {
2488  const SIMDType a1( A.load(i ,k) );
2489  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2490  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2491  const SIMDType b1( set( B(k,j ) ) );
2492  const SIMDType b2( set( B(k,j+1UL) ) );
2493  xmm1 += a1 * b1;
2494  xmm2 += a2 * b1;
2495  xmm3 += a3 * b1;
2496  xmm4 += a1 * b2;
2497  xmm5 += a2 * b2;
2498  xmm6 += a3 * b2;
2499  }
2500 
2501  C.store( i , j , xmm1 );
2502  C.store( i+SIMDSIZE , j , xmm2 );
2503  C.store( i+SIMDSIZE*2UL, j , xmm3 );
2504  C.store( i , j+1UL, xmm4 );
2505  C.store( i+SIMDSIZE , j+1UL, xmm5 );
2506  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2507  }
2508 
2509  if( j < N )
2510  {
2511  const size_t kbegin( ( IsLower_v<MT5> )
2512  ?( ( IsUpper_v<MT4> )
2513  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2514  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2515  :( IsUpper_v<MT4> ? i : 0UL ) );
2516  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2517 
2518  SIMDType xmm1( C.load(i ,j) );
2519  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2520  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2521 
2522  for( size_t k=kbegin; k<kend; ++k ) {
2523  const SIMDType b1( set( B(k,j) ) );
2524  xmm1 += A.load(i ,k) * b1;
2525  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2526  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2527  }
2528 
2529  C.store( i , j, xmm1 );
2530  C.store( i+SIMDSIZE , j, xmm2 );
2531  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2532  }
2533  }
2534 
2535  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2536  {
2537  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
2538  size_t j( UPP ? i : 0UL );
2539 
2540  for( ; (j+4UL) <= jend; j+=4UL )
2541  {
2542  const size_t kbegin( ( IsLower_v<MT5> )
2543  ?( ( IsUpper_v<MT4> )
2544  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2545  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2546  :( IsUpper_v<MT4> ? i : 0UL ) );
2547  const size_t kend( ( IsUpper_v<MT5> )
2548  ?( ( IsLower_v<MT4> )
2549  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2550  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2551  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2552 
2553  SIMDType xmm1( C.load(i ,j ) );
2554  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
2555  SIMDType xmm3( C.load(i ,j+1UL) );
2556  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
2557  SIMDType xmm5( C.load(i ,j+2UL) );
2558  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
2559  SIMDType xmm7( C.load(i ,j+3UL) );
2560  SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
2561 
2562  for( size_t k=kbegin; k<kend; ++k ) {
2563  const SIMDType a1( A.load(i ,k) );
2564  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2565  const SIMDType b1( set( B(k,j ) ) );
2566  const SIMDType b2( set( B(k,j+1UL) ) );
2567  const SIMDType b3( set( B(k,j+2UL) ) );
2568  const SIMDType b4( set( B(k,j+3UL) ) );
2569  xmm1 += a1 * b1;
2570  xmm2 += a2 * b1;
2571  xmm3 += a1 * b2;
2572  xmm4 += a2 * b2;
2573  xmm5 += a1 * b3;
2574  xmm6 += a2 * b3;
2575  xmm7 += a1 * b4;
2576  xmm8 += a2 * b4;
2577  }
2578 
2579  C.store( i , j , xmm1 );
2580  C.store( i+SIMDSIZE, j , xmm2 );
2581  C.store( i , j+1UL, xmm3 );
2582  C.store( i+SIMDSIZE, j+1UL, xmm4 );
2583  C.store( i , j+2UL, xmm5 );
2584  C.store( i+SIMDSIZE, j+2UL, xmm6 );
2585  C.store( i , j+3UL, xmm7 );
2586  C.store( i+SIMDSIZE, j+3UL, xmm8 );
2587  }
2588 
2589  for( ; (j+3UL) <= jend; j+=3UL )
2590  {
2591  const size_t kbegin( ( IsLower_v<MT5> )
2592  ?( ( IsUpper_v<MT4> )
2593  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2594  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2595  :( IsUpper_v<MT4> ? i : 0UL ) );
2596  const size_t kend( ( IsUpper_v<MT5> )
2597  ?( ( IsLower_v<MT4> )
2598  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2599  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2600  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2601 
2602  SIMDType xmm1( C.load(i ,j ) );
2603  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
2604  SIMDType xmm3( C.load(i ,j+1UL) );
2605  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
2606  SIMDType xmm5( C.load(i ,j+2UL) );
2607  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
2608 
2609  for( size_t k=kbegin; k<kend; ++k ) {
2610  const SIMDType a1( A.load(i ,k) );
2611  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2612  const SIMDType b1( set( B(k,j ) ) );
2613  const SIMDType b2( set( B(k,j+1UL) ) );
2614  const SIMDType b3( set( B(k,j+2UL) ) );
2615  xmm1 += a1 * b1;
2616  xmm2 += a2 * b1;
2617  xmm3 += a1 * b2;
2618  xmm4 += a2 * b2;
2619  xmm5 += a1 * b3;
2620  xmm6 += a2 * b3;
2621  }
2622 
2623  C.store( i , j , xmm1 );
2624  C.store( i+SIMDSIZE, j , xmm2 );
2625  C.store( i , j+1UL, xmm3 );
2626  C.store( i+SIMDSIZE, j+1UL, xmm4 );
2627  C.store( i , j+2UL, xmm5 );
2628  C.store( i+SIMDSIZE, j+2UL, xmm6 );
2629  }
2630 
2631  for( ; (j+2UL) <= jend; j+=2UL )
2632  {
2633  const size_t kbegin( ( IsLower_v<MT5> )
2634  ?( ( IsUpper_v<MT4> )
2635  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2636  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2637  :( IsUpper_v<MT4> ? i : 0UL ) );
2638  const size_t kend( ( IsUpper_v<MT5> )
2639  ?( ( IsLower_v<MT4> )
2640  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2641  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2642  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2643 
2644  SIMDType xmm1( C.load(i ,j ) );
2645  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
2646  SIMDType xmm3( C.load(i ,j+1UL) );
2647  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
2648  SIMDType xmm5, xmm6, xmm7, xmm8;
2649  size_t k( kbegin );
2650 
2651  for( ; (k+2UL) < kend; k+=2UL ) {
2652  const SIMDType a1( A.load(i ,k ) );
2653  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2654  const SIMDType a3( A.load(i ,k+1UL) );
2655  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2656  const SIMDType b1( set( B(k ,j ) ) );
2657  const SIMDType b2( set( B(k ,j+1UL) ) );
2658  const SIMDType b3( set( B(k+1UL,j ) ) );
2659  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
2660  xmm1 += a1 * b1;
2661  xmm2 += a2 * b1;
2662  xmm3 += a1 * b2;
2663  xmm4 += a2 * b2;
2664  xmm5 += a3 * b3;
2665  xmm6 += a4 * b3;
2666  xmm7 += a3 * b4;
2667  xmm8 += a4 * b4;
2668  }
2669 
2670  for( ; k<kend; ++k ) {
2671  const SIMDType a1( A.load(i ,k) );
2672  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2673  const SIMDType b1( set( B(k,j ) ) );
2674  const SIMDType b2( set( B(k,j+1UL) ) );
2675  xmm1 += a1 * b1;
2676  xmm2 += a2 * b1;
2677  xmm3 += a1 * b2;
2678  xmm4 += a2 * b2;
2679  }
2680 
2681  C.store( i , j , xmm1+xmm5 );
2682  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
2683  C.store( i , j+1UL, xmm3+xmm7 );
2684  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2685  }
2686 
2687  if( j < jend )
2688  {
2689  const size_t kbegin( ( IsLower_v<MT5> )
2690  ?( ( IsUpper_v<MT4> )
2691  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2692  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2693  :( IsUpper_v<MT4> ? i : 0UL ) );
2694  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2695 
2696  SIMDType xmm1( C.load(i ,j) );
2697  SIMDType xmm2( C.load(i+SIMDSIZE,j) );
2698  SIMDType xmm3, xmm4;
2699  size_t k( kbegin );
2700 
2701  for( ; (k+2UL) <= kend; k+=2UL ) {
2702  const SIMDType b1( set( B(k ,j) ) );
2703  const SIMDType b2( set( B(k+1UL,j) ) );
2704  xmm1 += A.load(i ,k ) * b1;
2705  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2706  xmm3 += A.load(i ,k+1UL) * b2;
2707  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2708  }
2709 
2710  for( ; k<kend; ++k ) {
2711  const SIMDType b1( set( B(k,j) ) );
2712  xmm1 += A.load(i ,k) * b1;
2713  xmm2 += A.load(i+SIMDSIZE,k) * b1;
2714  }
2715 
2716  C.store( i , j, xmm1+xmm3 );
2717  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
2718  }
2719  }
2720 
2721  for( ; i<ipos; i+=SIMDSIZE )
2722  {
2723  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
2724  size_t j( UPP ? i : 0UL );
2725 
2726  for( ; (j+4UL) <= jend; j+=4UL )
2727  {
2728  const size_t kbegin( ( IsLower_v<MT5> )
2729  ?( ( IsUpper_v<MT4> )
2730  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2731  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2732  :( IsUpper_v<MT4> ? i : 0UL ) );
2733  const size_t kend( ( IsUpper_v<MT5> )
2734  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2735  :( K ) );
2736 
2737  SIMDType xmm1( C.load(i,j ) );
2738  SIMDType xmm2( C.load(i,j+1UL) );
2739  SIMDType xmm3( C.load(i,j+2UL) );
2740  SIMDType xmm4( C.load(i,j+3UL) );
2741  SIMDType xmm5, xmm6, xmm7, xmm8;
2742  size_t k( kbegin );
2743 
2744  for( ; (k+2UL) <= kend; k+=2UL ) {
2745  const SIMDType a1( A.load(i,k ) );
2746  const SIMDType a2( A.load(i,k+1UL) );
2747  xmm1 += a1 * set( B(k ,j ) );
2748  xmm2 += a1 * set( B(k ,j+1UL) );
2749  xmm3 += a1 * set( B(k ,j+2UL) );
2750  xmm4 += a1 * set( B(k ,j+3UL) );
2751  xmm5 += a2 * set( B(k+1UL,j ) );
2752  xmm6 += a2 * set( B(k+1UL,j+1UL) );
2753  xmm7 += a2 * set( B(k+1UL,j+2UL) );
2754  xmm8 += a2 * set( B(k+1UL,j+3UL) );
2755  }
2756 
2757  for( ; k<kend; ++k ) {
2758  const SIMDType a1( A.load(i,k) );
2759  xmm1 += a1 * set( B(k,j ) );
2760  xmm2 += a1 * set( B(k,j+1UL) );
2761  xmm3 += a1 * set( B(k,j+2UL) );
2762  xmm4 += a1 * set( B(k,j+3UL) );
2763  }
2764 
2765  C.store( i, j , xmm1+xmm5 );
2766  C.store( i, j+1UL, xmm2+xmm6 );
2767  C.store( i, j+2UL, xmm3+xmm7 );
2768  C.store( i, j+3UL, xmm4+xmm8 );
2769  }
2770 
2771  for( ; (j+3UL) <= jend; j+=3UL )
2772  {
2773  const size_t kbegin( ( IsLower_v<MT5> )
2774  ?( ( IsUpper_v<MT4> )
2775  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2776  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2777  :( IsUpper_v<MT4> ? i : 0UL ) );
2778  const size_t kend( ( IsUpper_v<MT5> )
2779  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2780  :( K ) );
2781 
2782  SIMDType xmm1( C.load(i,j ) );
2783  SIMDType xmm2( C.load(i,j+1UL) );
2784  SIMDType xmm3( C.load(i,j+2UL) );
2785  SIMDType xmm4, xmm5, xmm6;
2786  size_t k( kbegin );
2787 
2788  for( ; (k+2UL) <= kend; k+=2UL ) {
2789  const SIMDType a1( A.load(i,k ) );
2790  const SIMDType a2( A.load(i,k+1UL) );
2791  xmm1 += a1 * set( B(k ,j ) );
2792  xmm2 += a1 * set( B(k ,j+1UL) );
2793  xmm3 += a1 * set( B(k ,j+2UL) );
2794  xmm4 += a2 * set( B(k+1UL,j ) );
2795  xmm5 += a2 * set( B(k+1UL,j+1UL) );
2796  xmm6 += a2 * set( B(k+1UL,j+2UL) );
2797  }
2798 
2799  for( ; k<kend; ++k ) {
2800  const SIMDType a1( A.load(i,k) );
2801  xmm1 += a1 * set( B(k,j ) );
2802  xmm2 += a1 * set( B(k,j+1UL) );
2803  xmm3 += a1 * set( B(k,j+2UL) );
2804  }
2805 
2806  C.store( i, j , xmm1+xmm4 );
2807  C.store( i, j+1UL, xmm2+xmm5 );
2808  C.store( i, j+2UL, xmm3+xmm6 );
2809  }
2810 
2811  for( ; (j+2UL) <= jend; j+=2UL )
2812  {
2813  const size_t kbegin( ( IsLower_v<MT5> )
2814  ?( ( IsUpper_v<MT4> )
2815  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2816  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2817  :( IsUpper_v<MT4> ? i : 0UL ) );
2818  const size_t kend( ( IsUpper_v<MT5> )
2819  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2820  :( K ) );
2821 
2822  SIMDType xmm1( C.load(i,j ) );
2823  SIMDType xmm2( C.load(i,j+1UL) );
2824  SIMDType xmm3, xmm4;
2825  size_t k( kbegin );
2826 
2827  for( ; (k+2UL) <= kend; k+=2UL ) {
2828  const SIMDType a1( A.load(i,k ) );
2829  const SIMDType a2( A.load(i,k+1UL) );
2830  xmm1 += a1 * set( B(k ,j ) );
2831  xmm2 += a1 * set( B(k ,j+1UL) );
2832  xmm3 += a2 * set( B(k+1UL,j ) );
2833  xmm4 += a2 * set( B(k+1UL,j+1UL) );
2834  }
2835 
2836  for( ; k<kend; ++k ) {
2837  const SIMDType a1( A.load(i,k) );
2838  xmm1 += a1 * set( B(k,j ) );
2839  xmm2 += a1 * set( B(k,j+1UL) );
2840  }
2841 
2842  C.store( i, j , xmm1+xmm3 );
2843  C.store( i, j+1UL, xmm2+xmm4 );
2844  }
2845 
2846  if( j < jend )
2847  {
2848  const size_t kbegin( ( IsLower_v<MT5> )
2849  ?( ( IsUpper_v<MT4> )
2850  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2851  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2852  :( IsUpper_v<MT4> ? i : 0UL ) );
2853 
2854  SIMDType xmm1( C.load(i,j) );
2855  SIMDType xmm2;
2856  size_t k( kbegin );
2857 
2858  for( ; (k+2UL) <= K; k+=2UL ) {
2859  xmm1 += A.load(i,k ) * set( B(k ,j) );
2860  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
2861  }
2862 
2863  for( ; k<K; ++k ) {
2864  xmm1 += A.load(i,k) * set( B(k,j) );
2865  }
2866 
2867  C.store( i, j, xmm1+xmm2 );
2868  }
2869  }
2870 
2871  for( ; remainder && i<M; ++i )
2872  {
2873  const size_t jend( LOW ? i+1UL : N );
2874  size_t j( UPP ? i : 0UL );
2875 
2876  for( ; (j+2UL) <= jend; j+=2UL )
2877  {
2878  const size_t kbegin( ( IsLower_v<MT5> )
2879  ?( ( IsUpper_v<MT4> )
2880  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2881  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2882  :( IsUpper_v<MT4> ? i : 0UL ) );
2883  const size_t kend( ( IsUpper_v<MT5> )
2884  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2885  :( K ) );
2886 
2887  ElementType value1( C(i,j ) );
2888  ElementType value2( C(i,j+1UL) );
2889 
2890  for( size_t k=kbegin; k<kend; ++k ) {
2891  value1 += A(i,k) * B(k,j );
2892  value2 += A(i,k) * B(k,j+1UL);
2893  }
2894 
2895  C(i,j ) = value1;
2896  C(i,j+1UL) = value2;
2897  }
2898 
2899  if( j < jend )
2900  {
2901  const size_t kbegin( ( IsLower_v<MT5> )
2902  ?( ( IsUpper_v<MT4> )
2903  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2904  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2905  :( IsUpper_v<MT4> ? i : 0UL ) );
2906 
2907  ElementType value( C(i,j) );
2908 
2909  for( size_t k=kbegin; k<K; ++k ) {
2910  value += A(i,k) * B(k,j);
2911  }
2912 
2913  C(i,j) = value;
2914  }
2915  }
2916  }
2918  //**********************************************************************************************
2919 
2920  //**Default addition assignment to dense matrices (large matrices)******************************
2934  template< typename MT3 // Type of the left-hand side target matrix
2935  , typename MT4 // Type of the left-hand side matrix operand
2936  , typename MT5 > // Type of the right-hand side matrix operand
2937  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2938  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2939  {
2940  selectDefaultAddAssignKernel( C, A, B );
2941  }
2943  //**********************************************************************************************
2944 
2945  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
2960  template< typename MT3 // Type of the left-hand side target matrix
2961  , typename MT4 // Type of the left-hand side matrix operand
2962  , typename MT5 > // Type of the right-hand side matrix operand
2963  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2964  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2965  {
2966  if( LOW )
2967  lmmm( C, A, B, ElementType(1), ElementType(1) );
2968  else if( UPP )
2969  ummm( C, A, B, ElementType(1), ElementType(1) );
2970  else
2971  mmm( C, A, B, ElementType(1), ElementType(1) );
2972  }
2974  //**********************************************************************************************
2975 
2976  //**BLAS-based addition assignment to dense matrices (default)**********************************
2990  template< typename MT3 // Type of the left-hand side target matrix
2991  , typename MT4 // Type of the left-hand side matrix operand
2992  , typename MT5 > // Type of the right-hand side matrix operand
2993  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2994  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2995  {
2996  selectLargeAddAssignKernel( C, A, B );
2997  }
2999  //**********************************************************************************************
3000 
3001  //**BLAS-based addition assignment to dense matrices********************************************
3002 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3003 
3016  template< typename MT3 // Type of the left-hand side target matrix
3017  , typename MT4 // Type of the left-hand side matrix operand
3018  , typename MT5 > // Type of the right-hand side matrix operand
3019  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3020  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3021  {
3022  using ET = ElementType_t<MT3>;
3023 
3024  if( IsTriangular_v<MT4> ) {
3025  ResultType_t<MT3> tmp( serial( B ) );
3026  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3027  addAssign( C, tmp );
3028  }
3029  else if( IsTriangular_v<MT5> ) {
3030  ResultType_t<MT3> tmp( serial( A ) );
3031  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3032  addAssign( C, tmp );
3033  }
3034  else {
3035  gemm( C, A, B, ET(1), ET(1) );
3036  }
3037  }
3039 #endif
3040  //**********************************************************************************************
3041 
3042  //**Restructuring addition assignment to row-major matrices*************************************
3057  template< typename MT > // Type of the target matrix
3058  friend inline auto addAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
3059  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3060  {
3062 
3064 
3065  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3066  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3067 
3068  const ForwardFunctor fwd;
3069 
3070  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
3071  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
3072  else if( IsSymmetric_v<MT1> )
3073  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
3074  else
3075  addAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
3076  }
3078  //**********************************************************************************************
3079 
3080  //**Addition assignment to sparse matrices******************************************************
3081  // No special implementation for the addition assignment to sparse matrices.
3082  //**********************************************************************************************
3083 
3084  //**Subtraction assignment to dense matrices****************************************************
3097  template< typename MT // Type of the target dense matrix
3098  , bool SO > // Storage order of the target dense matrix
3099  friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3100  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3101  {
3103 
3104  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3105  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3106 
3107  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3108  return;
3109  }
3110 
3111  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3112  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3113 
3114  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3115  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3116  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3117  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3118  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3119  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3120 
3121  TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3122  }
3124  //**********************************************************************************************
3125 
3126  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3137  template< typename MT3 // Type of the left-hand side target matrix
3138  , typename MT4 // Type of the left-hand side matrix operand
3139  , typename MT5 > // Type of the right-hand side matrix operand
3140  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3141  {
3142  if( ( IsDiagonal_v<MT4> ) ||
3143  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
3144  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3145  selectSmallSubAssignKernel( C, A, B );
3146  else
3147  selectBlasSubAssignKernel( C, A, B );
3148  }
3150  //**********************************************************************************************
3151 
3152  //**Default subtraction assignment to dense matrices (general/general)**************************
3166  template< typename MT3 // Type of the left-hand side target matrix
3167  , typename MT4 // Type of the left-hand side matrix operand
3168  , typename MT5 > // Type of the right-hand side matrix operand
3169  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3170  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3171  {
3172  const size_t M( A.rows() );
3173  const size_t N( B.columns() );
3174  const size_t K( A.columns() );
3175 
3176  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3177 
3178  for( size_t j=0UL; j<N; ++j )
3179  {
3180  const size_t kbegin( ( IsLower_v<MT5> )
3181  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3182  :( 0UL ) );
3183  const size_t kend( ( IsUpper_v<MT5> )
3184  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3185  :( K ) );
3186  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3187 
3188  for( size_t k=kbegin; k<kend; ++k )
3189  {
3190  const size_t ibegin( ( IsLower_v<MT4> )
3191  ?( ( IsStrictlyLower_v<MT4> )
3192  ?( LOW ? max(j,k+1UL) : k+1UL )
3193  :( LOW ? max(j,k) : k ) )
3194  :( LOW ? j : 0UL ) );
3195  const size_t iend( ( IsUpper_v<MT4> )
3196  ?( ( IsStrictlyUpper_v<MT4> )
3197  ?( UPP ? min(j+1UL,k) : k )
3198  :( UPP ? min(j,k)+1UL : k+1UL ) )
3199  :( UPP ? j+1UL : M ) );
3200 
3201  if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
3202  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3203 
3204  const size_t inum( iend - ibegin );
3205  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3206 
3207  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3208  C(i ,j) -= A(i ,k) * B(k,j);
3209  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3210  }
3211  if( ipos < iend ) {
3212  C(ipos,j) -= A(ipos,k) * B(k,j);
3213  }
3214  }
3215  }
3216  }
3218  //**********************************************************************************************
3219 
3220  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
3234  template< typename MT3 // Type of the left-hand side target matrix
3235  , typename MT4 // Type of the left-hand side matrix operand
3236  , typename MT5 > // Type of the right-hand side matrix operand
3237  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3238  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3239  {
3241 
3242  const size_t M( A.rows() );
3243  const size_t N( B.columns() );
3244 
3245  for( size_t j=0UL; j<N; ++j )
3246  {
3247  const size_t ibegin( ( IsLower_v<MT4> )
3248  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3249  :( 0UL ) );
3250  const size_t iend( ( IsUpper_v<MT4> )
3251  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3252  :( M ) );
3253  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3254 
3255  const size_t inum( iend - ibegin );
3256  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3257 
3258  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3259  C(i ,j) -= A(i ,j) * B(j,j);
3260  C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3261  }
3262  if( ipos < iend ) {
3263  C(ipos,j) -= A(ipos,j) * B(j,j);
3264  }
3265  }
3266  }
3268  //**********************************************************************************************
3269 
3270  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
3284  template< typename MT3 // Type of the left-hand side target matrix
3285  , typename MT4 // Type of the left-hand side matrix operand
3286  , typename MT5 > // Type of the right-hand side matrix operand
3287  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3288  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3289  {
3291 
3292  const size_t M( A.rows() );
3293  const size_t N( B.columns() );
3294 
3295  for( size_t j=0UL; j<N; ++j )
3296  {
3297  const size_t ibegin( ( IsLower_v<MT5> )
3298  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3299  :( 0UL ) );
3300  const size_t iend( ( IsUpper_v<MT5> )
3301  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3302  :( M ) );
3303  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3304 
3305  const size_t inum( iend - ibegin );
3306  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3307 
3308  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3309  C(i ,j) -= A(i ,i ) * B(i ,j);
3310  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3311  }
3312  if( ipos < iend ) {
3313  C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3314  }
3315  }
3316  }
3318  //**********************************************************************************************
3319 
3320  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3334  template< typename MT3 // Type of the left-hand side target matrix
3335  , typename MT4 // Type of the left-hand side matrix operand
3336  , typename MT5 > // Type of the right-hand side matrix operand
3337  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3338  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3339  {
3341 
3342  for( size_t i=0UL; i<A.rows(); ++i ) {
3343  C(i,i) -= A(i,i) * B(i,i);
3344  }
3345  }
3347  //**********************************************************************************************
3348 
3349  //**Default subtraction assignment to dense matrices (small matrices)***************************
3363  template< typename MT3 // Type of the left-hand side target matrix
3364  , typename MT4 // Type of the left-hand side matrix operand
3365  , typename MT5 > // Type of the right-hand side matrix operand
3366  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3367  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3368  {
3369  selectDefaultSubAssignKernel( C, A, B );
3370  }
3372  //**********************************************************************************************
3373 
3374  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
3389  template< typename MT3 // Type of the left-hand side target matrix
3390  , typename MT4 // Type of the left-hand side matrix operand
3391  , typename MT5 > // Type of the right-hand side matrix operand
3392  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3393  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3394  {
3397  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
3398  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
3399 
3400  const ForwardFunctor fwd;
3401 
3402  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
3403  const OppositeType_t<MT5> tmp( serial( B ) );
3404  subAssign( C, fwd( A * tmp ) );
3405  }
3406  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
3407  const OppositeType_t<MT4> tmp( serial( A ) );
3408  subAssign( C, fwd( tmp * B ) );
3409  }
3410  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3411  const OppositeType_t<MT5> tmp( serial( B ) );
3412  subAssign( C, fwd( A * tmp ) );
3413  }
3414  else {
3415  const OppositeType_t<MT4> tmp( serial( A ) );
3416  subAssign( C, fwd( tmp * B ) );
3417  }
3418  }
3420  //**********************************************************************************************
3421 
3422  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
3437  template< typename MT3 // Type of the left-hand side target matrix
3438  , typename MT4 // Type of the left-hand side matrix operand
3439  , typename MT5 > // Type of the right-hand side matrix operand
3440  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3441  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3442  {
3443  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3444 
3445  const size_t M( A.rows() );
3446  const size_t N( B.columns() );
3447  const size_t K( A.columns() );
3448 
3449  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3450 
3451  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3452  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3453 
3454  size_t i( 0UL );
3455 
3456  if( IsIntegral_v<ElementType> )
3457  {
3458  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3459  for( size_t j=0UL; j<N; ++j )
3460  {
3461  const size_t kbegin( ( IsLower_v<MT5> )
3462  ?( ( IsUpper_v<MT4> )
3463  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3464  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3465  :( IsUpper_v<MT4> ? i : 0UL ) );
3466  const size_t kend( ( IsUpper_v<MT5> )
3467  ?( ( IsLower_v<MT4> )
3468  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3469  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3470  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
3471 
3472  SIMDType xmm1( C.load(i ,j) );
3473  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3474  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3475  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3476  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
3477  SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
3478  SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
3479  SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
3480 
3481  for( size_t k=kbegin; k<kend; ++k ) {
3482  const SIMDType b1( set( B(k,j) ) );
3483  xmm1 -= A.load(i ,k) * b1;
3484  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3485  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3486  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3487  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3488  xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
3489  xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
3490  xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
3491  }
3492 
3493  C.store( i , j, xmm1 );
3494  C.store( i+SIMDSIZE , j, xmm2 );
3495  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3496  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3497  C.store( i+SIMDSIZE*4UL, j, xmm5 );
3498  C.store( i+SIMDSIZE*5UL, j, xmm6 );
3499  C.store( i+SIMDSIZE*6UL, j, xmm7 );
3500  C.store( i+SIMDSIZE*7UL, j, xmm8 );
3501  }
3502  }
3503  }
3504 
3505  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3506  {
3507  size_t j( 0UL );
3508 
3509  for( ; (j+2UL) <= N; j+=2UL )
3510  {
3511  const size_t kbegin( ( IsLower_v<MT5> )
3512  ?( ( IsUpper_v<MT4> )
3513  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3514  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3515  :( IsUpper_v<MT4> ? i : 0UL ) );
3516  const size_t kend( ( IsUpper_v<MT5> )
3517  ?( ( IsLower_v<MT4> )
3518  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3519  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3520  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
3521 
3522  SIMDType xmm1 ( C.load(i ,j ) );
3523  SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
3524  SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
3525  SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
3526  SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
3527  SIMDType xmm6 ( C.load(i ,j+1UL) );
3528  SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
3529  SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
3530  SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
3531  SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
3532 
3533  for( size_t k=kbegin; k<kend; ++k ) {
3534  const SIMDType a1( A.load(i ,k) );
3535  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3536  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3537  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3538  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3539  const SIMDType b1( set( B(k,j ) ) );
3540  const SIMDType b2( set( B(k,j+1UL) ) );
3541  xmm1 -= a1 * b1;
3542  xmm2 -= a2 * b1;
3543  xmm3 -= a3 * b1;
3544  xmm4 -= a4 * b1;
3545  xmm5 -= a5 * b1;
3546  xmm6 -= a1 * b2;
3547  xmm7 -= a2 * b2;
3548  xmm8 -= a3 * b2;
3549  xmm9 -= a4 * b2;
3550  xmm10 -= a5 * b2;
3551  }
3552 
3553  C.store( i , j , xmm1 );
3554  C.store( i+SIMDSIZE , j , xmm2 );
3555  C.store( i+SIMDSIZE*2UL, j , xmm3 );
3556  C.store( i+SIMDSIZE*3UL, j , xmm4 );
3557  C.store( i+SIMDSIZE*4UL, j , xmm5 );
3558  C.store( i , j+1UL, xmm6 );
3559  C.store( i+SIMDSIZE , j+1UL, xmm7 );
3560  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3561  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3562  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3563  }
3564 
3565  if( j < N )
3566  {
3567  const size_t kbegin( ( IsLower_v<MT5> )
3568  ?( ( IsUpper_v<MT4> )
3569  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3570  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3571  :( IsUpper_v<MT4> ? i : 0UL ) );
3572  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
3573 
3574  SIMDType xmm1( C.load(i ,j) );
3575  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3576  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3577  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3578  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
3579 
3580  for( size_t k=kbegin; k<kend; ++k ) {
3581  const SIMDType b1( set( B(k,j) ) );
3582  xmm1 -= A.load(i ,k) * b1;
3583  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3584  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3585  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3586  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3587  }
3588 
3589  C.store( i , j, xmm1 );
3590  C.store( i+SIMDSIZE , j, xmm2 );
3591  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3592  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3593  C.store( i+SIMDSIZE*4UL, j, xmm5 );
3594  }
3595  }
3596 
3597  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3598  {
3599  size_t j( 0UL );
3600 
3601  for( ; (j+2UL) <= N; j+=2UL )
3602  {
3603  const size_t kbegin( ( IsLower_v<MT5> )
3604  ?( ( IsUpper_v<MT4> )
3605  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3606  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3607  :( IsUpper_v<MT4> ? i : 0UL ) );
3608  const size_t kend( ( IsUpper_v<MT5> )
3609  ?( ( IsLower_v<MT4> )
3610  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3611  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3612  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
3613 
3614  SIMDType xmm1( C.load(i ,j ) );
3615  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
3616  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
3617  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
3618  SIMDType xmm5( C.load(i ,j+1UL) );
3619  SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
3620  SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
3621  SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
3622 
3623  for( size_t k=kbegin; k<kend; ++k ) {
3624  const SIMDType a1( A.load(i ,k) );
3625  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3626  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3627  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3628  const SIMDType b1( set( B(k,j ) ) );
3629  const SIMDType b2( set( B(k,j+1UL) ) );
3630  xmm1 -= a1 * b1;
3631  xmm2 -= a2 * b1;
3632  xmm3 -= a3 * b1;
3633  xmm4 -= a4 * b1;
3634  xmm5 -= a1 * b2;
3635  xmm6 -= a2 * b2;
3636  xmm7 -= a3 * b2;
3637  xmm8 -= a4 * b2;
3638  }
3639 
3640  C.store( i , j , xmm1 );
3641  C.store( i+SIMDSIZE , j , xmm2 );
3642  C.store( i+SIMDSIZE*2UL, j , xmm3 );
3643  C.store( i+SIMDSIZE*3UL, j , xmm4 );
3644  C.store( i , j+1UL, xmm5 );
3645  C.store( i+SIMDSIZE , j+1UL, xmm6 );
3646  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3647  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3648  }
3649 
3650  if( j < N )
3651  {
3652  const size_t kbegin( ( IsLower_v<MT5> )
3653  ?( ( IsUpper_v<MT4> )
3654  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3655  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3656  :( IsUpper_v<MT4> ? i : 0UL ) );
3657  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3658 
3659  SIMDType xmm1( C.load(i ,j) );
3660  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3661  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3662  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3663 
3664  for( size_t k=kbegin; k<kend; ++k ) {
3665  const SIMDType b1( set( B(k,j) ) );
3666  xmm1 -= A.load(i ,k) * b1;
3667  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3668  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3669  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3670  }
3671 
3672  C.store( i , j, xmm1 );
3673  C.store( i+SIMDSIZE , j, xmm2 );
3674  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3675  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3676  }
3677  }
3678 
3679  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3680  {
3681  size_t j( 0UL );
3682 
3683  for( ; (j+2UL) <= N; j+=2UL )
3684  {
3685  const size_t kbegin( ( IsLower_v<MT5> )
3686  ?( ( IsUpper_v<MT4> )
3687  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3688  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3689  :( IsUpper_v<MT4> ? i : 0UL ) );
3690  const size_t kend( ( IsUpper_v<MT5> )
3691  ?( ( IsLower_v<MT4> )
3692  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3693  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3694  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
3695 
3696  SIMDType xmm1( C.load(i ,j ) );
3697  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
3698  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
3699  SIMDType xmm4( C.load(i ,j+1UL) );
3700  SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
3701  SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
3702 
3703  for( size_t k=kbegin; k<kend; ++k ) {
3704  const SIMDType a1( A.load(i ,k) );
3705  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3706  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3707  const SIMDType b1( set( B(k,j ) ) );
3708  const SIMDType b2( set( B(k,j+1UL) ) );
3709  xmm1 -= a1 * b1;
3710  xmm2 -= a2 * b1;
3711  xmm3 -= a3 * b1;
3712  xmm4 -= a1 * b2;
3713  xmm5 -= a2 * b2;
3714  xmm6 -= a3 * b2;
3715  }
3716 
3717  C.store( i , j , xmm1 );
3718  C.store( i+SIMDSIZE , j , xmm2 );
3719  C.store( i+SIMDSIZE*2UL, j , xmm3 );
3720  C.store( i , j+1UL, xmm4 );
3721  C.store( i+SIMDSIZE , j+1UL, xmm5 );
3722  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3723  }
3724 
3725  if( j < N )
3726  {
3727  const size_t kbegin( ( IsLower_v<MT5> )
3728  ?( ( IsUpper_v<MT4> )
3729  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3730  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3731  :( IsUpper_v<MT4> ? i : 0UL ) );
3732  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
3733 
3734  SIMDType xmm1( C.load(i ,j) );
3735  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3736  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3737 
3738  for( size_t k=kbegin; k<kend; ++k ) {
3739  const SIMDType b1( set( B(k,j) ) );
3740  xmm1 -= A.load(i ,k) * b1;
3741  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3742  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3743  }
3744 
3745  C.store( i , j, xmm1 );
3746  C.store( i+SIMDSIZE , j, xmm2 );
3747  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3748  }
3749  }
3750 
3751  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3752  {
3753  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
3754  size_t j( UPP ? i : 0UL );
3755 
3756  for( ; (j+4UL) <= jend; j+=4UL )
3757  {
3758  const size_t kbegin( ( IsLower_v<MT5> )
3759  ?( ( IsUpper_v<MT4> )
3760  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3761  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3762  :( IsUpper_v<MT4> ? i : 0UL ) );
3763  const size_t kend( ( IsUpper_v<MT5> )
3764  ?( ( IsLower_v<MT4> )
3765  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
3766  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
3767  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
3768 
3769  SIMDType xmm1( C.load(i ,j ) );
3770  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
3771  SIMDType xmm3( C.load(i ,j+1UL) );
3772  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
3773  SIMDType xmm5( C.load(i ,j+2UL) );
3774  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
3775  SIMDType xmm7( C.load(i ,j+3UL) );
3776  SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
3777 
3778  for( size_t k=kbegin; k<kend; ++k ) {
3779  const SIMDType a1( A.load(i ,k) );
3780  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3781  const SIMDType b1( set( B(k,j ) ) );
3782  const SIMDType b2( set( B(k,j+1UL) ) );
3783  const SIMDType b3( set( B(k,j+2UL) ) );
3784  const SIMDType b4( set( B(k,j+3UL) ) );
3785  xmm1 -= a1 * b1;
3786  xmm2 -= a2 * b1;
3787  xmm3 -= a1 * b2;
3788  xmm4 -= a2 * b2;
3789  xmm5 -= a1 * b3;
3790  xmm6 -= a2 * b3;
3791  xmm7 -= a1 * b4;
3792  xmm8 -= a2 * b4;
3793  }
3794 
3795  C.store( i , j , xmm1 );
3796  C.store( i+SIMDSIZE, j , xmm2 );
3797  C.store( i , j+1UL, xmm3 );
3798  C.store( i+SIMDSIZE, j+1UL, xmm4 );
3799  C.store( i , j+2UL, xmm5 );
3800  C.store( i+SIMDSIZE, j+2UL, xmm6 );
3801  C.store( i , j+3UL, xmm7 );
3802  C.store( i+SIMDSIZE, j+3UL, xmm8 );
3803  }
3804 
3805  for( ; (j+3UL) <= jend; j+=3UL )
3806  {
3807  const size_t kbegin( ( IsLower_v<MT5> )
3808  ?( ( IsUpper_v<MT4> )
3809  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3810  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3811  :( IsUpper_v<MT4> ? i : 0UL ) );
3812  const size_t kend( ( IsUpper_v<MT5> )
3813  ?( ( IsLower_v<MT4> )
3814  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
3815  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
3816  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
3817 
3818  SIMDType xmm1( C.load(i ,j ) );
3819  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
3820  SIMDType xmm3( C.load(i ,j+1UL) );
3821  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
3822  SIMDType xmm5( C.load(i ,j+2UL) );
3823  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
3824 
3825  for( size_t k=kbegin; k<kend; ++k ) {
3826  const SIMDType a1( A.load(i ,k) );
3827  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3828  const SIMDType b1( set( B(k,j ) ) );
3829  const SIMDType b2( set( B(k,j+1UL) ) );
3830  const SIMDType b3( set( B(k,j+2UL) ) );
3831  xmm1 -= a1 * b1;
3832  xmm2 -= a2 * b1;
3833  xmm3 -= a1 * b2;
3834  xmm4 -= a2 * b2;
3835  xmm5 -= a1 * b3;
3836  xmm6 -= a2 * b3;
3837  }
3838 
3839  C.store( i , j , xmm1 );
3840  C.store( i+SIMDSIZE, j , xmm2 );
3841  C.store( i , j+1UL, xmm3 );
3842  C.store( i+SIMDSIZE, j+1UL, xmm4 );
3843  C.store( i , j+2UL, xmm5 );
3844  C.store( i+SIMDSIZE, j+2UL, xmm6 );
3845  }
3846 
3847  for( ; (j+2UL) <= jend; j+=2UL )
3848  {
3849  const size_t kbegin( ( IsLower_v<MT5> )
3850  ?( ( IsUpper_v<MT4> )
3851  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3852  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3853  :( IsUpper_v<MT4> ? i : 0UL ) );
3854  const size_t kend( ( IsUpper_v<MT5> )
3855  ?( ( IsLower_v<MT4> )
3856  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3857  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3858  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
3859 
3860  SIMDType xmm1( C.load(i ,j ) );
3861  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
3862  SIMDType xmm3( C.load(i ,j+1UL) );
3863  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
3864  SIMDType xmm5, xmm6, xmm7, xmm8;
3865  size_t k( kbegin );
3866 
3867  for( ; (k+2UL) <= kend; k+=2UL ) {
3868  const SIMDType a1( A.load(i ,k ) );
3869  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
3870  const SIMDType a3( A.load(i ,k+1UL) );
3871  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
3872  const SIMDType b1( set( B(k ,j ) ) );
3873  const SIMDType b2( set( B(k ,j+1UL) ) );
3874  const SIMDType b3( set( B(k+1UL,j ) ) );
3875  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
3876  xmm1 -= a1 * b1;
3877  xmm2 -= a2 * b1;
3878  xmm3 -= a1 * b2;
3879  xmm4 -= a2 * b2;
3880  xmm5 -= a3 * b3;
3881  xmm6 -= a4 * b3;
3882  xmm7 -= a3 * b4;
3883  xmm8 -= a4 * b4;
3884  }
3885 
3886  for( ; k<kend; ++k ) {
3887  const SIMDType a1( A.load(i ,k) );
3888  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3889  const SIMDType b1( set( B(k,j ) ) );
3890  const SIMDType b2( set( B(k,j+1UL) ) );
3891  xmm1 -= a1 * b1;
3892  xmm2 -= a2 * b1;
3893  xmm3 -= a1 * b2;
3894  xmm4 -= a2 * b2;
3895  }
3896 
3897  C.store( i , j , xmm1+xmm5 );
3898  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
3899  C.store( i , j+1UL, xmm3+xmm7 );
3900  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
3901  }
3902 
3903  if( j < jend )
3904  {
3905  const size_t kbegin( ( IsLower_v<MT5> )
3906  ?( ( IsUpper_v<MT4> )
3907  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3908  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3909  :( IsUpper_v<MT4> ? i : 0UL ) );
3910  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
3911 
3912  SIMDType xmm1( C.load(i ,j) );
3913  SIMDType xmm2( C.load(i+SIMDSIZE,j) );
3914  SIMDType xmm3, xmm4;
3915  size_t k( kbegin );
3916 
3917  for( ; (k+2UL) <= kend; k+=2UL ) {
3918  const SIMDType b1( set( B(k ,j) ) );
3919  const SIMDType b2( set( B(k+1UL,j) ) );
3920  xmm1 -= A.load(i ,k ) * b1;
3921  xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
3922  xmm3 -= A.load(i ,k+1UL) * b2;
3923  xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
3924  }
3925 
3926  for( ; k<kend; ++k ) {
3927  const SIMDType b1( set( B(k,j) ) );
3928  xmm1 -= A.load(i ,k) * b1;
3929  xmm2 -= A.load(i+SIMDSIZE,k) * b1;
3930  }
3931 
3932  C.store( i , j, xmm1+xmm3 );
3933  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
3934  }
3935  }
3936 
3937  for( ; i<ipos; i+=SIMDSIZE )
3938  {
3939  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
3940  size_t j( UPP ? i : 0UL );
3941 
3942  for( ; (j+4UL) <= jend; j+=4UL )
3943  {
3944  const size_t kbegin( ( IsLower_v<MT5> )
3945  ?( ( IsUpper_v<MT4> )
3946  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3947  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3948  :( IsUpper_v<MT4> ? i : 0UL ) );
3949  const size_t kend( ( IsUpper_v<MT5> )
3950  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
3951  :( K ) );
3952 
3953  SIMDType xmm1( C.load(i,j ) );
3954  SIMDType xmm2( C.load(i,j+1UL) );
3955  SIMDType xmm3( C.load(i,j+2UL) );
3956  SIMDType xmm4( C.load(i,j+3UL) );
3957  SIMDType xmm5, xmm6, xmm7, xmm8;
3958  size_t k( kbegin );
3959 
3960  for( ; (k+2UL) <= kend; k+=2UL ) {
3961  const SIMDType a1( A.load(i,k ) );
3962  const SIMDType a2( A.load(i,k+1UL) );
3963  xmm1 -= a1 * set( B(k ,j ) );
3964  xmm2 -= a1 * set( B(k ,j+1UL) );
3965  xmm3 -= a1 * set( B(k ,j+2UL) );
3966  xmm4 -= a1 * set( B(k ,j+3UL) );
3967  xmm5 -= a2 * set( B(k+1UL,j ) );
3968  xmm6 -= a2 * set( B(k+1UL,j+1UL) );
3969  xmm7 -= a2 * set( B(k+1UL,j+2UL) );
3970  xmm8 -= a2 * set( B(k+1UL,j+3UL) );
3971  }
3972 
3973  for( ; k<kend; ++k ) {
3974  const SIMDType a1( A.load(i,k) );
3975  xmm1 -= a1 * set( B(k,j ) );
3976  xmm2 -= a1 * set( B(k,j+1UL) );
3977  xmm3 -= a1 * set( B(k,j+2UL) );
3978  xmm4 -= a1 * set( B(k,j+3UL) );
3979  }
3980 
3981  C.store( i, j , xmm1+xmm5 );
3982  C.store( i, j+1UL, xmm2+xmm6 );
3983  C.store( i, j+2UL, xmm3+xmm7 );
3984  C.store( i, j+3UL, xmm4+xmm8 );
3985  }
3986 
3987  for( ; (j+3UL) <= jend; j+=3UL )
3988  {
3989  const size_t kbegin( ( IsLower_v<MT5> )
3990  ?( ( IsUpper_v<MT4> )
3991  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3992  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3993  :( IsUpper_v<MT4> ? i : 0UL ) );
3994  const size_t kend( ( IsUpper_v<MT5> )
3995  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
3996  :( K ) );
3997 
3998  SIMDType xmm1( C.load(i,j ) );
3999  SIMDType xmm2( C.load(i,j+1UL) );
4000  SIMDType xmm3( C.load(i,j+2UL) );
4001  SIMDType xmm4, xmm5, xmm6;
4002  size_t k( kbegin );
4003 
4004  for( ; (k+2UL) <= kend; k+=2UL ) {
4005  const SIMDType a1( A.load(i,k ) );
4006  const SIMDType a2( A.load(i,k+1UL) );
4007  xmm1 -= a1 * set( B(k ,j ) );
4008  xmm2 -= a1 * set( B(k ,j+1UL) );
4009  xmm3 -= a1 * set( B(k ,j+2UL) );
4010  xmm4 -= a2 * set( B(k+1UL,j ) );
4011  xmm5 -= a2 * set( B(k+1UL,j+1UL) );
4012  xmm6 -= a2 * set( B(k+1UL,j+2UL) );
4013  }
4014 
4015  for( ; k<kend; ++k ) {
4016  const SIMDType a1( A.load(i,k) );
4017  xmm1 -= a1 * set( B(k,j ) );
4018  xmm2 -= a1 * set( B(k,j+1UL) );
4019  xmm3 -= a1 * set( B(k,j+2UL) );
4020  }
4021 
4022  C.store( i, j , xmm1+xmm4 );
4023  C.store( i, j+1UL, xmm2+xmm5 );
4024  C.store( i, j+2UL, xmm3+xmm6 );
4025  }
4026 
4027  for( ; (j+2UL) <= jend; j+=2UL )
4028  {
4029  const size_t kbegin( ( IsLower_v<MT5> )
4030  ?( ( IsUpper_v<MT4> )
4031  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4032  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4033  :( IsUpper_v<MT4> ? i : 0UL ) );
4034  const size_t kend( ( IsUpper_v<MT5> )
4035  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4036  :( K ) );
4037 
4038  SIMDType xmm1( C.load(i,j ) );
4039  SIMDType xmm2( C.load(i,j+1UL) );
4040  SIMDType xmm3, xmm4;
4041  size_t k( kbegin );
4042 
4043  for( ; (k+2UL) <= kend; k+=2UL ) {
4044  const SIMDType a1( A.load(i,k ) );
4045  const SIMDType a2( A.load(i,k+1UL) );
4046  xmm1 -= a1 * set( B(k ,j ) );
4047  xmm2 -= a1 * set( B(k ,j+1UL) );
4048  xmm3 -= a2 * set( B(k+1UL,j ) );
4049  xmm4 -= a2 * set( B(k+1UL,j+1UL) );
4050  }
4051 
4052  for( ; k<kend; ++k ) {
4053  const SIMDType a1( A.load(i,k) );
4054  xmm1 -= a1 * set( B(k,j ) );
4055  xmm2 -= a1 * set( B(k,j+1UL) );
4056  }
4057 
4058  C.store( i, j , xmm1+xmm3 );
4059  C.store( i, j+1UL, xmm2+xmm4 );
4060  }
4061 
4062  if( j < jend )
4063  {
4064  const size_t kbegin( ( IsLower_v<MT5> )
4065  ?( ( IsUpper_v<MT4> )
4066  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4067  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4068  :( IsUpper_v<MT4> ? i : 0UL ) );
4069 
4070  SIMDType xmm1( C.load(i,j) );
4071  SIMDType xmm2;
4072  size_t k( kbegin );
4073 
4074  for( ; (k+2UL) <= K; k+=2UL ) {
4075  xmm1 -= A.load(i,k ) * set( B(k ,j) );
4076  xmm2 -= A.load(i,k+1UL) * set( B(k+1UL,j) );
4077  }
4078 
4079  for( ; k<K; ++k ) {
4080  xmm1 -= A.load(i,k) * set( B(k,j) );
4081  }
4082 
4083  C.store( i, j, xmm1+xmm2 );
4084  }
4085  }
4086 
4087  for( ; remainder && i<M; ++i )
4088  {
4089  const size_t jend( LOW ? i+1UL : N );
4090  size_t j( UPP ? i : 0UL );
4091 
4092  for( ; (j+2UL) <= jend; j+=2UL )
4093  {
4094  const size_t kbegin( ( IsLower_v<MT5> )
4095  ?( ( IsUpper_v<MT4> )
4096  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4097  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4098  :( IsUpper_v<MT4> ? i : 0UL ) );
4099  const size_t kend( ( IsUpper_v<MT5> )
4100  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4101  :( K ) );
4102 
4103  ElementType value1( C(i,j ) );
4104  ElementType value2( C(i,j+1UL) );
4105 
4106  for( size_t k=kbegin; k<kend; ++k ) {
4107  value1 -= A(i,k) * B(k,j );
4108  value2 -= A(i,k) * B(k,j+1UL);
4109  }
4110 
4111  C(i,j ) = value1;
4112  C(i,j+1UL) = value2;
4113  }
4114 
4115  if( j < jend )
4116  {
4117  const size_t kbegin( ( IsLower_v<MT5> )
4118  ?( ( IsUpper_v<MT4> )
4119  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4120  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4121  :( IsUpper_v<MT4> ? i : 0UL ) );
4122 
4123  ElementType value( C(i,j) );
4124 
4125  for( size_t k=kbegin; k<K; ++k ) {
4126  value -= A(i,k) * B(k,j);
4127  }
4128 
4129  C(i,j) = value;
4130  }
4131  }
4132  }
4134  //**********************************************************************************************
4135 
4136  //**Default subtraction assignment to dense matrices (large matrices)***************************
4150  template< typename MT3 // Type of the left-hand side target matrix
4151  , typename MT4 // Type of the left-hand side matrix operand
4152  , typename MT5 > // Type of the right-hand side matrix operand
4153  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4154  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4155  {
4156  selectDefaultSubAssignKernel( C, A, B );
4157  }
4159  //**********************************************************************************************
4160 
4161  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
4176  template< typename MT3 // Type of the left-hand side target matrix
4177  , typename MT4 // Type of the left-hand side matrix operand
4178  , typename MT5 > // Type of the right-hand side matrix operand
4179  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4180  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4181  {
4182  if( LOW )
4183  lmmm( C, A, B, ElementType(-1), ElementType(1) );
4184  else if( UPP )
4185  ummm( C, A, B, ElementType(-1), ElementType(1) );
4186  else
4187  mmm( C, A, B, ElementType(-1), ElementType(1) );
4188  }
4190  //**********************************************************************************************
4191 
4192  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4206  template< typename MT3 // Type of the left-hand side target matrix
4207  , typename MT4 // Type of the left-hand side matrix operand
4208  , typename MT5 > // Type of the right-hand side matrix operand
4209  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4210  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4211  {
4212  selectLargeSubAssignKernel( C, A, B );
4213  }
4215  //**********************************************************************************************
4216 
4217  //**BLAS-based subraction assignment to dense matrices******************************************
4218 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4219 
4232  template< typename MT3 // Type of the left-hand side target matrix
4233  , typename MT4 // Type of the left-hand side matrix operand
4234  , typename MT5 > // Type of the right-hand side matrix operand
4235  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4236  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4237  {
4238  using ET = ElementType_t<MT3>;
4239 
4240  if( IsTriangular_v<MT4> ) {
4241  ResultType_t<MT3> tmp( serial( B ) );
4242  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4243  subAssign( C, tmp );
4244  }
4245  else if( IsTriangular_v<MT5> ) {
4246  ResultType_t<MT3> tmp( serial( A ) );
4247  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4248  subAssign( C, tmp );
4249  }
4250  else {
4251  gemm( C, A, B, ET(-1), ET(1) );
4252  }
4253  }
4255 #endif
4256  //**********************************************************************************************
4257 
4258  //**Restructuring subtraction assignment to row-major matrices**********************************
4274  template< typename MT > // Type of the target matrix
4275  friend inline auto subAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4276  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4277  {
4279 
4281 
4282  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4283  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4284 
4285  const ForwardFunctor fwd;
4286 
4287  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4288  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4289  else if( IsSymmetric_v<MT1> )
4290  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4291  else
4292  subAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4293  }
4295  //**********************************************************************************************
4296 
4297  //**Subtraction assignment to sparse matrices***************************************************
4298  // No special implementation for the subtraction assignment to sparse matrices.
4299  //**********************************************************************************************
4300 
4301  //**Schur product assignment to dense matrices**************************************************
4314  template< typename MT // Type of the target dense matrix
4315  , bool SO > // Storage order of the target dense matrix
4316  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4317  {
4319 
4323 
4324  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4325  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4326 
4327  const ResultType tmp( serial( rhs ) );
4328  schurAssign( ~lhs, tmp );
4329  }
4331  //**********************************************************************************************
4332 
4333  //**Multiplication assignment to dense matrices*************************************************
4334  // No special implementation for the multiplication assignment to dense matrices.
4335  //**********************************************************************************************
4336 
4337  //**Multiplication assignment to sparse matrices************************************************
4338  // No special implementation for the multiplication assignment to sparse matrices.
4339  //**********************************************************************************************
4340 
4341  //**SMP assignment to dense matrices************************************************************
4357  template< typename MT // Type of the target dense matrix
4358  , bool SO > // Storage order of the target dense matrix
4359  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4360  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4361  {
4363 
4364  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4365  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4366 
4367  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4368  return;
4369  }
4370  else if( rhs.lhs_.columns() == 0UL ) {
4371  reset( ~lhs );
4372  return;
4373  }
4374 
4375  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4376  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4377 
4378  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4379  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4380  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4381  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4382  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4383  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4384 
4385  smpAssign( ~lhs, A * B );
4386  }
4388  //**********************************************************************************************
4389 
4390  //**SMP assignment to sparse matrices***********************************************************
4406  template< typename MT // Type of the target sparse matrix
4407  , bool SO > // Storage order of the target sparse matrix
4408  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4409  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4410  {
4412 
4413  using TmpType = If_t< SO, ResultType, OppositeType >;
4414 
4421 
4422  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4423  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4424 
4425  const ForwardFunctor fwd;
4426 
4427  const TmpType tmp( rhs );
4428  smpAssign( ~lhs, fwd( tmp ) );
4429  }
4431  //**********************************************************************************************
4432 
4433  //**Restructuring SMP assignment to row-major matrices******************************************
4448  template< typename MT > // Type of the target matrix
4449  friend inline auto smpAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4450  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4451  {
4453 
4455 
4456  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4457  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4458 
4459  const ForwardFunctor fwd;
4460 
4461  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4462  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4463  else if( IsSymmetric_v<MT1> )
4464  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4465  else
4466  smpAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4467  }
4469  //**********************************************************************************************
4470 
4471  //**SMP addition assignment to dense matrices***************************************************
4487  template< typename MT // Type of the target dense matrix
4488  , bool SO > // Storage order of the target dense matrix
4489  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4490  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4491  {
4493 
4494  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4495  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4496 
4497  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4498  return;
4499  }
4500 
4501  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4502  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4503 
4504  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4505  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4506  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4507  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4508  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4509  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4510 
4511  smpAddAssign( ~lhs, A * B );
4512  }
4514  //**********************************************************************************************
4515 
4516  //**Restructuring SMP addition assignment to row-major matrices*********************************
4532  template< typename MT > // Type of the target matrix
4533  friend inline auto smpAddAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4534  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4535  {
4537 
4539 
4540  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4541  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4542 
4543  const ForwardFunctor fwd;
4544 
4545  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4546  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4547  else if( IsSymmetric_v<MT1> )
4548  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4549  else
4550  smpAddAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4551  }
4553  //**********************************************************************************************
4554 
4555  //**SMP addition assignment to sparse matrices**************************************************
4556  // No special implementation for the SMP addition assignment to sparse matrices.
4557  //**********************************************************************************************
4558 
4559  //**SMP subtraction assignment to dense matrices************************************************
4575  template< typename MT // Type of the target dense matrix
4576  , bool SO > // Storage order of the target dense matrix
4577  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4578  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4579  {
4581 
4582  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4583  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4584 
4585  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4586  return;
4587  }
4588 
4589  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4590  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4591 
4592  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4593  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4594  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4595  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4596  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4597  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4598 
4599  smpSubAssign( ~lhs, A * B );
4600  }
4602  //**********************************************************************************************
4603 
4604  //**Restructuring SMP subtraction assignment to row-major matrices******************************
4620  template< typename MT > // Type of the target matrix
4621  friend inline auto smpSubAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4622  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4623  {
4625 
4627 
4628  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4629  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4630 
4631  const ForwardFunctor fwd;
4632 
4633  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4634  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4635  else if( IsSymmetric_v<MT1> )
4636  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4637  else
4638  smpSubAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4639  }
4641  //**********************************************************************************************
4642 
4643  //**SMP subtraction assignment to sparse matrices***********************************************
4644  // No special implementation for the SMP subtraction assignment to sparse matrices.
4645  //**********************************************************************************************
4646 
4647  //**SMP Schur product assignment to dense matrices**********************************************
4661  template< typename MT // Type of the target dense matrix
4662  , bool SO > // Storage order of the target dense matrix
4663  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4664  {
4666 
4670 
4671  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4672  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4673 
4674  const ResultType tmp( rhs );
4675  smpSchurAssign( ~lhs, tmp );
4676  }
4678  //**********************************************************************************************
4679 
4680  //**SMP Schur product assignment to sparse matrices*********************************************
4681  // No special implementation for the SMP Schur product assignment to sparse matrices.
4682  //**********************************************************************************************
4683 
4684  //**SMP multiplication assignment to dense matrices*********************************************
4685  // No special implementation for the SMP multiplication assignment to dense matrices.
4686  //**********************************************************************************************
4687 
4688  //**SMP multiplication assignment to sparse matrices********************************************
4689  // No special implementation for the SMP multiplication assignment to sparse matrices.
4690  //**********************************************************************************************
4691 
4692  //**Compile time checks*************************************************************************
4700  //**********************************************************************************************
4701 };
4702 //*************************************************************************************************
4703 
4704 
4705 
4706 
4707 //=================================================================================================
4708 //
4709 // DMATSCALARMULTEXPR SPECIALIZATION
4710 //
4711 //=================================================================================================
4712 
4713 //*************************************************************************************************
4721 template< typename MT1 // Type of the left-hand side dense matrix
4722  , typename MT2 // Type of the right-hand side dense matrix
4723  , bool SF // Symmetry flag
4724  , bool HF // Hermitian flag
4725  , bool LF // Lower flag
4726  , bool UF // Upper flag
4727  , typename ST > // Type of the right-hand side scalar value
4728 class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
4729  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4730  , private Computation
4731 {
4732  private:
4733  //**Type definitions****************************************************************************
4735  using MMM = TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4736 
4737  using RES = ResultType_t<MMM>;
4738  using RT1 = ResultType_t<MT1>;
4739  using RT2 = ResultType_t<MT2>;
4740  using ET1 = ElementType_t<RT1>;
4741  using ET2 = ElementType_t<RT2>;
4742  using CT1 = CompositeType_t<MT1>;
4743  using CT2 = CompositeType_t<MT2>;
4744  //**********************************************************************************************
4745 
4746  //**********************************************************************************************
4748  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4749  //**********************************************************************************************
4750 
4751  //**********************************************************************************************
4753  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4754  //**********************************************************************************************
4755 
4756  //**********************************************************************************************
4757  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4758  static constexpr bool HERM = ( HF && !( LF || UF ) );
4759  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4760  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4761  //**********************************************************************************************
4762 
4763  //**********************************************************************************************
4765 
4769  template< typename T1, typename T2, typename T3 >
4770  static constexpr bool CanExploitSymmetry_v =
4771  ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4772  //**********************************************************************************************
4773 
4774  //**********************************************************************************************
4776 
4779  template< typename T1, typename T2, typename T3 >
4780  static constexpr bool IsEvaluationRequired_v =
4781  ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4782  //**********************************************************************************************
4783 
4784  //**********************************************************************************************
4786 
4788  template< typename T1, typename T2, typename T3, typename T4 >
4789  static constexpr bool UseBlasKernel_v =
4790  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4791  !SYM && !HERM && !LOW && !UPP &&
4792  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4793  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4794  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4795  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4796  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4797  IsBLASCompatible_v< ElementType_t<T1> > &&
4798  IsBLASCompatible_v< ElementType_t<T2> > &&
4799  IsBLASCompatible_v< ElementType_t<T3> > &&
4800  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4801  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4802  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4803  //**********************************************************************************************
4804 
4805  //**********************************************************************************************
4807 
4809  template< typename T1, typename T2, typename T3, typename T4 >
4810  static constexpr bool UseVectorizedDefaultKernel_v =
4811  ( useOptimizedKernels &&
4812  !IsDiagonal_v<T2> &&
4813  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4814  IsSIMDCombinable_v< ElementType_t<T1>
4815  , ElementType_t<T2>
4816  , ElementType_t<T3>
4817  , T4 > &&
4818  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T2> > &&
4819  HasSIMDMult_v< ElementType_t<T3>, ElementType_t<T3> > );
4820  //**********************************************************************************************
4821 
4822  //**********************************************************************************************
4824 
4826  using ForwardFunctor = If_t< HERM
4827  , DeclHerm
4828  , If_t< SYM
4829  , DeclSym
4830  , If_t< LOW
4831  , If_t< UPP
4832  , DeclDiag
4833  , DeclLow >
4834  , If_t< UPP
4835  , DeclUpp
4836  , Noop > > > >;
4837  //**********************************************************************************************
4838 
4839  public:
4840  //**Type definitions****************************************************************************
4842  using This = DMatScalarMultExpr<MMM,ST,true>;
4843 
4845  using BaseType = DenseMatrix<This,true>;
4846 
4848  using ResultType = typename If_t< HERM
4849  , DeclHermTrait< MultTrait_t<RES,ST> >
4850  , If_t< SYM
4851  , DeclSymTrait< MultTrait_t<RES,ST> >
4852  , If_t< LOW
4853  , If_t< UPP
4854  , DeclDiagTrait< MultTrait_t<RES,ST> >
4855  , DeclLowTrait< MultTrait_t<RES,ST> > >
4856  , If_t< UPP
4857  , DeclUppTrait< MultTrait_t<RES,ST> >
4858  , MultTrait<RES,ST> > > > >::Type;
4859 
4860  using OppositeType = OppositeType_t<ResultType>;
4861  using TransposeType = TransposeType_t<ResultType>;
4862  using ElementType = ElementType_t<ResultType>;
4863  using SIMDType = SIMDTrait_t<ElementType>;
4864  using ReturnType = const ElementType;
4865  using CompositeType = const ResultType;
4866 
4868  using LeftOperand = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4869 
4871  using RightOperand = ST;
4872 
4874  using LT = If_t< evaluateLeft, const RT1, CT1 >;
4875 
4877  using RT = If_t< evaluateRight, const RT2, CT2 >;
4878  //**********************************************************************************************
4879 
4880  //**Compilation flags***************************************************************************
4882  static constexpr bool simdEnabled =
4883  ( !IsDiagonal_v<MT1> &&
4884  MT1::simdEnabled && MT2::simdEnabled &&
4885  IsSIMDCombinable_v<ET1,ET2,ST> &&
4886  HasSIMDAdd_v<ET1,ET2> &&
4887  HasSIMDMult_v<ET1,ET2> );
4888 
4890  static constexpr bool smpAssignable =
4891  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4892  //**********************************************************************************************
4893 
4894  //**SIMD properties*****************************************************************************
4896  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
4897  //**********************************************************************************************
4898 
4899  //**Constructor*********************************************************************************
4905  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4906  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4907  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4908  {}
4909  //**********************************************************************************************
4910 
4911  //**Access operator*****************************************************************************
4918  inline ReturnType operator()( size_t i, size_t j ) const {
4919  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4920  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4921  return matrix_(i,j) * scalar_;
4922  }
4923  //**********************************************************************************************
4924 
4925  //**At function*********************************************************************************
4933  inline ReturnType at( size_t i, size_t j ) const {
4934  if( i >= matrix_.rows() ) {
4935  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4936  }
4937  if( j >= matrix_.columns() ) {
4938  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4939  }
4940  return (*this)(i,j);
4941  }
4942  //**********************************************************************************************
4943 
4944  //**Rows function*******************************************************************************
4949  inline size_t rows() const {
4950  return matrix_.rows();
4951  }
4952  //**********************************************************************************************
4953 
4954  //**Columns function****************************************************************************
4959  inline size_t columns() const {
4960  return matrix_.columns();
4961  }
4962  //**********************************************************************************************
4963 
4964  //**Left operand access*************************************************************************
4969  inline LeftOperand leftOperand() const {
4970  return matrix_;
4971  }
4972  //**********************************************************************************************
4973 
4974  //**Right operand access************************************************************************
4979  inline RightOperand rightOperand() const {
4980  return scalar_;
4981  }
4982  //**********************************************************************************************
4983 
4984  //**********************************************************************************************
4990  template< typename T >
4991  inline bool canAlias( const T* alias ) const {
4992  return matrix_.canAlias( alias );
4993  }
4994  //**********************************************************************************************
4995 
4996  //**********************************************************************************************
5002  template< typename T >
5003  inline bool isAliased( const T* alias ) const {
5004  return matrix_.isAliased( alias );
5005  }
5006  //**********************************************************************************************
5007 
5008  //**********************************************************************************************
5013  inline bool isAligned() const {
5014  return matrix_.isAligned();
5015  }
5016  //**********************************************************************************************
5017 
5018  //**********************************************************************************************
5023  inline bool canSMPAssign() const noexcept {
5024  return ( !BLAZE_BLAS_MODE ||
5025  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
5027  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
5028  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
5029  }
5030  //**********************************************************************************************
5031 
5032  private:
5033  //**Member variables****************************************************************************
5036  //**********************************************************************************************
5037 
5038  //**Assignment to dense matrices****************************************************************
5050  template< typename MT // Type of the target dense matrix
5051  , bool SO > // Storage order of the target dense matrix
5052  friend inline auto assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5053  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
5054  {
5056 
5057  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5058  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5059 
5060  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5061  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5062 
5063  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
5064  return;
5065  }
5066  else if( left.columns() == 0UL ) {
5067  reset( ~lhs );
5068  return;
5069  }
5070 
5071  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5072  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5073 
5074  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5075  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5076  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5077  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5078  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5079  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
5080 
5081  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
5082  }
5083  //**********************************************************************************************
5084 
5085  //**Assignment to dense matrices (kernel selection)*********************************************
5096  template< typename MT3 // Type of the left-hand side target matrix
5097  , typename MT4 // Type of the left-hand side matrix operand
5098  , typename MT5 // Type of the right-hand side matrix operand
5099  , typename ST2 > // Type of the scalar value
5100  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5101  {
5102  if( ( IsDiagonal_v<MT4> ) ||
5103  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
5104  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5105  selectSmallAssignKernel( C, A, B, scalar );
5106  else
5107  selectBlasAssignKernel( C, A, B, scalar );
5108  }
5109  //**********************************************************************************************
5110 
5111  //**Default assignment to dense matrices (general/general)**************************************
5125  template< typename MT3 // Type of the left-hand side target matrix
5126  , typename MT4 // Type of the left-hand side matrix operand
5127  , typename MT5 // Type of the right-hand side matrix operand
5128  , typename ST2 > // Type of the scalar value
5129  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5130  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5131  {
5132  const size_t M( A.rows() );
5133  const size_t N( B.columns() );
5134  const size_t K( A.columns() );
5135 
5136  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5137 
5138  for( size_t j=0UL; j<N; ++j )
5139  {
5140  const size_t kbegin( ( IsLower_v<MT5> )
5141  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5142  :( 0UL ) );
5143  const size_t kend( ( IsUpper_v<MT5> )
5144  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5145  :( K ) );
5146  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5147 
5148  if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
5149  for( size_t i=0UL; i<M; ++i ) {
5150  reset( C(i,j) );
5151  }
5152  continue;
5153  }
5154 
5155  {
5156  const size_t ibegin( ( IsLower_v<MT4> )
5157  ?( ( IsStrictlyLower_v<MT4> )
5158  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
5159  :( LOW ? max(j,kbegin) : kbegin ) )
5160  :( LOW ? j : 0UL ) );
5161  const size_t iend( ( IsUpper_v<MT4> )
5162  ?( ( IsStrictlyUpper_v<MT4> )
5163  ?( UPP ? min(j+1UL,kbegin) : kbegin )
5164  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
5165  :( UPP ? j+1UL : M ) );
5166 
5167  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5168  for( size_t i=0UL; i<ibegin; ++i ) {
5169  reset( C(i,j) );
5170  }
5171  }
5172  else if( IsStrictlyLower_v<MT4> ) {
5173  reset( C(0UL,j) );
5174  }
5175  for( size_t i=ibegin; i<iend; ++i ) {
5176  C(i,j) = A(i,kbegin) * B(kbegin,j);
5177  }
5178  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5179  for( size_t i=iend; i<M; ++i ) {
5180  reset( C(i,j) );
5181  }
5182  }
5183  else if( IsStrictlyUpper_v<MT4> ) {
5184  reset( C(M-1UL,j) );
5185  }
5186  }
5187 
5188  for( size_t k=kbegin+1UL; k<kend; ++k )
5189  {
5190  const size_t ibegin( ( IsLower_v<MT4> )
5191  ?( ( IsStrictlyLower_v<MT4> )
5192  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
5193  :( SYM || HERM || LOW ? max( j, k ) : k ) )
5194  :( SYM || HERM || LOW ? j : 0UL ) );
5195  const size_t iend( ( IsUpper_v<MT4> )
5196  ?( ( IsStrictlyUpper_v<MT4> )
5197  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
5198  :( UPP ? min(j+1UL,k) : k ) )
5199  :( UPP ? j+1UL : M ) );
5200 
5201  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5202  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5203 
5204  for( size_t i=ibegin; i<iend; ++i ) {
5205  C(i,j) += A(i,k) * B(k,j);
5206  }
5207  if( IsUpper_v<MT4> ) {
5208  C(iend,j) = A(iend,k) * B(k,j);
5209  }
5210  }
5211 
5212  {
5213  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
5214  ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
5215  :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5216  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5217  ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
5218  :( UPP ? j+1UL : M ) );
5219 
5220  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5221  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5222 
5223  for( size_t i=ibegin; i<iend; ++i ) {
5224  C(i,j) *= scalar;
5225  }
5226  }
5227  }
5228 
5229  if( SYM || HERM ) {
5230  for( size_t j=1UL; j<N; ++j ) {
5231  for( size_t i=0UL; i<j; ++i ) {
5232  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5233  }
5234  }
5235  }
5236  }
5237  //**********************************************************************************************
5238 
5239  //**Default assignment to dense matrices (general/diagonal)*************************************
5253  template< typename MT3 // Type of the left-hand side target matrix
5254  , typename MT4 // Type of the left-hand side matrix operand
5255  , typename MT5 // Type of the right-hand side matrix operand
5256  , typename ST2 > // Type of the scalar value
5257  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5258  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5259  {
5261 
5262  const size_t M( A.rows() );
5263  const size_t N( B.columns() );
5264 
5265  for( size_t j=0UL; j<N; ++j )
5266  {
5267  const size_t ibegin( ( IsLower_v<MT4> )
5268  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5269  :( 0UL ) );
5270  const size_t iend( ( IsUpper_v<MT4> )
5271  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5272  :( M ) );
5273  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5274 
5275  if( IsLower_v<MT4> ) {
5276  for( size_t i=0UL; i<ibegin; ++i ) {
5277  reset( C(i,j) );
5278  }
5279  }
5280  for( size_t i=ibegin; i<iend; ++i ) {
5281  C(i,j) = A(i,j) * B(j,j) * scalar;
5282  }
5283  if( IsUpper_v<MT4> ) {
5284  for( size_t i=iend; i<M; ++i ) {
5285  reset( C(i,j) );
5286  }
5287  }
5288  }
5289  }
5290  //**********************************************************************************************
5291 
5292  //**Default assignment to dense matrices (diagonal/general)*************************************
5306  template< typename MT3 // Type of the left-hand side target matrix
5307  , typename MT4 // Type of the left-hand side matrix operand
5308  , typename MT5 // Type of the right-hand side matrix operand
5309  , typename ST2 > // Type of the scalar value
5310  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5311  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5312  {
5314 
5315  const size_t M( A.rows() );
5316  const size_t N( B.columns() );
5317 
5318  for( size_t j=0UL; j<N; ++j )
5319  {
5320  const size_t ibegin( ( IsLower_v<MT5> )
5321  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5322  :( 0UL ) );
5323  const size_t iend( ( IsUpper_v<MT5> )
5324  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5325  :( M ) );
5326  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5327 
5328  if( IsLower_v<MT4> ) {
5329  for( size_t i=0UL; i<ibegin; ++i ) {
5330  reset( C(i,j) );
5331  }
5332  }
5333  for( size_t i=ibegin; i<iend; ++i ) {
5334  C(i,j) = A(i,i) * B(i,j) * scalar;
5335  }
5336  if( IsUpper_v<MT4> ) {
5337  for( size_t i=iend; i<M; ++i ) {
5338  reset( C(i,j) );
5339  }
5340  }
5341  }
5342  }
5343  //**********************************************************************************************
5344 
5345  //**Default assignment to dense matrices (diagonal/diagonal)************************************
5359  template< typename MT3 // Type of the left-hand side target matrix
5360  , typename MT4 // Type of the left-hand side matrix operand
5361  , typename MT5 // Type of the right-hand side matrix operand
5362  , typename ST2 > // Type of the scalar value
5363  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5364  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5365  {
5367 
5368  reset( C );
5369 
5370  for( size_t i=0UL; i<A.rows(); ++i ) {
5371  C(i,i) = A(i,i) * B(i,i) * scalar;
5372  }
5373  }
5374  //**********************************************************************************************
5375 
5376  //**Default assignment to dense matrices (small matrices)***************************************
5390  template< typename MT3 // Type of the left-hand side target matrix
5391  , typename MT4 // Type of the left-hand side matrix operand
5392  , typename MT5 // Type of the right-hand side matrix operand
5393  , typename ST2 > // Type of the scalar value
5394  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5395  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5396  {
5397  selectDefaultAssignKernel( C, A, B, scalar );
5398  }
5399  //**********************************************************************************************
5400 
5401  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5416  template< typename MT3 // Type of the left-hand side target matrix
5417  , typename MT4 // Type of the left-hand side matrix operand
5418  , typename MT5 // Type of the right-hand side matrix operand
5419  , typename ST2 > // Type of the scalar value
5420  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5421  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5422  {
5425  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
5426  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
5427 
5428  const ForwardFunctor fwd;
5429 
5430  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
5431  const OppositeType_t<MT5> tmp( serial( B ) );
5432  assign( C, fwd( A * tmp ) * scalar );
5433  }
5434  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
5435  const OppositeType_t<MT4> tmp( serial( A ) );
5436  assign( C, fwd( tmp * B ) * scalar );
5437  }
5438  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5439  const OppositeType_t<MT5> tmp( serial( B ) );
5440  assign( C, fwd( A * tmp ) * scalar );
5441  }
5442  else {
5443  const OppositeType_t<MT4> tmp( serial( A ) );
5444  assign( C, fwd( tmp * B ) * scalar );
5445  }
5446  }
5447  //**********************************************************************************************
5448 
5449  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
5464  template< typename MT3 // Type of the left-hand side target matrix
5465  , typename MT4 // Type of the left-hand side matrix operand
5466  , typename MT5 // Type of the right-hand side matrix operand
5467  , typename ST2 > // Type of the scalar value
5468  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5469  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5470  {
5471  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5472 
5473  const size_t M( A.rows() );
5474  const size_t N( B.columns() );
5475  const size_t K( A.columns() );
5476 
5477  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5478 
5479  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
5480  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5481 
5482  const SIMDType factor( set( scalar ) );
5483 
5484  size_t i( 0UL );
5485 
5486  if( IsIntegral_v<ElementType> )
5487  {
5488  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5489  for( size_t j=0UL; j<N; ++j )
5490  {
5491  const size_t kbegin( ( IsLower_v<MT5> )
5492  ?( ( IsUpper_v<MT4> )
5493  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5494  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5495  :( IsUpper_v<MT4> ? i : 0UL ) );
5496  const size_t kend( ( IsUpper_v<MT5> )
5497  ?( ( IsLower_v<MT4> )
5498  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5499  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5500  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
5501 
5502  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5503 
5504  for( size_t k=kbegin; k<kend; ++k ) {
5505  const SIMDType b1( set( B(k,j) ) );
5506  xmm1 += A.load(i ,k) * b1;
5507  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5508  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5509  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5510  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5511  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
5512  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
5513  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
5514  }
5515 
5516  C.store( i , j, xmm1 * factor );
5517  C.store( i+SIMDSIZE , j, xmm2 * factor );
5518  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5519  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5520  C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5521  C.store( i+SIMDSIZE*5UL, j, xmm6 * factor );
5522  C.store( i+SIMDSIZE*6UL, j, xmm7 * factor );
5523  C.store( i+SIMDSIZE*7UL, j, xmm8 * factor );
5524  }
5525  }
5526  }
5527 
5528  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5529  {
5530  size_t j( 0UL );
5531 
5532  for( ; (j+2UL) <= N; j+=2UL )
5533  {
5534  const size_t kbegin( ( IsLower_v<MT5> )
5535  ?( ( IsUpper_v<MT4> )
5536  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5537  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5538  :( IsUpper_v<MT4> ? i : 0UL ) );
5539  const size_t kend( ( IsUpper_v<MT5> )
5540  ?( ( IsLower_v<MT4> )
5541  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5542  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5543  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
5544 
5545  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5546 
5547  for( size_t k=kbegin; k<kend; ++k ) {
5548  const SIMDType a1( A.load(i ,k) );
5549  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5550  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5551  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5552  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5553  const SIMDType b1( set( B(k,j ) ) );
5554  const SIMDType b2( set( B(k,j+1UL) ) );
5555  xmm1 += a1 * b1;
5556  xmm2 += a2 * b1;
5557  xmm3 += a3 * b1;
5558  xmm4 += a4 * b1;
5559  xmm5 += a5 * b1;
5560  xmm6 += a1 * b2;
5561  xmm7 += a2 * b2;
5562  xmm8 += a3 * b2;
5563  xmm9 += a4 * b2;
5564  xmm10 += a5 * b2;
5565  }
5566 
5567  C.store( i , j , xmm1 * factor );
5568  C.store( i+SIMDSIZE , j , xmm2 * factor );
5569  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5570  C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5571  C.store( i+SIMDSIZE*4UL, j , xmm5 * factor );
5572  C.store( i , j+1UL, xmm6 * factor );
5573  C.store( i+SIMDSIZE , j+1UL, xmm7 * factor );
5574  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5575  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5576  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5577  }
5578 
5579  if( j < N )
5580  {
5581  const size_t kbegin( ( IsLower_v<MT5> )
5582  ?( ( IsUpper_v<MT4> )
5583  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5584  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5585  :( IsUpper_v<MT4> ? i : 0UL ) );
5586  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
5587 
5588  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5589 
5590  for( size_t k=kbegin; k<kend; ++k ) {
5591  const SIMDType b1( set( B(k,j) ) );
5592  xmm1 += A.load(i ,k) * b1;
5593  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5594  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5595  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5596  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5597  }
5598 
5599  C.store( i , j, xmm1 * factor );
5600  C.store( i+SIMDSIZE , j, xmm2 * factor );
5601  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5602  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5603  C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5604  }
5605  }
5606 
5607  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5608  {
5609  const size_t jend( LOW ? min(i+SIMDSIZE*4UL,N) : N );
5610  size_t j( 0UL );
5611 
5612  if( SYM || HERM ) {
5613  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
5614  for( ; j<i; ++j ) {
5615  for( size_t ii=i; ii<iiend; ++ii ) {
5616  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
5617  }
5618  }
5619  }
5620  else if( UPP ) {
5621  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
5622  for( ; j<i; ++j ) {
5623  for( size_t ii=i; ii<iiend; ++ii ) {
5624  reset( C(ii,j) );
5625  }
5626  }
5627  }
5628 
5629  for( ; (j+2UL) <= jend; j+=2UL )
5630  {
5631  const size_t kbegin( ( IsLower_v<MT5> )
5632  ?( ( IsUpper_v<MT4> )
5633  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5634  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5635  :( IsUpper_v<MT4> ? i : 0UL ) );
5636  const size_t kend( ( IsUpper_v<MT5> )
5637  ?( ( IsLower_v<MT4> )
5638  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5639  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5640  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
5641 
5642  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5643 
5644  for( size_t k=kbegin; k<kend; ++k ) {
5645  const SIMDType a1( A.load(i ,k) );
5646  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5647  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5648  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5649  const SIMDType b1( set( B(k,j ) ) );
5650  const SIMDType b2( set( B(k,j+1UL) ) );
5651  xmm1 += a1 * b1;
5652  xmm2 += a2 * b1;
5653  xmm3 += a3 * b1;
5654  xmm4 += a4 * b1;
5655  xmm5 += a1 * b2;
5656  xmm6 += a2 * b2;
5657  xmm7 += a3 * b2;
5658  xmm8 += a4 * b2;
5659  }
5660 
5661  C.store( i , j , xmm1 * factor );
5662  C.store( i+SIMDSIZE , j , xmm2 * factor );
5663  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5664  C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5665  C.store( i , j+1UL, xmm5 * factor );
5666  C.store( i+SIMDSIZE , j+1UL, xmm6 * factor );
5667  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5668  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5669  }
5670 
5671  if( j < jend )
5672  {
5673  const size_t kbegin( ( IsLower_v<MT5> )
5674  ?( ( IsUpper_v<MT4> )
5675  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5676  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5677  :( IsUpper_v<MT4> ? i : 0UL ) );
5678  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
5679 
5680  SIMDType xmm1, xmm2, xmm3, xmm4;
5681 
5682  for( size_t k=kbegin; k<kend; ++k ) {
5683  const SIMDType b1( set( B(k,j) ) );
5684  xmm1 += A.load(i ,k) * b1;
5685  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5686  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5687  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5688  }
5689 
5690  C.store( i , j, xmm1 * factor );
5691  C.store( i+SIMDSIZE , j, xmm2 * factor );
5692  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5693  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5694 
5695  if( LOW ) ++j;
5696  }
5697 
5698  if( LOW ) {
5699  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
5700  for( ; j<N; ++j ) {
5701  for( size_t ii=i; ii<iiend; ++ii ) {
5702  reset( C(ii,j) );
5703  }
5704  }
5705  }
5706  }
5707 
5708  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5709  {
5710  const size_t jend( LOW ? min(i+SIMDSIZE*3UL,N) : N );
5711  size_t j( 0UL );
5712 
5713  if( SYM || HERM ) {
5714  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
5715  for( ; j<i; ++j ) {
5716  for( size_t ii=i; ii<iiend; ++ii ) {
5717  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
5718  }
5719  }
5720  }
5721  else if( UPP ) {
5722  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
5723  for( ; j<i; ++j ) {
5724  for( size_t ii=i; ii<iiend; ++ii ) {
5725  reset( C(ii,j) );
5726  }
5727  }
5728  }
5729 
5730  for( ; (j+2UL) <= jend; j+=2UL )
5731  {
5732  const size_t kbegin( ( IsLower_v<MT5> )
5733  ?( ( IsUpper_v<MT4> )
5734  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5735  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5736  :( IsUpper_v<MT4> ? i : 0UL ) );
5737  const size_t kend( ( IsUpper_v<MT5> )
5738  ?( ( IsLower_v<MT4> )
5739  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5740  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5741  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
5742 
5743  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5744 
5745  for( size_t k=kbegin; k<kend; ++k ) {
5746  const SIMDType a1( A.load(i ,k) );
5747  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5748  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5749  const SIMDType b1( set( B(k,j ) ) );
5750  const SIMDType b2( set( B(k,j+1UL) ) );
5751  xmm1 += a1 * b1;
5752  xmm2 += a2 * b1;
5753  xmm3 += a3 * b1;
5754  xmm4 += a1 * b2;
5755  xmm5 += a2 * b2;
5756  xmm6 += a3 * b2;
5757  }
5758 
5759  C.store( i , j , xmm1 * factor );
5760  C.store( i+SIMDSIZE , j , xmm2 * factor );
5761  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5762  C.store( i , j+1UL, xmm4 * factor );
5763  C.store( i+SIMDSIZE , j+1UL, xmm5 * factor );
5764  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
5765  }
5766 
5767  if( j < jend )
5768  {
5769  const size_t kbegin( ( IsLower_v<MT5> )
5770  ?( ( IsUpper_v<MT4> )
5771  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5772  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5773  :( IsUpper_v<MT4> ? i : 0UL ) );
5774  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
5775 
5776  SIMDType xmm1, xmm2, xmm3;
5777 
5778  for( size_t k=kbegin; k<kend; ++k ) {
5779  const SIMDType b1( set( B(k,j) ) );
5780  xmm1 += A.load(i ,k) * b1;
5781  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5782  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5783  }
5784 
5785  C.store( i , j, xmm1 * factor );
5786  C.store( i+SIMDSIZE , j, xmm2 * factor );
5787  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5788 
5789  if( LOW ) ++j;
5790  }
5791 
5792  if( LOW ) {
5793  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
5794  for( ; j<N; ++j ) {
5795  for( size_t ii=i; ii<iiend; ++ii ) {
5796  reset( C(ii,j) );
5797  }
5798  }
5799  }
5800  }
5801 
5802  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5803  {
5804  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
5805  size_t j( 0UL );
5806 
5807  if( SYM || HERM ) {
5808  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
5809  for( ; j<i; ++j ) {
5810  for( size_t ii=i; ii<iiend; ++ii ) {
5811  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
5812  }
5813  }
5814  }
5815  else if( UPP ) {
5816  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
5817  for( ; j<i; ++j ) {
5818  for( size_t ii=i; ii<iiend; ++ii ) {
5819  reset( C(ii,j) );
5820  }
5821  }
5822  }
5823 
5824  for( ; (j+4UL) <= jend; j+=4UL )
5825  {
5826  const size_t kbegin( ( IsLower_v<MT5> )
5827  ?( ( IsUpper_v<MT4> )
5828  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5829  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5830  :( IsUpper_v<MT4> ? i : 0UL ) );
5831  const size_t kend( ( IsUpper_v<MT5> )
5832  ?( ( IsLower_v<MT4> )
5833  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
5834  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
5835  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
5836 
5837  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5838 
5839  for( size_t k=kbegin; k<kend; ++k ) {
5840  const SIMDType a1( A.load(i ,k) );
5841  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5842  const SIMDType b1( set( B(k,j ) ) );
5843  const SIMDType b2( set( B(k,j+1UL) ) );
5844  const SIMDType b3( set( B(k,j+2UL) ) );
5845  const SIMDType b4( set( B(k,j+3UL) ) );
5846  xmm1 += a1 * b1;
5847  xmm2 += a2 * b1;
5848  xmm3 += a1 * b2;
5849  xmm4 += a2 * b2;
5850  xmm5 += a1 * b3;
5851  xmm6 += a2 * b3;
5852  xmm7 += a1 * b4;
5853  xmm8 += a2 * b4;
5854  }
5855 
5856  C.store( i , j , xmm1 * factor );
5857  C.store( i+SIMDSIZE, j , xmm2 * factor );
5858  C.store( i , j+1UL, xmm3 * factor );
5859  C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5860  C.store( i , j+2UL, xmm5 * factor );
5861  C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5862  C.store( i , j+3UL, xmm7 * factor );
5863  C.store( i+SIMDSIZE, j+3UL, xmm8 * factor );
5864  }
5865 
5866  for( ; (j+3UL) <= jend; j+=3UL )
5867  {
5868  const size_t kbegin( ( IsLower_v<MT5> )
5869  ?( ( IsUpper_v<MT4> )
5870  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5871  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5872  :( IsUpper_v<MT4> ? i : 0UL ) );
5873  const size_t kend( ( IsUpper_v<MT5> )
5874  ?( ( IsLower_v<MT4> )
5875  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
5876  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
5877  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
5878 
5879  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5880 
5881  for( size_t k=kbegin; k<kend; ++k ) {
5882  const SIMDType a1( A.load(i ,k) );
5883  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5884  const SIMDType b1( set( B(k,j ) ) );
5885  const SIMDType b2( set( B(k,j+1UL) ) );
5886  const SIMDType b3( set( B(k,j+2UL) ) );
5887  xmm1 += a1 * b1;
5888  xmm2 += a2 * b1;
5889  xmm3 += a1 * b2;
5890  xmm4 += a2 * b2;
5891  xmm5 += a1 * b3;
5892  xmm6 += a2 * b3;
5893  }
5894 
5895  C.store( i , j , xmm1 * factor );
5896  C.store( i+SIMDSIZE, j , xmm2 * factor );
5897  C.store( i , j+1UL, xmm3 * factor );
5898  C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5899  C.store( i , j+2UL, xmm5 * factor );
5900  C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5901  }
5902 
5903  for( ; (j+2UL) <= jend; j+=2UL )
5904  {
5905  const size_t kbegin( ( IsLower_v<MT5> )
5906  ?( ( IsUpper_v<MT4> )
5907  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5908  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5909  :( IsUpper_v<MT4> ? i : 0UL ) );
5910  const size_t kend( ( IsUpper_v<MT5> )
5911  ?( ( IsLower_v<MT4> )
5912  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5913  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5914  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
5915 
5916  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5917  size_t k( kbegin );
5918 
5919  for( ; (k+2UL) <= kend; k+=2UL ) {
5920  const SIMDType a1( A.load(i ,k ) );
5921  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
5922  const SIMDType a3( A.load(i ,k+1UL) );
5923  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
5924  const SIMDType b1( set( B(k ,j ) ) );
5925  const SIMDType b2( set( B(k ,j+1UL) ) );
5926  const SIMDType b3( set( B(k+1UL,j ) ) );
5927  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
5928  xmm1 += a1 * b1;
5929  xmm2 += a2 * b1;
5930  xmm3 += a1 * b2;
5931  xmm4 += a2 * b2;
5932  xmm5 += a3 * b3;
5933  xmm6 += a4 * b3;
5934  xmm7 += a3 * b4;
5935  xmm8 += a4 * b4;
5936  }
5937 
5938  for( ; k<kend; ++k ) {
5939  const SIMDType a1( A.load(i ,k) );
5940  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5941  const SIMDType b1( set( B(k,j ) ) );
5942  const SIMDType b2( set( B(k,j+1UL) ) );
5943  xmm1 += a1 * b1;
5944  xmm2 += a2 * b1;
5945  xmm3 += a1 * b2;
5946  xmm4 += a2 * b2;
5947  }
5948 
5949  C.store( i , j , (xmm1+xmm5) * factor );
5950  C.store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
5951  C.store( i , j+1UL, (xmm3+xmm7) * factor );
5952  C.store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
5953  }
5954 
5955  if( j < jend )
5956  {
5957  const size_t kbegin( ( IsLower_v<MT5> )
5958  ?( ( IsUpper_v<MT4> )
5959  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5960  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5961  :( IsUpper_v<MT4> ? i : 0UL ) );
5962  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
5963 
5964  SIMDType xmm1, xmm2, xmm3, xmm4;
5965  size_t k( kbegin );
5966 
5967  for( ; (k+2UL) <= kend; k+=2UL ) {
5968  const SIMDType b1( set( B(k ,j) ) );
5969  const SIMDType b2( set( B(k+1UL,j) ) );
5970  xmm1 += A.load(i ,k ) * b1;
5971  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
5972  xmm3 += A.load(i ,k+1UL) * b2;
5973  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
5974  }
5975 
5976  for( ; k<kend; ++k ) {
5977  const SIMDType b1( set( B(k,j) ) );
5978  xmm1 += A.load(i ,k) * b1;
5979  xmm2 += A.load(i+SIMDSIZE,k) * b1;
5980  }
5981 
5982  C.store( i , j, (xmm1+xmm3) * factor );
5983  C.store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
5984 
5985  if( LOW ) ++j;
5986  }
5987 
5988  if( LOW ) {
5989  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
5990  for( ; j<N; ++j ) {
5991  for( size_t ii=i; ii<iiend; ++ii ) {
5992  reset( C(ii,j) );
5993  }
5994  }
5995  }
5996  }
5997 
5998  for( ; i<ipos; i+=SIMDSIZE )
5999  {
6000  const size_t jend( LOW ? min(i+SIMDSIZE,N) : N );
6001  size_t j( 0UL );
6002 
6003  if( SYM || HERM ) {
6004  const size_t iiend( min(i+SIMDSIZE,M) );
6005  for( ; j<i; ++j ) {
6006  for( size_t ii=i; ii<iiend; ++ii ) {
6007  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
6008  }
6009  }
6010  }
6011  else if( UPP ) {
6012  const size_t iiend( min(i+SIMDSIZE,M) );
6013  for( ; j<i; ++j ) {
6014  for( size_t ii=i; ii<iiend; ++ii ) {
6015  reset( C(ii,j) );
6016  }
6017  }
6018  }
6019 
6020  for( ; (j+4UL) <= jend; j+=4UL )
6021  {
6022  const size_t kbegin( ( IsLower_v<MT5> )
6023  ?( ( IsUpper_v<MT4> )
6024  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6025  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6026  :( IsUpper_v<MT4> ? i : 0UL ) );
6027  const size_t kend( ( IsUpper_v<MT5> )
6028  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6029  :( K ) );
6030 
6031  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6032  size_t k( kbegin );
6033 
6034  for( ; (k+2UL) <= kend; k+=2UL ) {
6035  const SIMDType a1( A.load(i,k ) );
6036  const SIMDType a2( A.load(i,k+1UL) );
6037  xmm1 += a1 * set( B(k ,j ) );
6038  xmm2 += a1 * set( B(k ,j+1UL) );
6039  xmm3 += a1 * set( B(k ,j+2UL) );
6040  xmm4 += a1 * set( B(k ,j+3UL) );
6041  xmm5 += a2 * set( B(k+1UL,j ) );
6042  xmm6 += a2 * set( B(k+1UL,j+1UL) );
6043  xmm7 += a2 * set( B(k+1UL,j+2UL) );
6044  xmm8 += a2 * set( B(k+1UL,j+3UL) );
6045  }
6046 
6047  for( ; k<kend; ++k ) {
6048  const SIMDType a1( A.load(i,k) );
6049  xmm1 += a1 * set( B(k,j ) );
6050  xmm2 += a1 * set( B(k,j+1UL) );
6051  xmm3 += a1 * set( B(k,j+2UL) );
6052  xmm4 += a1 * set( B(k,j+3UL) );
6053  }
6054 
6055  C.store( i, j , (xmm1+xmm5) * factor );
6056  C.store( i, j+1UL, (xmm2+xmm6) * factor );
6057  C.store( i, j+2UL, (xmm3+xmm7) * factor );
6058  C.store( i, j+3UL, (xmm4+xmm8) * factor );
6059  }
6060 
6061  for( ; (j+3UL) <= jend; j+=3UL )
6062  {
6063  const size_t kbegin( ( IsLower_v<MT5> )
6064  ?( ( IsUpper_v<MT4> )
6065  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6066  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6067  :( IsUpper_v<MT4> ? i : 0UL ) );
6068  const size_t kend( ( IsUpper_v<MT5> )
6069  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6070  :( K ) );
6071 
6072  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6073  size_t k( kbegin );
6074 
6075  for( ; (k+2UL) <= kend; k+=2UL ) {
6076  const SIMDType a1( A.load(i,k ) );
6077  const SIMDType a2( A.load(i,k+1UL) );
6078  xmm1 += a1 * set( B(k ,j ) );
6079  xmm2 += a1 * set( B(k ,j+1UL) );
6080  xmm3 += a1 * set( B(k ,j+2UL) );
6081  xmm4 += a2 * set( B(k+1UL,j ) );
6082  xmm5 += a2 * set( B(k+1UL,j+1UL) );
6083  xmm6 += a2 * set( B(k+1UL,j+2UL) );
6084  }
6085 
6086  for( ; k<kend; ++k ) {
6087  const SIMDType a1( A.load(i,k) );
6088  xmm1 += a1 * set( B(k,j ) );
6089  xmm2 += a1 * set( B(k,j+1UL) );
6090  xmm3 += a1 * set( B(k,j+2UL) );
6091  }
6092 
6093  C.store( i, j , (xmm1+xmm4) * factor );
6094  C.store( i, j+1UL, (xmm2+xmm5) * factor );
6095  C.store( i, j+2UL, (xmm3+xmm6) * factor );
6096  }
6097 
6098  for( ; (j+2UL) <= jend; j+=2UL )
6099  {
6100  const size_t kbegin( ( IsLower_v<MT5> )
6101  ?( ( IsUpper_v<MT4> )
6102  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6103  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6104  :( IsUpper_v<MT4> ? i : 0UL ) );
6105  const size_t kend( ( IsUpper_v<MT5> )
6106  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6107  :( K ) );
6108 
6109  SIMDType xmm1, xmm2, xmm3, xmm4;
6110  size_t k( kbegin );
6111 
6112  for( ; k<kend; ++k ) {
6113  const SIMDType a1( A.load(i,k) );
6114  xmm1 += a1 * set( B(k,j ) );
6115  xmm2 += a1 * set( B(k,j+1UL) );
6116  }
6117 
6118  for( ; (k+2UL) <= kend; k+=2UL ) {
6119  const SIMDType a1( A.load(i,k ) );
6120  const SIMDType a2( A.load(i,k+1UL) );
6121  xmm1 += a1 * set( B(k ,j ) );
6122  xmm2 += a1 * set( B(k ,j+1UL) );
6123  xmm3 += a2 * set( B(k+1UL,j ) );
6124  xmm4 += a2 * set( B(k+1UL,j+1UL) );
6125  }
6126 
6127  C.store( i, j , (xmm1+xmm3) * factor );
6128  C.store( i, j+1UL, (xmm2+xmm4) * factor );
6129  }
6130 
6131  if( j < jend )
6132  {
6133  const size_t kbegin( ( IsLower_v<MT5> )
6134  ?( ( IsUpper_v<MT4> )
6135  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6136  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6137  :( IsUpper_v<MT4> ? i : 0UL ) );
6138 
6139  SIMDType xmm1, xmm2;
6140  size_t k( kbegin );
6141 
6142  for( ; (k+2UL) <= K; k+=2UL ) {
6143  xmm1 += A.load(i,k ) * set( B(k ,j) );
6144  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
6145  }
6146 
6147  for( ; k<K; ++k ) {
6148  xmm1 += A.load(i,k) * set( B(k,j) );
6149  }
6150 
6151  C.store( i, j, (xmm1+xmm2) * factor );
6152 
6153  if( LOW ) ++j;
6154  }
6155 
6156  if( LOW ) {
6157  const size_t iiend( min(i+SIMDSIZE,M) );
6158  for( ; j<N; ++j ) {
6159  for( size_t ii=i; ii<iiend; ++ii ) {
6160  reset( C(ii,j) );
6161  }
6162  }
6163  }
6164  }
6165 
6166  for( ; remainder && i<M; ++i )
6167  {
6168  size_t j( 0UL );
6169 
6170  if( SYM || HERM ) {
6171  for( ; j<i; ++j ) {
6172  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
6173  }
6174  }
6175  else if( UPP ) {
6176  for( ; j<i; ++j ) {
6177  reset( C(i,j) );
6178  }
6179  }
6180 
6181  for( ; (j+2UL) <= N; j+=2UL )
6182  {
6183  const size_t kbegin( ( IsLower_v<MT5> )
6184  ?( ( IsUpper_v<MT4> )
6185  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6186  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6187  :( IsUpper_v<MT4> ? i : 0UL ) );
6188  const size_t kend( ( IsUpper_v<MT5> )
6189  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6190  :( K ) );
6191 
6192  ElementType value1{};
6193  ElementType value2{};
6194 
6195  for( size_t k=kbegin; k<kend; ++k ) {
6196  value1 += A(i,k) * B(k,j );
6197  value2 += A(i,k) * B(k,j+1UL);
6198  }
6199 
6200  C(i,j ) = value1 * scalar;
6201  C(i,j+1UL) = value2 * scalar;
6202  }
6203 
6204  if( j < N )
6205  {
6206  const size_t kbegin( ( IsLower_v<MT5> )
6207  ?( ( IsUpper_v<MT4> )
6208  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6209  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6210  :( IsUpper_v<MT4> ? i : 0UL ) );
6211 
6212  ElementType value{};
6213 
6214  for( size_t k=kbegin; k<K; ++k ) {
6215  value += A(i,k) * B(k,j);
6216  }
6217 
6218  C(i,j) = value * scalar;
6219  }
6220  }
6221  }
6222  //**********************************************************************************************
6223 
6224  //**Default assignment to dense matrices (large matrices)***************************************
6238  template< typename MT3 // Type of the left-hand side target matrix
6239  , typename MT4 // Type of the left-hand side matrix operand
6240  , typename MT5 // Type of the right-hand side matrix operand
6241  , typename ST2 > // Type of the scalar value
6242  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6243  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6244  {
6245  selectDefaultAssignKernel( C, A, B, scalar );
6246  }
6247  //**********************************************************************************************
6248 
6249  //**Vectorized default assignment to dense matrices (large matrices)****************************
6264  template< typename MT3 // Type of the left-hand side target matrix
6265  , typename MT4 // Type of the left-hand side matrix operand
6266  , typename MT5 // Type of the right-hand side matrix operand
6267  , typename ST2 > // Type of the scalar value
6268  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6269  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6270  {
6271  if( SYM )
6272  smmm( C, A, B, scalar );
6273  else if( HERM )
6274  hmmm( C, A, B, scalar );
6275  else if( LOW )
6276  lmmm( C, A, B, scalar, ST2(0) );
6277  else if( UPP )
6278  ummm( C, A, B, scalar, ST2(0) );
6279  else
6280  mmm( C, A, B, scalar, ST2(0) );
6281  }
6282  //**********************************************************************************************
6283 
6284  //**BLAS-based assignment to dense matrices (default)*******************************************
6298  template< typename MT3 // Type of the left-hand side target matrix
6299  , typename MT4 // Type of the left-hand side matrix operand
6300  , typename MT5 // Type of the right-hand side matrix operand
6301  , typename ST2 > // Type of the scalar value
6302  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6303  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6304  {
6305  selectLargeAssignKernel( C, A, B, scalar );
6306  }
6307  //**********************************************************************************************
6308 
6309  //**BLAS-based assignment to dense matrices*****************************************************
6310 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6311 
6324  template< typename MT3 // Type of the left-hand side target matrix
6325  , typename MT4 // Type of the left-hand side matrix operand
6326  , typename MT5 // Type of the right-hand side matrix operand
6327  , typename ST2 > // Type of the scalar value
6328  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6329  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6330  {
6331  using ET = ElementType_t<MT3>;
6332 
6333  if( IsTriangular_v<MT4> ) {
6334  assign( C, B );
6335  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6336  }
6337  else if( IsTriangular_v<MT5> ) {
6338  assign( C, A );
6339  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6340  }
6341  else {
6342  gemm( C, A, B, ET(scalar), ET(0) );
6343  }
6344  }
6345 #endif
6346  //**********************************************************************************************
6347 
6348  //**Assignment to sparse matrices***************************************************************
6360  template< typename MT // Type of the target sparse matrix
6361  , bool SO > // Storage order of the target sparse matrix
6362  friend inline auto assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6363  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6364  {
6366 
6367  using TmpType = If_t< SO, ResultType, OppositeType >;
6368 
6375 
6376  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6377  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6378 
6379  const ForwardFunctor fwd;
6380 
6381  const TmpType tmp( serial( rhs ) );
6382  assign( ~lhs, fwd( tmp ) );
6383  }
6384  //**********************************************************************************************
6385 
6386  //**Restructuring assignment to row-major matrices**********************************************
6400  template< typename MT > // Type of the target matrix
6401  friend inline auto assign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
6402  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6403  {
6405 
6407 
6408  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6409  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6410 
6411  const ForwardFunctor fwd;
6412 
6413  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6414  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6415 
6416  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
6417  assign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
6418  else if( IsSymmetric_v<MT1> )
6419  assign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
6420  else
6421  assign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
6422  }
6423  //**********************************************************************************************
6424 
6425  //**Addition assignment to dense matrices*******************************************************
6437  template< typename MT // Type of the target dense matrix
6438  , bool SO > // Storage order of the target dense matrix
6439  friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6440  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6441  {
6443 
6444  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6445  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6446 
6447  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6448  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6449 
6450  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6451  return;
6452  }
6453 
6454  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6455  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6456 
6457  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6458  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6459  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6460  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6461  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6462  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6463 
6464  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6465  }
6466  //**********************************************************************************************
6467 
6468  //**Addition assignment to dense matrices (kernel selection)************************************
6479  template< typename MT3 // Type of the left-hand side target matrix
6480  , typename MT4 // Type of the left-hand side matrix operand
6481  , typename MT5 // Type of the right-hand side matrix operand
6482  , typename ST2 > // Type of the scalar value
6483  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6484  {
6485  if( ( IsDiagonal_v<MT4> ) ||
6486  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
6487  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6488  selectSmallAddAssignKernel( C, A, B, scalar );
6489  else
6490  selectBlasAddAssignKernel( C, A, B, scalar );
6491  }
6492  //**********************************************************************************************
6493 
6494  //**Default addition assignment to dense matrices (general/general)*****************************
6508  template< typename MT3 // Type of the left-hand side target matrix
6509  , typename MT4 // Type of the left-hand side matrix operand
6510  , typename MT5 // Type of the right-hand side matrix operand
6511  , typename ST2 > // Type of the scalar value
6512  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6513  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6514  {
6515  const ResultType tmp( serial( A * B * scalar ) );
6516  addAssign( C, tmp );
6517  }
6518  //**********************************************************************************************
6519 
6520  //**Default addition assignment to dense matrices (general/diagonal)****************************
6534  template< typename MT3 // Type of the left-hand side target matrix
6535  , typename MT4 // Type of the left-hand side matrix operand
6536  , typename MT5 // Type of the right-hand side matrix operand
6537  , typename ST2 > // Type of the scalar value
6538  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6539  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6540  {
6542 
6543  const size_t M( A.rows() );
6544  const size_t N( B.columns() );
6545 
6546  for( size_t j=0UL; j<N; ++j )
6547  {
6548  const size_t ibegin( ( IsLower_v<MT4> )
6549  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
6550  :( 0UL ) );
6551  const size_t iend( ( IsUpper_v<MT4> )
6552  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
6553  :( M ) );
6554  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6555 
6556  const size_t inum( iend - ibegin );
6557  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6558 
6559  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6560  C(i ,j) += A(i ,j) * B(j,j) * scalar;
6561  C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6562  }
6563  if( ipos < iend ) {
6564  C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6565  }
6566  }
6567  }
6568  //**********************************************************************************************
6569 
6570  //**Default addition assignment to dense matrices (diagonal/general)****************************
6584  template< typename MT3 // Type of the left-hand side target matrix
6585  , typename MT4 // Type of the left-hand side matrix operand
6586  , typename MT5 // Type of the right-hand side matrix operand
6587  , typename ST2 > // Type of the scalar value
6588  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6589  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6590  {
6592 
6593  const size_t M( A.rows() );
6594  const size_t N( B.columns() );
6595 
6596  for( size_t j=0UL; j<N; ++j )
6597  {
6598  const size_t ibegin( ( IsLower_v<MT5> )
6599  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6600  :( 0UL ) );
6601  const size_t iend( ( IsUpper_v<MT5> )
6602  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6603  :( M ) );
6604  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6605 
6606  const size_t inum( iend - ibegin );
6607  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6608 
6609  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6610  C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6611  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6612  }
6613  if( ipos < iend ) {
6614  C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6615  }
6616  }
6617  }
6618  //**********************************************************************************************
6619 
6620  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6634  template< typename MT3 // Type of the left-hand side target matrix
6635  , typename MT4 // Type of the left-hand side matrix operand
6636  , typename MT5 // Type of the right-hand side matrix operand
6637  , typename ST2 > // Type of the scalar value
6638  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6639  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6640  {
6642 
6643  for( size_t i=0UL; i<A.rows(); ++i ) {
6644  C(i,i) += A(i,i) * B(i,i) * scalar;
6645  }
6646  }
6647  //**********************************************************************************************
6648 
6649  //**Default addition assignment to dense matrices (small matrices)******************************
6663  template< typename MT3 // Type of the left-hand side target matrix
6664  , typename MT4 // Type of the left-hand side matrix operand
6665  , typename MT5 // Type of the right-hand side matrix operand
6666  , typename ST2 > // Type of the scalar value
6667  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6668  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6669  {
6670  selectDefaultAddAssignKernel( C, A, B, scalar );
6671  }
6672  //**********************************************************************************************
6673 
6674  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
6689  template< typename MT3 // Type of the left-hand side target matrix
6690  , typename MT4 // Type of the left-hand side matrix operand
6691  , typename MT5 // Type of the right-hand side matrix operand
6692  , typename ST2 > // Type of the scalar value
6693  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6694  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6695  {
6698  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
6699  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
6700 
6701  const ForwardFunctor fwd;
6702 
6703  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6704  const OppositeType_t<MT5> tmp( serial( B ) );
6705  addAssign( C, fwd( A * tmp ) * scalar );
6706  }
6707  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6708  const OppositeType_t<MT4> tmp( serial( A ) );
6709  addAssign( C, fwd( tmp * B ) * scalar );
6710  }
6711  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6712  const OppositeType_t<MT5> tmp( serial( B ) );
6713  addAssign( C, fwd( A * tmp ) * scalar );
6714  }
6715  else {
6716  const OppositeType_t<MT4> tmp( serial( A ) );
6717  addAssign( C, fwd( tmp * B ) * scalar );
6718  }
6719  }
6720  //**********************************************************************************************
6721 
6722  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
6737  template< typename MT3 // Type of the left-hand side target matrix
6738  , typename MT4 // Type of the left-hand side matrix operand
6739  , typename MT5 // Type of the right-hand side matrix operand
6740  , typename ST2 > // Type of the scalar value
6741  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6742  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6743  {
6744  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
6745 
6746  const size_t M( A.rows() );
6747  const size_t N( B.columns() );
6748  const size_t K( A.columns() );
6749 
6750  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6751 
6752  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
6753  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
6754 
6755  const SIMDType factor( set( scalar ) );
6756 
6757  size_t i( 0UL );
6758 
6759  if( IsIntegral_v<ElementType> )
6760  {
6761  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6762  for( size_t j=0UL; j<N; ++j )
6763  {
6764  const size_t kbegin( ( IsLower_v<MT5> )
6765  ?( ( IsUpper_v<MT4> )
6766  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6767  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6768  :( IsUpper_v<MT4> ? i : 0UL ) );
6769  const size_t kend( ( IsUpper_v<MT5> )
6770  ?( ( IsLower_v<MT4> )
6771  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
6772  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
6773  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
6774 
6775  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6776 
6777  for( size_t k=kbegin; k<kend; ++k ) {
6778  const SIMDType b1( set( B(k,j) ) );
6779  xmm1 += A.load(i ,k) * b1;
6780  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6781  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6782  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6783  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6784  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
6785  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
6786  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
6787  }
6788 
6789  C.store( i , j, C.load(i ,j) + xmm1 * factor );
6790  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
6791  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6792  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6793  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6794  C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
6795  C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
6796  C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
6797  }
6798  }
6799  }
6800 
6801  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6802  {
6803  size_t j( 0UL );
6804 
6805  for( ; (j+2UL) <= N; j+=2UL )
6806  {
6807  const size_t kbegin( ( IsLower_v<MT5> )
6808  ?( ( IsUpper_v<MT4> )
6809  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6810  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6811  :( IsUpper_v<MT4> ? i : 0UL ) );
6812  const size_t kend( ( IsUpper_v<MT5> )
6813  ?( ( IsLower_v<MT4> )
6814  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6815  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6816  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
6817 
6818  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6819 
6820  for( size_t k=kbegin; k<kend; ++k ) {
6821  const SIMDType a1( A.load(i ,k) );
6822  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6823  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6824  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6825  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6826  const SIMDType b1( set( B(k,j ) ) );
6827  const SIMDType b2( set( B(k,j+1UL) ) );
6828  xmm1 += a1 * b1;
6829  xmm2 += a2 * b1;
6830  xmm3 += a3 * b1;
6831  xmm4 += a4 * b1;
6832  xmm5 += a5 * b1;
6833  xmm6 += a1 * b2;
6834  xmm7 += a2 * b2;
6835  xmm8 += a3 * b2;
6836  xmm9 += a4 * b2;
6837  xmm10 += a5 * b2;
6838  }
6839 
6840  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6841  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
6842  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6843  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6844  C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
6845  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
6846  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
6847  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
6848  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
6849  C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
6850  }
6851 
6852  if( j < N )
6853  {
6854  const size_t kbegin( ( IsLower_v<MT5> )
6855  ?( ( IsUpper_v<MT4> )
6856  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6857  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6858  :( IsUpper_v<MT4> ? i : 0UL ) );
6859  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
6860 
6861  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6862 
6863  for( size_t k=kbegin; k<kend; ++k ) {
6864  const SIMDType b1( set( B(k,j) ) );
6865  xmm1 += A.load(i ,k) * b1;
6866  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6867  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6868  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6869  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6870  }
6871 
6872  C.store( i , j, C.load(i ,j) + xmm1 * factor );
6873  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
6874  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6875  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6876  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6877  }
6878  }
6879 
6880  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6881  {
6882  size_t j( 0UL );
6883 
6884  for( ; (j+2UL) <= N; j+=2UL )
6885  {
6886  const size_t kbegin( ( IsLower_v<MT5> )
6887  ?( ( IsUpper_v<MT4> )
6888  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6889  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6890  :( IsUpper_v<MT4> ? i : 0UL ) );
6891  const size_t kend( ( IsUpper_v<MT5> )
6892  ?( ( IsLower_v<MT4> )
6893  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6894  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6895  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
6896 
6897  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6898 
6899  for( size_t k=kbegin; k<kend; ++k ) {
6900  const SIMDType a1( A.load(i ,k) );
6901  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6902  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6903  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6904  const SIMDType b1( set( B(k,j ) ) );
6905  const SIMDType b2( set( B(k,j+1UL) ) );
6906  xmm1 += a1 * b1;
6907  xmm2 += a2 * b1;
6908  xmm3 += a3 * b1;
6909  xmm4 += a4 * b1;
6910  xmm5 += a1 * b2;
6911  xmm6 += a2 * b2;
6912  xmm7 += a3 * b2;
6913  xmm8 += a4 * b2;
6914  }
6915 
6916  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6917  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
6918  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6919  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6920  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
6921  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
6922  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
6923  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
6924  }
6925 
6926  if( j < N )
6927  {
6928  const size_t kbegin( ( IsLower_v<MT5> )
6929  ?( ( IsUpper_v<MT4> )
6930  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6931  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6932  :( IsUpper_v<MT4> ? i : 0UL ) );
6933  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
6934 
6935  SIMDType xmm1, xmm2, xmm3, xmm4;
6936 
6937  for( size_t k=kbegin; k<kend; ++k ) {
6938  const SIMDType b1( set( B(k,j) ) );
6939  xmm1 += A.load(i ,k) * b1;
6940  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6941  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6942  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6943  }
6944 
6945  C.store( i , j, C.load(i ,j) + xmm1 * factor );
6946  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
6947  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6948  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6949  }
6950  }
6951 
6952  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
6953  {
6954  size_t j( 0UL );
6955 
6956  for( ; (j+2UL) <= N; j+=2UL )
6957  {
6958  const size_t kbegin( ( IsLower_v<MT5> )
6959  ?( ( IsUpper_v<MT4> )
6960  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6961  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6962  :( IsUpper_v<MT4> ? i : 0UL ) );
6963  const size_t kend( ( IsUpper_v<MT5> )
6964  ?( ( IsLower_v<MT4> )
6965  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6966  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6967  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
6968 
6969  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6970 
6971  for( size_t k=kbegin; k<kend; ++k ) {
6972  const SIMDType a1( A.load(i ,k) );
6973  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6974  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6975  const SIMDType b1( set( B(k,j ) ) );
6976  const SIMDType b2( set( B(k,j+1UL) ) );
6977  xmm1 += a1 * b1;
6978  xmm2 += a2 * b1;
6979  xmm3 += a3 * b1;
6980  xmm4 += a1 * b2;
6981  xmm5 += a2 * b2;
6982  xmm6 += a3 * b2;
6983  }
6984 
6985  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6986  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
6987  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6988  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
6989  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
6990  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
6991  }
6992 
6993  if( j < N )
6994  {
6995  const size_t kbegin( ( IsLower_v<MT5> )
6996  ?( ( IsUpper_v<MT4> )
6997  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6998  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6999  :( IsUpper_v<MT4> ? i : 0UL ) );
7000  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
7001 
7002  SIMDType xmm1, xmm2, xmm3;
7003 
7004  for( size_t k=kbegin; k<kend; ++k ) {
7005  const SIMDType b1( set( B(k,j) ) );
7006  xmm1 += A.load(i ,k) * b1;
7007  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7008  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7009  }
7010 
7011  C.store( i , j, C.load(i ,j) + xmm1 * factor );
7012  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
7013  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
7014  }
7015  }
7016 
7017  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7018  {
7019  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
7020  size_t j( UPP ? i : 0UL );
7021 
7022  for( ; (j+4UL) <= jend; j+=4UL )
7023  {
7024  const size_t kbegin( ( IsLower_v<MT5> )
7025  ?( ( IsUpper_v<MT4> )
7026  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7027  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7028  :( IsUpper_v<MT4> ? i : 0UL ) );
7029  const size_t kend( ( IsUpper_v<MT5> )
7030  ?( ( IsLower_v<MT4> )
7031  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
7032  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
7033  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
7034 
7035  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7036 
7037  for( size_t k=kbegin; k<kend; ++k ) {
7038  const SIMDType a1( A.load(i ,k) );
7039  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7040  const SIMDType b1( set( B(k,j ) ) );
7041  const SIMDType b2( set( B(k,j+1UL) ) );
7042  const SIMDType b3( set( B(k,j+2UL) ) );
7043  const SIMDType b4( set( B(k,j+3UL) ) );
7044  xmm1 += a1 * b1;
7045  xmm2 += a2 * b1;
7046  xmm3 += a1 * b2;
7047  xmm4 += a2 * b2;
7048  xmm5 += a1 * b3;
7049  xmm6 += a2 * b3;
7050  xmm7 += a1 * b4;
7051  xmm8 += a2 * b4;
7052  }
7053 
7054  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7055  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
7056  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7057  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
7058  C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
7059  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
7060  C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
7061  C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
7062  }
7063 
7064  for( ; (j+3UL) <= jend; j+=3UL )
7065  {
7066  const size_t kbegin( ( IsLower_v<MT5> )
7067  ?( ( IsUpper_v<MT4> )
7068  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7069  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7070  :( IsUpper_v<MT4> ? i : 0UL ) );
7071  const size_t kend( ( IsUpper_v<MT5> )
7072  ?( ( IsLower_v<MT4> )
7073  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
7074  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
7075  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
7076 
7077  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7078 
7079  for( size_t k=kbegin; k<kend; ++k ) {
7080  const SIMDType a1( A.load(i ,k) );
7081  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7082  const SIMDType b1( set( B(k,j ) ) );
7083  const SIMDType b2( set( B(k,j+1UL) ) );
7084  const SIMDType b3( set( B(k,j+2UL) ) );
7085  xmm1 += a1 * b1;
7086  xmm2 += a2 * b1;
7087  xmm3 += a1 * b2;
7088  xmm4 += a2 * b2;
7089  xmm5 += a1 * b3;
7090  xmm6 += a2 * b3;
7091  }
7092 
7093  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7094  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
7095  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7096  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
7097  C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
7098  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
7099  }
7100 
7101  for( ; (j+2UL) <= jend; j+=2UL )
7102  {
7103  const size_t kbegin( ( IsLower_v<MT5> )
7104  ?( ( IsUpper_v<MT4> )
7105  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7106  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7107  :( IsUpper_v<MT4> ? i : 0UL ) );
7108  const size_t kend( ( IsUpper_v<MT5> )
7109  ?( ( IsLower_v<MT4> )
7110  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7111  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7112  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
7113 
7114  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7115  size_t k( kbegin );
7116 
7117  for( ; (k+2UL) <= kend; k+=2UL ) {
7118  const SIMDType a1( A.load(i ,k ) );
7119  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
7120  const SIMDType a3( A.load(i ,k+1UL) );
7121  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
7122  const SIMDType b1( set( B(k ,j ) ) );
7123  const SIMDType b2( set( B(k ,j+1UL) ) );
7124  const SIMDType b3( set( B(k+1UL,j ) ) );
7125  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
7126  xmm1 += a1 * b1;
7127  xmm2 += a2 * b1;
7128  xmm3 += a1 * b2;
7129  xmm4 += a2 * b2;
7130  xmm5 += a3 * b3;
7131  xmm6 += a4 * b3;
7132  xmm7 += a3 * b4;
7133  xmm8 += a4 * b4;
7134  }
7135 
7136  for( ; k<kend; ++k ) {
7137  const SIMDType a1( A.load(i ,k) );
7138  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7139  const SIMDType b1( set( B(k,j ) ) );
7140  const SIMDType b2( set( B(k,j+1UL) ) );
7141  xmm1 += a1 * b1;
7142  xmm2 += a2 * b1;
7143  xmm3 += a1 * b2;
7144  xmm4 += a2 * b2;
7145  }
7146 
7147  C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
7148  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
7149  C.store( i , j+1UL, C.load(i ,j+1UL) + (xmm3+xmm7) * factor );
7150  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
7151  }
7152 
7153  if( j < jend )
7154  {
7155  const size_t kbegin( ( IsLower_v<MT5> )
7156  ?( ( IsUpper_v<MT4> )
7157  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7158  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7159  :( IsUpper_v<MT4> ? i : 0UL ) );
7160  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
7161 
7162  SIMDType xmm1, xmm2, xmm3, xmm4;
7163  size_t k( kbegin );
7164 
7165  for( ; (k+2UL) <= kend; k+=2UL ) {
7166  const SIMDType b1( set( B(k ,j) ) );
7167  const SIMDType b2( set( B(k+1UL,j) ) );
7168  xmm1 += A.load(i ,k ) * b1;
7169  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
7170  xmm3 += A.load(i ,k+1UL) * b2;
7171  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
7172  }
7173 
7174  for( ; k<kend; ++k ) {
7175  const SIMDType b1( set( B(k,j) ) );
7176  xmm1 += A.load(i ,k) * b1;
7177  xmm2 += A.load(i+SIMDSIZE,k) * b1;
7178  }
7179 
7180  C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
7181  C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
7182  }
7183  }
7184 
7185  for( ; i<ipos; i+=SIMDSIZE )
7186  {
7187  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
7188  size_t j( UPP ? i : 0UL );
7189 
7190  for( ; (j+4UL) <= jend; j+=4UL )
7191  {
7192  const size_t kbegin( ( IsLower_v<MT5> )
7193  ?( ( IsUpper_v<MT4> )
7194  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7195  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7196  :( IsUpper_v<MT4> ? i : 0UL ) );
7197  const size_t kend( ( IsUpper_v<MT5> )
7198  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
7199  :( K ) );
7200 
7201  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7202  size_t k( kbegin );
7203 
7204  for( ; (k+2UL) <= kend; k+=2UL ) {
7205  const SIMDType a1( A.load(i,k ) );
7206  const SIMDType a2( A.load(i,k+1UL) );
7207  xmm1 += a1 * set( B(k ,j ) );
7208  xmm2 += a1 * set( B(k ,j+1UL) );
7209  xmm3 += a1 * set( B(k ,j+2UL) );
7210  xmm4 += a1 * set( B(k ,j+3UL) );
7211  xmm5 += a2 * set( B(k+1UL,j ) );
7212  xmm6 += a2 * set( B(k+1UL,j+1UL) );
7213  xmm7 += a2 * set( B(k+1UL,j+2UL) );
7214  xmm8 += a2 * set( B(k+1UL,j+3UL) );
7215  }
7216 
7217  for( ; k<kend; ++k ) {
7218  const SIMDType a1( A.load(i,k) );
7219  xmm1 += a1 * set( B(k,j ) );
7220  xmm2 += a1 * set( B(k,j+1UL) );
7221  xmm3 += a1 * set( B(k,j+2UL) );
7222  xmm4 += a1 * set( B(k,j+3UL) );
7223  }
7224 
7225  C.store( i, j , C.load(i,j ) + (xmm1+xmm5) * factor );
7226  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm6) * factor );
7227  C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm7) * factor );
7228  C.store( i, j+3UL, C.load(i,j+3UL) + (xmm4+xmm8) * factor );
7229  }
7230 
7231  for( ; (j+3UL) <= jend; j+=3UL )
7232  {
7233  const size_t kbegin( ( IsLower_v<MT5> )
7234  ?( ( IsUpper_v<MT4> )
7235  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7236  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7237  :( IsUpper_v<MT4> ? i : 0UL ) );
7238  const size_t kend( ( IsUpper_v<MT5> )
7239  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
7240  :( K ) );
7241 
7242  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7243  size_t k( kbegin );
7244 
7245  for( ; (k+2UL) <= kend; k+=2UL ) {
7246  const SIMDType a1( A.load(i,k ) );
7247  const SIMDType a2( A.load(i,k+1UL) );
7248  xmm1 += a1 * set( B(k ,j ) );
7249  xmm2 += a1 * set( B(k ,j+1UL) );
7250  xmm3 += a1 * set( B(k ,j+2UL) );
7251  xmm4 += a2 * set( B(k+1UL,j ) );
7252  xmm5 += a2 * set( B(k+1UL,j+1UL) );
7253  xmm6 += a2 * set( B(k+1UL,j+2UL) );
7254  }
7255 
7256  for( ; k<kend; ++k ) {
7257  const SIMDType a1( A.load(i,k) );
7258  xmm1 += a1 * set( B(k,j ) );
7259  xmm2 += a1 * set( B(k,j+1UL) );
7260  xmm3 += a1 * set( B(k,j+2UL) );
7261  }
7262 
7263  C.store( i, j , C.load(i,j ) + (xmm1+xmm4) * factor );
7264  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm5) * factor );
7265  C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm6) * factor );
7266  }
7267 
7268  for( ; (j+2UL) <= jend; j+=2UL )
7269  {
7270  const size_t kbegin( ( IsLower_v<MT5> )
7271  ?( ( IsUpper_v<MT4> )
7272  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7273  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7274  :( IsUpper_v<MT4> ? i : 0UL ) );
7275  const size_t kend( ( IsUpper_v<MT5> )
7276  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7277  :( K ) );
7278 
7279  SIMDType xmm1, xmm2, xmm3, xmm4;
7280  size_t k( kbegin );
7281 
7282  for( ; (k+2UL) <= kend; k+=2UL ) {
7283  const SIMDType a1( A.load(i,k ) );
7284  const SIMDType a2( A.load(i,k+1UL) );
7285  xmm1 += a1 * set( B(k ,j ) );
7286  xmm2 += a1 * set( B(k ,j+1UL) );
7287  xmm3 += a2 * set( B(k+1UL,j ) );
7288  xmm4 += a2 * set( B(k+1UL,j+1UL) );
7289  }
7290 
7291  for( ; k<kend; ++k ) {
7292  const SIMDType a1( A.load(i,k) );
7293  xmm1 += a1 * set( B(k,j ) );
7294  xmm2 += a1 * set( B(k,j+1UL) );
7295  }
7296 
7297  C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
7298  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm4) * factor );
7299  }
7300 
7301  if( j < jend )
7302  {
7303  const size_t kbegin( ( IsLower_v<MT5> )
7304  ?( ( IsUpper_v<MT4> )
7305  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7306  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7307  :( IsUpper_v<MT4> ? i : 0UL ) );
7308 
7309  SIMDType xmm1, xmm2;
7310  size_t k( kbegin );
7311 
7312  for( ; (k+2UL) <= K; k+=2UL ) {
7313  xmm1 += A.load(i,k ) * set( B(k ,j) );
7314  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
7315  }
7316 
7317  for( ; k<K; ++k ) {
7318  xmm1 += A.load(i,k) * set( B(k,j) );
7319  }
7320 
7321  C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
7322  }
7323  }
7324 
7325  for( ; remainder && i<M; ++i )
7326  {
7327  const size_t jend( LOW ? i+1UL : N );
7328  size_t j( UPP ? i : 0UL );
7329 
7330  for( ; (j+2UL) <= jend; j+=2UL )
7331  {
7332  const size_t kbegin( ( IsLower_v<MT5> )
7333  ?( ( IsUpper_v<MT4> )
7334  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7335  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7336  :( IsUpper_v<MT4> ? i : 0UL ) );
7337  const size_t kend( ( IsUpper_v<MT5> )
7338  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7339  :( K ) );
7340 
7341  ElementType value1{};
7342  ElementType value2{};
7343 
7344  for( size_t k=kbegin; k<kend; ++k ) {
7345  value1 += A(i,k) * B(k,j );
7346  value2 += A(i,k) * B(k,j+1UL);
7347  }
7348 
7349  C(i,j ) += value1 * scalar;
7350  C(i,j+1UL) += value2 * scalar;
7351  }
7352 
7353  if( j < jend )
7354  {
7355  const size_t kbegin( ( IsLower_v<MT5> )
7356  ?( ( IsUpper_v<MT4> )
7357  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7358  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7359  :( IsUpper_v<MT4> ? i : 0UL ) );
7360 
7361  ElementType value{};
7362 
7363  for( size_t k=kbegin; k<K; ++k ) {
7364  value += A(i,k) * B(k,j);
7365  }
7366 
7367  C(i,j) += value * scalar;
7368  }
7369  }
7370  }
7371  //**********************************************************************************************
7372 
7373  //**Default addition assignment to dense matrices (large matrices)******************************
7387  template< typename MT3 // Type of the left-hand side target matrix
7388  , typename MT4 // Type of the left-hand side matrix operand
7389  , typename MT5 // Type of the right-hand side matrix operand
7390  , typename ST2 > // Type of the scalar value
7391  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7392  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7393  {
7394  selectDefaultAddAssignKernel( C, A, B, scalar );
7395  }
7396  //**********************************************************************************************
7397 
7398  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
7413  template< typename MT3 // Type of the left-hand side target matrix
7414  , typename MT4 // Type of the left-hand side matrix operand
7415  , typename MT5 // Type of the right-hand side matrix operand
7416  , typename ST2 > // Type of the scalar value
7417  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7418  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7419  {
7420  if( LOW )
7421  lmmm( C, A, B, scalar, ST2(1) );
7422  else if( UPP )
7423  ummm( C, A, B, scalar, ST2(1) );
7424  else
7425  mmm( C, A, B, scalar, ST2(1) );
7426  }
7427  //**********************************************************************************************
7428 
7429  //**BLAS-based addition assignment to dense matrices (default)**********************************
7444  template< typename MT3 // Type of the left-hand side target matrix
7445  , typename MT4 // Type of the left-hand side matrix operand
7446  , typename MT5 // Type of the right-hand side matrix operand
7447  , typename ST2 > // Type of the scalar value
7448  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7449  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7450  {
7451  selectLargeAddAssignKernel( C, A, B, scalar );
7452  }
7453  //**********************************************************************************************
7454 
7455  //**BLAS-based addition assignment to dense matrices********************************************
7456 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7457 
7470  template< typename MT3 // Type of the left-hand side target matrix
7471  , typename MT4 // Type of the left-hand side matrix operand
7472  , typename MT5 // Type of the right-hand side matrix operand
7473  , typename ST2 > // Type of the scalar value
7474  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7475  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7476  {
7477  using ET = ElementType_t<MT3>;
7478 
7479  if( IsTriangular_v<MT4> ) {
7480  ResultType_t<MT3> tmp( serial( B ) );
7481  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7482  addAssign( C, tmp );
7483  }
7484  else if( IsTriangular_v<MT5> ) {
7485  ResultType_t<MT3> tmp( serial( A ) );
7486  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7487  addAssign( C, tmp );
7488  }
7489  else {
7490  gemm( C, A, B, ET(scalar), ET(1) );
7491  }
7492  }
7493 #endif
7494  //**********************************************************************************************
7495 
7496  //**Restructuring addition assignment to row-major matrices*************************************
7511  template< typename MT > // Type of the target matrix
7512  friend inline auto addAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7513  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7514  {
7516 
7518 
7519  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7520  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7521 
7522  const ForwardFunctor fwd;
7523 
7524  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7525  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7526 
7527  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
7528  addAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
7529  else if( IsSymmetric_v<MT1> )
7530  addAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
7531  else
7532  addAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
7533  }
7534  //**********************************************************************************************
7535 
7536  //**Addition assignment to sparse matrices******************************************************
7537  // No special implementation for the addition assignment to sparse matrices.
7538  //**********************************************************************************************
7539 
7540  //**Subtraction assignment to dense matrices****************************************************
7552  template< typename MT // Type of the target dense matrix
7553  , bool SO > // Storage order of the target dense matrix
7554  friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7555  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7556  {
7558 
7559  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7560  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7561 
7562  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7563  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7564 
7565  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7566  return;
7567  }
7568 
7569  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7570  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7571 
7572  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7573  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7574  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7575  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7576  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7577  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7578 
7579  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7580  }
7581  //**********************************************************************************************
7582 
7583  //**Subtraction assignment to dense matrices (kernel selection)*********************************
7594  template< typename MT3 // Type of the left-hand side target matrix
7595  , typename MT4 // Type of the left-hand side matrix operand
7596  , typename MT5 // Type of the right-hand side matrix operand
7597  , typename ST2 > // Type of the scalar value
7598  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7599  {
7600  if( ( IsDiagonal_v<MT4> ) ||
7601  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
7602  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
7603  selectSmallSubAssignKernel( C, A, B, scalar );
7604  else
7605  selectBlasSubAssignKernel( C, A, B, scalar );
7606  }
7607  //**********************************************************************************************
7608 
7609  //**Default subtraction assignment to dense matrices (general/general)**************************
7623  template< typename MT3 // Type of the left-hand side target matrix
7624  , typename MT4 // Type of the left-hand side matrix operand
7625  , typename MT5 // Type of the right-hand side matrix operand
7626  , typename ST2 > // Type of the scalar value
7627  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7628  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7629  {
7630  const ResultType tmp( serial( A * B * scalar ) );
7631  subAssign( C, tmp );
7632  }
7633  //**********************************************************************************************
7634 
7635  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
7649  template< typename MT3 // Type of the left-hand side target matrix
7650  , typename MT4 // Type of the left-hand side matrix operand
7651  , typename MT5 // Type of the right-hand side matrix operand
7652  , typename ST2 > // Type of the scalar value
7653  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7654  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7655  {
7657 
7658  const size_t M( A.rows() );
7659  const size_t N( B.columns() );
7660 
7661  for( size_t j=0UL; j<N; ++j )
7662  {
7663  const size_t ibegin( ( IsLower_v<MT4> )
7664  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7665  :( 0UL ) );
7666  const size_t iend( ( IsUpper_v<MT4> )
7667  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
7668  :( M ) );
7669  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7670 
7671  const size_t inum( iend - ibegin );
7672  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7673 
7674  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7675  C(i ,j) -= A(i ,j) * B(j,j) * scalar;
7676  C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
7677  }
7678  if( ipos < iend ) {
7679  C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
7680  }
7681  }
7682  }
7683  //**********************************************************************************************
7684 
7685  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
7699  template< typename MT3 // Type of the left-hand side target matrix
7700  , typename MT4 // Type of the left-hand side matrix operand
7701  , typename MT5 // Type of the right-hand side matrix operand
7702  , typename ST2 > // Type of the scalar value
7703  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7704  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7705  {
7707 
7708  const size_t M( A.rows() );
7709  const size_t N( B.columns() );
7710 
7711  for( size_t j=0UL; j<N; ++j )
7712  {
7713  const size_t ibegin( ( IsLower_v<MT5> )
7714  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7715  :( 0UL ) );
7716  const size_t iend( ( IsUpper_v<MT5> )
7717  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7718  :( M ) );
7719  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7720 
7721  const size_t inum( iend - ibegin );
7722  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7723 
7724  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7725  C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7726  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7727  }
7728  if( ipos < iend ) {
7729  C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7730  }
7731  }
7732  }
7733  //**********************************************************************************************
7734 
7735  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
7749  template< typename MT3 // Type of the left-hand side target matrix
7750  , typename MT4 // Type of the left-hand side matrix operand
7751  , typename MT5 // Type of the right-hand side matrix operand
7752  , typename ST2 > // Type of the scalar value
7753  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7754  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7755  {
7757 
7758  for( size_t i=0UL; i<A.rows(); ++i ) {
7759  C(i,i) -= A(i,i) * B(i,i) * scalar;
7760  }
7761  }
7762  //**********************************************************************************************
7763 
7764  //**Default subtraction assignment to dense matrices (small matrices)***************************
7778  template< typename MT3 // Type of the left-hand side target matrix
7779  , typename MT4 // Type of the left-hand side matrix operand
7780  , typename MT5 // Type of the right-hand side matrix operand
7781  , typename ST2 > // Type of the scalar value
7782  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7783  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7784  {
7785  selectDefaultSubAssignKernel( C, A, B, scalar );
7786  }
7787  //**********************************************************************************************
7788 
7789  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
7804  template< typename MT3 // Type of the left-hand side target matrix
7805  , typename MT4 // Type of the left-hand side matrix operand
7806  , typename MT5 // Type of the right-hand side matrix operand
7807  , typename ST2 > // Type of the scalar value
7808  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7809  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7810  {
7813  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
7814  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
7815 
7816  const ForwardFunctor fwd;
7817 
7818  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7819  const OppositeType_t<MT5> tmp( serial( B ) );
7820  subAssign( C, fwd( A * tmp ) * scalar );
7821  }
7822  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7823  const OppositeType_t<MT4> tmp( serial( A ) );
7824  subAssign( C, fwd( tmp * B ) * scalar );
7825  }
7826  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7827  const OppositeType_t<MT5> tmp( serial( B ) );
7828  subAssign( C, fwd( A * tmp ) * scalar );
7829  }
7830  else {
7831  const OppositeType_t<MT4> tmp( serial( A ) );
7832  subAssign( C, fwd( tmp * B ) * scalar );
7833  }
7834  }
7835  //**********************************************************************************************
7836 
7837  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
7852  template< typename MT3 // Type of the left-hand side target matrix
7853  , typename MT4 // Type of the left-hand side matrix operand
7854  , typename MT5 // Type of the right-hand side matrix operand
7855  , typename ST2 > // Type of the scalar value
7856  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7857  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7858  {
7859  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
7860 
7861  const size_t M( A.rows() );
7862  const size_t N( B.columns() );
7863  const size_t K( A.columns() );
7864 
7865  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7866 
7867  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
7868  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
7869 
7870  const SIMDType factor( set( scalar ) );
7871 
7872  size_t i( 0UL );
7873 
7874  if( IsIntegral_v<ElementType> )
7875  {
7876  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
7877  for( size_t j=0UL; j<N; ++j )
7878  {
7879  const size_t kbegin( ( IsLower_v<MT5> )
7880  ?( ( IsUpper_v<MT4> )
7881  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7882  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7883  :( IsUpper_v<MT4> ? i : 0UL ) );
7884  const size_t kend( ( IsUpper_v<MT5> )
7885  ?( ( IsLower_v<MT4> )
7886  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7887  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
7888  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
7889 
7890  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7891 
7892  for( size_t k=kbegin; k<kend; ++k ) {
7893  const SIMDType b1( set( B(k,j) ) );
7894  xmm1 += A.load(i ,k) * b1;
7895  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7896  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7897  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7898  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7899  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
7900  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
7901  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
7902  }
7903 
7904  C.store( i , j, C.load(i ,j) - xmm1 * factor );
7905  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
7906  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7907  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7908  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7909  C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
7910  C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
7911  C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
7912  }
7913  }
7914  }
7915 
7916  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
7917  {
7918  size_t j( 0UL );
7919 
7920  for( ; (j+2UL) <= N; j+=2UL )
7921  {
7922  const size_t kbegin( ( IsLower_v<MT5> )
7923  ?( ( IsUpper_v<MT4> )
7924  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7925  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7926  :( IsUpper_v<MT4> ? i : 0UL ) );
7927  const size_t kend( ( IsUpper_v<MT5> )
7928  ?( ( IsLower_v<MT4> )
7929  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7930  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7931  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
7932 
7933  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7934 
7935  for( size_t k=kbegin; k<kend; ++k ) {
7936  const SIMDType a1( A.load(i ,k) );
7937  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7938  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7939  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7940  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
7941  const SIMDType b1( set( B(k,j ) ) );
7942  const SIMDType b2( set( B(k,j+1UL) ) );
7943  xmm1 += a1 * b1;
7944  xmm2 += a2 * b1;
7945  xmm3 += a3 * b1;
7946  xmm4 += a4 * b1;
7947  xmm5 += a5 * b1;
7948  xmm6 += a1 * b2;
7949  xmm7 += a2 * b2;
7950  xmm8 += a3 * b2;
7951  xmm9 += a4 * b2;
7952  xmm10 += a5 * b2;
7953  }
7954 
7955  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7956  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
7957  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7958  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7959  C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
7960  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
7961  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
7962  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
7963  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
7964  C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
7965  }
7966 
7967  if( j < N )
7968  {
7969  const size_t kbegin( ( IsLower_v<MT5> )
7970  ?( ( IsUpper_v<MT4> )
7971  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7972  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7973  :( IsUpper_v<MT4> ? i : 0UL ) );
7974  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
7975 
7976  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7977 
7978  for( size_t k=kbegin; k<kend; ++k ) {
7979  const SIMDType b1( set( B(k,j) ) );
7980  xmm1 += A.load(i ,k) * b1;
7981  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7982  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7983  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7984  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7985  }
7986 
7987  C.store( i , j, C.load(i ,j) - xmm1 * factor );
7988  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
7989  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7990  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7991  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7992  }
7993  }
7994 
7995  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7996  {
7997  size_t j( 0UL );
7998 
7999  for( ; (j+2UL) <= N; j+=2UL )
8000  {
8001  const size_t kbegin( ( IsLower_v<MT5> )
8002  ?( ( IsUpper_v<MT4> )
8003  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8004  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8005  :( IsUpper_v<MT4> ? i : 0UL ) );
8006  const size_t kend( ( IsUpper_v<MT5> )
8007  ?( ( IsLower_v<MT4> )
8008  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8009  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8010  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
8011 
8012  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8013 
8014  for( size_t k=kbegin; k<kend; ++k ) {
8015  const SIMDType a1( A.load(i ,k) );
8016  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8017  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8018  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8019  const SIMDType b1( set( B(k,j ) ) );
8020  const SIMDType b2( set( B(k,j+1UL) ) );
8021  xmm1 += a1 * b1;
8022  xmm2 += a2 * b1;
8023  xmm3 += a3 * b1;
8024  xmm4 += a4 * b1;
8025  xmm5 += a1 * b2;
8026  xmm6 += a2 * b2;
8027  xmm7 += a3 * b2;
8028  xmm8 += a4 * b2;
8029  }
8030 
8031  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8032  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
8033  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
8034  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
8035  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
8036  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
8037  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
8038  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
8039  }
8040 
8041  if( j < N )
8042  {
8043  const size_t kbegin( ( IsLower_v<MT5> )
8044  ?( ( IsUpper_v<MT4> )
8045  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8046  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8047  :( IsUpper_v<MT4> ? i : 0UL ) );
8048  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
8049 
8050  SIMDType xmm1, xmm2, xmm3, xmm4;
8051 
8052  for( size_t k=kbegin; k<kend; ++k ) {
8053  const SIMDType b1( set( B(k,j) ) );
8054  xmm1 += A.load(i ,k) * b1;
8055  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8056  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8057  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8058  }
8059 
8060  C.store( i , j, C.load(i ,j) - xmm1 * factor );
8061  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
8062  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
8063  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
8064  }
8065  }
8066 
8067  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
8068  {
8069  size_t j( 0UL );
8070 
8071  for( ; (j+2UL) <= N; j+=2UL )
8072  {
8073  const size_t kbegin( ( IsLower_v<MT5> )
8074  ?( ( IsUpper_v<MT4> )
8075  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8076  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8077  :( IsUpper_v<MT4> ? i : 0UL ) );
8078  const size_t kend( ( IsUpper_v<MT5> )
8079  ?( ( IsLower_v<MT4> )
8080  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8081  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8082  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
8083 
8084  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8085 
8086  for( size_t k=kbegin; k<kend; ++k ) {
8087  const SIMDType a1( A.load(i ,k) );
8088  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8089  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8090  const SIMDType b1( set( B(k,j ) ) );
8091  const SIMDType b2( set( B(k,j+1UL) ) );
8092  xmm1 += a1 * b1;
8093  xmm2 += a2 * b1;
8094  xmm3 += a3 * b1;
8095  xmm4 += a1 * b2;
8096  xmm5 += a2 * b2;
8097  xmm6 += a3 * b2;
8098  }
8099 
8100  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8101  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
8102  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
8103  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
8104  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
8105  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
8106  }
8107 
8108  if( j < N )
8109  {
8110  const size_t kbegin( ( IsLower_v<MT5> )
8111  ?( ( IsUpper_v<MT4> )
8112  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8113  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8114  :( IsUpper_v<MT4> ? i : 0UL ) );
8115  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
8116 
8117  SIMDType xmm1, xmm2, xmm3;
8118 
8119  for( size_t k=kbegin; k<kend; ++k ) {
8120  const SIMDType b1( set( B(k,j) ) );
8121  xmm1 += A.load(i ,k) * b1;
8122  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8123  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8124  }
8125 
8126  C.store( i , j, C.load(i ,j) - xmm1 * factor );
8127  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
8128  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
8129  }
8130  }
8131 
8132  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8133  {
8134  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
8135  size_t j( UPP ? i : 0UL );
8136 
8137  for( ; (j+4UL) <= jend; j+=4UL )
8138  {
8139  const size_t kbegin( ( IsLower_v<MT5> )
8140  ?( ( IsUpper_v<MT4> )
8141  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8142  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8143  :( IsUpper_v<MT4> ? i : 0UL ) );
8144  const size_t kend( ( IsUpper_v<MT5> )
8145  ?( ( IsLower_v<MT4> )
8146  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
8147  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
8148  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8149 
8150  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8151 
8152  for( size_t k=kbegin; k<kend; ++k ) {
8153  const SIMDType a1( A.load(i ,k) );
8154  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8155  const SIMDType b1( set( B(k,j ) ) );
8156  const SIMDType b2( set( B(k,j+1UL) ) );
8157  const SIMDType b3( set( B(k,j+2UL) ) );
8158  const SIMDType b4( set( B(k,j+3UL) ) );
8159  xmm1 += a1 * b1;
8160  xmm2 += a2 * b1;
8161  xmm3 += a1 * b2;
8162  xmm4 += a2 * b2;
8163  xmm5 += a1 * b3;
8164  xmm6 += a2 * b3;
8165  xmm7 += a1 * b4;
8166  xmm8 += a2 * b4;
8167  }
8168 
8169  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8170  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
8171  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8172  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8173  C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8174  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
8175  C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
8176  C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
8177  }
8178 
8179  for( ; (j+3UL) <= jend; j+=3UL )
8180  {
8181  const size_t kbegin( ( IsLower_v<MT5> )
8182  ?( ( IsUpper_v<MT4> )
8183  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8184  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8185  :( IsUpper_v<MT4> ? i : 0UL ) );
8186  const size_t kend( ( IsUpper_v<MT5> )
8187  ?( ( IsLower_v<MT4> )
8188  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
8189  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
8190  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8191 
8192  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8193 
8194  for( size_t k=kbegin; k<kend; ++k ) {
8195  const SIMDType a1( A.load(i ,k) );
8196  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8197  const SIMDType b1( set( B(k,j ) ) );
8198  const SIMDType b2( set( B(k,j+1UL) ) );
8199  const SIMDType b3( set( B(k,j+2UL) ) );
8200  xmm1 += a1 * b1;
8201  xmm2 += a2 * b1;
8202  xmm3 += a1 * b2;
8203  xmm4 += a2 * b2;
8204  xmm5 += a1 * b3;
8205  xmm6 += a2 * b3;
8206  }
8207 
8208  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8209  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
8210  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8211  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8212  C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8213  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
8214  }
8215 
8216  for( ; (j+2UL) <= jend; j+=2UL )
8217  {
8218  const size_t kbegin( ( IsLower_v<MT5> )
8219  ?( ( IsUpper_v<MT4> )
8220  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8221  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8222  :( IsUpper_v<MT4> ? i : 0UL ) );
8223  const size_t kend( ( IsUpper_v<MT5> )
8224  ?( ( IsLower_v<MT4> )
8225  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8226  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8227  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8228 
8229  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8230  size_t k( kbegin );
8231 
8232  for( ; (k+2UL) <= kend; k+=2UL ) {
8233  const SIMDType a1( A.load(i ,k ) );
8234  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8235  const SIMDType a3( A.load(i ,k+1UL) );
8236  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8237  const SIMDType b1( set( B(k ,j ) ) );
8238  const SIMDType b2( set( B(k ,j+1UL) ) );
8239  const SIMDType b3( set( B(k+1UL,j ) ) );
8240  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
8241  xmm1 += a1 * b1;
8242  xmm2 += a2 * b1;
8243  xmm3 += a1 * b2;
8244  xmm4 += a2 * b2;
8245  xmm5 += a3 * b3;
8246  xmm6 += a4 * b3;
8247  xmm7 += a3 * b4;
8248  xmm8 += a4 * b4;
8249  }
8250 
8251  for( ; k<kend; ++k ) {
8252  const SIMDType a1( A.load(i ,k) );
8253  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8254  const SIMDType b1( set( B(k,j ) ) );
8255  const SIMDType b2( set( B(k,j+1UL) ) );
8256  xmm1 += a1 * b1;
8257  xmm2 += a2 * b1;
8258  xmm3 += a1 * b2;
8259  xmm4 += a2 * b2;
8260  }
8261 
8262  C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
8263  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
8264  C.store( i , j+1UL, C.load(i ,j+1UL) - (xmm3+xmm7) * factor );
8265  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
8266  }
8267 
8268  if( j < jend )
8269  {
8270  const size_t kbegin( ( IsLower_v<MT5> )
8271  ?( ( IsUpper_v<MT4> )
8272  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8273  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8274  :( IsUpper_v<MT4> ? i : 0UL ) );
8275  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8276 
8277  SIMDType xmm1, xmm2, xmm3, xmm4;
8278  size_t k( kbegin );
8279 
8280  for( ; (k+2UL) <= kend; k+=2UL ) {
8281  const SIMDType b1( set( B(k ,j) ) );
8282  const SIMDType b2( set( B(k+1UL,j) ) );
8283  xmm1 += A.load(i ,k ) * b1;
8284  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8285  xmm3 += A.load(i ,k+1UL) * b2;
8286  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8287  }
8288 
8289  for( ; k<kend; ++k ) {
8290  const SIMDType b1( set( B(k,j) ) );
8291  xmm1 += A.load(i ,k) * b1;
8292  xmm2 += A.load(i+SIMDSIZE,k) * b1;
8293  }
8294 
8295  C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
8296  C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
8297  }
8298  }
8299 
8300  for( ; i<ipos; i+=SIMDSIZE )
8301  {
8302  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
8303  size_t j( UPP ? i : 0UL );
8304 
8305  for( ; (j+4UL) <= jend; j+=4UL )
8306  {
8307  const size_t kbegin( ( IsLower_v<MT5> )
8308  ?( ( IsUpper_v<MT4> )
8309  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8310  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8311  :( IsUpper_v<MT4> ? i : 0UL ) );
8312  const size_t kend( ( IsUpper_v<MT5> )
8313  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
8314  :( K ) );
8315 
8316  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8317  size_t k( kbegin );
8318 
8319  for( ; (k+2UL) <= kend; k+=2UL ) {
8320  const SIMDType a1( A.load(i,k ) );
8321  const SIMDType a2( A.load(i,k+1UL) );
8322  xmm1 += a1 * set( B(k ,j ) );
8323  xmm2 += a1 * set( B(k ,j+1UL) );
8324  xmm3 += a1 * set( B(k ,j+2UL) );
8325  xmm4 += a1 * set( B(k ,j+3UL) );
8326  xmm5 += a2 * set( B(k+1UL,j ) );
8327  xmm6 += a2 * set( B(k+1UL,j+1UL) );
8328  xmm7 += a2 * set( B(k+1UL,j+2UL) );
8329  xmm8 += a2 * set( B(k+1UL,j+3UL) );
8330  }
8331 
8332  for( ; k<kend; ++k ) {
8333  const SIMDType a1( A.load(i,k) );
8334  xmm1 += a1 * set( B(k,j ) );
8335  xmm2 += a1 * set( B(k,j+1UL) );
8336  xmm3 += a1 * set( B(k,j+2UL) );
8337  xmm4 += a1 * set( B(k,j+3UL) );
8338  }
8339 
8340  C.store( i, j , C.load(i,j ) - (xmm1+xmm5) * factor );
8341  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm6) * factor );
8342  C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm7) * factor );
8343  C.store( i, j+3UL, C.load(i,j+3UL) - (xmm4+xmm8) * factor );
8344  }
8345 
8346  for( ; (j+3UL) <= jend; j+=3UL )
8347  {
8348  const size_t kbegin( ( IsLower_v<MT5> )
8349  ?( ( IsUpper_v<MT4> )
8350  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8351  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8352  :( IsUpper_v<MT4> ? i : 0UL ) );
8353  const size_t kend( ( IsUpper_v<MT5> )
8354  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
8355  :( K ) );
8356 
8357  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8358  size_t k( kbegin );
8359 
8360  for( ; (k+2UL) <= kend; k+=2UL ) {
8361  const SIMDType a1( A.load(i,k ) );
8362  const SIMDType a2( A.load(i,k+1UL) );
8363  xmm1 += a1 * set( B(k ,j ) );
8364  xmm2 += a1 * set( B(k ,j+1UL) );
8365  xmm3 += a1 * set( B(k ,j+2UL) );
8366  xmm4 += a2 * set( B(k+1UL,j ) );
8367  xmm5 += a2 * set( B(k+1UL,j+1UL) );
8368  xmm6 += a2 * set( B(k+1UL,j+2UL) );
8369  }
8370 
8371  for( ; k<kend; ++k ) {
8372  const SIMDType a1( A.load(i,k) );
8373  xmm1 += a1 * set( B(k,j ) );
8374  xmm2 += a1 * set( B(k,j+1UL) );
8375  xmm3 += a1 * set( B(k,j+2UL) );
8376  }
8377 
8378  C.store( i, j , C.load(i,j ) - (xmm1+xmm4) * factor );
8379  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm5) * factor );
8380  C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm6) * factor );
8381  }
8382 
8383  for( ; (j+2UL) <= jend; j+=2UL )
8384  {
8385  const size_t kbegin( ( IsLower_v<MT5> )
8386  ?( ( IsUpper_v<MT4> )
8387  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8388  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8389  :( IsUpper_v<MT4> ? i : 0UL ) );
8390  const size_t kend( ( IsUpper_v<MT5> )
8391  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8392  :( K ) );
8393 
8394  SIMDType xmm1, xmm2, xmm3, xmm4;
8395  size_t k( kbegin );
8396 
8397  for( ; (k+2UL) <= kend; k+=2UL ) {
8398  const SIMDType a1( A.load(i,k ) );
8399  const SIMDType a2( A.load(i,k+1UL) );
8400  xmm1 += a1 * set( B(k ,j ) );
8401  xmm2 += a1 * set( B(k ,j+1UL) );
8402  xmm3 += a2 * set( B(k+1UL,j ) );
8403  xmm4 += a2 * set( B(k+1UL,j+1UL) );
8404  }
8405 
8406  for( ; k<kend; ++k ) {
8407  const SIMDType a1( A.load(i,k) );
8408  xmm1 += a1 * set( B(k,j ) );
8409  xmm2 += a1 * set( B(k,j+1UL) );
8410  }
8411 
8412  C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
8413  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm4) * factor );
8414  }
8415 
8416  if( j < jend )
8417  {
8418  const size_t kbegin( ( IsLower_v<MT5> )
8419  ?( ( IsUpper_v<MT4> )
8420  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8421  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8422  :( IsUpper_v<MT4> ? i : 0UL ) );
8423 
8424  SIMDType xmm1, xmm2;
8425  size_t k( kbegin );
8426 
8427  for( ; (k+2UL) <= K; k+=2UL ) {
8428  xmm1 += A.load(i,k ) * set( B(k ,j) );
8429  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
8430  }
8431 
8432  for( ; k<K; ++k ) {
8433  xmm1 += A.load(i,k) * set( B(k,j) );
8434  }
8435 
8436  C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
8437  }
8438  }
8439 
8440  for( ; remainder && i<M; ++i )
8441  {
8442  const size_t jend( LOW ? i+1UL : N );
8443  size_t j( UPP ? i : 0UL );
8444 
8445  for( ; (j+2UL) <= jend; j+=2UL )
8446  {
8447  const size_t kbegin( ( IsLower_v<MT5> )
8448  ?( ( IsUpper_v<MT4> )
8449  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8450  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8451  :( IsUpper_v<MT4> ? i : 0UL ) );
8452  const size_t kend( ( IsUpper_v<MT5> )
8453  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8454  :( K ) );
8455 
8456  ElementType value1{};
8457  ElementType value2{};
8458 
8459  for( size_t k=kbegin; k<kend; ++k ) {
8460  value1 += A(i,k) * B(k,j );
8461  value2 += A(i,k) * B(k,j+1UL);
8462  }
8463 
8464  C(i,j ) -= value1 * scalar;
8465  C(i,j+1UL) -= value2 * scalar;
8466  }
8467 
8468  if( j < jend )
8469  {
8470  const size_t kbegin( ( IsLower_v<MT5> )
8471  ?( ( IsUpper_v<MT4> )
8472  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8473  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8474  :( IsUpper_v<MT4> ? i : 0UL ) );
8475 
8476  ElementType value{};
8477 
8478  for( size_t k=kbegin; k<K; ++k ) {
8479  value += A(i,k) * B(k,j);
8480  }
8481 
8482  C(i,j) -= value * scalar;
8483  }
8484  }
8485  }
8486  //**********************************************************************************************
8487 
8488  //**Default subtraction assignment to dense matrices (large matrices)***************************
8502  template< typename MT3 // Type of the left-hand side target matrix
8503  , typename MT4 // Type of the left-hand side matrix operand
8504  , typename MT5 // Type of the right-hand side matrix operand
8505  , typename ST2 > // Type of the scalar value
8506  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8507  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8508  {
8509  selectDefaultSubAssignKernel( C, A, B, scalar );
8510  }
8511  //**********************************************************************************************
8512 
8513  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
8528  template< typename MT3 // Type of the left-hand side target matrix
8529  , typename MT4 // Type of the left-hand side matrix operand
8530  , typename MT5 // Type of the right-hand side matrix operand
8531  , typename ST2 > // Type of the scalar value
8532  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8533  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8534  {
8535  if( LOW )
8536  lmmm( C, A, B, -scalar, ST2(1) );
8537  else if( UPP )
8538  ummm( C, A, B, -scalar, ST2(1) );
8539  else
8540  mmm( C, A, B, -scalar, ST2(1) );
8541  }
8542  //**********************************************************************************************
8543 
8544  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
8559  template< typename MT3 // Type of the left-hand side target matrix
8560  , typename MT4 // Type of the left-hand side matrix operand
8561  , typename MT5 // Type of the right-hand side matrix operand
8562  , typename ST2 > // Type of the scalar value
8563  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8564  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8565  {
8566  selectLargeSubAssignKernel( C, A, B, scalar );
8567  }
8568  //**********************************************************************************************
8569 
8570  //**BLAS-based subraction assignment to dense matrices******************************************
8571 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
8572 
8585  template< typename MT3 // Type of the left-hand side target matrix
8586  , typename MT4 // Type of the left-hand side matrix operand
8587  , typename MT5 // Type of the right-hand side matrix operand
8588  , typename ST2 > // Type of the scalar value
8589  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8590  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8591  {
8592  using ET = ElementType_t<MT3>;
8593 
8594  if( IsTriangular_v<MT4> ) {
8595  ResultType_t<MT3> tmp( serial( B ) );
8596  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
8597  subAssign( C, tmp );
8598  }
8599  else if( IsTriangular_v<MT5> ) {
8600  ResultType_t<MT3> tmp( serial( A ) );
8601  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
8602  subAssign( C, tmp );
8603  }
8604  else {
8605  gemm( C, A, B, ET(-scalar), ET(1) );
8606  }
8607  }
8608 #endif
8609  //**********************************************************************************************
8610 
8611  //**Restructuring subtraction assignment to row-major matrices**********************************
8625  template< typename MT > // Type of the target matrix
8626  friend inline auto subAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8627  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8628  {
8630 
8632 
8633  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8634  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8635 
8636  const ForwardFunctor fwd;
8637 
8638  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8639  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8640 
8641  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8642  subAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8643  else if( IsSymmetric_v<MT1> )
8644  subAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8645  else
8646  subAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8647  }
8648  //**********************************************************************************************
8649 
8650  //**Subtraction assignment to sparse matrices***************************************************
8651  // No special implementation for the subtraction assignment to sparse matrices.
8652  //**********************************************************************************************
8653 
8654  //**Schur product assignment to dense matrices**************************************************
8666  template< typename MT // Type of the target dense matrix
8667  , bool SO > // Storage order of the target dense matrix
8668  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8669  {
8671 
8675 
8676  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8677  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8678 
8679  const ResultType tmp( serial( rhs ) );
8680  schurAssign( ~lhs, tmp );
8681  }
8682  //**********************************************************************************************
8683 
8684  //**Schur product assignment to sparse matrices*************************************************
8685  // No special implementation for the Schur product assignment to sparse matrices.
8686  //**********************************************************************************************
8687 
8688  //**Multiplication assignment to dense matrices*************************************************
8689  // No special implementation for the multiplication assignment to dense matrices.
8690  //**********************************************************************************************
8691 
8692  //**Multiplication assignment to sparse matrices************************************************
8693  // No special implementation for the multiplication assignment to sparse matrices.
8694  //**********************************************************************************************
8695 
8696  //**SMP assignment to dense matrices************************************************************
8711  template< typename MT // Type of the target dense matrix
8712  , bool SO > // Storage order of the target dense matrix
8713  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8714  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8715  {
8717 
8718  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8719  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8720 
8721  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8722  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8723 
8724  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
8725  return;
8726  }
8727  else if( left.columns() == 0UL ) {
8728  reset( ~lhs );
8729  return;
8730  }
8731 
8732  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8733  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8734 
8735  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8736  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8737  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8738  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8739  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8740  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8741 
8742  smpAssign( ~lhs, A * B * rhs.scalar_ );
8743  }
8744  //**********************************************************************************************
8745 
8746  //**SMP assignment to sparse matrices***********************************************************
8761  template< typename MT // Type of the target sparse matrix
8762  , bool SO > // Storage order of the target sparse matrix
8763  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8764  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8765  {
8767 
8768  using TmpType = If_t< SO, ResultType, OppositeType >;
8769 
8776 
8777  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8778  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8779 
8780  const ForwardFunctor fwd;
8781 
8782  const TmpType tmp( rhs );
8783  smpAssign( ~lhs, fwd( tmp ) );
8784  }
8785  //**********************************************************************************************
8786 
8787  //**Restructuring SMP assignment to row-major matrices******************************************
8801  template< typename MT > // Type of the target matrix
8802  friend inline auto smpAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8803  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8804  {
8806 
8808 
8809  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8810  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8811 
8812  const ForwardFunctor fwd;
8813 
8814  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8815  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8816 
8817  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8818  smpAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8819  else if( IsSymmetric_v<MT1> )
8820  smpAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8821  else
8822  smpAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8823  }
8824  //**********************************************************************************************
8825 
8826  //**SMP addition assignment to dense matrices***************************************************
8841  template< typename MT // Type of the target dense matrix
8842  , bool SO > // Storage order of the target dense matrix
8843  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8844  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8845  {
8847 
8848  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8849  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8850 
8851  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8852  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8853 
8854  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8855  return;
8856  }
8857 
8858  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8859  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8860 
8861  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8862  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8863  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8864  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8865  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8866  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8867 
8868  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
8869  }
8870  //**********************************************************************************************
8871 
8872  //**Restructuring SMP addition assignment to row-major matrices*********************************
8887  template< typename MT > // Type of the target matrix
8888  friend inline auto smpAddAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8889  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8890  {
8892 
8894 
8895  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8896  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8897 
8898  const ForwardFunctor fwd;
8899 
8900  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8901  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8902 
8903  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8904  smpAddAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8905  else if( IsSymmetric_v<MT1> )
8906  smpAddAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8907  else
8908  smpAddAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8909  }
8910  //**********************************************************************************************
8911 
8912  //**SMP addition assignment to sparse matrices**************************************************
8913  // No special implementation for the SMP addition assignment to sparse matrices.
8914  //**********************************************************************************************
8915 
8916  //**SMP subtraction assignment to dense matrices************************************************
8931  template< typename MT // Type of the target dense matrix
8932  , bool SO > // Storage order of the target dense matrix
8933  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8934  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8935  {
8937 
8938  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8939  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8940 
8941  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8942  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8943 
8944  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8945  return;
8946  }
8947 
8948  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8949  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8950 
8951  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8952  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8953  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8954  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8955  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8956  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8957 
8958  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
8959  }
8960  //**********************************************************************************************
8961 
8962  //**Restructuring SMP subtraction assignment to row-major matrices******************************
8977  template< typename MT > // Type of the target matrix
8978  friend inline auto smpSubAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8979  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8980  {
8982 
8984 
8985  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8986  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8987 
8988  const ForwardFunctor fwd;
8989 
8990  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8991  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8992 
8993  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8994  smpSubAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8995  else if( IsSymmetric_v<MT1> )
8996  smpSubAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8997  else
8998  smpSubAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8999  }
9000  //**********************************************************************************************
9001 
9002  //**SMP subtraction assignment to sparse matrices***********************************************
9003  // No special implementation for the SMP subtraction assignment to sparse matrices.
9004  //**********************************************************************************************
9005 
9006  //**SMP Schur product assignment to dense matrices**********************************************
9018  template< typename MT // Type of the target dense matrix
9019  , bool SO > // Storage order of the target dense matrix
9020  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9021  {
9023 
9027 
9028  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9029  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9030 
9031  const ResultType tmp( rhs );
9032  smpSchurAssign( ~lhs, tmp );
9033  }
9034  //**********************************************************************************************
9035 
9036  //**SMP Schur product assignment to sparse matrices*********************************************
9037  // No special implementation for the SMP Schur product assignment to sparse matrices.
9038  //**********************************************************************************************
9039 
9040  //**SMP multiplication assignment to dense matrices*********************************************
9041  // No special implementation for the SMP multiplication assignment to dense matrices.
9042  //**********************************************************************************************
9043 
9044  //**SMP multiplication assignment to sparse matrices********************************************
9045  // No special implementation for the SMP multiplication assignment to sparse matrices.
9046  //**********************************************************************************************
9047 
9048  //**Compile time checks*************************************************************************
9057  //**********************************************************************************************
9058 };
9060 //*************************************************************************************************
9061 
9062 
9063 
9064 
9065 //=================================================================================================
9066 //
9067 // GLOBAL BINARY ARITHMETIC OPERATORS
9068 //
9069 //=================================================================================================
9070 
9071 //*************************************************************************************************
9098 template< typename MT1 // Type of the left-hand side dense matrix
9099  , typename MT2 > // Type of the right-hand side dense matrix
9100 inline decltype(auto)
9101  operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,true>& rhs )
9102 {
9104 
9105  if( (~lhs).columns() != (~rhs).rows() ) {
9106  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
9107  }
9108 
9109  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,false,false,false,false>;
9110  return ReturnType( ~lhs, ~rhs );
9111 }
9112 //*************************************************************************************************
9113 
9114 
9115 
9116 
9117 //=================================================================================================
9118 //
9119 // GLOBAL FUNCTIONS
9120 //
9121 //=================================================================================================
9122 
9123 //*************************************************************************************************
9146 template< typename MT1 // Type of the left-hand side dense matrix
9147  , typename MT2 // Type of the right-hand side dense matrix
9148  , bool SF // Symmetry flag
9149  , bool HF // Hermitian flag
9150  , bool LF // Lower flag
9151  , bool UF > // Upper flag
9152 inline decltype(auto) declsym( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9153 {
9155 
9156  if( !isSquare( dm ) ) {
9157  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
9158  }
9159 
9160  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
9161  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9162 }
9164 //*************************************************************************************************
9165 
9166 
9167 //*************************************************************************************************
9190 template< typename MT1 // Type of the left-hand side dense matrix
9191  , typename MT2 // Type of the right-hand side dense matrix
9192  , bool SF // Symmetry flag
9193  , bool HF // Hermitian flag
9194  , bool LF // Lower flag
9195  , bool UF > // Upper flag
9196 inline decltype(auto) declherm( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9197 {
9199 
9200  if( !isSquare( dm ) ) {
9201  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
9202  }
9203 
9204  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9205  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9206 }
9208 //*************************************************************************************************
9209 
9210 
9211 //*************************************************************************************************
9234 template< typename MT1 // Type of the left-hand side dense matrix
9235  , typename MT2 // Type of the right-hand side dense matrix
9236  , bool SF // Symmetry flag
9237  , bool HF // Hermitian flag
9238  , bool LF // Lower flag
9239  , bool UF > // Upper flag
9240 inline decltype(auto) decllow( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9241 {
9243 
9244  if( !isSquare( dm ) ) {
9245  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9246  }
9247 
9248  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9249  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9250 }
9252 //*************************************************************************************************
9253 
9254 
9255 //*************************************************************************************************
9278 template< typename MT1 // Type of the left-hand side dense matrix
9279  , typename MT2 // Type of the right-hand side dense matrix
9280  , bool SF // Symmetry flag
9281  , bool HF // Hermitian flag
9282  , bool LF // Lower flag
9283  , bool UF > // Upper flag
9284 inline decltype(auto) declupp( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9285 {
9287 
9288  if( !isSquare( dm ) ) {
9289  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9290  }
9291 
9292  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9293  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9294 }
9296 //*************************************************************************************************
9297 
9298 
9299 //*************************************************************************************************
9322 template< typename MT1 // Type of the left-hand side dense matrix
9323  , typename MT2 // Type of the right-hand side dense matrix
9324  , bool SF // Symmetry flag
9325  , bool HF // Hermitian flag
9326  , bool LF // Lower flag
9327  , bool UF > // Upper flag
9328 inline decltype(auto) decldiag( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9329 {
9331 
9332  if( !isSquare( dm ) ) {
9333  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
9334  }
9335 
9336  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
9337  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9338 }
9340 //*************************************************************************************************
9341 
9342 
9343 
9344 
9345 //=================================================================================================
9346 //
9347 // SIZE SPECIALIZATIONS
9348 //
9349 //=================================================================================================
9350 
9351 //*************************************************************************************************
9353 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9354 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9355  : public Size<MT1,0UL>
9356 {};
9357 
9358 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9359 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9360  : public Size<MT2,1UL>
9361 {};
9363 //*************************************************************************************************
9364 
9365 
9366 
9367 
9368 //=================================================================================================
9369 //
9370 // ISALIGNED SPECIALIZATIONS
9371 //
9372 //=================================================================================================
9373 
9374 //*************************************************************************************************
9376 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9377 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9378  : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
9379 {};
9381 //*************************************************************************************************
9382 
9383 } // namespace blaze
9384 
9385 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:427
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:495
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:483
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatTDMatMultExpr.h:307
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:533
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:299
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:606
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:287
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:523
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
Header file for the IsIntegral type trait.
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:329
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1001
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:597
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Expression object for transpose dense matrix-transpose dense matrix multiplications....
Definition: Forward.h:173
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:432
Header file for the IsBLASCompatible type trait.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:429
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:285
Header file for the IsComplexDouble type trait.
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:177
Constraint on the data type.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatTDMatMultExpr.h:320
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:170
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:565
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:439
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:59
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:463
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:168
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1001
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:553
Generic wrapper for the null function.
Definition: Noop.h:60
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:162
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:158
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
Constraint on the data type.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:496
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:469
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:293
Header file for the IsSIMDCombinable type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:419
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type,...
Definition: Symmetric.h:79
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:302
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:161
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:283
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:174
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:176
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatTDMatMultExpr.h:314
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:422
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:159
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:174
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:165
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:393
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:59
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:105
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:577
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:286
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:175
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:59
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:765
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:454
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:441
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:409
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:451
Header file for BLAS general matrix/matrix multiplication functions (gemm)
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:288
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:344
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:59
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:587
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:107
Header file for the IsUpper type trait.
typename DisableIf< Condition, T >::Type DisableIf_t
Auxiliary type for the DisableIf class template.The DisableIf_t alias declaration provides a convenie...
Definition: DisableIf.h:138
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1324
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:473
Generic wrapper for the declsym() function.
Definition: DeclSym.h:59
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:296
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
Header file for the IsResizable type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:543
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:171
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:290
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
Header file for the DeclSym functor.
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:160
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:289
Header file for the IsExpression type trait class.
Header file for the function trace functionality.