TDMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/dense/MMM.h>
54 #include <blaze/math/Exception.h>
67 #include <blaze/math/shims/Reset.h>
69 #include <blaze/math/SIMD.h>
100 #include <blaze/math/views/Check.h>
101 #include <blaze/system/BLAS.h>
102 #include <blaze/system/Blocking.h>
103 #include <blaze/system/Debugging.h>
105 #include <blaze/system/Thresholds.h>
108 #include <blaze/util/Assert.h>
109 #include <blaze/util/Complex.h>
112 #include <blaze/util/DisableIf.h>
113 #include <blaze/util/EnableIf.h>
116 #include <blaze/util/mpl/If.h>
117 #include <blaze/util/TrueType.h>
118 #include <blaze/util/Types.h>
127 
128 
129 namespace blaze {
130 
131 //=================================================================================================
132 //
133 // CLASS TDMATTDMATMULTEXPR
134 //
135 //=================================================================================================
136 
137 //*************************************************************************************************
144 template< typename MT1 // Type of the left-hand side dense matrix
145  , typename MT2 // Type of the right-hand side dense matrix
146  , bool SF // Symmetry flag
147  , bool HF // Hermitian flag
148  , bool LF // Lower flag
149  , bool UF > // Upper flag
150 class TDMatTDMatMultExpr
151  : public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
152  , private Computation
153 {
154  private:
155  //**Type definitions****************************************************************************
162  //**********************************************************************************************
163 
164  //**********************************************************************************************
166  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
167  //**********************************************************************************************
168 
169  //**********************************************************************************************
171  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
172  //**********************************************************************************************
173 
174  //**********************************************************************************************
175  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
176  static constexpr bool HERM = ( HF && !( LF || UF ) );
177  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
178  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
179  //**********************************************************************************************
180 
181  //**********************************************************************************************
183 
188  template< typename T1, typename T2, typename T3 >
189  static constexpr bool CanExploitSymmetry_v =
190  ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
192  //**********************************************************************************************
193 
194  //**********************************************************************************************
196 
200  template< typename T1, typename T2, typename T3 >
201  static constexpr bool IsEvaluationRequired_v =
202  ( ( evaluateLeft || evaluateRight ) && CanExploitSymmetry_v<T1,T2,T3> );
204  //**********************************************************************************************
205 
206  //**********************************************************************************************
208 
211  template< typename T1, typename T2, typename T3 >
212  static constexpr bool UseBlasKernel_v =
213  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
214  !SYM && !HERM && !LOW && !UPP &&
215  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
216  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
217  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
218  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
219  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
220  IsBLASCompatible_v< ElementType_t<T1> > &&
221  IsBLASCompatible_v< ElementType_t<T2> > &&
222  IsBLASCompatible_v< ElementType_t<T3> > &&
223  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
224  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
226  //**********************************************************************************************
227 
228  //**********************************************************************************************
230 
233  template< typename T1, typename T2, typename T3 >
234  static constexpr bool UseVectorizedDefaultKernel_v =
235  ( useOptimizedKernels &&
236  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
237  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
238  IsSIMDCombinable_v< ElementType_t<T1>
240  , ElementType_t<T3> > &&
241  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
242  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
244  //**********************************************************************************************
245 
246  //**********************************************************************************************
248 
251  using ForwardFunctor = If_t< HERM
252  , DeclHerm
253  , If_t< SYM
254  , DeclSym
255  , If_t< LOW
256  , If_t< UPP
257  , DeclDiag
258  , DeclLow >
259  , If_t< UPP
260  , DeclUpp
261  , Noop > > > >;
263  //**********************************************************************************************
264 
265  public:
266  //**Type definitions****************************************************************************
269 
272 
274  using ResultType = typename If_t< HERM
276  , If_t< SYM
278  , If_t< LOW
279  , If_t< UPP
282  , If_t< UPP
284  , MultTrait<RT1,RT2> > > > >::Type;
285 
290  using ReturnType = const ElementType;
291  using CompositeType = const ResultType;
292 
294  using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
295 
297  using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
298 
301 
304  //**********************************************************************************************
305 
306  //**Compilation flags***************************************************************************
308  static constexpr bool simdEnabled =
309  ( !IsDiagonal_v<MT1> &&
310  MT1::simdEnabled && MT2::simdEnabled &&
311  HasSIMDAdd_v<ET1,ET2> &&
312  HasSIMDMult_v<ET1,ET2> );
313 
315  static constexpr bool smpAssignable =
317  //**********************************************************************************************
318 
319  //**SIMD properties*****************************************************************************
321  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
322  //**********************************************************************************************
323 
324  //**Constructor*********************************************************************************
330  explicit inline TDMatTDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
331  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
332  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
333  {
334  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
335  }
336  //**********************************************************************************************
337 
338  //**Access operator*****************************************************************************
345  inline ReturnType operator()( size_t i, size_t j ) const {
346  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
347  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
348 
349  if( IsDiagonal_v<MT1> ) {
350  return lhs_(i,i) * rhs_(i,j);
351  }
352  else if( IsDiagonal_v<MT2> ) {
353  return lhs_(i,j) * rhs_(j,j);
354  }
355  else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
356  const size_t begin( ( IsUpper_v<MT1> )
357  ?( ( IsLower_v<MT2> )
358  ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
359  , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
360  :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
361  :( ( IsLower_v<MT2> )
362  ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
363  :( 0UL ) ) );
364  const size_t end( ( IsLower_v<MT1> )
365  ?( ( IsUpper_v<MT2> )
366  ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
367  , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
368  :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
369  :( ( IsUpper_v<MT2> )
370  ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
371  :( lhs_.columns() ) ) );
372 
373  if( begin >= end ) return ElementType();
374 
375  const size_t n( end - begin );
376 
377  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
378  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
379  }
380  else {
381  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
382  }
383  }
384  //**********************************************************************************************
385 
386  //**At function*********************************************************************************
394  inline ReturnType at( size_t i, size_t j ) const {
395  if( i >= lhs_.rows() ) {
396  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
397  }
398  if( j >= rhs_.columns() ) {
399  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
400  }
401  return (*this)(i,j);
402  }
403  //**********************************************************************************************
404 
405  //**Rows function*******************************************************************************
410  inline size_t rows() const noexcept {
411  return lhs_.rows();
412  }
413  //**********************************************************************************************
414 
415  //**Columns function****************************************************************************
420  inline size_t columns() const noexcept {
421  return rhs_.columns();
422  }
423  //**********************************************************************************************
424 
425  //**Left operand access*************************************************************************
430  inline LeftOperand leftOperand() const noexcept {
431  return lhs_;
432  }
433  //**********************************************************************************************
434 
435  //**Right operand access************************************************************************
440  inline RightOperand rightOperand() const noexcept {
441  return rhs_;
442  }
443  //**********************************************************************************************
444 
445  //**********************************************************************************************
451  template< typename T >
452  inline bool canAlias( const T* alias ) const noexcept {
453  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
454  }
455  //**********************************************************************************************
456 
457  //**********************************************************************************************
463  template< typename T >
464  inline bool isAliased( const T* alias ) const noexcept {
465  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
466  }
467  //**********************************************************************************************
468 
469  //**********************************************************************************************
474  inline bool isAligned() const noexcept {
475  return lhs_.isAligned() && rhs_.isAligned();
476  }
477  //**********************************************************************************************
478 
479  //**********************************************************************************************
484  inline bool canSMPAssign() const noexcept {
485  return ( !BLAZE_BLAS_MODE ||
486  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
488  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
489  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
490  !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
491  }
492  //**********************************************************************************************
493 
494  private:
495  //**Member variables****************************************************************************
498  //**********************************************************************************************
499 
500  //**Assignment to dense matrices****************************************************************
513  template< typename MT // Type of the target dense matrix
514  , bool SO > // Storage order of the target dense matrix
515  friend inline auto assign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
517  {
519 
520  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
521  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
522 
523  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
524  return;
525  }
526  else if( rhs.lhs_.columns() == 0UL ) {
527  reset( ~lhs );
528  return;
529  }
530 
531  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
532  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
533 
534  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
535  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
536  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
537  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
538  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
539  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
540 
541  TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
542  }
544  //**********************************************************************************************
545 
546  //**Assignment to dense matrices (kernel selection)*********************************************
557  template< typename MT3 // Type of the left-hand side target matrix
558  , typename MT4 // Type of the left-hand side matrix operand
559  , typename MT5 > // Type of the right-hand side matrix operand
560  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
561  {
562  if( ( IsDiagonal_v<MT4> ) ||
563  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
564  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
565  selectSmallAssignKernel( C, A, B );
566  else
567  selectBlasAssignKernel( C, A, B );
568  }
570  //**********************************************************************************************
571 
572  //**Default assignment to dense matrices (general/general)**************************************
586  template< typename MT3 // Type of the left-hand side target matrix
587  , typename MT4 // Type of the left-hand side matrix operand
588  , typename MT5 > // Type of the right-hand side matrix operand
589  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
590  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
591  {
592  const size_t M( A.rows() );
593  const size_t N( B.columns() );
594  const size_t K( A.columns() );
595 
596  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
597 
598  for( size_t j=0UL; j<N; ++j )
599  {
600  const size_t kbegin( ( IsLower_v<MT5> )
601  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
602  :( 0UL ) );
603  const size_t kend( ( IsUpper_v<MT5> )
604  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
605  :( K ) );
606  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
607 
608  if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
609  for( size_t i=0UL; i<M; ++i ) {
610  reset( C(i,j) );
611  }
612  continue;
613  }
614 
615  {
616  const size_t ibegin( ( IsLower_v<MT4> )
617  ?( ( IsStrictlyLower_v<MT4> )
618  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
619  :( LOW ? max(j,kbegin) : kbegin ) )
620  :( LOW ? j : 0UL ) );
621  const size_t iend( ( IsUpper_v<MT4> )
622  ?( ( IsStrictlyUpper_v<MT4> )
623  ?( UPP ? min(j+1UL,kbegin) : kbegin )
624  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
625  :( UPP ? j+1UL : M ) );
626 
627  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
628  for( size_t i=0UL; i<ibegin; ++i ) {
629  reset( C(i,j) );
630  }
631  }
632  else if( IsStrictlyLower_v<MT4> ) {
633  reset( C(0UL,j) );
634  }
635  for( size_t i=ibegin; i<iend; ++i ) {
636  C(i,j) = A(i,kbegin) * B(kbegin,j);
637  }
638  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
639  for( size_t i=iend; i<M; ++i ) {
640  reset( C(i,j) );
641  }
642  }
643  else if( IsStrictlyUpper_v<MT4> ) {
644  reset( C(M-1UL,j) );
645  }
646  }
647 
648  for( size_t k=kbegin+1UL; k<kend; ++k )
649  {
650  const size_t ibegin( ( IsLower_v<MT4> )
651  ?( ( IsStrictlyLower_v<MT4> )
652  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
653  :( SYM || HERM || LOW ? max( j, k ) : k ) )
654  :( SYM || HERM || LOW ? j : 0UL ) );
655  const size_t iend( ( IsUpper_v<MT4> )
656  ?( ( IsStrictlyUpper_v<MT4> )
657  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
658  :( UPP ? min(j+1UL,k) : k ) )
659  :( UPP ? j+1UL : M ) );
660 
661  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
662  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
663 
664  for( size_t i=ibegin; i<iend; ++i ) {
665  C(i,j) += A(i,k) * B(k,j);
666  }
667  if( IsUpper_v<MT4> ) {
668  C(iend,j) = A(iend,k) * B(k,j);
669  }
670  }
671  }
672 
673  if( SYM || HERM ) {
674  for( size_t j=1UL; j<N; ++j ) {
675  for( size_t i=0UL; i<j; ++i ) {
676  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
677  }
678  }
679  }
680  }
682  //**********************************************************************************************
683 
684  //**Default assignment to dense matrices (general/diagonal)*************************************
698  template< typename MT3 // Type of the left-hand side target matrix
699  , typename MT4 // Type of the left-hand side matrix operand
700  , typename MT5 > // Type of the right-hand side matrix operand
701  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
702  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
703  {
705 
706  const size_t M( A.rows() );
707  const size_t N( B.columns() );
708 
709  for( size_t j=0UL; j<N; ++j )
710  {
711  const size_t ibegin( ( IsLower_v<MT4> )
712  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
713  :( 0UL ) );
714  const size_t iend( ( IsUpper_v<MT4> )
715  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
716  :( M ) );
717  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
718 
719  if( IsLower_v<MT4> ) {
720  for( size_t i=0UL; i<ibegin; ++i ) {
721  reset( C(i,j) );
722  }
723  }
724  for( size_t i=ibegin; i<iend; ++i ) {
725  C(i,j) = A(i,j) * B(j,j);
726  }
727  if( IsUpper_v<MT4> ) {
728  for( size_t i=iend; i<M; ++i ) {
729  reset( C(i,j) );
730  }
731  }
732  }
733  }
735  //**********************************************************************************************
736 
737  //**Default assignment to dense matrices (diagonal/general)*************************************
751  template< typename MT3 // Type of the left-hand side target matrix
752  , typename MT4 // Type of the left-hand side matrix operand
753  , typename MT5 > // Type of the right-hand side matrix operand
754  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
755  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
756  {
758 
759  const size_t M( A.rows() );
760  const size_t N( B.columns() );
761 
762  for( size_t j=0UL; j<N; ++j )
763  {
764  const size_t ibegin( ( IsLower_v<MT5> )
765  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
766  :( 0UL ) );
767  const size_t iend( ( IsUpper_v<MT5> )
768  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
769  :( M ) );
770  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
771 
772  if( IsLower_v<MT4> ) {
773  for( size_t i=0UL; i<ibegin; ++i ) {
774  reset( C(i,j) );
775  }
776  }
777  for( size_t i=ibegin; i<iend; ++i ) {
778  C(i,j) = A(i,i) * B(i,j);
779  }
780  if( IsUpper_v<MT4> ) {
781  for( size_t i=iend; i<M; ++i ) {
782  reset( C(i,j) );
783  }
784  }
785  }
786  }
788  //**********************************************************************************************
789 
790  //**Default assignment to dense matrices (diagonal/diagonal)************************************
804  template< typename MT3 // Type of the left-hand side target matrix
805  , typename MT4 // Type of the left-hand side matrix operand
806  , typename MT5 > // Type of the right-hand side matrix operand
807  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
808  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
809  {
811 
812  reset( C );
813 
814  for( size_t i=0UL; i<A.rows(); ++i ) {
815  C(i,i) = A(i,i) * B(i,i);
816  }
817  }
819  //**********************************************************************************************
820 
821  //**Default assignment to dense matrices (small matrices)***************************************
835  template< typename MT3 // Type of the left-hand side target matrix
836  , typename MT4 // Type of the left-hand side matrix operand
837  , typename MT5 > // Type of the right-hand side matrix operand
838  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
839  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
840  {
841  selectDefaultAssignKernel( C, A, B );
842  }
844  //**********************************************************************************************
845 
846  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
861  template< typename MT3 // Type of the left-hand side target matrix
862  , typename MT4 // Type of the left-hand side matrix operand
863  , typename MT5 > // Type of the right-hand side matrix operand
864  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
865  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
866  {
869  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
870  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
871 
872  const ForwardFunctor fwd;
873 
874  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
875  const OppositeType_t<MT5> tmp( serial( B ) );
876  assign( C, fwd( A * tmp ) );
877  }
878  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
879  const OppositeType_t<MT4> tmp( serial( A ) );
880  assign( C, fwd( tmp * B ) );
881  }
882  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
883  const OppositeType_t<MT5> tmp( serial( B ) );
884  assign( C, fwd( A * tmp ) );
885  }
886  else {
887  const OppositeType_t<MT4> tmp( serial( A ) );
888  assign( C, fwd( tmp * B ) );
889  }
890  }
892  //**********************************************************************************************
893 
894  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
909  template< typename MT3 // Type of the left-hand side target matrix
910  , typename MT4 // Type of the left-hand side matrix operand
911  , typename MT5 > // Type of the right-hand side matrix operand
912  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
913  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
914  {
915  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
916 
917  const size_t M( A.rows() );
918  const size_t N( B.columns() );
919  const size_t K( A.columns() );
920 
921  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
922 
923  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
924  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
925 
926  if( LOW && UPP && M > SIMDSIZE*3UL ) {
927  reset( C );
928  }
929 
930  {
931  size_t i( 0UL );
932 
933  if( IsIntegral_v<ElementType> )
934  {
935  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
936  for( size_t j=0UL; j<N; ++j )
937  {
938  const size_t kbegin( ( IsLower_v<MT5> )
939  ?( ( IsUpper_v<MT4> )
940  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
941  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
942  :( IsUpper_v<MT4> ? i : 0UL ) );
943  const size_t kend( ( IsUpper_v<MT5> )
944  ?( ( IsLower_v<MT4> )
945  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
946  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
947  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
948 
949  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
950 
951  for( size_t k=kbegin; k<kend; ++k ) {
952  const SIMDType b1( set( B(k,j) ) );
953  xmm1 += A.load(i ,k) * b1;
954  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
955  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
956  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
957  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
958  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
959  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
960  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
961  }
962 
963  C.store( i , j, xmm1 );
964  C.store( i+SIMDSIZE , j, xmm2 );
965  C.store( i+SIMDSIZE*2UL, j, xmm3 );
966  C.store( i+SIMDSIZE*3UL, j, xmm4 );
967  C.store( i+SIMDSIZE*4UL, j, xmm5 );
968  C.store( i+SIMDSIZE*5UL, j, xmm6 );
969  C.store( i+SIMDSIZE*6UL, j, xmm7 );
970  C.store( i+SIMDSIZE*7UL, j, xmm8 );
971  }
972  }
973  }
974 
975  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
976  {
977  size_t j( 0UL );
978 
979  for( ; (j+2UL) <= N; j+=2UL )
980  {
981  const size_t kbegin( ( IsLower_v<MT5> )
982  ?( ( IsUpper_v<MT4> )
983  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
984  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
985  :( IsUpper_v<MT4> ? i : 0UL ) );
986  const size_t kend( ( IsUpper_v<MT5> )
987  ?( ( IsLower_v<MT4> )
988  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
989  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
990  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
991 
992  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
993 
994  for( size_t k=kbegin; k<kend; ++k ) {
995  const SIMDType a1( A.load(i ,k) );
996  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
997  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
998  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
999  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1000  const SIMDType b1( set( B(k,j ) ) );
1001  const SIMDType b2( set( B(k,j+1UL) ) );
1002  xmm1 += a1 * b1;
1003  xmm2 += a2 * b1;
1004  xmm3 += a3 * b1;
1005  xmm4 += a4 * b1;
1006  xmm5 += a5 * b1;
1007  xmm6 += a1 * b2;
1008  xmm7 += a2 * b2;
1009  xmm8 += a3 * b2;
1010  xmm9 += a4 * b2;
1011  xmm10 += a5 * b2;
1012  }
1013 
1014  C.store( i , j , xmm1 );
1015  C.store( i+SIMDSIZE , j , xmm2 );
1016  C.store( i+SIMDSIZE*2UL, j , xmm3 );
1017  C.store( i+SIMDSIZE*3UL, j , xmm4 );
1018  C.store( i+SIMDSIZE*4UL, j , xmm5 );
1019  C.store( i , j+1UL, xmm6 );
1020  C.store( i+SIMDSIZE , j+1UL, xmm7 );
1021  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1022  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1023  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1024  }
1025 
1026  if( j < N )
1027  {
1028  const size_t kbegin( ( IsLower_v<MT5> )
1029  ?( ( IsUpper_v<MT4> )
1030  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1031  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1032  :( IsUpper_v<MT4> ? i : 0UL ) );
1033  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1034 
1035  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1036 
1037  for( size_t k=kbegin; k<kend; ++k ) {
1038  const SIMDType b1( set( B(k,j) ) );
1039  xmm1 += A.load(i ,k) * b1;
1040  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1041  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1042  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1043  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1044  }
1045 
1046  C.store( i , j, xmm1 );
1047  C.store( i+SIMDSIZE , j, xmm2 );
1048  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1049  C.store( i+SIMDSIZE*3UL, j, xmm4 );
1050  C.store( i+SIMDSIZE*4UL, j, xmm5 );
1051  }
1052  }
1053 
1054  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1055  {
1056  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
1057  size_t j( UPP ? i : 0UL );
1058 
1059  for( ; (j+2UL) <= jend; j+=2UL )
1060  {
1061  const size_t kbegin( ( IsLower_v<MT5> )
1062  ?( ( IsUpper_v<MT4> )
1063  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1064  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1065  :( IsUpper_v<MT4> ? i : 0UL ) );
1066  const size_t kend( ( IsUpper_v<MT5> )
1067  ?( ( IsLower_v<MT4> )
1068  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1069  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1070  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
1071 
1072  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1073 
1074  for( size_t k=kbegin; k<kend; ++k ) {
1075  const SIMDType a1( A.load(i ,k) );
1076  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1077  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1078  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1079  const SIMDType b1( set( B(k,j ) ) );
1080  const SIMDType b2( set( B(k,j+1UL) ) );
1081  xmm1 += a1 * b1;
1082  xmm2 += a2 * b1;
1083  xmm3 += a3 * b1;
1084  xmm4 += a4 * b1;
1085  xmm5 += a1 * b2;
1086  xmm6 += a2 * b2;
1087  xmm7 += a3 * b2;
1088  xmm8 += a4 * b2;
1089  }
1090 
1091  C.store( i , j , xmm1 );
1092  C.store( i+SIMDSIZE , j , xmm2 );
1093  C.store( i+SIMDSIZE*2UL, j , xmm3 );
1094  C.store( i+SIMDSIZE*3UL, j , xmm4 );
1095  C.store( i , j+1UL, xmm5 );
1096  C.store( i+SIMDSIZE , j+1UL, xmm6 );
1097  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1098  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1099  }
1100 
1101  if( j < jend )
1102  {
1103  const size_t kbegin( ( IsLower_v<MT5> )
1104  ?( ( IsUpper_v<MT4> )
1105  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1106  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1107  :( IsUpper_v<MT4> ? i : 0UL ) );
1108  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1109 
1110  SIMDType xmm1, xmm2, xmm3, xmm4;
1111 
1112  for( size_t k=kbegin; k<kend; ++k ) {
1113  const SIMDType b1( set( B(k,j) ) );
1114  xmm1 += A.load(i ,k) * b1;
1115  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1116  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1117  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1118  }
1119 
1120  C.store( i , j, xmm1 );
1121  C.store( i+SIMDSIZE , j, xmm2 );
1122  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1123  C.store( i+SIMDSIZE*3UL, j, xmm4 );
1124  }
1125  }
1126 
1127  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1128  {
1129  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
1130  size_t j( UPP ? i : 0UL );
1131 
1132  for( ; (j+2UL) <= jend; j+=2UL )
1133  {
1134  const size_t kbegin( ( IsLower_v<MT5> )
1135  ?( ( IsUpper_v<MT4> )
1136  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1137  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1138  :( IsUpper_v<MT4> ? i : 0UL ) );
1139  const size_t kend( ( IsUpper_v<MT5> )
1140  ?( ( IsLower_v<MT4> )
1141  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1142  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1143  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
1144 
1145  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1146 
1147  for( size_t k=kbegin; k<kend; ++k ) {
1148  const SIMDType a1( A.load(i ,k) );
1149  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1150  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1151  const SIMDType b1( set( B(k,j ) ) );
1152  const SIMDType b2( set( B(k,j+1UL) ) );
1153  xmm1 += a1 * b1;
1154  xmm2 += a2 * b1;
1155  xmm3 += a3 * b1;
1156  xmm4 += a1 * b2;
1157  xmm5 += a2 * b2;
1158  xmm6 += a3 * b2;
1159  }
1160 
1161  C.store( i , j , xmm1 );
1162  C.store( i+SIMDSIZE , j , xmm2 );
1163  C.store( i+SIMDSIZE*2UL, j , xmm3 );
1164  C.store( i , j+1UL, xmm4 );
1165  C.store( i+SIMDSIZE , j+1UL, xmm5 );
1166  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1167  }
1168 
1169  if( j < jend )
1170  {
1171  const size_t kbegin( ( IsLower_v<MT5> )
1172  ?( ( IsUpper_v<MT4> )
1173  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1174  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1175  :( IsUpper_v<MT4> ? i : 0UL ) );
1176  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
1177 
1178  SIMDType xmm1, xmm2, xmm3;
1179 
1180  for( size_t k=kbegin; k<kend; ++k ) {
1181  const SIMDType b1( set( B(k,j) ) );
1182  xmm1 += A.load(i ,k) * b1;
1183  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1184  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1185  }
1186 
1187  C.store( i , j, xmm1 );
1188  C.store( i+SIMDSIZE , j, xmm2 );
1189  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1190  }
1191  }
1192 
1193  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1194  {
1195  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
1196  size_t j( UPP ? i : 0UL );
1197 
1198  for( ; (j+4UL) <= jend; j+=4UL )
1199  {
1200  const size_t kbegin( ( IsLower_v<MT5> )
1201  ?( ( IsUpper_v<MT4> )
1202  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1203  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1204  :( IsUpper_v<MT4> ? i : 0UL ) );
1205  const size_t kend( ( IsUpper_v<MT5> )
1206  ?( ( IsLower_v<MT4> )
1207  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
1208  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
1209  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
1210 
1211  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1212 
1213  for( size_t k=kbegin; k<kend; ++k ) {
1214  const SIMDType a1( A.load(i ,k) );
1215  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1216  const SIMDType b1( set( B(k,j ) ) );
1217  const SIMDType b2( set( B(k,j+1UL) ) );
1218  const SIMDType b3( set( B(k,j+2UL) ) );
1219  const SIMDType b4( set( B(k,j+3UL) ) );
1220  xmm1 += a1 * b1;
1221  xmm2 += a2 * b1;
1222  xmm3 += a1 * b2;
1223  xmm4 += a2 * b2;
1224  xmm5 += a1 * b3;
1225  xmm6 += a2 * b3;
1226  xmm7 += a1 * b4;
1227  xmm8 += a2 * b4;
1228  }
1229 
1230  C.store( i , j , xmm1 );
1231  C.store( i+SIMDSIZE, j , xmm2 );
1232  C.store( i , j+1UL, xmm3 );
1233  C.store( i+SIMDSIZE, j+1UL, xmm4 );
1234  C.store( i , j+2UL, xmm5 );
1235  C.store( i+SIMDSIZE, j+2UL, xmm6 );
1236  C.store( i , j+3UL, xmm7 );
1237  C.store( i+SIMDSIZE, j+3UL, xmm8 );
1238  }
1239 
1240  for( ; (j+3UL) <= jend; j+=3UL )
1241  {
1242  const size_t kbegin( ( IsLower_v<MT5> )
1243  ?( ( IsUpper_v<MT4> )
1244  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1245  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1246  :( IsUpper_v<MT4> ? i : 0UL ) );
1247  const size_t kend( ( IsUpper_v<MT5> )
1248  ?( ( IsLower_v<MT4> )
1249  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
1250  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
1251  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
1252 
1253  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1254 
1255  for( size_t k=kbegin; k<kend; ++k ) {
1256  const SIMDType a1( A.load(i ,k) );
1257  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1258  const SIMDType b1( set( B(k,j ) ) );
1259  const SIMDType b2( set( B(k,j+1UL) ) );
1260  const SIMDType b3( set( B(k,j+2UL) ) );
1261  xmm1 += a1 * b1;
1262  xmm2 += a2 * b1;
1263  xmm3 += a1 * b2;
1264  xmm4 += a2 * b2;
1265  xmm5 += a1 * b3;
1266  xmm6 += a2 * b3;
1267  }
1268 
1269  C.store( i , j , xmm1 );
1270  C.store( i+SIMDSIZE, j , xmm2 );
1271  C.store( i , j+1UL, xmm3 );
1272  C.store( i+SIMDSIZE, j+1UL, xmm4 );
1273  C.store( i , j+2UL, xmm5 );
1274  C.store( i+SIMDSIZE, j+2UL, xmm6 );
1275  }
1276 
1277  for( ; (j+2UL) <= jend; j+=2UL )
1278  {
1279  const size_t kbegin( ( IsLower_v<MT5> )
1280  ?( ( IsUpper_v<MT4> )
1281  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1282  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1283  :( IsUpper_v<MT4> ? i : 0UL ) );
1284  const size_t kend( ( IsUpper_v<MT5> )
1285  ?( ( IsLower_v<MT4> )
1286  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1287  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1288  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
1289 
1290  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1291  size_t k( kbegin );
1292 
1293  for( ; (k+2UL) <= kend; k+=2UL ) {
1294  const SIMDType a1( A.load(i ,k ) );
1295  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
1296  const SIMDType a3( A.load(i ,k+1UL) );
1297  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
1298  const SIMDType b1( set( B(k ,j ) ) );
1299  const SIMDType b2( set( B(k ,j+1UL) ) );
1300  const SIMDType b3( set( B(k+1UL,j ) ) );
1301  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
1302  xmm1 += a1 * b1;
1303  xmm2 += a2 * b1;
1304  xmm3 += a1 * b2;
1305  xmm4 += a2 * b2;
1306  xmm5 += a3 * b3;
1307  xmm6 += a4 * b3;
1308  xmm7 += a3 * b4;
1309  xmm8 += a4 * b4;
1310  }
1311 
1312  for( ; k<kend; ++k ) {
1313  const SIMDType a1( A.load(i ,k) );
1314  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1315  const SIMDType b1( set( B(k,j ) ) );
1316  const SIMDType b2( set( B(k,j+1UL) ) );
1317  xmm1 += a1 * b1;
1318  xmm2 += a2 * b1;
1319  xmm3 += a1 * b2;
1320  xmm4 += a2 * b2;
1321  }
1322 
1323  C.store( i , j , xmm1+xmm5 );
1324  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
1325  C.store( i , j+1UL, xmm3+xmm7 );
1326  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
1327  }
1328 
1329  if( j < jend )
1330  {
1331  const size_t kbegin( ( IsLower_v<MT5> )
1332  ?( ( IsUpper_v<MT4> )
1333  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1334  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1335  :( IsUpper_v<MT4> ? i : 0UL ) );
1336  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
1337 
1338  SIMDType xmm1, xmm2, xmm3, xmm4;
1339  size_t k( kbegin );
1340 
1341  for( ; (k+2UL) <= kend; k+=2UL ) {
1342  const SIMDType b1( set( B(k ,j) ) );
1343  const SIMDType b2( set( B(k+1UL,j) ) );
1344  xmm1 += A.load(i ,k ) * b1;
1345  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
1346  xmm3 += A.load(i ,k+1UL) * b2;
1347  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
1348  }
1349 
1350  for( ; k<kend; ++k ) {
1351  const SIMDType b1( set( B(k,j) ) );
1352  xmm1 += A.load(i ,k) * b1;
1353  xmm2 += A.load(i+SIMDSIZE,k) * b1;
1354  }
1355 
1356  C.store( i , j, xmm1+xmm3 );
1357  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
1358  }
1359  }
1360 
1361  for( ; i<ipos; i+=SIMDSIZE )
1362  {
1363  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
1364  size_t j( UPP ? i : 0UL );
1365 
1366  for( ; (j+4UL) <= jend; j+=4UL )
1367  {
1368  const size_t kbegin( ( IsLower_v<MT5> )
1369  ?( ( IsUpper_v<MT4> )
1370  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1371  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1372  :( IsUpper_v<MT4> ? i : 0UL ) );
1373  const size_t kend( ( IsUpper_v<MT5> )
1374  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
1375  :( K ) );
1376 
1377  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1378  size_t k( kbegin );
1379 
1380  for( ; (k+2UL) <= kend; k+=2UL ) {
1381  const SIMDType a1( A.load(i,k ) );
1382  const SIMDType a2( A.load(i,k+1UL) );
1383  xmm1 += a1 * set( B(k ,j ) );
1384  xmm2 += a1 * set( B(k ,j+1UL) );
1385  xmm3 += a1 * set( B(k ,j+2UL) );
1386  xmm4 += a1 * set( B(k ,j+3UL) );
1387  xmm5 += a2 * set( B(k+1UL,j ) );
1388  xmm6 += a2 * set( B(k+1UL,j+1UL) );
1389  xmm7 += a2 * set( B(k+1UL,j+2UL) );
1390  xmm8 += a2 * set( B(k+1UL,j+3UL) );
1391  }
1392 
1393  for( ; k<kend; ++k ) {
1394  const SIMDType a1( A.load(i,k) );
1395  xmm1 += a1 * set( B(k,j ) );
1396  xmm2 += a1 * set( B(k,j+1UL) );
1397  xmm3 += a1 * set( B(k,j+2UL) );
1398  xmm4 += a1 * set( B(k,j+3UL) );
1399  }
1400 
1401  C.store( i, j , xmm1+xmm5 );
1402  C.store( i, j+1UL, xmm2+xmm6 );
1403  C.store( i, j+2UL, xmm3+xmm7 );
1404  C.store( i, j+3UL, xmm4+xmm8 );
1405  }
1406 
1407  for( ; (j+3UL) <= jend; j+=3UL )
1408  {
1409  const size_t kbegin( ( IsLower_v<MT5> )
1410  ?( ( IsUpper_v<MT4> )
1411  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1412  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1413  :( IsUpper_v<MT4> ? i : 0UL ) );
1414  const size_t kend( ( IsUpper_v<MT5> )
1415  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
1416  :( K ) );
1417 
1418  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1419  size_t k( kbegin );
1420 
1421  for( ; (k+2UL) <= kend; k+=2UL ) {
1422  const SIMDType a1( A.load(i,k ) );
1423  const SIMDType a2( A.load(i,k+1UL) );
1424  xmm1 += a1 * set( B(k ,j ) );
1425  xmm2 += a1 * set( B(k ,j+1UL) );
1426  xmm3 += a1 * set( B(k ,j+2UL) );
1427  xmm4 += a2 * set( B(k+1UL,j ) );
1428  xmm5 += a2 * set( B(k+1UL,j+1UL) );
1429  xmm6 += a2 * set( B(k+1UL,j+2UL) );
1430  }
1431 
1432  for( ; k<kend; ++k ) {
1433  const SIMDType a1( A.load(i,k) );
1434  xmm1 += a1 * set( B(k,j ) );
1435  xmm2 += a1 * set( B(k,j+1UL) );
1436  xmm3 += a1 * set( B(k,j+2UL) );
1437  }
1438 
1439  C.store( i, j , xmm1+xmm4 );
1440  C.store( i, j+1UL, xmm2+xmm5 );
1441  C.store( i, j+2UL, xmm3+xmm6 );
1442  }
1443 
1444  for( ; (j+2UL) <= jend; j+=2UL )
1445  {
1446  const size_t kbegin( ( IsLower_v<MT5> )
1447  ?( ( IsUpper_v<MT4> )
1448  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1449  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1450  :( IsUpper_v<MT4> ? i : 0UL ) );
1451  const size_t kend( ( IsUpper_v<MT5> )
1452  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1453  :( K ) );
1454 
1455  SIMDType xmm1, xmm2, xmm3, xmm4;
1456  size_t k( kbegin );
1457 
1458  for( ; (k+2UL) <= kend; k+=2UL ) {
1459  const SIMDType a1( A.load(i,k ) );
1460  const SIMDType a2( A.load(i,k+1UL) );
1461  xmm1 += a1 * set( B(k ,j ) );
1462  xmm2 += a1 * set( B(k ,j+1UL) );
1463  xmm3 += a2 * set( B(k+1UL,j ) );
1464  xmm4 += a2 * set( B(k+1UL,j+1UL) );
1465  }
1466 
1467  for( ; k<kend; ++k ) {
1468  const SIMDType a1( A.load(i,k) );
1469  xmm1 += a1 * set( B(k,j ) );
1470  xmm2 += a1 * set( B(k,j+1UL) );
1471  }
1472 
1473  C.store( i, j , xmm1+xmm3 );
1474  C.store( i, j+1UL, xmm2+xmm4 );
1475  }
1476 
1477  if( j < jend )
1478  {
1479  const size_t kbegin( ( IsLower_v<MT5> )
1480  ?( ( IsUpper_v<MT4> )
1481  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1482  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1483  :( IsUpper_v<MT4> ? i : 0UL ) );
1484 
1485  SIMDType xmm1, xmm2;
1486  size_t k( kbegin );
1487 
1488  for( ; (k+2UL) <= K; k+=2UL ) {
1489  xmm1 += A.load(i,k ) * set( B(k ,j) );
1490  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
1491  }
1492 
1493  for( ; k<K; ++k ) {
1494  xmm1 += A.load(i,k) * set( B(k,j) );
1495  }
1496 
1497  C.store( i, j, xmm1+xmm2 );
1498  }
1499  }
1500 
1501  for( ; remainder && i<M; ++i )
1502  {
1503  size_t j( LOW && UPP ? i : 0UL );
1504 
1505  for( ; (j+2UL) <= N; j+=2UL )
1506  {
1507  const size_t kbegin( ( IsLower_v<MT5> )
1508  ?( ( IsUpper_v<MT4> )
1509  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1510  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1511  :( IsUpper_v<MT4> ? i : 0UL ) );
1512  const size_t kend( ( IsUpper_v<MT5> )
1513  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1514  :( K ) );
1515 
1516  ElementType value1{};
1517  ElementType value2{};
1518 
1519  for( size_t k=kbegin; k<kend; ++k ) {
1520  value1 += A(i,k) * B(k,j );
1521  value2 += A(i,k) * B(k,j+1UL);
1522  }
1523 
1524  C(i,j ) = value1;
1525  C(i,j+1UL) = value2;
1526  }
1527 
1528  if( j < N )
1529  {
1530  const size_t kbegin( ( IsLower_v<MT5> )
1531  ?( ( IsUpper_v<MT4> )
1532  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1533  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1534  :( IsUpper_v<MT4> ? i : 0UL ) );
1535 
1536  ElementType value{};
1537 
1538  for( size_t k=kbegin; k<K; ++k ) {
1539  value += A(i,k) * B(k,j);
1540  }
1541 
1542  C(i,j) = value;
1543  }
1544  }
1545  }
1546 
1547  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
1548  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1549  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1550  for( size_t i=0UL; i<iend; ++i ) {
1551  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1552  }
1553  }
1554  }
1555  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
1556  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1557  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1558  for( size_t i=0UL; i<iend; ++i ) {
1559  reset( C(i,j) );
1560  }
1561  }
1562  }
1563  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
1564  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1565  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1566  for( size_t j=0UL; j<jend; ++j ) {
1567  reset( C(i,j) );
1568  }
1569  }
1570  }
1571  }
1573  //**********************************************************************************************
1574 
1575  //**Default assignment to dense matrices (large matrices)***************************************
1589  template< typename MT3 // Type of the left-hand side target matrix
1590  , typename MT4 // Type of the left-hand side matrix operand
1591  , typename MT5 > // Type of the right-hand side matrix operand
1592  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1593  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1594  {
1595  selectDefaultAssignKernel( C, A, B );
1596  }
1598  //**********************************************************************************************
1599 
1600  //**Vectorized default assignment to dense matrices (large matrices)****************************
1615  template< typename MT3 // Type of the left-hand side target matrix
1616  , typename MT4 // Type of the left-hand side matrix operand
1617  , typename MT5 > // Type of the right-hand side matrix operand
1618  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1619  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1620  {
1621  if( SYM )
1622  smmm( C, A, B, ElementType(1) );
1623  else if( HERM )
1624  hmmm( C, A, B, ElementType(1) );
1625  else if( LOW )
1626  lmmm( C, A, B, ElementType(1), ElementType(0) );
1627  else if( UPP )
1628  ummm( C, A, B, ElementType(1), ElementType(0) );
1629  else
1630  mmm( C, A, B, ElementType(1), ElementType(0) );
1631  }
1633  //**********************************************************************************************
1634 
1635  //**BLAS-based assignment to dense matrices (default)*******************************************
1649  template< typename MT3 // Type of the left-hand side target matrix
1650  , typename MT4 // Type of the left-hand side matrix operand
1651  , typename MT5 > // Type of the right-hand side matrix operand
1652  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1653  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1654  {
1655  selectLargeAssignKernel( C, A, B );
1656  }
1658  //**********************************************************************************************
1659 
1660  //**BLAS-based assignment to dense matrices*****************************************************
1661 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1662 
1675  template< typename MT3 // Type of the left-hand side target matrix
1676  , typename MT4 // Type of the left-hand side matrix operand
1677  , typename MT5 > // Type of the right-hand side matrix operand
1678  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1679  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1680  {
1681  using ET = ElementType_t<MT3>;
1682 
1683  if( IsTriangular_v<MT4> ) {
1684  assign( C, B );
1685  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1686  }
1687  else if( IsTriangular_v<MT5> ) {
1688  assign( C, A );
1689  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1690  }
1691  else {
1692  gemm( C, A, B, ET(1), ET(0) );
1693  }
1694  }
1696 #endif
1697  //**********************************************************************************************
1698 
1699  //**Assignment to sparse matrices***************************************************************
1712  template< typename MT // Type of the target sparse matrix
1713  , bool SO > // Storage order of the target sparse matrix
1714  friend inline auto assign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1715  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1716  {
1718 
1719  using TmpType = If_t< SO, ResultType, OppositeType >;
1720 
1727 
1728  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1729  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1730 
1731  const ForwardFunctor fwd;
1732 
1733  const TmpType tmp( serial( rhs ) );
1734  assign( ~lhs, fwd( tmp ) );
1735  }
1737  //**********************************************************************************************
1738 
1739  //**Restructuring assignment to row-major matrices**********************************************
1754  template< typename MT > // Type of the target matrix
1755  friend inline auto assign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
1756  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1757  {
1759 
1761 
1762  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1763  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1764 
1765  const ForwardFunctor fwd;
1766 
1767  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
1768  assign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
1769  else if( IsSymmetric_v<MT1> )
1770  assign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
1771  else
1772  assign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
1773  }
1775  //**********************************************************************************************
1776 
1777  //**Addition assignment to dense matrices*******************************************************
1790  template< typename MT // Type of the target dense matrix
1791  , bool SO > // Storage order of the target dense matrix
1792  friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1793  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1794  {
1796 
1797  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1798  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1799 
1800  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1801  return;
1802  }
1803 
1804  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1805  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1806 
1807  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1808  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1809  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1810  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1811  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1812  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1813 
1814  TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1815  }
1817  //**********************************************************************************************
1818 
1819  //**Addition assignment to dense matrices (kernel selection)************************************
1830  template< typename MT3 // Type of the left-hand side target matrix
1831  , typename MT4 // Type of the left-hand side matrix operand
1832  , typename MT5 > // Type of the right-hand side matrix operand
1833  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1834  {
1835  if( ( IsDiagonal_v<MT4> ) ||
1836  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
1837  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1838  selectSmallAddAssignKernel( C, A, B );
1839  else
1840  selectBlasAddAssignKernel( C, A, B );
1841  }
1843  //**********************************************************************************************
1844 
1845  //**Default addition assignment to dense matrices (general/general)*****************************
1859  template< typename MT3 // Type of the left-hand side target matrix
1860  , typename MT4 // Type of the left-hand side matrix operand
1861  , typename MT5 > // Type of the right-hand side matrix operand
1862  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1863  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1864  {
1865  const size_t M( A.rows() );
1866  const size_t N( B.columns() );
1867  const size_t K( A.columns() );
1868 
1869  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1870 
1871  for( size_t j=0UL; j<N; ++j )
1872  {
1873  const size_t kbegin( ( IsLower_v<MT5> )
1874  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1875  :( 0UL ) );
1876  const size_t kend( ( IsUpper_v<MT5> )
1877  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1878  :( K ) );
1879  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1880 
1881  for( size_t k=kbegin; k<kend; ++k )
1882  {
1883  const size_t ibegin( ( IsLower_v<MT4> )
1884  ?( ( IsStrictlyLower_v<MT4> )
1885  ?( LOW ? max(j,k+1UL) : k+1UL )
1886  :( LOW ? max(j,k) : k ) )
1887  :( LOW ? j : 0UL ) );
1888  const size_t iend( ( IsUpper_v<MT4> )
1889  ?( ( IsStrictlyUpper_v<MT4> )
1890  ?( UPP ? min(j+1UL,k) : k )
1891  :( UPP ? min(j,k)+1UL : k+1UL ) )
1892  :( UPP ? j+1UL : M ) );
1893 
1894  if( ( LOW || UPP ) && ibegin >= iend ) continue;
1895  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1896 
1897  const size_t inum( iend - ibegin );
1898  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1899 
1900  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1901  C(i ,j) += A(i ,k) * B(k,j);
1902  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1903  }
1904  if( ipos < iend ) {
1905  C(ipos,j) += A(ipos,k) * B(k,j);
1906  }
1907  }
1908  }
1909  }
1911  //**********************************************************************************************
1912 
1913  //**Default addition assignment to dense matrices (general/diagonal)****************************
1927  template< typename MT3 // Type of the left-hand side target matrix
1928  , typename MT4 // Type of the left-hand side matrix operand
1929  , typename MT5 > // Type of the right-hand side matrix operand
1930  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1931  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1932  {
1934 
1935  const size_t M( A.rows() );
1936  const size_t N( B.columns() );
1937 
1938  for( size_t j=0UL; j<N; ++j )
1939  {
1940  const size_t ibegin( ( IsLower_v<MT4> )
1941  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
1942  :( 0UL ) );
1943  const size_t iend( ( IsUpper_v<MT4> )
1944  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
1945  :( M ) );
1946  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1947 
1948  const size_t inum( iend - ibegin );
1949  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1950 
1951  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1952  C(i ,j) += A(i ,j) * B(j,j);
1953  C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1954  }
1955  if( ipos < iend ) {
1956  C(ipos,j) += A(ipos,j) * B(j,j);
1957  }
1958  }
1959  }
1961  //**********************************************************************************************
1962 
1963  //**Default addition assignment to dense matrices (diagonal/general)****************************
1977  template< typename MT3 // Type of the left-hand side target matrix
1978  , typename MT4 // Type of the left-hand side matrix operand
1979  , typename MT5 > // Type of the right-hand side matrix operand
1980  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1981  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1982  {
1984 
1985  const size_t M( A.rows() );
1986  const size_t N( B.columns() );
1987 
1988  for( size_t j=0UL; j<N; ++j )
1989  {
1990  const size_t ibegin( ( IsLower_v<MT5> )
1991  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1992  :( 0UL ) );
1993  const size_t iend( ( IsUpper_v<MT5> )
1994  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1995  :( M ) );
1996  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1997 
1998  const size_t inum( iend - ibegin );
1999  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2000 
2001  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2002  C(i ,j) += A(i ,i ) * B(i ,j);
2003  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2004  }
2005  if( ipos < iend ) {
2006  C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2007  }
2008  }
2009  }
2011  //**********************************************************************************************
2012 
2013  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2027  template< typename MT3 // Type of the left-hand side target matrix
2028  , typename MT4 // Type of the left-hand side matrix operand
2029  , typename MT5 > // Type of the right-hand side matrix operand
2030  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2031  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2032  {
2034 
2035  for( size_t i=0UL; i<A.rows(); ++i ) {
2036  C(i,i) += A(i,i) * B(i,i);
2037  }
2038  }
2040  //**********************************************************************************************
2041 
2042  //**Default addition assignment to dense matrices (small matrices)******************************
2056  template< typename MT3 // Type of the left-hand side target matrix
2057  , typename MT4 // Type of the left-hand side matrix operand
2058  , typename MT5 > // Type of the right-hand side matrix operand
2059  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2060  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2061  {
2062  selectDefaultAddAssignKernel( C, A, B );
2063  }
2065  //**********************************************************************************************
2066 
2067  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2082  template< typename MT3 // Type of the left-hand side target matrix
2083  , typename MT4 // Type of the left-hand side matrix operand
2084  , typename MT5 > // Type of the right-hand side matrix operand
2085  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2086  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2087  {
2090  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
2091  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
2092 
2093  const ForwardFunctor fwd;
2094 
2095  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2096  const OppositeType_t<MT5> tmp( serial( B ) );
2097  addAssign( C, fwd( A * tmp ) );
2098  }
2099  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2100  const OppositeType_t<MT4> tmp( serial( A ) );
2101  addAssign( C, fwd( tmp * B ) );
2102  }
2103  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2104  const OppositeType_t<MT5> tmp( serial( B ) );
2105  addAssign( C, fwd( A * tmp ) );
2106  }
2107  else {
2108  const OppositeType_t<MT4> tmp( serial( A ) );
2109  addAssign( C, fwd( tmp * B ) );
2110  }
2111  }
2113  //**********************************************************************************************
2114 
2115  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2130  template< typename MT3 // Type of the left-hand side target matrix
2131  , typename MT4 // Type of the left-hand side matrix operand
2132  , typename MT5 > // Type of the right-hand side matrix operand
2133  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2134  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2135  {
2136  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
2137 
2138  const size_t M( A.rows() );
2139  const size_t N( B.columns() );
2140  const size_t K( A.columns() );
2141 
2142  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2143 
2144  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
2145  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
2146 
2147  size_t i( 0UL );
2148 
2149  if( IsIntegral_v<ElementType> )
2150  {
2151  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
2152  for( size_t j=0UL; j<N; ++j )
2153  {
2154  const size_t kbegin( ( IsLower_v<MT5> )
2155  ?( ( IsUpper_v<MT4> )
2156  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2157  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2158  :( IsUpper_v<MT4> ? i : 0UL ) );
2159  const size_t kend( ( IsUpper_v<MT5> )
2160  ?( ( IsLower_v<MT4> )
2161  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2162  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
2163  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
2164 
2165  SIMDType xmm1( C.load(i ,j) );
2166  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2167  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2168  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
2169  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
2170  SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
2171  SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
2172  SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
2173 
2174  for( size_t k=kbegin; k<kend; ++k ) {
2175  const SIMDType b1( set( B(k,j) ) );
2176  xmm1 += A.load(i ,k) * b1;
2177  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2178  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2179  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2180  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2181  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
2182  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
2183  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
2184  }
2185 
2186  C.store( i , j, xmm1 );
2187  C.store( i+SIMDSIZE , j, xmm2 );
2188  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2189  C.store( i+SIMDSIZE*3UL, j, xmm4 );
2190  C.store( i+SIMDSIZE*4UL, j, xmm5 );
2191  C.store( i+SIMDSIZE*5UL, j, xmm6 );
2192  C.store( i+SIMDSIZE*6UL, j, xmm7 );
2193  C.store( i+SIMDSIZE*7UL, j, xmm8 );
2194  }
2195  }
2196  }
2197 
2198  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2199  {
2200  size_t j( 0UL );
2201 
2202  for( ; (j+2UL) <= N; j+=2UL )
2203  {
2204  const size_t kbegin( ( IsLower_v<MT5> )
2205  ?( ( IsUpper_v<MT4> )
2206  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2207  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2208  :( IsUpper_v<MT4> ? i : 0UL ) );
2209  const size_t kend( ( IsUpper_v<MT5> )
2210  ?( ( IsLower_v<MT4> )
2211  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2212  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2213  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
2214 
2215  SIMDType xmm1 ( C.load(i ,j ) );
2216  SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
2217  SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
2218  SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
2219  SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
2220  SIMDType xmm6 ( C.load(i ,j+1UL) );
2221  SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
2222  SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
2223  SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
2224  SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
2225 
2226  for( size_t k=kbegin; k<kend; ++k ) {
2227  const SIMDType a1( A.load(i ,k) );
2228  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2229  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2230  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2231  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
2232  const SIMDType b1( set( B(k,j ) ) );
2233  const SIMDType b2( set( B(k,j+1UL) ) );
2234  xmm1 += a1 * b1;
2235  xmm2 += a2 * b1;
2236  xmm3 += a3 * b1;
2237  xmm4 += a4 * b1;
2238  xmm5 += a5 * b1;
2239  xmm6 += a1 * b2;
2240  xmm7 += a2 * b2;
2241  xmm8 += a3 * b2;
2242  xmm9 += a4 * b2;
2243  xmm10 += a5 * b2;
2244  }
2245 
2246  C.store( i , j , xmm1 );
2247  C.store( i+SIMDSIZE , j , xmm2 );
2248  C.store( i+SIMDSIZE*2UL, j , xmm3 );
2249  C.store( i+SIMDSIZE*3UL, j , xmm4 );
2250  C.store( i+SIMDSIZE*4UL, j , xmm5 );
2251  C.store( i , j+1UL, xmm6 );
2252  C.store( i+SIMDSIZE , j+1UL, xmm7 );
2253  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
2254  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
2255  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
2256  }
2257 
2258  if( j < N )
2259  {
2260  const size_t kbegin( ( IsLower_v<MT5> )
2261  ?( ( IsUpper_v<MT4> )
2262  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2263  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2264  :( IsUpper_v<MT4> ? i : 0UL ) );
2265  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
2266 
2267  SIMDType xmm1( C.load(i ,j) );
2268  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2269  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2270  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
2271  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
2272 
2273  for( size_t k=kbegin; k<kend; ++k ) {
2274  const SIMDType b1( set( B(k,j) ) );
2275  xmm1 += A.load(i ,k) * b1;
2276  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2277  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2278  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2279  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2280  }
2281 
2282  C.store( i , j, xmm1 );
2283  C.store( i+SIMDSIZE , j, xmm2 );
2284  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2285  C.store( i+SIMDSIZE*3UL, j, xmm4 );
2286  C.store( i+SIMDSIZE*4UL, j, xmm5 );
2287  }
2288  }
2289 
2290  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2291  {
2292  size_t j( 0UL );
2293 
2294  for( ; (j+2UL) <= N; j+=2UL )
2295  {
2296  const size_t kbegin( ( IsLower_v<MT5> )
2297  ?( ( IsUpper_v<MT4> )
2298  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2299  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2300  :( IsUpper_v<MT4> ? i : 0UL ) );
2301  const size_t kend( ( IsUpper_v<MT5> )
2302  ?( ( IsLower_v<MT4> )
2303  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2304  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2305  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
2306 
2307  SIMDType xmm1( C.load(i ,j ) );
2308  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
2309  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
2310  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
2311  SIMDType xmm5( C.load(i ,j+1UL) );
2312  SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
2313  SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
2314  SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
2315 
2316  for( size_t k=kbegin; k<kend; ++k ) {
2317  const SIMDType a1( A.load(i ,k) );
2318  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2319  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2320  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2321  const SIMDType b1( set( B(k,j ) ) );
2322  const SIMDType b2( set( B(k,j+1UL) ) );
2323  xmm1 += a1 * b1;
2324  xmm2 += a2 * b1;
2325  xmm3 += a3 * b1;
2326  xmm4 += a4 * b1;
2327  xmm5 += a1 * b2;
2328  xmm6 += a2 * b2;
2329  xmm7 += a3 * b2;
2330  xmm8 += a4 * b2;
2331  }
2332 
2333  C.store( i , j , xmm1 );
2334  C.store( i+SIMDSIZE , j , xmm2 );
2335  C.store( i+SIMDSIZE*2UL, j , xmm3 );
2336  C.store( i+SIMDSIZE*3UL, j , xmm4 );
2337  C.store( i , j+1UL, xmm5 );
2338  C.store( i+SIMDSIZE , j+1UL, xmm6 );
2339  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2340  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2341  }
2342 
2343  if( j < N )
2344  {
2345  const size_t kbegin( ( IsLower_v<MT5> )
2346  ?( ( IsUpper_v<MT4> )
2347  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2348  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2349  :( IsUpper_v<MT4> ? i : 0UL ) );
2350  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
2351 
2352  SIMDType xmm1( C.load(i ,j) );
2353  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2354  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2355  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
2356 
2357  for( size_t k=kbegin; k<kend; ++k ) {
2358  const SIMDType b1( set( B(k,j) ) );
2359  xmm1 += A.load(i ,k) * b1;
2360  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2361  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2362  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2363  }
2364 
2365  C.store( i , j, xmm1 );
2366  C.store( i+SIMDSIZE , j, xmm2 );
2367  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2368  C.store( i+SIMDSIZE*3UL, j, xmm4 );
2369  }
2370  }
2371 
2372  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2373  {
2374  size_t j( 0UL );
2375 
2376  for( ; (j+2UL) <= N; j+=2UL )
2377  {
2378  const size_t kbegin( ( IsLower_v<MT5> )
2379  ?( ( IsUpper_v<MT4> )
2380  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2381  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2382  :( IsUpper_v<MT4> ? i : 0UL ) );
2383  const size_t kend( ( IsUpper_v<MT5> )
2384  ?( ( IsLower_v<MT4> )
2385  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2386  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2387  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
2388 
2389  SIMDType xmm1( C.load(i ,j ) );
2390  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
2391  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
2392  SIMDType xmm4( C.load(i ,j+1UL) );
2393  SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
2394  SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
2395 
2396  for( size_t k=kbegin; k<kend; ++k ) {
2397  const SIMDType a1( A.load(i ,k) );
2398  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2399  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2400  const SIMDType b1( set( B(k,j ) ) );
2401  const SIMDType b2( set( B(k,j+1UL) ) );
2402  xmm1 += a1 * b1;
2403  xmm2 += a2 * b1;
2404  xmm3 += a3 * b1;
2405  xmm4 += a1 * b2;
2406  xmm5 += a2 * b2;
2407  xmm6 += a3 * b2;
2408  }
2409 
2410  C.store( i , j , xmm1 );
2411  C.store( i+SIMDSIZE , j , xmm2 );
2412  C.store( i+SIMDSIZE*2UL, j , xmm3 );
2413  C.store( i , j+1UL, xmm4 );
2414  C.store( i+SIMDSIZE , j+1UL, xmm5 );
2415  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2416  }
2417 
2418  if( j < N )
2419  {
2420  const size_t kbegin( ( IsLower_v<MT5> )
2421  ?( ( IsUpper_v<MT4> )
2422  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2423  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2424  :( IsUpper_v<MT4> ? i : 0UL ) );
2425  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2426 
2427  SIMDType xmm1( C.load(i ,j) );
2428  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
2429  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
2430 
2431  for( size_t k=kbegin; k<kend; ++k ) {
2432  const SIMDType b1( set( B(k,j) ) );
2433  xmm1 += A.load(i ,k) * b1;
2434  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2435  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2436  }
2437 
2438  C.store( i , j, xmm1 );
2439  C.store( i+SIMDSIZE , j, xmm2 );
2440  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2441  }
2442  }
2443 
2444  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2445  {
2446  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
2447  size_t j( UPP ? i : 0UL );
2448 
2449  for( ; (j+4UL) <= jend; j+=4UL )
2450  {
2451  const size_t kbegin( ( IsLower_v<MT5> )
2452  ?( ( IsUpper_v<MT4> )
2453  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2454  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2455  :( IsUpper_v<MT4> ? i : 0UL ) );
2456  const size_t kend( ( IsUpper_v<MT5> )
2457  ?( ( IsLower_v<MT4> )
2458  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2459  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2460  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2461 
2462  SIMDType xmm1( C.load(i ,j ) );
2463  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
2464  SIMDType xmm3( C.load(i ,j+1UL) );
2465  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
2466  SIMDType xmm5( C.load(i ,j+2UL) );
2467  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
2468  SIMDType xmm7( C.load(i ,j+3UL) );
2469  SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
2470 
2471  for( size_t k=kbegin; k<kend; ++k ) {
2472  const SIMDType a1( A.load(i ,k) );
2473  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2474  const SIMDType b1( set( B(k,j ) ) );
2475  const SIMDType b2( set( B(k,j+1UL) ) );
2476  const SIMDType b3( set( B(k,j+2UL) ) );
2477  const SIMDType b4( set( B(k,j+3UL) ) );
2478  xmm1 += a1 * b1;
2479  xmm2 += a2 * b1;
2480  xmm3 += a1 * b2;
2481  xmm4 += a2 * b2;
2482  xmm5 += a1 * b3;
2483  xmm6 += a2 * b3;
2484  xmm7 += a1 * b4;
2485  xmm8 += a2 * b4;
2486  }
2487 
2488  C.store( i , j , xmm1 );
2489  C.store( i+SIMDSIZE, j , xmm2 );
2490  C.store( i , j+1UL, xmm3 );
2491  C.store( i+SIMDSIZE, j+1UL, xmm4 );
2492  C.store( i , j+2UL, xmm5 );
2493  C.store( i+SIMDSIZE, j+2UL, xmm6 );
2494  C.store( i , j+3UL, xmm7 );
2495  C.store( i+SIMDSIZE, j+3UL, xmm8 );
2496  }
2497 
2498  for( ; (j+3UL) <= jend; j+=3UL )
2499  {
2500  const size_t kbegin( ( IsLower_v<MT5> )
2501  ?( ( IsUpper_v<MT4> )
2502  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2503  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2504  :( IsUpper_v<MT4> ? i : 0UL ) );
2505  const size_t kend( ( IsUpper_v<MT5> )
2506  ?( ( IsLower_v<MT4> )
2507  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2508  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2509  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2510 
2511  SIMDType xmm1( C.load(i ,j ) );
2512  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
2513  SIMDType xmm3( C.load(i ,j+1UL) );
2514  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
2515  SIMDType xmm5( C.load(i ,j+2UL) );
2516  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
2517 
2518  for( size_t k=kbegin; k<kend; ++k ) {
2519  const SIMDType a1( A.load(i ,k) );
2520  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2521  const SIMDType b1( set( B(k,j ) ) );
2522  const SIMDType b2( set( B(k,j+1UL) ) );
2523  const SIMDType b3( set( B(k,j+2UL) ) );
2524  xmm1 += a1 * b1;
2525  xmm2 += a2 * b1;
2526  xmm3 += a1 * b2;
2527  xmm4 += a2 * b2;
2528  xmm5 += a1 * b3;
2529  xmm6 += a2 * b3;
2530  }
2531 
2532  C.store( i , j , xmm1 );
2533  C.store( i+SIMDSIZE, j , xmm2 );
2534  C.store( i , j+1UL, xmm3 );
2535  C.store( i+SIMDSIZE, j+1UL, xmm4 );
2536  C.store( i , j+2UL, xmm5 );
2537  C.store( i+SIMDSIZE, j+2UL, xmm6 );
2538  }
2539 
2540  for( ; (j+2UL) <= jend; j+=2UL )
2541  {
2542  const size_t kbegin( ( IsLower_v<MT5> )
2543  ?( ( IsUpper_v<MT4> )
2544  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2545  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2546  :( IsUpper_v<MT4> ? i : 0UL ) );
2547  const size_t kend( ( IsUpper_v<MT5> )
2548  ?( ( IsLower_v<MT4> )
2549  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2550  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2551  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2552 
2553  SIMDType xmm1( C.load(i ,j ) );
2554  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
2555  SIMDType xmm3( C.load(i ,j+1UL) );
2556  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
2557  SIMDType xmm5, xmm6, xmm7, xmm8;
2558  size_t k( kbegin );
2559 
2560  for( ; (k+2UL) < kend; k+=2UL ) {
2561  const SIMDType a1( A.load(i ,k ) );
2562  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2563  const SIMDType a3( A.load(i ,k+1UL) );
2564  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2565  const SIMDType b1( set( B(k ,j ) ) );
2566  const SIMDType b2( set( B(k ,j+1UL) ) );
2567  const SIMDType b3( set( B(k+1UL,j ) ) );
2568  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
2569  xmm1 += a1 * b1;
2570  xmm2 += a2 * b1;
2571  xmm3 += a1 * b2;
2572  xmm4 += a2 * b2;
2573  xmm5 += a3 * b3;
2574  xmm6 += a4 * b3;
2575  xmm7 += a3 * b4;
2576  xmm8 += a4 * b4;
2577  }
2578 
2579  for( ; k<kend; ++k ) {
2580  const SIMDType a1( A.load(i ,k) );
2581  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2582  const SIMDType b1( set( B(k,j ) ) );
2583  const SIMDType b2( set( B(k,j+1UL) ) );
2584  xmm1 += a1 * b1;
2585  xmm2 += a2 * b1;
2586  xmm3 += a1 * b2;
2587  xmm4 += a2 * b2;
2588  }
2589 
2590  C.store( i , j , xmm1+xmm5 );
2591  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
2592  C.store( i , j+1UL, xmm3+xmm7 );
2593  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2594  }
2595 
2596  if( j < jend )
2597  {
2598  const size_t kbegin( ( IsLower_v<MT5> )
2599  ?( ( IsUpper_v<MT4> )
2600  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2601  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2602  :( IsUpper_v<MT4> ? i : 0UL ) );
2603  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2604 
2605  SIMDType xmm1( C.load(i ,j) );
2606  SIMDType xmm2( C.load(i+SIMDSIZE,j) );
2607  SIMDType xmm3, xmm4;
2608  size_t k( kbegin );
2609 
2610  for( ; (k+2UL) <= kend; k+=2UL ) {
2611  const SIMDType b1( set( B(k ,j) ) );
2612  const SIMDType b2( set( B(k+1UL,j) ) );
2613  xmm1 += A.load(i ,k ) * b1;
2614  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2615  xmm3 += A.load(i ,k+1UL) * b2;
2616  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2617  }
2618 
2619  for( ; k<kend; ++k ) {
2620  const SIMDType b1( set( B(k,j) ) );
2621  xmm1 += A.load(i ,k) * b1;
2622  xmm2 += A.load(i+SIMDSIZE,k) * b1;
2623  }
2624 
2625  C.store( i , j, xmm1+xmm3 );
2626  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
2627  }
2628  }
2629 
2630  for( ; i<ipos; i+=SIMDSIZE )
2631  {
2632  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
2633  size_t j( UPP ? i : 0UL );
2634 
2635  for( ; (j+4UL) <= jend; j+=4UL )
2636  {
2637  const size_t kbegin( ( IsLower_v<MT5> )
2638  ?( ( IsUpper_v<MT4> )
2639  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2640  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2641  :( IsUpper_v<MT4> ? i : 0UL ) );
2642  const size_t kend( ( IsUpper_v<MT5> )
2643  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2644  :( K ) );
2645 
2646  SIMDType xmm1( C.load(i,j ) );
2647  SIMDType xmm2( C.load(i,j+1UL) );
2648  SIMDType xmm3( C.load(i,j+2UL) );
2649  SIMDType xmm4( C.load(i,j+3UL) );
2650  SIMDType xmm5, xmm6, xmm7, xmm8;
2651  size_t k( kbegin );
2652 
2653  for( ; (k+2UL) <= kend; k+=2UL ) {
2654  const SIMDType a1( A.load(i,k ) );
2655  const SIMDType a2( A.load(i,k+1UL) );
2656  xmm1 += a1 * set( B(k ,j ) );
2657  xmm2 += a1 * set( B(k ,j+1UL) );
2658  xmm3 += a1 * set( B(k ,j+2UL) );
2659  xmm4 += a1 * set( B(k ,j+3UL) );
2660  xmm5 += a2 * set( B(k+1UL,j ) );
2661  xmm6 += a2 * set( B(k+1UL,j+1UL) );
2662  xmm7 += a2 * set( B(k+1UL,j+2UL) );
2663  xmm8 += a2 * set( B(k+1UL,j+3UL) );
2664  }
2665 
2666  for( ; k<kend; ++k ) {
2667  const SIMDType a1( A.load(i,k) );
2668  xmm1 += a1 * set( B(k,j ) );
2669  xmm2 += a1 * set( B(k,j+1UL) );
2670  xmm3 += a1 * set( B(k,j+2UL) );
2671  xmm4 += a1 * set( B(k,j+3UL) );
2672  }
2673 
2674  C.store( i, j , xmm1+xmm5 );
2675  C.store( i, j+1UL, xmm2+xmm6 );
2676  C.store( i, j+2UL, xmm3+xmm7 );
2677  C.store( i, j+3UL, xmm4+xmm8 );
2678  }
2679 
2680  for( ; (j+3UL) <= jend; j+=3UL )
2681  {
2682  const size_t kbegin( ( IsLower_v<MT5> )
2683  ?( ( IsUpper_v<MT4> )
2684  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2685  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2686  :( IsUpper_v<MT4> ? i : 0UL ) );
2687  const size_t kend( ( IsUpper_v<MT5> )
2688  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2689  :( K ) );
2690 
2691  SIMDType xmm1( C.load(i,j ) );
2692  SIMDType xmm2( C.load(i,j+1UL) );
2693  SIMDType xmm3( C.load(i,j+2UL) );
2694  SIMDType xmm4, xmm5, xmm6;
2695  size_t k( kbegin );
2696 
2697  for( ; (k+2UL) <= kend; k+=2UL ) {
2698  const SIMDType a1( A.load(i,k ) );
2699  const SIMDType a2( A.load(i,k+1UL) );
2700  xmm1 += a1 * set( B(k ,j ) );
2701  xmm2 += a1 * set( B(k ,j+1UL) );
2702  xmm3 += a1 * set( B(k ,j+2UL) );
2703  xmm4 += a2 * set( B(k+1UL,j ) );
2704  xmm5 += a2 * set( B(k+1UL,j+1UL) );
2705  xmm6 += a2 * set( B(k+1UL,j+2UL) );
2706  }
2707 
2708  for( ; k<kend; ++k ) {
2709  const SIMDType a1( A.load(i,k) );
2710  xmm1 += a1 * set( B(k,j ) );
2711  xmm2 += a1 * set( B(k,j+1UL) );
2712  xmm3 += a1 * set( B(k,j+2UL) );
2713  }
2714 
2715  C.store( i, j , xmm1+xmm4 );
2716  C.store( i, j+1UL, xmm2+xmm5 );
2717  C.store( i, j+2UL, xmm3+xmm6 );
2718  }
2719 
2720  for( ; (j+2UL) <= jend; j+=2UL )
2721  {
2722  const size_t kbegin( ( IsLower_v<MT5> )
2723  ?( ( IsUpper_v<MT4> )
2724  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2725  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2726  :( IsUpper_v<MT4> ? i : 0UL ) );
2727  const size_t kend( ( IsUpper_v<MT5> )
2728  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2729  :( K ) );
2730 
2731  SIMDType xmm1( C.load(i,j ) );
2732  SIMDType xmm2( C.load(i,j+1UL) );
2733  SIMDType xmm3, xmm4;
2734  size_t k( kbegin );
2735 
2736  for( ; (k+2UL) <= kend; k+=2UL ) {
2737  const SIMDType a1( A.load(i,k ) );
2738  const SIMDType a2( A.load(i,k+1UL) );
2739  xmm1 += a1 * set( B(k ,j ) );
2740  xmm2 += a1 * set( B(k ,j+1UL) );
2741  xmm3 += a2 * set( B(k+1UL,j ) );
2742  xmm4 += a2 * set( B(k+1UL,j+1UL) );
2743  }
2744 
2745  for( ; k<kend; ++k ) {
2746  const SIMDType a1( A.load(i,k) );
2747  xmm1 += a1 * set( B(k,j ) );
2748  xmm2 += a1 * set( B(k,j+1UL) );
2749  }
2750 
2751  C.store( i, j , xmm1+xmm3 );
2752  C.store( i, j+1UL, xmm2+xmm4 );
2753  }
2754 
2755  if( j < jend )
2756  {
2757  const size_t kbegin( ( IsLower_v<MT5> )
2758  ?( ( IsUpper_v<MT4> )
2759  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2760  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2761  :( IsUpper_v<MT4> ? i : 0UL ) );
2762 
2763  SIMDType xmm1( C.load(i,j) );
2764  SIMDType xmm2;
2765  size_t k( kbegin );
2766 
2767  for( ; (k+2UL) <= K; k+=2UL ) {
2768  xmm1 += A.load(i,k ) * set( B(k ,j) );
2769  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
2770  }
2771 
2772  for( ; k<K; ++k ) {
2773  xmm1 += A.load(i,k) * set( B(k,j) );
2774  }
2775 
2776  C.store( i, j, xmm1+xmm2 );
2777  }
2778  }
2779 
2780  for( ; remainder && i<M; ++i )
2781  {
2782  const size_t jend( LOW ? i+1UL : N );
2783  size_t j( UPP ? i : 0UL );
2784 
2785  for( ; (j+2UL) <= jend; j+=2UL )
2786  {
2787  const size_t kbegin( ( IsLower_v<MT5> )
2788  ?( ( IsUpper_v<MT4> )
2789  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2790  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2791  :( IsUpper_v<MT4> ? i : 0UL ) );
2792  const size_t kend( ( IsUpper_v<MT5> )
2793  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2794  :( K ) );
2795 
2796  ElementType value1( C(i,j ) );
2797  ElementType value2( C(i,j+1UL) );
2798 
2799  for( size_t k=kbegin; k<kend; ++k ) {
2800  value1 += A(i,k) * B(k,j );
2801  value2 += A(i,k) * B(k,j+1UL);
2802  }
2803 
2804  C(i,j ) = value1;
2805  C(i,j+1UL) = value2;
2806  }
2807 
2808  if( j < jend )
2809  {
2810  const size_t kbegin( ( IsLower_v<MT5> )
2811  ?( ( IsUpper_v<MT4> )
2812  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2813  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2814  :( IsUpper_v<MT4> ? i : 0UL ) );
2815 
2816  ElementType value( C(i,j) );
2817 
2818  for( size_t k=kbegin; k<K; ++k ) {
2819  value += A(i,k) * B(k,j);
2820  }
2821 
2822  C(i,j) = value;
2823  }
2824  }
2825  }
2827  //**********************************************************************************************
2828 
2829  //**Default addition assignment to dense matrices (large matrices)******************************
2843  template< typename MT3 // Type of the left-hand side target matrix
2844  , typename MT4 // Type of the left-hand side matrix operand
2845  , typename MT5 > // Type of the right-hand side matrix operand
2846  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2847  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2848  {
2849  selectDefaultAddAssignKernel( C, A, B );
2850  }
2852  //**********************************************************************************************
2853 
2854  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
2869  template< typename MT3 // Type of the left-hand side target matrix
2870  , typename MT4 // Type of the left-hand side matrix operand
2871  , typename MT5 > // Type of the right-hand side matrix operand
2872  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2873  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2874  {
2875  if( LOW )
2876  lmmm( C, A, B, ElementType(1), ElementType(1) );
2877  else if( UPP )
2878  ummm( C, A, B, ElementType(1), ElementType(1) );
2879  else
2880  mmm( C, A, B, ElementType(1), ElementType(1) );
2881  }
2883  //**********************************************************************************************
2884 
2885  //**BLAS-based addition assignment to dense matrices (default)**********************************
2899  template< typename MT3 // Type of the left-hand side target matrix
2900  , typename MT4 // Type of the left-hand side matrix operand
2901  , typename MT5 > // Type of the right-hand side matrix operand
2902  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2903  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2904  {
2905  selectLargeAddAssignKernel( C, A, B );
2906  }
2908  //**********************************************************************************************
2909 
2910  //**BLAS-based addition assignment to dense matrices********************************************
2911 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2912 
2925  template< typename MT3 // Type of the left-hand side target matrix
2926  , typename MT4 // Type of the left-hand side matrix operand
2927  , typename MT5 > // Type of the right-hand side matrix operand
2928  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2929  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2930  {
2931  using ET = ElementType_t<MT3>;
2932 
2933  if( IsTriangular_v<MT4> ) {
2934  ResultType_t<MT3> tmp( serial( B ) );
2935  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2936  addAssign( C, tmp );
2937  }
2938  else if( IsTriangular_v<MT5> ) {
2939  ResultType_t<MT3> tmp( serial( A ) );
2940  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2941  addAssign( C, tmp );
2942  }
2943  else {
2944  gemm( C, A, B, ET(1), ET(1) );
2945  }
2946  }
2948 #endif
2949  //**********************************************************************************************
2950 
2951  //**Restructuring addition assignment to row-major matrices*************************************
2966  template< typename MT > // Type of the target matrix
2967  friend inline auto addAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
2968  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2969  {
2971 
2973 
2974  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2975  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2976 
2977  const ForwardFunctor fwd;
2978 
2979  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
2980  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
2981  else if( IsSymmetric_v<MT1> )
2982  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
2983  else
2984  addAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
2985  }
2987  //**********************************************************************************************
2988 
2989  //**Addition assignment to sparse matrices******************************************************
2990  // No special implementation for the addition assignment to sparse matrices.
2991  //**********************************************************************************************
2992 
2993  //**Subtraction assignment to dense matrices****************************************************
3006  template< typename MT // Type of the target dense matrix
3007  , bool SO > // Storage order of the target dense matrix
3008  friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3009  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3010  {
3012 
3013  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3014  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3015 
3016  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3017  return;
3018  }
3019 
3020  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3021  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3022 
3023  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3024  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3025  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3026  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3027  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3028  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3029 
3030  TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3031  }
3033  //**********************************************************************************************
3034 
3035  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3046  template< typename MT3 // Type of the left-hand side target matrix
3047  , typename MT4 // Type of the left-hand side matrix operand
3048  , typename MT5 > // Type of the right-hand side matrix operand
3049  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3050  {
3051  if( ( IsDiagonal_v<MT4> ) ||
3052  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
3053  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3054  selectSmallSubAssignKernel( C, A, B );
3055  else
3056  selectBlasSubAssignKernel( C, A, B );
3057  }
3059  //**********************************************************************************************
3060 
3061  //**Default subtraction assignment to dense matrices (general/general)**************************
3075  template< typename MT3 // Type of the left-hand side target matrix
3076  , typename MT4 // Type of the left-hand side matrix operand
3077  , typename MT5 > // Type of the right-hand side matrix operand
3078  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3079  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3080  {
3081  const size_t M( A.rows() );
3082  const size_t N( B.columns() );
3083  const size_t K( A.columns() );
3084 
3085  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3086 
3087  for( size_t j=0UL; j<N; ++j )
3088  {
3089  const size_t kbegin( ( IsLower_v<MT5> )
3090  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3091  :( 0UL ) );
3092  const size_t kend( ( IsUpper_v<MT5> )
3093  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3094  :( K ) );
3095  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3096 
3097  for( size_t k=kbegin; k<kend; ++k )
3098  {
3099  const size_t ibegin( ( IsLower_v<MT4> )
3100  ?( ( IsStrictlyLower_v<MT4> )
3101  ?( LOW ? max(j,k+1UL) : k+1UL )
3102  :( LOW ? max(j,k) : k ) )
3103  :( LOW ? j : 0UL ) );
3104  const size_t iend( ( IsUpper_v<MT4> )
3105  ?( ( IsStrictlyUpper_v<MT4> )
3106  ?( UPP ? min(j+1UL,k) : k )
3107  :( UPP ? min(j,k)+1UL : k+1UL ) )
3108  :( UPP ? j+1UL : M ) );
3109 
3110  if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
3111  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3112 
3113  const size_t inum( iend - ibegin );
3114  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3115 
3116  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3117  C(i ,j) -= A(i ,k) * B(k,j);
3118  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3119  }
3120  if( ipos < iend ) {
3121  C(ipos,j) -= A(ipos,k) * B(k,j);
3122  }
3123  }
3124  }
3125  }
3127  //**********************************************************************************************
3128 
3129  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
3143  template< typename MT3 // Type of the left-hand side target matrix
3144  , typename MT4 // Type of the left-hand side matrix operand
3145  , typename MT5 > // Type of the right-hand side matrix operand
3146  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3147  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3148  {
3150 
3151  const size_t M( A.rows() );
3152  const size_t N( B.columns() );
3153 
3154  for( size_t j=0UL; j<N; ++j )
3155  {
3156  const size_t ibegin( ( IsLower_v<MT4> )
3157  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3158  :( 0UL ) );
3159  const size_t iend( ( IsUpper_v<MT4> )
3160  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3161  :( M ) );
3162  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3163 
3164  const size_t inum( iend - ibegin );
3165  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3166 
3167  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3168  C(i ,j) -= A(i ,j) * B(j,j);
3169  C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3170  }
3171  if( ipos < iend ) {
3172  C(ipos,j) -= A(ipos,j) * B(j,j);
3173  }
3174  }
3175  }
3177  //**********************************************************************************************
3178 
3179  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
3193  template< typename MT3 // Type of the left-hand side target matrix
3194  , typename MT4 // Type of the left-hand side matrix operand
3195  , typename MT5 > // Type of the right-hand side matrix operand
3196  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3197  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3198  {
3200 
3201  const size_t M( A.rows() );
3202  const size_t N( B.columns() );
3203 
3204  for( size_t j=0UL; j<N; ++j )
3205  {
3206  const size_t ibegin( ( IsLower_v<MT5> )
3207  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3208  :( 0UL ) );
3209  const size_t iend( ( IsUpper_v<MT5> )
3210  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3211  :( M ) );
3212  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3213 
3214  const size_t inum( iend - ibegin );
3215  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3216 
3217  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3218  C(i ,j) -= A(i ,i ) * B(i ,j);
3219  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3220  }
3221  if( ipos < iend ) {
3222  C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3223  }
3224  }
3225  }
3227  //**********************************************************************************************
3228 
3229  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3243  template< typename MT3 // Type of the left-hand side target matrix
3244  , typename MT4 // Type of the left-hand side matrix operand
3245  , typename MT5 > // Type of the right-hand side matrix operand
3246  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3247  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3248  {
3250 
3251  for( size_t i=0UL; i<A.rows(); ++i ) {
3252  C(i,i) -= A(i,i) * B(i,i);
3253  }
3254  }
3256  //**********************************************************************************************
3257 
3258  //**Default subtraction assignment to dense matrices (small matrices)***************************
3272  template< typename MT3 // Type of the left-hand side target matrix
3273  , typename MT4 // Type of the left-hand side matrix operand
3274  , typename MT5 > // Type of the right-hand side matrix operand
3275  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3276  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3277  {
3278  selectDefaultSubAssignKernel( C, A, B );
3279  }
3281  //**********************************************************************************************
3282 
3283  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
3298  template< typename MT3 // Type of the left-hand side target matrix
3299  , typename MT4 // Type of the left-hand side matrix operand
3300  , typename MT5 > // Type of the right-hand side matrix operand
3301  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3302  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3303  {
3306  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
3307  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
3308 
3309  const ForwardFunctor fwd;
3310 
3311  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
3312  const OppositeType_t<MT5> tmp( serial( B ) );
3313  subAssign( C, fwd( A * tmp ) );
3314  }
3315  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
3316  const OppositeType_t<MT4> tmp( serial( A ) );
3317  subAssign( C, fwd( tmp * B ) );
3318  }
3319  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3320  const OppositeType_t<MT5> tmp( serial( B ) );
3321  subAssign( C, fwd( A * tmp ) );
3322  }
3323  else {
3324  const OppositeType_t<MT4> tmp( serial( A ) );
3325  subAssign( C, fwd( tmp * B ) );
3326  }
3327  }
3329  //**********************************************************************************************
3330 
3331  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
3346  template< typename MT3 // Type of the left-hand side target matrix
3347  , typename MT4 // Type of the left-hand side matrix operand
3348  , typename MT5 > // Type of the right-hand side matrix operand
3349  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3350  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3351  {
3352  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3353 
3354  const size_t M( A.rows() );
3355  const size_t N( B.columns() );
3356  const size_t K( A.columns() );
3357 
3358  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3359 
3360  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3361  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3362 
3363  size_t i( 0UL );
3364 
3365  if( IsIntegral_v<ElementType> )
3366  {
3367  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3368  for( size_t j=0UL; j<N; ++j )
3369  {
3370  const size_t kbegin( ( IsLower_v<MT5> )
3371  ?( ( IsUpper_v<MT4> )
3372  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3373  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3374  :( IsUpper_v<MT4> ? i : 0UL ) );
3375  const size_t kend( ( IsUpper_v<MT5> )
3376  ?( ( IsLower_v<MT4> )
3377  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3378  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3379  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
3380 
3381  SIMDType xmm1( C.load(i ,j) );
3382  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3383  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3384  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3385  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
3386  SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
3387  SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
3388  SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
3389 
3390  for( size_t k=kbegin; k<kend; ++k ) {
3391  const SIMDType b1( set( B(k,j) ) );
3392  xmm1 -= A.load(i ,k) * b1;
3393  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3394  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3395  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3396  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3397  xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
3398  xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
3399  xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
3400  }
3401 
3402  C.store( i , j, xmm1 );
3403  C.store( i+SIMDSIZE , j, xmm2 );
3404  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3405  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3406  C.store( i+SIMDSIZE*4UL, j, xmm5 );
3407  C.store( i+SIMDSIZE*5UL, j, xmm6 );
3408  C.store( i+SIMDSIZE*6UL, j, xmm7 );
3409  C.store( i+SIMDSIZE*7UL, j, xmm8 );
3410  }
3411  }
3412  }
3413 
3414  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3415  {
3416  size_t j( 0UL );
3417 
3418  for( ; (j+2UL) <= N; j+=2UL )
3419  {
3420  const size_t kbegin( ( IsLower_v<MT5> )
3421  ?( ( IsUpper_v<MT4> )
3422  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3423  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3424  :( IsUpper_v<MT4> ? i : 0UL ) );
3425  const size_t kend( ( IsUpper_v<MT5> )
3426  ?( ( IsLower_v<MT4> )
3427  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3428  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3429  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
3430 
3431  SIMDType xmm1 ( C.load(i ,j ) );
3432  SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
3433  SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
3434  SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
3435  SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
3436  SIMDType xmm6 ( C.load(i ,j+1UL) );
3437  SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
3438  SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
3439  SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
3440  SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
3441 
3442  for( size_t k=kbegin; k<kend; ++k ) {
3443  const SIMDType a1( A.load(i ,k) );
3444  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3445  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3446  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3447  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3448  const SIMDType b1( set( B(k,j ) ) );
3449  const SIMDType b2( set( B(k,j+1UL) ) );
3450  xmm1 -= a1 * b1;
3451  xmm2 -= a2 * b1;
3452  xmm3 -= a3 * b1;
3453  xmm4 -= a4 * b1;
3454  xmm5 -= a5 * b1;
3455  xmm6 -= a1 * b2;
3456  xmm7 -= a2 * b2;
3457  xmm8 -= a3 * b2;
3458  xmm9 -= a4 * b2;
3459  xmm10 -= a5 * b2;
3460  }
3461 
3462  C.store( i , j , xmm1 );
3463  C.store( i+SIMDSIZE , j , xmm2 );
3464  C.store( i+SIMDSIZE*2UL, j , xmm3 );
3465  C.store( i+SIMDSIZE*3UL, j , xmm4 );
3466  C.store( i+SIMDSIZE*4UL, j , xmm5 );
3467  C.store( i , j+1UL, xmm6 );
3468  C.store( i+SIMDSIZE , j+1UL, xmm7 );
3469  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3470  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3471  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3472  }
3473 
3474  if( j < N )
3475  {
3476  const size_t kbegin( ( IsLower_v<MT5> )
3477  ?( ( IsUpper_v<MT4> )
3478  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3479  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3480  :( IsUpper_v<MT4> ? i : 0UL ) );
3481  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
3482 
3483  SIMDType xmm1( C.load(i ,j) );
3484  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3485  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3486  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3487  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
3488 
3489  for( size_t k=kbegin; k<kend; ++k ) {
3490  const SIMDType b1( set( B(k,j) ) );
3491  xmm1 -= A.load(i ,k) * b1;
3492  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3493  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3494  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3495  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3496  }
3497 
3498  C.store( i , j, xmm1 );
3499  C.store( i+SIMDSIZE , j, xmm2 );
3500  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3501  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3502  C.store( i+SIMDSIZE*4UL, j, xmm5 );
3503  }
3504  }
3505 
3506  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3507  {
3508  size_t j( 0UL );
3509 
3510  for( ; (j+2UL) <= N; j+=2UL )
3511  {
3512  const size_t kbegin( ( IsLower_v<MT5> )
3513  ?( ( IsUpper_v<MT4> )
3514  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3515  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3516  :( IsUpper_v<MT4> ? i : 0UL ) );
3517  const size_t kend( ( IsUpper_v<MT5> )
3518  ?( ( IsLower_v<MT4> )
3519  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3520  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3521  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
3522 
3523  SIMDType xmm1( C.load(i ,j ) );
3524  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
3525  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
3526  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
3527  SIMDType xmm5( C.load(i ,j+1UL) );
3528  SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
3529  SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
3530  SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
3531 
3532  for( size_t k=kbegin; k<kend; ++k ) {
3533  const SIMDType a1( A.load(i ,k) );
3534  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3535  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3536  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3537  const SIMDType b1( set( B(k,j ) ) );
3538  const SIMDType b2( set( B(k,j+1UL) ) );
3539  xmm1 -= a1 * b1;
3540  xmm2 -= a2 * b1;
3541  xmm3 -= a3 * b1;
3542  xmm4 -= a4 * b1;
3543  xmm5 -= a1 * b2;
3544  xmm6 -= a2 * b2;
3545  xmm7 -= a3 * b2;
3546  xmm8 -= a4 * b2;
3547  }
3548 
3549  C.store( i , j , xmm1 );
3550  C.store( i+SIMDSIZE , j , xmm2 );
3551  C.store( i+SIMDSIZE*2UL, j , xmm3 );
3552  C.store( i+SIMDSIZE*3UL, j , xmm4 );
3553  C.store( i , j+1UL, xmm5 );
3554  C.store( i+SIMDSIZE , j+1UL, xmm6 );
3555  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3556  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3557  }
3558 
3559  if( j < N )
3560  {
3561  const size_t kbegin( ( IsLower_v<MT5> )
3562  ?( ( IsUpper_v<MT4> )
3563  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3564  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3565  :( IsUpper_v<MT4> ? i : 0UL ) );
3566  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3567 
3568  SIMDType xmm1( C.load(i ,j) );
3569  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3570  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3571  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3572 
3573  for( size_t k=kbegin; k<kend; ++k ) {
3574  const SIMDType b1( set( B(k,j) ) );
3575  xmm1 -= A.load(i ,k) * b1;
3576  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3577  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3578  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3579  }
3580 
3581  C.store( i , j, xmm1 );
3582  C.store( i+SIMDSIZE , j, xmm2 );
3583  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3584  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3585  }
3586  }
3587 
3588  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3589  {
3590  size_t j( 0UL );
3591 
3592  for( ; (j+2UL) <= N; j+=2UL )
3593  {
3594  const size_t kbegin( ( IsLower_v<MT5> )
3595  ?( ( IsUpper_v<MT4> )
3596  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3597  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3598  :( IsUpper_v<MT4> ? i : 0UL ) );
3599  const size_t kend( ( IsUpper_v<MT5> )
3600  ?( ( IsLower_v<MT4> )
3601  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3602  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3603  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
3604 
3605  SIMDType xmm1( C.load(i ,j ) );
3606  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
3607  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
3608  SIMDType xmm4( C.load(i ,j+1UL) );
3609  SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
3610  SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
3611 
3612  for( size_t k=kbegin; k<kend; ++k ) {
3613  const SIMDType a1( A.load(i ,k) );
3614  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3615  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3616  const SIMDType b1( set( B(k,j ) ) );
3617  const SIMDType b2( set( B(k,j+1UL) ) );
3618  xmm1 -= a1 * b1;
3619  xmm2 -= a2 * b1;
3620  xmm3 -= a3 * b1;
3621  xmm4 -= a1 * b2;
3622  xmm5 -= a2 * b2;
3623  xmm6 -= a3 * b2;
3624  }
3625 
3626  C.store( i , j , xmm1 );
3627  C.store( i+SIMDSIZE , j , xmm2 );
3628  C.store( i+SIMDSIZE*2UL, j , xmm3 );
3629  C.store( i , j+1UL, xmm4 );
3630  C.store( i+SIMDSIZE , j+1UL, xmm5 );
3631  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3632  }
3633 
3634  if( j < N )
3635  {
3636  const size_t kbegin( ( IsLower_v<MT5> )
3637  ?( ( IsUpper_v<MT4> )
3638  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3639  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3640  :( IsUpper_v<MT4> ? i : 0UL ) );
3641  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
3642 
3643  SIMDType xmm1( C.load(i ,j) );
3644  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3645  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3646 
3647  for( size_t k=kbegin; k<kend; ++k ) {
3648  const SIMDType b1( set( B(k,j) ) );
3649  xmm1 -= A.load(i ,k) * b1;
3650  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3651  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3652  }
3653 
3654  C.store( i , j, xmm1 );
3655  C.store( i+SIMDSIZE , j, xmm2 );
3656  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3657  }
3658  }
3659 
3660  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3661  {
3662  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
3663  size_t j( UPP ? i : 0UL );
3664 
3665  for( ; (j+4UL) <= jend; j+=4UL )
3666  {
3667  const size_t kbegin( ( IsLower_v<MT5> )
3668  ?( ( IsUpper_v<MT4> )
3669  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3670  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3671  :( IsUpper_v<MT4> ? i : 0UL ) );
3672  const size_t kend( ( IsUpper_v<MT5> )
3673  ?( ( IsLower_v<MT4> )
3674  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
3675  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
3676  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
3677 
3678  SIMDType xmm1( C.load(i ,j ) );
3679  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
3680  SIMDType xmm3( C.load(i ,j+1UL) );
3681  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
3682  SIMDType xmm5( C.load(i ,j+2UL) );
3683  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
3684  SIMDType xmm7( C.load(i ,j+3UL) );
3685  SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
3686 
3687  for( size_t k=kbegin; k<kend; ++k ) {
3688  const SIMDType a1( A.load(i ,k) );
3689  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3690  const SIMDType b1( set( B(k,j ) ) );
3691  const SIMDType b2( set( B(k,j+1UL) ) );
3692  const SIMDType b3( set( B(k,j+2UL) ) );
3693  const SIMDType b4( set( B(k,j+3UL) ) );
3694  xmm1 -= a1 * b1;
3695  xmm2 -= a2 * b1;
3696  xmm3 -= a1 * b2;
3697  xmm4 -= a2 * b2;
3698  xmm5 -= a1 * b3;
3699  xmm6 -= a2 * b3;
3700  xmm7 -= a1 * b4;
3701  xmm8 -= a2 * b4;
3702  }
3703 
3704  C.store( i , j , xmm1 );
3705  C.store( i+SIMDSIZE, j , xmm2 );
3706  C.store( i , j+1UL, xmm3 );
3707  C.store( i+SIMDSIZE, j+1UL, xmm4 );
3708  C.store( i , j+2UL, xmm5 );
3709  C.store( i+SIMDSIZE, j+2UL, xmm6 );
3710  C.store( i , j+3UL, xmm7 );
3711  C.store( i+SIMDSIZE, j+3UL, xmm8 );
3712  }
3713 
3714  for( ; (j+3UL) <= jend; j+=3UL )
3715  {
3716  const size_t kbegin( ( IsLower_v<MT5> )
3717  ?( ( IsUpper_v<MT4> )
3718  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3719  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3720  :( IsUpper_v<MT4> ? i : 0UL ) );
3721  const size_t kend( ( IsUpper_v<MT5> )
3722  ?( ( IsLower_v<MT4> )
3723  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
3724  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
3725  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
3726 
3727  SIMDType xmm1( C.load(i ,j ) );
3728  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
3729  SIMDType xmm3( C.load(i ,j+1UL) );
3730  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
3731  SIMDType xmm5( C.load(i ,j+2UL) );
3732  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
3733 
3734  for( size_t k=kbegin; k<kend; ++k ) {
3735  const SIMDType a1( A.load(i ,k) );
3736  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3737  const SIMDType b1( set( B(k,j ) ) );
3738  const SIMDType b2( set( B(k,j+1UL) ) );
3739  const SIMDType b3( set( B(k,j+2UL) ) );
3740  xmm1 -= a1 * b1;
3741  xmm2 -= a2 * b1;
3742  xmm3 -= a1 * b2;
3743  xmm4 -= a2 * b2;
3744  xmm5 -= a1 * b3;
3745  xmm6 -= a2 * b3;
3746  }
3747 
3748  C.store( i , j , xmm1 );
3749  C.store( i+SIMDSIZE, j , xmm2 );
3750  C.store( i , j+1UL, xmm3 );
3751  C.store( i+SIMDSIZE, j+1UL, xmm4 );
3752  C.store( i , j+2UL, xmm5 );
3753  C.store( i+SIMDSIZE, j+2UL, xmm6 );
3754  }
3755 
3756  for( ; (j+2UL) <= jend; j+=2UL )
3757  {
3758  const size_t kbegin( ( IsLower_v<MT5> )
3759  ?( ( IsUpper_v<MT4> )
3760  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3761  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3762  :( IsUpper_v<MT4> ? i : 0UL ) );
3763  const size_t kend( ( IsUpper_v<MT5> )
3764  ?( ( IsLower_v<MT4> )
3765  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3766  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3767  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
3768 
3769  SIMDType xmm1( C.load(i ,j ) );
3770  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
3771  SIMDType xmm3( C.load(i ,j+1UL) );
3772  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
3773  SIMDType xmm5, xmm6, xmm7, xmm8;
3774  size_t k( kbegin );
3775 
3776  for( ; (k+2UL) <= kend; k+=2UL ) {
3777  const SIMDType a1( A.load(i ,k ) );
3778  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
3779  const SIMDType a3( A.load(i ,k+1UL) );
3780  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
3781  const SIMDType b1( set( B(k ,j ) ) );
3782  const SIMDType b2( set( B(k ,j+1UL) ) );
3783  const SIMDType b3( set( B(k+1UL,j ) ) );
3784  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
3785  xmm1 -= a1 * b1;
3786  xmm2 -= a2 * b1;
3787  xmm3 -= a1 * b2;
3788  xmm4 -= a2 * b2;
3789  xmm5 -= a3 * b3;
3790  xmm6 -= a4 * b3;
3791  xmm7 -= a3 * b4;
3792  xmm8 -= a4 * b4;
3793  }
3794 
3795  for( ; k<kend; ++k ) {
3796  const SIMDType a1( A.load(i ,k) );
3797  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3798  const SIMDType b1( set( B(k,j ) ) );
3799  const SIMDType b2( set( B(k,j+1UL) ) );
3800  xmm1 -= a1 * b1;
3801  xmm2 -= a2 * b1;
3802  xmm3 -= a1 * b2;
3803  xmm4 -= a2 * b2;
3804  }
3805 
3806  C.store( i , j , xmm1+xmm5 );
3807  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
3808  C.store( i , j+1UL, xmm3+xmm7 );
3809  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
3810  }
3811 
3812  if( j < jend )
3813  {
3814  const size_t kbegin( ( IsLower_v<MT5> )
3815  ?( ( IsUpper_v<MT4> )
3816  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3817  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3818  :( IsUpper_v<MT4> ? i : 0UL ) );
3819  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
3820 
3821  SIMDType xmm1( C.load(i ,j) );
3822  SIMDType xmm2( C.load(i+SIMDSIZE,j) );
3823  SIMDType xmm3, xmm4;
3824  size_t k( kbegin );
3825 
3826  for( ; (k+2UL) <= kend; k+=2UL ) {
3827  const SIMDType b1( set( B(k ,j) ) );
3828  const SIMDType b2( set( B(k+1UL,j) ) );
3829  xmm1 -= A.load(i ,k ) * b1;
3830  xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
3831  xmm3 -= A.load(i ,k+1UL) * b2;
3832  xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
3833  }
3834 
3835  for( ; k<kend; ++k ) {
3836  const SIMDType b1( set( B(k,j) ) );
3837  xmm1 -= A.load(i ,k) * b1;
3838  xmm2 -= A.load(i+SIMDSIZE,k) * b1;
3839  }
3840 
3841  C.store( i , j, xmm1+xmm3 );
3842  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
3843  }
3844  }
3845 
3846  for( ; i<ipos; i+=SIMDSIZE )
3847  {
3848  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
3849  size_t j( UPP ? i : 0UL );
3850 
3851  for( ; (j+4UL) <= jend; j+=4UL )
3852  {
3853  const size_t kbegin( ( IsLower_v<MT5> )
3854  ?( ( IsUpper_v<MT4> )
3855  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3856  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3857  :( IsUpper_v<MT4> ? i : 0UL ) );
3858  const size_t kend( ( IsUpper_v<MT5> )
3859  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
3860  :( K ) );
3861 
3862  SIMDType xmm1( C.load(i,j ) );
3863  SIMDType xmm2( C.load(i,j+1UL) );
3864  SIMDType xmm3( C.load(i,j+2UL) );
3865  SIMDType xmm4( C.load(i,j+3UL) );
3866  SIMDType xmm5, xmm6, xmm7, xmm8;
3867  size_t k( kbegin );
3868 
3869  for( ; (k+2UL) <= kend; k+=2UL ) {
3870  const SIMDType a1( A.load(i,k ) );
3871  const SIMDType a2( A.load(i,k+1UL) );
3872  xmm1 -= a1 * set( B(k ,j ) );
3873  xmm2 -= a1 * set( B(k ,j+1UL) );
3874  xmm3 -= a1 * set( B(k ,j+2UL) );
3875  xmm4 -= a1 * set( B(k ,j+3UL) );
3876  xmm5 -= a2 * set( B(k+1UL,j ) );
3877  xmm6 -= a2 * set( B(k+1UL,j+1UL) );
3878  xmm7 -= a2 * set( B(k+1UL,j+2UL) );
3879  xmm8 -= a2 * set( B(k+1UL,j+3UL) );
3880  }
3881 
3882  for( ; k<kend; ++k ) {
3883  const SIMDType a1( A.load(i,k) );
3884  xmm1 -= a1 * set( B(k,j ) );
3885  xmm2 -= a1 * set( B(k,j+1UL) );
3886  xmm3 -= a1 * set( B(k,j+2UL) );
3887  xmm4 -= a1 * set( B(k,j+3UL) );
3888  }
3889 
3890  C.store( i, j , xmm1+xmm5 );
3891  C.store( i, j+1UL, xmm2+xmm6 );
3892  C.store( i, j+2UL, xmm3+xmm7 );
3893  C.store( i, j+3UL, xmm4+xmm8 );
3894  }
3895 
3896  for( ; (j+3UL) <= jend; j+=3UL )
3897  {
3898  const size_t kbegin( ( IsLower_v<MT5> )
3899  ?( ( IsUpper_v<MT4> )
3900  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3901  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3902  :( IsUpper_v<MT4> ? i : 0UL ) );
3903  const size_t kend( ( IsUpper_v<MT5> )
3904  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
3905  :( K ) );
3906 
3907  SIMDType xmm1( C.load(i,j ) );
3908  SIMDType xmm2( C.load(i,j+1UL) );
3909  SIMDType xmm3( C.load(i,j+2UL) );
3910  SIMDType xmm4, xmm5, xmm6;
3911  size_t k( kbegin );
3912 
3913  for( ; (k+2UL) <= kend; k+=2UL ) {
3914  const SIMDType a1( A.load(i,k ) );
3915  const SIMDType a2( A.load(i,k+1UL) );
3916  xmm1 -= a1 * set( B(k ,j ) );
3917  xmm2 -= a1 * set( B(k ,j+1UL) );
3918  xmm3 -= a1 * set( B(k ,j+2UL) );
3919  xmm4 -= a2 * set( B(k+1UL,j ) );
3920  xmm5 -= a2 * set( B(k+1UL,j+1UL) );
3921  xmm6 -= a2 * set( B(k+1UL,j+2UL) );
3922  }
3923 
3924  for( ; k<kend; ++k ) {
3925  const SIMDType a1( A.load(i,k) );
3926  xmm1 -= a1 * set( B(k,j ) );
3927  xmm2 -= a1 * set( B(k,j+1UL) );
3928  xmm3 -= a1 * set( B(k,j+2UL) );
3929  }
3930 
3931  C.store( i, j , xmm1+xmm4 );
3932  C.store( i, j+1UL, xmm2+xmm5 );
3933  C.store( i, j+2UL, xmm3+xmm6 );
3934  }
3935 
3936  for( ; (j+2UL) <= jend; j+=2UL )
3937  {
3938  const size_t kbegin( ( IsLower_v<MT5> )
3939  ?( ( IsUpper_v<MT4> )
3940  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3941  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3942  :( IsUpper_v<MT4> ? i : 0UL ) );
3943  const size_t kend( ( IsUpper_v<MT5> )
3944  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
3945  :( K ) );
3946 
3947  SIMDType xmm1( C.load(i,j ) );
3948  SIMDType xmm2( C.load(i,j+1UL) );
3949  SIMDType xmm3, xmm4;
3950  size_t k( kbegin );
3951 
3952  for( ; (k+2UL) <= kend; k+=2UL ) {
3953  const SIMDType a1( A.load(i,k ) );
3954  const SIMDType a2( A.load(i,k+1UL) );
3955  xmm1 -= a1 * set( B(k ,j ) );
3956  xmm2 -= a1 * set( B(k ,j+1UL) );
3957  xmm3 -= a2 * set( B(k+1UL,j ) );
3958  xmm4 -= a2 * set( B(k+1UL,j+1UL) );
3959  }
3960 
3961  for( ; k<kend; ++k ) {
3962  const SIMDType a1( A.load(i,k) );
3963  xmm1 -= a1 * set( B(k,j ) );
3964  xmm2 -= a1 * set( B(k,j+1UL) );
3965  }
3966 
3967  C.store( i, j , xmm1+xmm3 );
3968  C.store( i, j+1UL, xmm2+xmm4 );
3969  }
3970 
3971  if( j < jend )
3972  {
3973  const size_t kbegin( ( IsLower_v<MT5> )
3974  ?( ( IsUpper_v<MT4> )
3975  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3976  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3977  :( IsUpper_v<MT4> ? i : 0UL ) );
3978 
3979  SIMDType xmm1( C.load(i,j) );
3980  SIMDType xmm2;
3981  size_t k( kbegin );
3982 
3983  for( ; (k+2UL) <= K; k+=2UL ) {
3984  xmm1 -= A.load(i,k ) * set( B(k ,j) );
3985  xmm2 -= A.load(i,k+1UL) * set( B(k+1UL,j) );
3986  }
3987 
3988  for( ; k<K; ++k ) {
3989  xmm1 -= A.load(i,k) * set( B(k,j) );
3990  }
3991 
3992  C.store( i, j, xmm1+xmm2 );
3993  }
3994  }
3995 
3996  for( ; remainder && i<M; ++i )
3997  {
3998  const size_t jend( LOW ? i+1UL : N );
3999  size_t j( UPP ? i : 0UL );
4000 
4001  for( ; (j+2UL) <= jend; j+=2UL )
4002  {
4003  const size_t kbegin( ( IsLower_v<MT5> )
4004  ?( ( IsUpper_v<MT4> )
4005  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4006  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4007  :( IsUpper_v<MT4> ? i : 0UL ) );
4008  const size_t kend( ( IsUpper_v<MT5> )
4009  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4010  :( K ) );
4011 
4012  ElementType value1( C(i,j ) );
4013  ElementType value2( C(i,j+1UL) );
4014 
4015  for( size_t k=kbegin; k<kend; ++k ) {
4016  value1 -= A(i,k) * B(k,j );
4017  value2 -= A(i,k) * B(k,j+1UL);
4018  }
4019 
4020  C(i,j ) = value1;
4021  C(i,j+1UL) = value2;
4022  }
4023 
4024  if( j < jend )
4025  {
4026  const size_t kbegin( ( IsLower_v<MT5> )
4027  ?( ( IsUpper_v<MT4> )
4028  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4029  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4030  :( IsUpper_v<MT4> ? i : 0UL ) );
4031 
4032  ElementType value( C(i,j) );
4033 
4034  for( size_t k=kbegin; k<K; ++k ) {
4035  value -= A(i,k) * B(k,j);
4036  }
4037 
4038  C(i,j) = value;
4039  }
4040  }
4041  }
4043  //**********************************************************************************************
4044 
4045  //**Default subtraction assignment to dense matrices (large matrices)***************************
4059  template< typename MT3 // Type of the left-hand side target matrix
4060  , typename MT4 // Type of the left-hand side matrix operand
4061  , typename MT5 > // Type of the right-hand side matrix operand
4062  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4063  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4064  {
4065  selectDefaultSubAssignKernel( C, A, B );
4066  }
4068  //**********************************************************************************************
4069 
4070  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
4085  template< typename MT3 // Type of the left-hand side target matrix
4086  , typename MT4 // Type of the left-hand side matrix operand
4087  , typename MT5 > // Type of the right-hand side matrix operand
4088  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4089  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4090  {
4091  if( LOW )
4092  lmmm( C, A, B, ElementType(-1), ElementType(1) );
4093  else if( UPP )
4094  ummm( C, A, B, ElementType(-1), ElementType(1) );
4095  else
4096  mmm( C, A, B, ElementType(-1), ElementType(1) );
4097  }
4099  //**********************************************************************************************
4100 
4101  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4115  template< typename MT3 // Type of the left-hand side target matrix
4116  , typename MT4 // Type of the left-hand side matrix operand
4117  , typename MT5 > // Type of the right-hand side matrix operand
4118  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4119  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4120  {
4121  selectLargeSubAssignKernel( C, A, B );
4122  }
4124  //**********************************************************************************************
4125 
4126  //**BLAS-based subraction assignment to dense matrices******************************************
4127 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4128 
4141  template< typename MT3 // Type of the left-hand side target matrix
4142  , typename MT4 // Type of the left-hand side matrix operand
4143  , typename MT5 > // Type of the right-hand side matrix operand
4144  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4145  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4146  {
4147  using ET = ElementType_t<MT3>;
4148 
4149  if( IsTriangular_v<MT4> ) {
4150  ResultType_t<MT3> tmp( serial( B ) );
4151  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4152  subAssign( C, tmp );
4153  }
4154  else if( IsTriangular_v<MT5> ) {
4155  ResultType_t<MT3> tmp( serial( A ) );
4156  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4157  subAssign( C, tmp );
4158  }
4159  else {
4160  gemm( C, A, B, ET(-1), ET(1) );
4161  }
4162  }
4164 #endif
4165  //**********************************************************************************************
4166 
4167  //**Restructuring subtraction assignment to row-major matrices**********************************
4183  template< typename MT > // Type of the target matrix
4184  friend inline auto subAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4185  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4186  {
4188 
4190 
4191  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4192  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4193 
4194  const ForwardFunctor fwd;
4195 
4196  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4197  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4198  else if( IsSymmetric_v<MT1> )
4199  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4200  else
4201  subAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4202  }
4204  //**********************************************************************************************
4205 
4206  //**Subtraction assignment to sparse matrices***************************************************
4207  // No special implementation for the subtraction assignment to sparse matrices.
4208  //**********************************************************************************************
4209 
4210  //**Schur product assignment to dense matrices**************************************************
4223  template< typename MT // Type of the target dense matrix
4224  , bool SO > // Storage order of the target dense matrix
4225  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4226  {
4228 
4232 
4233  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4234  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4235 
4236  const ResultType tmp( serial( rhs ) );
4237  schurAssign( ~lhs, tmp );
4238  }
4240  //**********************************************************************************************
4241 
4242  //**Multiplication assignment to dense matrices*************************************************
4243  // No special implementation for the multiplication assignment to dense matrices.
4244  //**********************************************************************************************
4245 
4246  //**Multiplication assignment to sparse matrices************************************************
4247  // No special implementation for the multiplication assignment to sparse matrices.
4248  //**********************************************************************************************
4249 
4250  //**SMP assignment to dense matrices************************************************************
4266  template< typename MT // Type of the target dense matrix
4267  , bool SO > // Storage order of the target dense matrix
4268  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4269  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4270  {
4272 
4273  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4274  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4275 
4276  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4277  return;
4278  }
4279  else if( rhs.lhs_.columns() == 0UL ) {
4280  reset( ~lhs );
4281  return;
4282  }
4283 
4284  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4285  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4286 
4287  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4288  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4289  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4290  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4291  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4292  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4293 
4294  smpAssign( ~lhs, A * B );
4295  }
4297  //**********************************************************************************************
4298 
4299  //**SMP assignment to sparse matrices***********************************************************
4315  template< typename MT // Type of the target sparse matrix
4316  , bool SO > // Storage order of the target sparse matrix
4317  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4318  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4319  {
4321 
4322  using TmpType = If_t< SO, ResultType, OppositeType >;
4323 
4330 
4331  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4332  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4333 
4334  const ForwardFunctor fwd;
4335 
4336  const TmpType tmp( rhs );
4337  smpAssign( ~lhs, fwd( tmp ) );
4338  }
4340  //**********************************************************************************************
4341 
4342  //**Restructuring SMP assignment to row-major matrices******************************************
4357  template< typename MT > // Type of the target matrix
4358  friend inline auto smpAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4359  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4360  {
4362 
4364 
4365  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4366  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4367 
4368  const ForwardFunctor fwd;
4369 
4370  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4371  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4372  else if( IsSymmetric_v<MT1> )
4373  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4374  else
4375  smpAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4376  }
4378  //**********************************************************************************************
4379 
4380  //**SMP addition assignment to dense matrices***************************************************
4396  template< typename MT // Type of the target dense matrix
4397  , bool SO > // Storage order of the target dense matrix
4398  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4399  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4400  {
4402 
4403  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4404  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4405 
4406  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4407  return;
4408  }
4409 
4410  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4411  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4412 
4413  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4414  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4415  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4416  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4417  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4418  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4419 
4420  smpAddAssign( ~lhs, A * B );
4421  }
4423  //**********************************************************************************************
4424 
4425  //**Restructuring SMP addition assignment to row-major matrices*********************************
4441  template< typename MT > // Type of the target matrix
4442  friend inline auto smpAddAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4443  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4444  {
4446 
4448 
4449  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4450  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4451 
4452  const ForwardFunctor fwd;
4453 
4454  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4455  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4456  else if( IsSymmetric_v<MT1> )
4457  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4458  else
4459  smpAddAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4460  }
4462  //**********************************************************************************************
4463 
4464  //**SMP addition assignment to sparse matrices**************************************************
4465  // No special implementation for the SMP addition assignment to sparse matrices.
4466  //**********************************************************************************************
4467 
4468  //**SMP subtraction assignment to dense matrices************************************************
4484  template< typename MT // Type of the target dense matrix
4485  , bool SO > // Storage order of the target dense matrix
4486  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4487  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4488  {
4490 
4491  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4492  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4493 
4494  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4495  return;
4496  }
4497 
4498  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4499  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4500 
4501  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4502  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4503  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4504  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4505  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4506  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4507 
4508  smpSubAssign( ~lhs, A * B );
4509  }
4511  //**********************************************************************************************
4512 
4513  //**Restructuring SMP subtraction assignment to row-major matrices******************************
4529  template< typename MT > // Type of the target matrix
4530  friend inline auto smpSubAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4531  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4532  {
4534 
4536 
4537  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4538  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4539 
4540  const ForwardFunctor fwd;
4541 
4542  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4543  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4544  else if( IsSymmetric_v<MT1> )
4545  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4546  else
4547  smpSubAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4548  }
4550  //**********************************************************************************************
4551 
4552  //**SMP subtraction assignment to sparse matrices***********************************************
4553  // No special implementation for the SMP subtraction assignment to sparse matrices.
4554  //**********************************************************************************************
4555 
4556  //**SMP Schur product assignment to dense matrices**********************************************
4570  template< typename MT // Type of the target dense matrix
4571  , bool SO > // Storage order of the target dense matrix
4572  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4573  {
4575 
4579 
4580  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4581  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4582 
4583  const ResultType tmp( rhs );
4584  smpSchurAssign( ~lhs, tmp );
4585  }
4587  //**********************************************************************************************
4588 
4589  //**SMP Schur product assignment to sparse matrices*********************************************
4590  // No special implementation for the SMP Schur product assignment to sparse matrices.
4591  //**********************************************************************************************
4592 
4593  //**SMP multiplication assignment to dense matrices*********************************************
4594  // No special implementation for the SMP multiplication assignment to dense matrices.
4595  //**********************************************************************************************
4596 
4597  //**SMP multiplication assignment to sparse matrices********************************************
4598  // No special implementation for the SMP multiplication assignment to sparse matrices.
4599  //**********************************************************************************************
4600 
4601  //**Compile time checks*************************************************************************
4609  //**********************************************************************************************
4610 };
4611 //*************************************************************************************************
4612 
4613 
4614 
4615 
4616 //=================================================================================================
4617 //
4618 // DMATSCALARMULTEXPR SPECIALIZATION
4619 //
4620 //=================================================================================================
4621 
4622 //*************************************************************************************************
4630 template< typename MT1 // Type of the left-hand side dense matrix
4631  , typename MT2 // Type of the right-hand side dense matrix
4632  , bool SF // Symmetry flag
4633  , bool HF // Hermitian flag
4634  , bool LF // Lower flag
4635  , bool UF // Upper flag
4636  , typename ST > // Type of the right-hand side scalar value
4637 class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
4638  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4639  , private Computation
4640 {
4641  private:
4642  //**Type definitions****************************************************************************
4644  using MMM = TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4645 
4646  using RES = ResultType_t<MMM>;
4647  using RT1 = ResultType_t<MT1>;
4648  using RT2 = ResultType_t<MT2>;
4649  using ET1 = ElementType_t<RT1>;
4650  using ET2 = ElementType_t<RT2>;
4651  using CT1 = CompositeType_t<MT1>;
4652  using CT2 = CompositeType_t<MT2>;
4653  //**********************************************************************************************
4654 
4655  //**********************************************************************************************
4657  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4658  //**********************************************************************************************
4659 
4660  //**********************************************************************************************
4662  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4663  //**********************************************************************************************
4664 
4665  //**********************************************************************************************
4666  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4667  static constexpr bool HERM = ( HF && !( LF || UF ) );
4668  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4669  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4670  //**********************************************************************************************
4671 
4672  //**********************************************************************************************
4674 
4678  template< typename T1, typename T2, typename T3 >
4679  static constexpr bool CanExploitSymmetry_v =
4680  ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4681  //**********************************************************************************************
4682 
4683  //**********************************************************************************************
4685 
4688  template< typename T1, typename T2, typename T3 >
4689  static constexpr bool IsEvaluationRequired_v =
4690  ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4691  //**********************************************************************************************
4692 
4693  //**********************************************************************************************
4695 
4697  template< typename T1, typename T2, typename T3, typename T4 >
4698  static constexpr bool UseBlasKernel_v =
4699  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4700  !SYM && !HERM && !LOW && !UPP &&
4701  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4702  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4703  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4704  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4705  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4706  IsBLASCompatible_v< ElementType_t<T1> > &&
4707  IsBLASCompatible_v< ElementType_t<T2> > &&
4708  IsBLASCompatible_v< ElementType_t<T3> > &&
4709  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4710  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4711  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4712  //**********************************************************************************************
4713 
4714  //**********************************************************************************************
4716 
4718  template< typename T1, typename T2, typename T3, typename T4 >
4719  static constexpr bool UseVectorizedDefaultKernel_v =
4720  ( useOptimizedKernels &&
4721  !IsDiagonal_v<T2> &&
4722  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4723  IsSIMDCombinable_v< ElementType_t<T1>
4724  , ElementType_t<T2>
4725  , ElementType_t<T3>
4726  , T4 > &&
4727  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T2> > &&
4728  HasSIMDMult_v< ElementType_t<T3>, ElementType_t<T3> > );
4729  //**********************************************************************************************
4730 
4731  //**********************************************************************************************
4733 
4735  using ForwardFunctor = If_t< HERM
4736  , DeclHerm
4737  , If_t< SYM
4738  , DeclSym
4739  , If_t< LOW
4740  , If_t< UPP
4741  , DeclDiag
4742  , DeclLow >
4743  , If_t< UPP
4744  , DeclUpp
4745  , Noop > > > >;
4746  //**********************************************************************************************
4747 
4748  public:
4749  //**Type definitions****************************************************************************
4751  using This = DMatScalarMultExpr<MMM,ST,true>;
4752 
4754  using BaseType = DenseMatrix<This,true>;
4755 
4757  using ResultType = typename If_t< HERM
4758  , DeclHermTrait< MultTrait_t<RES,ST> >
4759  , If_t< SYM
4760  , DeclSymTrait< MultTrait_t<RES,ST> >
4761  , If_t< LOW
4762  , If_t< UPP
4763  , DeclDiagTrait< MultTrait_t<RES,ST> >
4764  , DeclLowTrait< MultTrait_t<RES,ST> > >
4765  , If_t< UPP
4766  , DeclUppTrait< MultTrait_t<RES,ST> >
4767  , MultTrait<RES,ST> > > > >::Type;
4768 
4769  using OppositeType = OppositeType_t<ResultType>;
4770  using TransposeType = TransposeType_t<ResultType>;
4771  using ElementType = ElementType_t<ResultType>;
4772  using SIMDType = SIMDTrait_t<ElementType>;
4773  using ReturnType = const ElementType;
4774  using CompositeType = const ResultType;
4775 
4777  using LeftOperand = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4778 
4780  using RightOperand = ST;
4781 
4783  using LT = If_t< evaluateLeft, const RT1, CT1 >;
4784 
4786  using RT = If_t< evaluateRight, const RT2, CT2 >;
4787  //**********************************************************************************************
4788 
4789  //**Compilation flags***************************************************************************
4791  static constexpr bool simdEnabled =
4792  ( !IsDiagonal_v<MT1> &&
4793  MT1::simdEnabled && MT2::simdEnabled &&
4794  IsSIMDCombinable_v<ET1,ET2,ST> &&
4795  HasSIMDAdd_v<ET1,ET2> &&
4796  HasSIMDMult_v<ET1,ET2> );
4797 
4799  static constexpr bool smpAssignable =
4800  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4801  //**********************************************************************************************
4802 
4803  //**SIMD properties*****************************************************************************
4805  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
4806  //**********************************************************************************************
4807 
4808  //**Constructor*********************************************************************************
4814  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4815  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4816  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4817  {}
4818  //**********************************************************************************************
4819 
4820  //**Access operator*****************************************************************************
4827  inline ReturnType operator()( size_t i, size_t j ) const {
4828  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4829  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4830  return matrix_(i,j) * scalar_;
4831  }
4832  //**********************************************************************************************
4833 
4834  //**At function*********************************************************************************
4842  inline ReturnType at( size_t i, size_t j ) const {
4843  if( i >= matrix_.rows() ) {
4844  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4845  }
4846  if( j >= matrix_.columns() ) {
4847  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4848  }
4849  return (*this)(i,j);
4850  }
4851  //**********************************************************************************************
4852 
4853  //**Rows function*******************************************************************************
4858  inline size_t rows() const {
4859  return matrix_.rows();
4860  }
4861  //**********************************************************************************************
4862 
4863  //**Columns function****************************************************************************
4868  inline size_t columns() const {
4869  return matrix_.columns();
4870  }
4871  //**********************************************************************************************
4872 
4873  //**Left operand access*************************************************************************
4878  inline LeftOperand leftOperand() const {
4879  return matrix_;
4880  }
4881  //**********************************************************************************************
4882 
4883  //**Right operand access************************************************************************
4888  inline RightOperand rightOperand() const {
4889  return scalar_;
4890  }
4891  //**********************************************************************************************
4892 
4893  //**********************************************************************************************
4899  template< typename T >
4900  inline bool canAlias( const T* alias ) const {
4901  return matrix_.canAlias( alias );
4902  }
4903  //**********************************************************************************************
4904 
4905  //**********************************************************************************************
4911  template< typename T >
4912  inline bool isAliased( const T* alias ) const {
4913  return matrix_.isAliased( alias );
4914  }
4915  //**********************************************************************************************
4916 
4917  //**********************************************************************************************
4922  inline bool isAligned() const {
4923  return matrix_.isAligned();
4924  }
4925  //**********************************************************************************************
4926 
4927  //**********************************************************************************************
4932  inline bool canSMPAssign() const noexcept {
4933  return ( !BLAZE_BLAS_MODE ||
4934  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4936  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4937  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
4938  }
4939  //**********************************************************************************************
4940 
4941  private:
4942  //**Member variables****************************************************************************
4945  //**********************************************************************************************
4946 
4947  //**Assignment to dense matrices****************************************************************
4959  template< typename MT // Type of the target dense matrix
4960  , bool SO > // Storage order of the target dense matrix
4961  friend inline auto assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4962  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4963  {
4965 
4966  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4967  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4968 
4969  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
4970  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
4971 
4972  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4973  return;
4974  }
4975  else if( left.columns() == 0UL ) {
4976  reset( ~lhs );
4977  return;
4978  }
4979 
4980  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4981  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4982 
4983  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4984  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4985  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4986  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4987  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4988  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4989 
4990  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4991  }
4992  //**********************************************************************************************
4993 
4994  //**Assignment to dense matrices (kernel selection)*********************************************
5005  template< typename MT3 // Type of the left-hand side target matrix
5006  , typename MT4 // Type of the left-hand side matrix operand
5007  , typename MT5 // Type of the right-hand side matrix operand
5008  , typename ST2 > // Type of the scalar value
5009  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5010  {
5011  if( ( IsDiagonal_v<MT4> ) ||
5012  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
5013  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5014  selectSmallAssignKernel( C, A, B, scalar );
5015  else
5016  selectBlasAssignKernel( C, A, B, scalar );
5017  }
5018  //**********************************************************************************************
5019 
5020  //**Default assignment to dense matrices (general/general)**************************************
5034  template< typename MT3 // Type of the left-hand side target matrix
5035  , typename MT4 // Type of the left-hand side matrix operand
5036  , typename MT5 // Type of the right-hand side matrix operand
5037  , typename ST2 > // Type of the scalar value
5038  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5039  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5040  {
5041  const size_t M( A.rows() );
5042  const size_t N( B.columns() );
5043  const size_t K( A.columns() );
5044 
5045  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5046 
5047  for( size_t j=0UL; j<N; ++j )
5048  {
5049  const size_t kbegin( ( IsLower_v<MT5> )
5050  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5051  :( 0UL ) );
5052  const size_t kend( ( IsUpper_v<MT5> )
5053  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5054  :( K ) );
5055  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5056 
5057  if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
5058  for( size_t i=0UL; i<M; ++i ) {
5059  reset( C(i,j) );
5060  }
5061  continue;
5062  }
5063 
5064  {
5065  const size_t ibegin( ( IsLower_v<MT4> )
5066  ?( ( IsStrictlyLower_v<MT4> )
5067  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
5068  :( LOW ? max(j,kbegin) : kbegin ) )
5069  :( LOW ? j : 0UL ) );
5070  const size_t iend( ( IsUpper_v<MT4> )
5071  ?( ( IsStrictlyUpper_v<MT4> )
5072  ?( UPP ? min(j+1UL,kbegin) : kbegin )
5073  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
5074  :( UPP ? j+1UL : M ) );
5075 
5076  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5077  for( size_t i=0UL; i<ibegin; ++i ) {
5078  reset( C(i,j) );
5079  }
5080  }
5081  else if( IsStrictlyLower_v<MT4> ) {
5082  reset( C(0UL,j) );
5083  }
5084  for( size_t i=ibegin; i<iend; ++i ) {
5085  C(i,j) = A(i,kbegin) * B(kbegin,j);
5086  }
5087  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5088  for( size_t i=iend; i<M; ++i ) {
5089  reset( C(i,j) );
5090  }
5091  }
5092  else if( IsStrictlyUpper_v<MT4> ) {
5093  reset( C(M-1UL,j) );
5094  }
5095  }
5096 
5097  for( size_t k=kbegin+1UL; k<kend; ++k )
5098  {
5099  const size_t ibegin( ( IsLower_v<MT4> )
5100  ?( ( IsStrictlyLower_v<MT4> )
5101  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
5102  :( SYM || HERM || LOW ? max( j, k ) : k ) )
5103  :( SYM || HERM || LOW ? j : 0UL ) );
5104  const size_t iend( ( IsUpper_v<MT4> )
5105  ?( ( IsStrictlyUpper_v<MT4> )
5106  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
5107  :( UPP ? min(j+1UL,k) : k ) )
5108  :( UPP ? j+1UL : M ) );
5109 
5110  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5111  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5112 
5113  for( size_t i=ibegin; i<iend; ++i ) {
5114  C(i,j) += A(i,k) * B(k,j);
5115  }
5116  if( IsUpper_v<MT4> ) {
5117  C(iend,j) = A(iend,k) * B(k,j);
5118  }
5119  }
5120 
5121  {
5122  const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
5123  ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
5124  :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5125  const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5126  ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
5127  :( UPP ? j+1UL : M ) );
5128 
5129  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5130  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5131 
5132  for( size_t i=ibegin; i<iend; ++i ) {
5133  C(i,j) *= scalar;
5134  }
5135  }
5136  }
5137 
5138  if( SYM || HERM ) {
5139  for( size_t j=1UL; j<N; ++j ) {
5140  for( size_t i=0UL; i<j; ++i ) {
5141  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5142  }
5143  }
5144  }
5145  }
5146  //**********************************************************************************************
5147 
5148  //**Default assignment to dense matrices (general/diagonal)*************************************
5162  template< typename MT3 // Type of the left-hand side target matrix
5163  , typename MT4 // Type of the left-hand side matrix operand
5164  , typename MT5 // Type of the right-hand side matrix operand
5165  , typename ST2 > // Type of the scalar value
5166  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5167  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5168  {
5170 
5171  const size_t M( A.rows() );
5172  const size_t N( B.columns() );
5173 
5174  for( size_t j=0UL; j<N; ++j )
5175  {
5176  const size_t ibegin( ( IsLower_v<MT4> )
5177  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5178  :( 0UL ) );
5179  const size_t iend( ( IsUpper_v<MT4> )
5180  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5181  :( M ) );
5182  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5183 
5184  if( IsLower_v<MT4> ) {
5185  for( size_t i=0UL; i<ibegin; ++i ) {
5186  reset( C(i,j) );
5187  }
5188  }
5189  for( size_t i=ibegin; i<iend; ++i ) {
5190  C(i,j) = A(i,j) * B(j,j) * scalar;
5191  }
5192  if( IsUpper_v<MT4> ) {
5193  for( size_t i=iend; i<M; ++i ) {
5194  reset( C(i,j) );
5195  }
5196  }
5197  }
5198  }
5199  //**********************************************************************************************
5200 
5201  //**Default assignment to dense matrices (diagonal/general)*************************************
5215  template< typename MT3 // Type of the left-hand side target matrix
5216  , typename MT4 // Type of the left-hand side matrix operand
5217  , typename MT5 // Type of the right-hand side matrix operand
5218  , typename ST2 > // Type of the scalar value
5219  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5220  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5221  {
5223 
5224  const size_t M( A.rows() );
5225  const size_t N( B.columns() );
5226 
5227  for( size_t j=0UL; j<N; ++j )
5228  {
5229  const size_t ibegin( ( IsLower_v<MT5> )
5230  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5231  :( 0UL ) );
5232  const size_t iend( ( IsUpper_v<MT5> )
5233  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5234  :( M ) );
5235  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5236 
5237  if( IsLower_v<MT4> ) {
5238  for( size_t i=0UL; i<ibegin; ++i ) {
5239  reset( C(i,j) );
5240  }
5241  }
5242  for( size_t i=ibegin; i<iend; ++i ) {
5243  C(i,j) = A(i,i) * B(i,j) * scalar;
5244  }
5245  if( IsUpper_v<MT4> ) {
5246  for( size_t i=iend; i<M; ++i ) {
5247  reset( C(i,j) );
5248  }
5249  }
5250  }
5251  }
5252  //**********************************************************************************************
5253 
5254  //**Default assignment to dense matrices (diagonal/diagonal)************************************
5268  template< typename MT3 // Type of the left-hand side target matrix
5269  , typename MT4 // Type of the left-hand side matrix operand
5270  , typename MT5 // Type of the right-hand side matrix operand
5271  , typename ST2 > // Type of the scalar value
5272  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5273  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5274  {
5276 
5277  reset( C );
5278 
5279  for( size_t i=0UL; i<A.rows(); ++i ) {
5280  C(i,i) = A(i,i) * B(i,i) * scalar;
5281  }
5282  }
5283  //**********************************************************************************************
5284 
5285  //**Default assignment to dense matrices (small matrices)***************************************
5299  template< typename MT3 // Type of the left-hand side target matrix
5300  , typename MT4 // Type of the left-hand side matrix operand
5301  , typename MT5 // Type of the right-hand side matrix operand
5302  , typename ST2 > // Type of the scalar value
5303  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5304  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5305  {
5306  selectDefaultAssignKernel( C, A, B, scalar );
5307  }
5308  //**********************************************************************************************
5309 
5310  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5325  template< typename MT3 // Type of the left-hand side target matrix
5326  , typename MT4 // Type of the left-hand side matrix operand
5327  , typename MT5 // Type of the right-hand side matrix operand
5328  , typename ST2 > // Type of the scalar value
5329  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5330  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5331  {
5334  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
5335  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
5336 
5337  const ForwardFunctor fwd;
5338 
5339  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
5340  const OppositeType_t<MT5> tmp( serial( B ) );
5341  assign( C, fwd( A * tmp ) * scalar );
5342  }
5343  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
5344  const OppositeType_t<MT4> tmp( serial( A ) );
5345  assign( C, fwd( tmp * B ) * scalar );
5346  }
5347  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5348  const OppositeType_t<MT5> tmp( serial( B ) );
5349  assign( C, fwd( A * tmp ) * scalar );
5350  }
5351  else {
5352  const OppositeType_t<MT4> tmp( serial( A ) );
5353  assign( C, fwd( tmp * B ) * scalar );
5354  }
5355  }
5356  //**********************************************************************************************
5357 
5358  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
5373  template< typename MT3 // Type of the left-hand side target matrix
5374  , typename MT4 // Type of the left-hand side matrix operand
5375  , typename MT5 // Type of the right-hand side matrix operand
5376  , typename ST2 > // Type of the scalar value
5377  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5378  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5379  {
5380  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5381 
5382  const size_t M( A.rows() );
5383  const size_t N( B.columns() );
5384  const size_t K( A.columns() );
5385 
5386  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5387 
5388  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
5389  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5390 
5391  const SIMDType factor( set( scalar ) );
5392 
5393  if( LOW && UPP && M > SIMDSIZE*3UL ) {
5394  reset( C );
5395  }
5396 
5397  {
5398  size_t i( 0UL );
5399 
5400  if( IsIntegral_v<ElementType> )
5401  {
5402  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5403  for( size_t j=0UL; j<N; ++j )
5404  {
5405  const size_t kbegin( ( IsLower_v<MT5> )
5406  ?( ( IsUpper_v<MT4> )
5407  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5408  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5409  :( IsUpper_v<MT4> ? i : 0UL ) );
5410  const size_t kend( ( IsUpper_v<MT5> )
5411  ?( ( IsLower_v<MT4> )
5412  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5413  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5414  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
5415 
5416  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5417 
5418  for( size_t k=kbegin; k<kend; ++k ) {
5419  const SIMDType b1( set( B(k,j) ) );
5420  xmm1 += A.load(i ,k) * b1;
5421  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5422  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5423  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5424  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5425  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
5426  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
5427  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
5428  }
5429 
5430  C.store( i , j, xmm1 * factor );
5431  C.store( i+SIMDSIZE , j, xmm2 * factor );
5432  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5433  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5434  C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5435  C.store( i+SIMDSIZE*5UL, j, xmm6 * factor );
5436  C.store( i+SIMDSIZE*6UL, j, xmm7 * factor );
5437  C.store( i+SIMDSIZE*7UL, j, xmm8 * factor );
5438  }
5439  }
5440  }
5441 
5442  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5443  {
5444  size_t j( 0UL );
5445 
5446  for( ; (j+2UL) <= N; j+=2UL )
5447  {
5448  const size_t kbegin( ( IsLower_v<MT5> )
5449  ?( ( IsUpper_v<MT4> )
5450  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5451  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5452  :( IsUpper_v<MT4> ? i : 0UL ) );
5453  const size_t kend( ( IsUpper_v<MT5> )
5454  ?( ( IsLower_v<MT4> )
5455  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5456  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5457  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
5458 
5459  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5460 
5461  for( size_t k=kbegin; k<kend; ++k ) {
5462  const SIMDType a1( A.load(i ,k) );
5463  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5464  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5465  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5466  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5467  const SIMDType b1( set( B(k,j ) ) );
5468  const SIMDType b2( set( B(k,j+1UL) ) );
5469  xmm1 += a1 * b1;
5470  xmm2 += a2 * b1;
5471  xmm3 += a3 * b1;
5472  xmm4 += a4 * b1;
5473  xmm5 += a5 * b1;
5474  xmm6 += a1 * b2;
5475  xmm7 += a2 * b2;
5476  xmm8 += a3 * b2;
5477  xmm9 += a4 * b2;
5478  xmm10 += a5 * b2;
5479  }
5480 
5481  C.store( i , j , xmm1 * factor );
5482  C.store( i+SIMDSIZE , j , xmm2 * factor );
5483  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5484  C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5485  C.store( i+SIMDSIZE*4UL, j , xmm5 * factor );
5486  C.store( i , j+1UL, xmm6 * factor );
5487  C.store( i+SIMDSIZE , j+1UL, xmm7 * factor );
5488  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5489  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5490  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5491  }
5492 
5493  if( j < N )
5494  {
5495  const size_t kbegin( ( IsLower_v<MT5> )
5496  ?( ( IsUpper_v<MT4> )
5497  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5498  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5499  :( IsUpper_v<MT4> ? i : 0UL ) );
5500  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
5501 
5502  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5503 
5504  for( size_t k=kbegin; k<kend; ++k ) {
5505  const SIMDType b1( set( B(k,j) ) );
5506  xmm1 += A.load(i ,k) * b1;
5507  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5508  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5509  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5510  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5511  }
5512 
5513  C.store( i , j, xmm1 * factor );
5514  C.store( i+SIMDSIZE , j, xmm2 * factor );
5515  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5516  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5517  C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5518  }
5519  }
5520 
5521  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5522  {
5523  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
5524  size_t j( UPP ? i : 0UL );
5525 
5526  for( ; (j+2UL) <= jend; j+=2UL )
5527  {
5528  const size_t kbegin( ( IsLower_v<MT5> )
5529  ?( ( IsUpper_v<MT4> )
5530  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5531  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5532  :( IsUpper_v<MT4> ? i : 0UL ) );
5533  const size_t kend( ( IsUpper_v<MT5> )
5534  ?( ( IsLower_v<MT4> )
5535  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5536  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5537  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
5538 
5539  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5540 
5541  for( size_t k=kbegin; k<kend; ++k ) {
5542  const SIMDType a1( A.load(i ,k) );
5543  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5544  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5545  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5546  const SIMDType b1( set( B(k,j ) ) );
5547  const SIMDType b2( set( B(k,j+1UL) ) );
5548  xmm1 += a1 * b1;
5549  xmm2 += a2 * b1;
5550  xmm3 += a3 * b1;
5551  xmm4 += a4 * b1;
5552  xmm5 += a1 * b2;
5553  xmm6 += a2 * b2;
5554  xmm7 += a3 * b2;
5555  xmm8 += a4 * b2;
5556  }
5557 
5558  C.store( i , j , xmm1 * factor );
5559  C.store( i+SIMDSIZE , j , xmm2 * factor );
5560  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5561  C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5562  C.store( i , j+1UL, xmm5 * factor );
5563  C.store( i+SIMDSIZE , j+1UL, xmm6 * factor );
5564  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5565  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5566  }
5567 
5568  if( j < jend )
5569  {
5570  const size_t kbegin( ( IsLower_v<MT5> )
5571  ?( ( IsUpper_v<MT4> )
5572  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5573  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5574  :( IsUpper_v<MT4> ? i : 0UL ) );
5575  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
5576 
5577  SIMDType xmm1, xmm2, xmm3, xmm4;
5578 
5579  for( size_t k=kbegin; k<kend; ++k ) {
5580  const SIMDType b1( set( B(k,j) ) );
5581  xmm1 += A.load(i ,k) * b1;
5582  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5583  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5584  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5585  }
5586 
5587  C.store( i , j, xmm1 * factor );
5588  C.store( i+SIMDSIZE , j, xmm2 * factor );
5589  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5590  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5591  }
5592  }
5593 
5594  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5595  {
5596  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
5597  size_t j( UPP ? i : 0UL );
5598 
5599  for( ; (j+2UL) <= jend; j+=2UL )
5600  {
5601  const size_t kbegin( ( IsLower_v<MT5> )
5602  ?( ( IsUpper_v<MT4> )
5603  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5604  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5605  :( IsUpper_v<MT4> ? i : 0UL ) );
5606  const size_t kend( ( IsUpper_v<MT5> )
5607  ?( ( IsLower_v<MT4> )
5608  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5609  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5610  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
5611 
5612  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5613 
5614  for( size_t k=kbegin; k<kend; ++k ) {
5615  const SIMDType a1( A.load(i ,k) );
5616  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5617  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5618  const SIMDType b1( set( B(k,j ) ) );
5619  const SIMDType b2( set( B(k,j+1UL) ) );
5620  xmm1 += a1 * b1;
5621  xmm2 += a2 * b1;
5622  xmm3 += a3 * b1;
5623  xmm4 += a1 * b2;
5624  xmm5 += a2 * b2;
5625  xmm6 += a3 * b2;
5626  }
5627 
5628  C.store( i , j , xmm1 * factor );
5629  C.store( i+SIMDSIZE , j , xmm2 * factor );
5630  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5631  C.store( i , j+1UL, xmm4 * factor );
5632  C.store( i+SIMDSIZE , j+1UL, xmm5 * factor );
5633  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
5634  }
5635 
5636  if( j < jend )
5637  {
5638  const size_t kbegin( ( IsLower_v<MT5> )
5639  ?( ( IsUpper_v<MT4> )
5640  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5641  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5642  :( IsUpper_v<MT4> ? i : 0UL ) );
5643  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
5644 
5645  SIMDType xmm1, xmm2, xmm3;
5646 
5647  for( size_t k=kbegin; k<kend; ++k ) {
5648  const SIMDType b1( set( B(k,j) ) );
5649  xmm1 += A.load(i ,k) * b1;
5650  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5651  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5652  }
5653 
5654  C.store( i , j, xmm1 * factor );
5655  C.store( i+SIMDSIZE , j, xmm2 * factor );
5656  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5657  }
5658  }
5659 
5660  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5661  {
5662  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
5663  size_t j( UPP ? i : 0UL );
5664 
5665  for( ; (j+4UL) <= jend; j+=4UL )
5666  {
5667  const size_t kbegin( ( IsLower_v<MT5> )
5668  ?( ( IsUpper_v<MT4> )
5669  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5670  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5671  :( IsUpper_v<MT4> ? i : 0UL ) );
5672  const size_t kend( ( IsUpper_v<MT5> )
5673  ?( ( IsLower_v<MT4> )
5674  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
5675  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
5676  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
5677 
5678  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5679 
5680  for( size_t k=kbegin; k<kend; ++k ) {
5681  const SIMDType a1( A.load(i ,k) );
5682  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5683  const SIMDType b1( set( B(k,j ) ) );
5684  const SIMDType b2( set( B(k,j+1UL) ) );
5685  const SIMDType b3( set( B(k,j+2UL) ) );
5686  const SIMDType b4( set( B(k,j+3UL) ) );
5687  xmm1 += a1 * b1;
5688  xmm2 += a2 * b1;
5689  xmm3 += a1 * b2;
5690  xmm4 += a2 * b2;
5691  xmm5 += a1 * b3;
5692  xmm6 += a2 * b3;
5693  xmm7 += a1 * b4;
5694  xmm8 += a2 * b4;
5695  }
5696 
5697  C.store( i , j , xmm1 * factor );
5698  C.store( i+SIMDSIZE, j , xmm2 * factor );
5699  C.store( i , j+1UL, xmm3 * factor );
5700  C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5701  C.store( i , j+2UL, xmm5 * factor );
5702  C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5703  C.store( i , j+3UL, xmm7 * factor );
5704  C.store( i+SIMDSIZE, j+3UL, xmm8 * factor );
5705  }
5706 
5707  for( ; (j+3UL) <= jend; j+=3UL )
5708  {
5709  const size_t kbegin( ( IsLower_v<MT5> )
5710  ?( ( IsUpper_v<MT4> )
5711  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5712  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5713  :( IsUpper_v<MT4> ? i : 0UL ) );
5714  const size_t kend( ( IsUpper_v<MT5> )
5715  ?( ( IsLower_v<MT4> )
5716  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
5717  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
5718  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
5719 
5720  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5721 
5722  for( size_t k=kbegin; k<kend; ++k ) {
5723  const SIMDType a1( A.load(i ,k) );
5724  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5725  const SIMDType b1( set( B(k,j ) ) );
5726  const SIMDType b2( set( B(k,j+1UL) ) );
5727  const SIMDType b3( set( B(k,j+2UL) ) );
5728  xmm1 += a1 * b1;
5729  xmm2 += a2 * b1;
5730  xmm3 += a1 * b2;
5731  xmm4 += a2 * b2;
5732  xmm5 += a1 * b3;
5733  xmm6 += a2 * b3;
5734  }
5735 
5736  C.store( i , j , xmm1 * factor );
5737  C.store( i+SIMDSIZE, j , xmm2 * factor );
5738  C.store( i , j+1UL, xmm3 * factor );
5739  C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5740  C.store( i , j+2UL, xmm5 * factor );
5741  C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5742  }
5743 
5744  for( ; (j+2UL) <= jend; j+=2UL )
5745  {
5746  const size_t kbegin( ( IsLower_v<MT5> )
5747  ?( ( IsUpper_v<MT4> )
5748  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5749  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5750  :( IsUpper_v<MT4> ? i : 0UL ) );
5751  const size_t kend( ( IsUpper_v<MT5> )
5752  ?( ( IsLower_v<MT4> )
5753  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5754  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5755  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
5756 
5757  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5758  size_t k( kbegin );
5759 
5760  for( ; (k+2UL) <= kend; k+=2UL ) {
5761  const SIMDType a1( A.load(i ,k ) );
5762  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
5763  const SIMDType a3( A.load(i ,k+1UL) );
5764  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
5765  const SIMDType b1( set( B(k ,j ) ) );
5766  const SIMDType b2( set( B(k ,j+1UL) ) );
5767  const SIMDType b3( set( B(k+1UL,j ) ) );
5768  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
5769  xmm1 += a1 * b1;
5770  xmm2 += a2 * b1;
5771  xmm3 += a1 * b2;
5772  xmm4 += a2 * b2;
5773  xmm5 += a3 * b3;
5774  xmm6 += a4 * b3;
5775  xmm7 += a3 * b4;
5776  xmm8 += a4 * b4;
5777  }
5778 
5779  for( ; k<kend; ++k ) {
5780  const SIMDType a1( A.load(i ,k) );
5781  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5782  const SIMDType b1( set( B(k,j ) ) );
5783  const SIMDType b2( set( B(k,j+1UL) ) );
5784  xmm1 += a1 * b1;
5785  xmm2 += a2 * b1;
5786  xmm3 += a1 * b2;
5787  xmm4 += a2 * b2;
5788  }
5789 
5790  C.store( i , j , (xmm1+xmm5) * factor );
5791  C.store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
5792  C.store( i , j+1UL, (xmm3+xmm7) * factor );
5793  C.store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
5794  }
5795 
5796  if( j < jend )
5797  {
5798  const size_t kbegin( ( IsLower_v<MT5> )
5799  ?( ( IsUpper_v<MT4> )
5800  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5801  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5802  :( IsUpper_v<MT4> ? i : 0UL ) );
5803  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
5804 
5805  SIMDType xmm1, xmm2, xmm3, xmm4;
5806  size_t k( kbegin );
5807 
5808  for( ; (k+2UL) <= kend; k+=2UL ) {
5809  const SIMDType b1( set( B(k ,j) ) );
5810  const SIMDType b2( set( B(k+1UL,j) ) );
5811  xmm1 += A.load(i ,k ) * b1;
5812  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
5813  xmm3 += A.load(i ,k+1UL) * b2;
5814  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
5815  }
5816 
5817  for( ; k<kend; ++k ) {
5818  const SIMDType b1( set( B(k,j) ) );
5819  xmm1 += A.load(i ,k) * b1;
5820  xmm2 += A.load(i+SIMDSIZE,k) * b1;
5821  }
5822 
5823  C.store( i , j, (xmm1+xmm3) * factor );
5824  C.store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
5825  }
5826  }
5827 
5828  for( ; i<ipos; i+=SIMDSIZE )
5829  {
5830  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
5831  size_t j( UPP ? i : 0UL );
5832 
5833  for( ; (j+4UL) <= jend; j+=4UL )
5834  {
5835  const size_t kbegin( ( IsLower_v<MT5> )
5836  ?( ( IsUpper_v<MT4> )
5837  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5838  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5839  :( IsUpper_v<MT4> ? i : 0UL ) );
5840  const size_t kend( ( IsUpper_v<MT5> )
5841  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
5842  :( K ) );
5843 
5844  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5845  size_t k( kbegin );
5846 
5847  for( ; (k+2UL) <= kend; k+=2UL ) {
5848  const SIMDType a1( A.load(i,k ) );
5849  const SIMDType a2( A.load(i,k+1UL) );
5850  xmm1 += a1 * set( B(k ,j ) );
5851  xmm2 += a1 * set( B(k ,j+1UL) );
5852  xmm3 += a1 * set( B(k ,j+2UL) );
5853  xmm4 += a1 * set( B(k ,j+3UL) );
5854  xmm5 += a2 * set( B(k+1UL,j ) );
5855  xmm6 += a2 * set( B(k+1UL,j+1UL) );
5856  xmm7 += a2 * set( B(k+1UL,j+2UL) );
5857  xmm8 += a2 * set( B(k+1UL,j+3UL) );
5858  }
5859 
5860  for( ; k<kend; ++k ) {
5861  const SIMDType a1( A.load(i,k) );
5862  xmm1 += a1 * set( B(k,j ) );
5863  xmm2 += a1 * set( B(k,j+1UL) );
5864  xmm3 += a1 * set( B(k,j+2UL) );
5865  xmm4 += a1 * set( B(k,j+3UL) );
5866  }
5867 
5868  C.store( i, j , (xmm1+xmm5) * factor );
5869  C.store( i, j+1UL, (xmm2+xmm6) * factor );
5870  C.store( i, j+2UL, (xmm3+xmm7) * factor );
5871  C.store( i, j+3UL, (xmm4+xmm8) * factor );
5872  }
5873 
5874  for( ; (j+3UL) <= jend; j+=3UL )
5875  {
5876  const size_t kbegin( ( IsLower_v<MT5> )
5877  ?( ( IsUpper_v<MT4> )
5878  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5879  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5880  :( IsUpper_v<MT4> ? i : 0UL ) );
5881  const size_t kend( ( IsUpper_v<MT5> )
5882  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
5883  :( K ) );
5884 
5885  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5886  size_t k( kbegin );
5887 
5888  for( ; (k+2UL) <= kend; k+=2UL ) {
5889  const SIMDType a1( A.load(i,k ) );
5890  const SIMDType a2( A.load(i,k+1UL) );
5891  xmm1 += a1 * set( B(k ,j ) );
5892  xmm2 += a1 * set( B(k ,j+1UL) );
5893  xmm3 += a1 * set( B(k ,j+2UL) );
5894  xmm4 += a2 * set( B(k+1UL,j ) );
5895  xmm5 += a2 * set( B(k+1UL,j+1UL) );
5896  xmm6 += a2 * set( B(k+1UL,j+2UL) );
5897  }
5898 
5899  for( ; k<kend; ++k ) {
5900  const SIMDType a1( A.load(i,k) );
5901  xmm1 += a1 * set( B(k,j ) );
5902  xmm2 += a1 * set( B(k,j+1UL) );
5903  xmm3 += a1 * set( B(k,j+2UL) );
5904  }
5905 
5906  C.store( i, j , (xmm1+xmm4) * factor );
5907  C.store( i, j+1UL, (xmm2+xmm5) * factor );
5908  C.store( i, j+2UL, (xmm3+xmm6) * factor );
5909  }
5910 
5911  for( ; (j+2UL) <= jend; j+=2UL )
5912  {
5913  const size_t kbegin( ( IsLower_v<MT5> )
5914  ?( ( IsUpper_v<MT4> )
5915  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5916  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5917  :( IsUpper_v<MT4> ? i : 0UL ) );
5918  const size_t kend( ( IsUpper_v<MT5> )
5919  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
5920  :( K ) );
5921 
5922  SIMDType xmm1, xmm2, xmm3, xmm4;
5923  size_t k( kbegin );
5924 
5925  for( ; k<kend; ++k ) {
5926  const SIMDType a1( A.load(i,k) );
5927  xmm1 += a1 * set( B(k,j ) );
5928  xmm2 += a1 * set( B(k,j+1UL) );
5929  }
5930 
5931  for( ; (k+2UL) <= kend; k+=2UL ) {
5932  const SIMDType a1( A.load(i,k ) );
5933  const SIMDType a2( A.load(i,k+1UL) );
5934  xmm1 += a1 * set( B(k ,j ) );
5935  xmm2 += a1 * set( B(k ,j+1UL) );
5936  xmm3 += a2 * set( B(k+1UL,j ) );
5937  xmm4 += a2 * set( B(k+1UL,j+1UL) );
5938  }
5939 
5940  C.store( i, j , (xmm1+xmm3) * factor );
5941  C.store( i, j+1UL, (xmm2+xmm4) * factor );
5942  }
5943 
5944  if( j < jend )
5945  {
5946  const size_t kbegin( ( IsLower_v<MT5> )
5947  ?( ( IsUpper_v<MT4> )
5948  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5949  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5950  :( IsUpper_v<MT4> ? i : 0UL ) );
5951 
5952  SIMDType xmm1, xmm2;
5953  size_t k( kbegin );
5954 
5955  for( ; (k+2UL) <= K; k+=2UL ) {
5956  xmm1 += A.load(i,k ) * set( B(k ,j) );
5957  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
5958  }
5959 
5960  for( ; k<K; ++k ) {
5961  xmm1 += A.load(i,k) * set( B(k,j) );
5962  }
5963 
5964  C.store( i, j, (xmm1+xmm2) * factor );
5965  }
5966  }
5967 
5968  for( ; remainder && i<M; ++i )
5969  {
5970  size_t j( LOW && UPP ? i : 0UL );
5971 
5972  for( ; (j+2UL) <= N; j+=2UL )
5973  {
5974  const size_t kbegin( ( IsLower_v<MT5> )
5975  ?( ( IsUpper_v<MT4> )
5976  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5977  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5978  :( IsUpper_v<MT4> ? i : 0UL ) );
5979  const size_t kend( ( IsUpper_v<MT5> )
5980  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
5981  :( K ) );
5982 
5983  ElementType value1{};
5984  ElementType value2{};
5985 
5986  for( size_t k=kbegin; k<kend; ++k ) {
5987  value1 += A(i,k) * B(k,j );
5988  value2 += A(i,k) * B(k,j+1UL);
5989  }
5990 
5991  C(i,j ) = value1 * scalar;
5992  C(i,j+1UL) = value2 * scalar;
5993  }
5994 
5995  if( j < N )
5996  {
5997  const size_t kbegin( ( IsLower_v<MT5> )
5998  ?( ( IsUpper_v<MT4> )
5999  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6000  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6001  :( IsUpper_v<MT4> ? i : 0UL ) );
6002 
6003  ElementType value{};
6004 
6005  for( size_t k=kbegin; k<K; ++k ) {
6006  value += A(i,k) * B(k,j);
6007  }
6008 
6009  C(i,j) = value * scalar;
6010  }
6011  }
6012  }
6013 
6014  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
6015  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6016  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6017  for( size_t i=0UL; i<iend; ++i ) {
6018  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
6019  }
6020  }
6021  }
6022  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
6023  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6024  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6025  for( size_t i=0UL; i<iend; ++i ) {
6026  reset( C(i,j) );
6027  }
6028  }
6029  }
6030  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
6031  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
6032  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
6033  for( size_t j=0UL; j<jend; ++j ) {
6034  reset( C(i,j) );
6035  }
6036  }
6037  }
6038  }
6039  //**********************************************************************************************
6040 
6041  //**Default assignment to dense matrices (large matrices)***************************************
6055  template< typename MT3 // Type of the left-hand side target matrix
6056  , typename MT4 // Type of the left-hand side matrix operand
6057  , typename MT5 // Type of the right-hand side matrix operand
6058  , typename ST2 > // Type of the scalar value
6059  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6060  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6061  {
6062  selectDefaultAssignKernel( C, A, B, scalar );
6063  }
6064  //**********************************************************************************************
6065 
6066  //**Vectorized default assignment to dense matrices (large matrices)****************************
6081  template< typename MT3 // Type of the left-hand side target matrix
6082  , typename MT4 // Type of the left-hand side matrix operand
6083  , typename MT5 // Type of the right-hand side matrix operand
6084  , typename ST2 > // Type of the scalar value
6085  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6086  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6087  {
6088  if( SYM )
6089  smmm( C, A, B, scalar );
6090  else if( HERM )
6091  hmmm( C, A, B, scalar );
6092  else if( LOW )
6093  lmmm( C, A, B, scalar, ST2(0) );
6094  else if( UPP )
6095  ummm( C, A, B, scalar, ST2(0) );
6096  else
6097  mmm( C, A, B, scalar, ST2(0) );
6098  }
6099  //**********************************************************************************************
6100 
6101  //**BLAS-based assignment to dense matrices (default)*******************************************
6115  template< typename MT3 // Type of the left-hand side target matrix
6116  , typename MT4 // Type of the left-hand side matrix operand
6117  , typename MT5 // Type of the right-hand side matrix operand
6118  , typename ST2 > // Type of the scalar value
6119  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6120  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6121  {
6122  selectLargeAssignKernel( C, A, B, scalar );
6123  }
6124  //**********************************************************************************************
6125 
6126  //**BLAS-based assignment to dense matrices*****************************************************
6127 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6128 
6141  template< typename MT3 // Type of the left-hand side target matrix
6142  , typename MT4 // Type of the left-hand side matrix operand
6143  , typename MT5 // Type of the right-hand side matrix operand
6144  , typename ST2 > // Type of the scalar value
6145  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6146  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6147  {
6148  using ET = ElementType_t<MT3>;
6149 
6150  if( IsTriangular_v<MT4> ) {
6151  assign( C, B );
6152  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6153  }
6154  else if( IsTriangular_v<MT5> ) {
6155  assign( C, A );
6156  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6157  }
6158  else {
6159  gemm( C, A, B, ET(scalar), ET(0) );
6160  }
6161  }
6162 #endif
6163  //**********************************************************************************************
6164 
6165  //**Assignment to sparse matrices***************************************************************
6177  template< typename MT // Type of the target sparse matrix
6178  , bool SO > // Storage order of the target sparse matrix
6179  friend inline auto assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6180  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6181  {
6183 
6184  using TmpType = If_t< SO, ResultType, OppositeType >;
6185 
6192 
6193  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6194  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6195 
6196  const ForwardFunctor fwd;
6197 
6198  const TmpType tmp( serial( rhs ) );
6199  assign( ~lhs, fwd( tmp ) );
6200  }
6201  //**********************************************************************************************
6202 
6203  //**Restructuring assignment to row-major matrices**********************************************
6217  template< typename MT > // Type of the target matrix
6218  friend inline auto assign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
6219  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6220  {
6222 
6224 
6225  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6226  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6227 
6228  const ForwardFunctor fwd;
6229 
6230  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6231  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6232 
6233  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
6234  assign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
6235  else if( IsSymmetric_v<MT1> )
6236  assign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
6237  else
6238  assign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
6239  }
6240  //**********************************************************************************************
6241 
6242  //**Addition assignment to dense matrices*******************************************************
6254  template< typename MT // Type of the target dense matrix
6255  , bool SO > // Storage order of the target dense matrix
6256  friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6257  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6258  {
6260 
6261  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6262  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6263 
6264  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6265  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6266 
6267  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6268  return;
6269  }
6270 
6271  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6272  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6273 
6274  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6275  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6276  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6277  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6278  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6279  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6280 
6281  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6282  }
6283  //**********************************************************************************************
6284 
6285  //**Addition assignment to dense matrices (kernel selection)************************************
6296  template< typename MT3 // Type of the left-hand side target matrix
6297  , typename MT4 // Type of the left-hand side matrix operand
6298  , typename MT5 // Type of the right-hand side matrix operand
6299  , typename ST2 > // Type of the scalar value
6300  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6301  {
6302  if( ( IsDiagonal_v<MT4> ) ||
6303  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
6304  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6305  selectSmallAddAssignKernel( C, A, B, scalar );
6306  else
6307  selectBlasAddAssignKernel( C, A, B, scalar );
6308  }
6309  //**********************************************************************************************
6310 
6311  //**Default addition assignment to dense matrices (general/general)*****************************
6325  template< typename MT3 // Type of the left-hand side target matrix
6326  , typename MT4 // Type of the left-hand side matrix operand
6327  , typename MT5 // Type of the right-hand side matrix operand
6328  , typename ST2 > // Type of the scalar value
6329  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6330  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6331  {
6332  const ResultType tmp( serial( A * B * scalar ) );
6333  addAssign( C, tmp );
6334  }
6335  //**********************************************************************************************
6336 
6337  //**Default addition assignment to dense matrices (general/diagonal)****************************
6351  template< typename MT3 // Type of the left-hand side target matrix
6352  , typename MT4 // Type of the left-hand side matrix operand
6353  , typename MT5 // Type of the right-hand side matrix operand
6354  , typename ST2 > // Type of the scalar value
6355  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6356  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6357  {
6359 
6360  const size_t M( A.rows() );
6361  const size_t N( B.columns() );
6362 
6363  for( size_t j=0UL; j<N; ++j )
6364  {
6365  const size_t ibegin( ( IsLower_v<MT4> )
6366  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
6367  :( 0UL ) );
6368  const size_t iend( ( IsUpper_v<MT4> )
6369  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
6370  :( M ) );
6371  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6372 
6373  const size_t inum( iend - ibegin );
6374  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6375 
6376  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6377  C(i ,j) += A(i ,j) * B(j,j) * scalar;
6378  C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6379  }
6380  if( ipos < iend ) {
6381  C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6382  }
6383  }
6384  }
6385  //**********************************************************************************************
6386 
6387  //**Default addition assignment to dense matrices (diagonal/general)****************************
6401  template< typename MT3 // Type of the left-hand side target matrix
6402  , typename MT4 // Type of the left-hand side matrix operand
6403  , typename MT5 // Type of the right-hand side matrix operand
6404  , typename ST2 > // Type of the scalar value
6405  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6406  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6407  {
6409 
6410  const size_t M( A.rows() );
6411  const size_t N( B.columns() );
6412 
6413  for( size_t j=0UL; j<N; ++j )
6414  {
6415  const size_t ibegin( ( IsLower_v<MT5> )
6416  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6417  :( 0UL ) );
6418  const size_t iend( ( IsUpper_v<MT5> )
6419  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6420  :( M ) );
6421  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6422 
6423  const size_t inum( iend - ibegin );
6424  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6425 
6426  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6427  C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6428  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6429  }
6430  if( ipos < iend ) {
6431  C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6432  }
6433  }
6434  }
6435  //**********************************************************************************************
6436 
6437  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6451  template< typename MT3 // Type of the left-hand side target matrix
6452  , typename MT4 // Type of the left-hand side matrix operand
6453  , typename MT5 // Type of the right-hand side matrix operand
6454  , typename ST2 > // Type of the scalar value
6455  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6456  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6457  {
6459 
6460  for( size_t i=0UL; i<A.rows(); ++i ) {
6461  C(i,i) += A(i,i) * B(i,i) * scalar;
6462  }
6463  }
6464  //**********************************************************************************************
6465 
6466  //**Default addition assignment to dense matrices (small matrices)******************************
6480  template< typename MT3 // Type of the left-hand side target matrix
6481  , typename MT4 // Type of the left-hand side matrix operand
6482  , typename MT5 // Type of the right-hand side matrix operand
6483  , typename ST2 > // Type of the scalar value
6484  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6485  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6486  {
6487  selectDefaultAddAssignKernel( C, A, B, scalar );
6488  }
6489  //**********************************************************************************************
6490 
6491  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
6506  template< typename MT3 // Type of the left-hand side target matrix
6507  , typename MT4 // Type of the left-hand side matrix operand
6508  , typename MT5 // Type of the right-hand side matrix operand
6509  , typename ST2 > // Type of the scalar value
6510  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6511  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6512  {
6515  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
6516  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
6517 
6518  const ForwardFunctor fwd;
6519 
6520  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6521  const OppositeType_t<MT5> tmp( serial( B ) );
6522  addAssign( C, fwd( A * tmp ) * scalar );
6523  }
6524  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6525  const OppositeType_t<MT4> tmp( serial( A ) );
6526  addAssign( C, fwd( tmp * B ) * scalar );
6527  }
6528  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6529  const OppositeType_t<MT5> tmp( serial( B ) );
6530  addAssign( C, fwd( A * tmp ) * scalar );
6531  }
6532  else {
6533  const OppositeType_t<MT4> tmp( serial( A ) );
6534  addAssign( C, fwd( tmp * B ) * scalar );
6535  }
6536  }
6537  //**********************************************************************************************
6538 
6539  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
6554  template< typename MT3 // Type of the left-hand side target matrix
6555  , typename MT4 // Type of the left-hand side matrix operand
6556  , typename MT5 // Type of the right-hand side matrix operand
6557  , typename ST2 > // Type of the scalar value
6558  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6559  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6560  {
6561  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
6562 
6563  const size_t M( A.rows() );
6564  const size_t N( B.columns() );
6565  const size_t K( A.columns() );
6566 
6567  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6568 
6569  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
6570  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
6571 
6572  const SIMDType factor( set( scalar ) );
6573 
6574  size_t i( 0UL );
6575 
6576  if( IsIntegral_v<ElementType> )
6577  {
6578  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6579  for( size_t j=0UL; j<N; ++j )
6580  {
6581  const size_t kbegin( ( IsLower_v<MT5> )
6582  ?( ( IsUpper_v<MT4> )
6583  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6584  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6585  :( IsUpper_v<MT4> ? i : 0UL ) );
6586  const size_t kend( ( IsUpper_v<MT5> )
6587  ?( ( IsLower_v<MT4> )
6588  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
6589  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
6590  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
6591 
6592  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6593 
6594  for( size_t k=kbegin; k<kend; ++k ) {
6595  const SIMDType b1( set( B(k,j) ) );
6596  xmm1 += A.load(i ,k) * b1;
6597  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6598  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6599  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6600  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6601  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
6602  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
6603  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
6604  }
6605 
6606  C.store( i , j, C.load(i ,j) + xmm1 * factor );
6607  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
6608  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6609  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6610  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6611  C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
6612  C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
6613  C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
6614  }
6615  }
6616  }
6617 
6618  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6619  {
6620  size_t j( 0UL );
6621 
6622  for( ; (j+2UL) <= N; j+=2UL )
6623  {
6624  const size_t kbegin( ( IsLower_v<MT5> )
6625  ?( ( IsUpper_v<MT4> )
6626  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6627  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6628  :( IsUpper_v<MT4> ? i : 0UL ) );
6629  const size_t kend( ( IsUpper_v<MT5> )
6630  ?( ( IsLower_v<MT4> )
6631  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6632  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6633  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
6634 
6635  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6636 
6637  for( size_t k=kbegin; k<kend; ++k ) {
6638  const SIMDType a1( A.load(i ,k) );
6639  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6640  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6641  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6642  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6643  const SIMDType b1( set( B(k,j ) ) );
6644  const SIMDType b2( set( B(k,j+1UL) ) );
6645  xmm1 += a1 * b1;
6646  xmm2 += a2 * b1;
6647  xmm3 += a3 * b1;
6648  xmm4 += a4 * b1;
6649  xmm5 += a5 * b1;
6650  xmm6 += a1 * b2;
6651  xmm7 += a2 * b2;
6652  xmm8 += a3 * b2;
6653  xmm9 += a4 * b2;
6654  xmm10 += a5 * b2;
6655  }
6656 
6657  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6658  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
6659  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6660  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6661  C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
6662  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
6663  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
6664  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
6665  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
6666  C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
6667  }
6668 
6669  if( j < N )
6670  {
6671  const size_t kbegin( ( IsLower_v<MT5> )
6672  ?( ( IsUpper_v<MT4> )
6673  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6674  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6675  :( IsUpper_v<MT4> ? i : 0UL ) );
6676  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
6677 
6678  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6679 
6680  for( size_t k=kbegin; k<kend; ++k ) {
6681  const SIMDType b1( set( B(k,j) ) );
6682  xmm1 += A.load(i ,k) * b1;
6683  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6684  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6685  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6686  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6687  }
6688 
6689  C.store( i , j, C.load(i ,j) + xmm1 * factor );
6690  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
6691  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6692  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6693  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6694  }
6695  }
6696 
6697  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6698  {
6699  size_t j( 0UL );
6700 
6701  for( ; (j+2UL) <= N; j+=2UL )
6702  {
6703  const size_t kbegin( ( IsLower_v<MT5> )
6704  ?( ( IsUpper_v<MT4> )
6705  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6706  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6707  :( IsUpper_v<MT4> ? i : 0UL ) );
6708  const size_t kend( ( IsUpper_v<MT5> )
6709  ?( ( IsLower_v<MT4> )
6710  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6711  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6712  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
6713 
6714  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6715 
6716  for( size_t k=kbegin; k<kend; ++k ) {
6717  const SIMDType a1( A.load(i ,k) );
6718  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6719  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6720  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6721  const SIMDType b1( set( B(k,j ) ) );
6722  const SIMDType b2( set( B(k,j+1UL) ) );
6723  xmm1 += a1 * b1;
6724  xmm2 += a2 * b1;
6725  xmm3 += a3 * b1;
6726  xmm4 += a4 * b1;
6727  xmm5 += a1 * b2;
6728  xmm6 += a2 * b2;
6729  xmm7 += a3 * b2;
6730  xmm8 += a4 * b2;
6731  }
6732 
6733  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6734  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
6735  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6736  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6737  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
6738  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
6739  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
6740  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
6741  }
6742 
6743  if( j < N )
6744  {
6745  const size_t kbegin( ( IsLower_v<MT5> )
6746  ?( ( IsUpper_v<MT4> )
6747  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6748  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6749  :( IsUpper_v<MT4> ? i : 0UL ) );
6750  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
6751 
6752  SIMDType xmm1, xmm2, xmm3, xmm4;
6753 
6754  for( size_t k=kbegin; k<kend; ++k ) {
6755  const SIMDType b1( set( B(k,j) ) );
6756  xmm1 += A.load(i ,k) * b1;
6757  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6758  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6759  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6760  }
6761 
6762  C.store( i , j, C.load(i ,j) + xmm1 * factor );
6763  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
6764  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6765  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6766  }
6767  }
6768 
6769  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
6770  {
6771  size_t j( 0UL );
6772 
6773  for( ; (j+2UL) <= N; j+=2UL )
6774  {
6775  const size_t kbegin( ( IsLower_v<MT5> )
6776  ?( ( IsUpper_v<MT4> )
6777  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6778  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6779  :( IsUpper_v<MT4> ? i : 0UL ) );
6780  const size_t kend( ( IsUpper_v<MT5> )
6781  ?( ( IsLower_v<MT4> )
6782  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6783  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6784  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
6785 
6786  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6787 
6788  for( size_t k=kbegin; k<kend; ++k ) {
6789  const SIMDType a1( A.load(i ,k) );
6790  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6791  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6792  const SIMDType b1( set( B(k,j ) ) );
6793  const SIMDType b2( set( B(k,j+1UL) ) );
6794  xmm1 += a1 * b1;
6795  xmm2 += a2 * b1;
6796  xmm3 += a3 * b1;
6797  xmm4 += a1 * b2;
6798  xmm5 += a2 * b2;
6799  xmm6 += a3 * b2;
6800  }
6801 
6802  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6803  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
6804  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6805  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
6806  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
6807  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
6808  }
6809 
6810  if( j < N )
6811  {
6812  const size_t kbegin( ( IsLower_v<MT5> )
6813  ?( ( IsUpper_v<MT4> )
6814  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6815  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6816  :( IsUpper_v<MT4> ? i : 0UL ) );
6817  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
6818 
6819  SIMDType xmm1, xmm2, xmm3;
6820 
6821  for( size_t k=kbegin; k<kend; ++k ) {
6822  const SIMDType b1( set( B(k,j) ) );
6823  xmm1 += A.load(i ,k) * b1;
6824  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6825  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6826  }
6827 
6828  C.store( i , j, C.load(i ,j) + xmm1 * factor );
6829  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
6830  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6831  }
6832  }
6833 
6834  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6835  {
6836  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
6837  size_t j( UPP ? i : 0UL );
6838 
6839  for( ; (j+4UL) <= jend; j+=4UL )
6840  {
6841  const size_t kbegin( ( IsLower_v<MT5> )
6842  ?( ( IsUpper_v<MT4> )
6843  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6844  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6845  :( IsUpper_v<MT4> ? i : 0UL ) );
6846  const size_t kend( ( IsUpper_v<MT5> )
6847  ?( ( IsLower_v<MT4> )
6848  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6849  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6850  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6851 
6852  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6853 
6854  for( size_t k=kbegin; k<kend; ++k ) {
6855  const SIMDType a1( A.load(i ,k) );
6856  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6857  const SIMDType b1( set( B(k,j ) ) );
6858  const SIMDType b2( set( B(k,j+1UL) ) );
6859  const SIMDType b3( set( B(k,j+2UL) ) );
6860  const SIMDType b4( set( B(k,j+3UL) ) );
6861  xmm1 += a1 * b1;
6862  xmm2 += a2 * b1;
6863  xmm3 += a1 * b2;
6864  xmm4 += a2 * b2;
6865  xmm5 += a1 * b3;
6866  xmm6 += a2 * b3;
6867  xmm7 += a1 * b4;
6868  xmm8 += a2 * b4;
6869  }
6870 
6871  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6872  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
6873  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
6874  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6875  C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
6876  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6877  C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
6878  C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
6879  }
6880 
6881  for( ; (j+3UL) <= jend; j+=3UL )
6882  {
6883  const size_t kbegin( ( IsLower_v<MT5> )
6884  ?( ( IsUpper_v<MT4> )
6885  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6886  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6887  :( IsUpper_v<MT4> ? i : 0UL ) );
6888  const size_t kend( ( IsUpper_v<MT5> )
6889  ?( ( IsLower_v<MT4> )
6890  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6891  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6892  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6893 
6894  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6895 
6896  for( size_t k=kbegin; k<kend; ++k ) {
6897  const SIMDType a1( A.load(i ,k) );
6898  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6899  const SIMDType b1( set( B(k,j ) ) );
6900  const SIMDType b2( set( B(k,j+1UL) ) );
6901  const SIMDType b3( set( B(k,j+2UL) ) );
6902  xmm1 += a1 * b1;
6903  xmm2 += a2 * b1;
6904  xmm3 += a1 * b2;
6905  xmm4 += a2 * b2;
6906  xmm5 += a1 * b3;
6907  xmm6 += a2 * b3;
6908  }
6909 
6910  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6911  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
6912  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
6913  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6914  C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
6915  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6916  }
6917 
6918  for( ; (j+2UL) <= jend; j+=2UL )
6919  {
6920  const size_t kbegin( ( IsLower_v<MT5> )
6921  ?( ( IsUpper_v<MT4> )
6922  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6923  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6924  :( IsUpper_v<MT4> ? i : 0UL ) );
6925  const size_t kend( ( IsUpper_v<MT5> )
6926  ?( ( IsLower_v<MT4> )
6927  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6928  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6929  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6930 
6931  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6932  size_t k( kbegin );
6933 
6934  for( ; (k+2UL) <= kend; k+=2UL ) {
6935  const SIMDType a1( A.load(i ,k ) );
6936  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6937  const SIMDType a3( A.load(i ,k+1UL) );
6938  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6939  const SIMDType b1( set( B(k ,j ) ) );
6940  const SIMDType b2( set( B(k ,j+1UL) ) );
6941  const SIMDType b3( set( B(k+1UL,j ) ) );
6942  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
6943  xmm1 += a1 * b1;
6944  xmm2 += a2 * b1;
6945  xmm3 += a1 * b2;
6946  xmm4 += a2 * b2;
6947  xmm5 += a3 * b3;
6948  xmm6 += a4 * b3;
6949  xmm7 += a3 * b4;
6950  xmm8 += a4 * b4;
6951  }
6952 
6953  for( ; k<kend; ++k ) {
6954  const SIMDType a1( A.load(i ,k) );
6955  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6956  const SIMDType b1( set( B(k,j ) ) );
6957  const SIMDType b2( set( B(k,j+1UL) ) );
6958  xmm1 += a1 * b1;
6959  xmm2 += a2 * b1;
6960  xmm3 += a1 * b2;
6961  xmm4 += a2 * b2;
6962  }
6963 
6964  C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
6965  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
6966  C.store( i , j+1UL, C.load(i ,j+1UL) + (xmm3+xmm7) * factor );
6967  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
6968  }
6969 
6970  if( j < jend )
6971  {
6972  const size_t kbegin( ( IsLower_v<MT5> )
6973  ?( ( IsUpper_v<MT4> )
6974  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6975  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6976  :( IsUpper_v<MT4> ? i : 0UL ) );
6977  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6978 
6979  SIMDType xmm1, xmm2, xmm3, xmm4;
6980  size_t k( kbegin );
6981 
6982  for( ; (k+2UL) <= kend; k+=2UL ) {
6983  const SIMDType b1( set( B(k ,j) ) );
6984  const SIMDType b2( set( B(k+1UL,j) ) );
6985  xmm1 += A.load(i ,k ) * b1;
6986  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
6987  xmm3 += A.load(i ,k+1UL) * b2;
6988  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
6989  }
6990 
6991  for( ; k<kend; ++k ) {
6992  const SIMDType b1( set( B(k,j) ) );
6993  xmm1 += A.load(i ,k) * b1;
6994  xmm2 += A.load(i+SIMDSIZE,k) * b1;
6995  }
6996 
6997  C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
6998  C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
6999  }
7000  }
7001 
7002  for( ; i<ipos; i+=SIMDSIZE )
7003  {
7004  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
7005  size_t j( UPP ? i : 0UL );
7006 
7007  for( ; (j+4UL) <= jend; j+=4UL )
7008  {
7009  const size_t kbegin( ( IsLower_v<MT5> )
7010  ?( ( IsUpper_v<MT4> )
7011  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7012  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7013  :( IsUpper_v<MT4> ? i : 0UL ) );
7014  const size_t kend( ( IsUpper_v<MT5> )
7015  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
7016  :( K ) );
7017 
7018  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7019  size_t k( kbegin );
7020 
7021  for( ; (k+2UL) <= kend; k+=2UL ) {
7022  const SIMDType a1( A.load(i,k ) );
7023  const SIMDType a2( A.load(i,k+1UL) );
7024  xmm1 += a1 * set( B(k ,j ) );
7025  xmm2 += a1 * set( B(k ,j+1UL) );
7026  xmm3 += a1 * set( B(k ,j+2UL) );
7027  xmm4 += a1 * set( B(k ,j+3UL) );
7028  xmm5 += a2 * set( B(k+1UL,j ) );
7029  xmm6 += a2 * set( B(k+1UL,j+1UL) );
7030  xmm7 += a2 * set( B(k+1UL,j+2UL) );
7031  xmm8 += a2 * set( B(k+1UL,j+3UL) );
7032  }
7033 
7034  for( ; k<kend; ++k ) {
7035  const SIMDType a1( A.load(i,k) );
7036  xmm1 += a1 * set( B(k,j ) );
7037  xmm2 += a1 * set( B(k,j+1UL) );
7038  xmm3 += a1 * set( B(k,j+2UL) );
7039  xmm4 += a1 * set( B(k,j+3UL) );
7040  }
7041 
7042  C.store( i, j , C.load(i,j ) + (xmm1+xmm5) * factor );
7043  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm6) * factor );
7044  C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm7) * factor );
7045  C.store( i, j+3UL, C.load(i,j+3UL) + (xmm4+xmm8) * factor );
7046  }
7047 
7048  for( ; (j+3UL) <= jend; j+=3UL )
7049  {
7050  const size_t kbegin( ( IsLower_v<MT5> )
7051  ?( ( IsUpper_v<MT4> )
7052  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7053  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7054  :( IsUpper_v<MT4> ? i : 0UL ) );
7055  const size_t kend( ( IsUpper_v<MT5> )
7056  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
7057  :( K ) );
7058 
7059  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7060  size_t k( kbegin );
7061 
7062  for( ; (k+2UL) <= kend; k+=2UL ) {
7063  const SIMDType a1( A.load(i,k ) );
7064  const SIMDType a2( A.load(i,k+1UL) );
7065  xmm1 += a1 * set( B(k ,j ) );
7066  xmm2 += a1 * set( B(k ,j+1UL) );
7067  xmm3 += a1 * set( B(k ,j+2UL) );
7068  xmm4 += a2 * set( B(k+1UL,j ) );
7069  xmm5 += a2 * set( B(k+1UL,j+1UL) );
7070  xmm6 += a2 * set( B(k+1UL,j+2UL) );
7071  }
7072 
7073  for( ; k<kend; ++k ) {
7074  const SIMDType a1( A.load(i,k) );
7075  xmm1 += a1 * set( B(k,j ) );
7076  xmm2 += a1 * set( B(k,j+1UL) );
7077  xmm3 += a1 * set( B(k,j+2UL) );
7078  }
7079 
7080  C.store( i, j , C.load(i,j ) + (xmm1+xmm4) * factor );
7081  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm5) * factor );
7082  C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm6) * factor );
7083  }
7084 
7085  for( ; (j+2UL) <= jend; j+=2UL )
7086  {
7087  const size_t kbegin( ( IsLower_v<MT5> )
7088  ?( ( IsUpper_v<MT4> )
7089  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7090  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7091  :( IsUpper_v<MT4> ? i : 0UL ) );
7092  const size_t kend( ( IsUpper_v<MT5> )
7093  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7094  :( K ) );
7095 
7096  SIMDType xmm1, xmm2, xmm3, xmm4;
7097  size_t k( kbegin );
7098 
7099  for( ; (k+2UL) <= kend; k+=2UL ) {
7100  const SIMDType a1( A.load(i,k ) );
7101  const SIMDType a2( A.load(i,k+1UL) );
7102  xmm1 += a1 * set( B(k ,j ) );
7103  xmm2 += a1 * set( B(k ,j+1UL) );
7104  xmm3 += a2 * set( B(k+1UL,j ) );
7105  xmm4 += a2 * set( B(k+1UL,j+1UL) );
7106  }
7107 
7108  for( ; k<kend; ++k ) {
7109  const SIMDType a1( A.load(i,k) );
7110  xmm1 += a1 * set( B(k,j ) );
7111  xmm2 += a1 * set( B(k,j+1UL) );
7112  }
7113 
7114  C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
7115  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm4) * factor );
7116  }
7117 
7118  if( j < jend )
7119  {
7120  const size_t kbegin( ( IsLower_v<MT5> )
7121  ?( ( IsUpper_v<MT4> )
7122  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7123  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7124  :( IsUpper_v<MT4> ? i : 0UL ) );
7125 
7126  SIMDType xmm1, xmm2;
7127  size_t k( kbegin );
7128 
7129  for( ; (k+2UL) <= K; k+=2UL ) {
7130  xmm1 += A.load(i,k ) * set( B(k ,j) );
7131  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
7132  }
7133 
7134  for( ; k<K; ++k ) {
7135  xmm1 += A.load(i,k) * set( B(k,j) );
7136  }
7137 
7138  C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
7139  }
7140  }
7141 
7142  for( ; remainder && i<M; ++i )
7143  {
7144  const size_t jend( LOW ? i+1UL : N );
7145  size_t j( UPP ? i : 0UL );
7146 
7147  for( ; (j+2UL) <= jend; j+=2UL )
7148  {
7149  const size_t kbegin( ( IsLower_v<MT5> )
7150  ?( ( IsUpper_v<MT4> )
7151  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7152  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7153  :( IsUpper_v<MT4> ? i : 0UL ) );
7154  const size_t kend( ( IsUpper_v<MT5> )
7155  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7156  :( K ) );
7157 
7158  ElementType value1{};
7159  ElementType value2{};
7160 
7161  for( size_t k=kbegin; k<kend; ++k ) {
7162  value1 += A(i,k) * B(k,j );
7163  value2 += A(i,k) * B(k,j+1UL);
7164  }
7165 
7166  C(i,j ) += value1 * scalar;
7167  C(i,j+1UL) += value2 * scalar;
7168  }
7169 
7170  if( j < jend )
7171  {
7172  const size_t kbegin( ( IsLower_v<MT5> )
7173  ?( ( IsUpper_v<MT4> )
7174  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7175  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7176  :( IsUpper_v<MT4> ? i : 0UL ) );
7177 
7178  ElementType value{};
7179 
7180  for( size_t k=kbegin; k<K; ++k ) {
7181  value += A(i,k) * B(k,j);
7182  }
7183 
7184  C(i,j) += value * scalar;
7185  }
7186  }
7187  }
7188  //**********************************************************************************************
7189 
7190  //**Default addition assignment to dense matrices (large matrices)******************************
7204  template< typename MT3 // Type of the left-hand side target matrix
7205  , typename MT4 // Type of the left-hand side matrix operand
7206  , typename MT5 // Type of the right-hand side matrix operand
7207  , typename ST2 > // Type of the scalar value
7208  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7209  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7210  {
7211  selectDefaultAddAssignKernel( C, A, B, scalar );
7212  }
7213  //**********************************************************************************************
7214 
7215  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
7230  template< typename MT3 // Type of the left-hand side target matrix
7231  , typename MT4 // Type of the left-hand side matrix operand
7232  , typename MT5 // Type of the right-hand side matrix operand
7233  , typename ST2 > // Type of the scalar value
7234  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7235  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7236  {
7237  if( LOW )
7238  lmmm( C, A, B, scalar, ST2(1) );
7239  else if( UPP )
7240  ummm( C, A, B, scalar, ST2(1) );
7241  else
7242  mmm( C, A, B, scalar, ST2(1) );
7243  }
7244  //**********************************************************************************************
7245 
7246  //**BLAS-based addition assignment to dense matrices (default)**********************************
7261  template< typename MT3 // Type of the left-hand side target matrix
7262  , typename MT4 // Type of the left-hand side matrix operand
7263  , typename MT5 // Type of the right-hand side matrix operand
7264  , typename ST2 > // Type of the scalar value
7265  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7266  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7267  {
7268  selectLargeAddAssignKernel( C, A, B, scalar );
7269  }
7270  //**********************************************************************************************
7271 
7272  //**BLAS-based addition assignment to dense matrices********************************************
7273 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7274 
7287  template< typename MT3 // Type of the left-hand side target matrix
7288  , typename MT4 // Type of the left-hand side matrix operand
7289  , typename MT5 // Type of the right-hand side matrix operand
7290  , typename ST2 > // Type of the scalar value
7291  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7292  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7293  {
7294  using ET = ElementType_t<MT3>;
7295 
7296  if( IsTriangular_v<MT4> ) {
7297  ResultType_t<MT3> tmp( serial( B ) );
7298  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7299  addAssign( C, tmp );
7300  }
7301  else if( IsTriangular_v<MT5> ) {
7302  ResultType_t<MT3> tmp( serial( A ) );
7303  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7304  addAssign( C, tmp );
7305  }
7306  else {
7307  gemm( C, A, B, ET(scalar), ET(1) );
7308  }
7309  }
7310 #endif
7311  //**********************************************************************************************
7312 
7313  //**Restructuring addition assignment to row-major matrices*************************************
7328  template< typename MT > // Type of the target matrix
7329  friend inline auto addAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7330  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7331  {
7333 
7335 
7336  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7337  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7338 
7339  const ForwardFunctor fwd;
7340 
7341  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7342  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7343 
7344  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
7345  addAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
7346  else if( IsSymmetric_v<MT1> )
7347  addAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
7348  else
7349  addAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
7350  }
7351  //**********************************************************************************************
7352 
7353  //**Addition assignment to sparse matrices******************************************************
7354  // No special implementation for the addition assignment to sparse matrices.
7355  //**********************************************************************************************
7356 
7357  //**Subtraction assignment to dense matrices****************************************************
7369  template< typename MT // Type of the target dense matrix
7370  , bool SO > // Storage order of the target dense matrix
7371  friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7372  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7373  {
7375 
7376  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7377  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7378 
7379  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7380  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7381 
7382  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7383  return;
7384  }
7385 
7386  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7387  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7388 
7389  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7390  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7391  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7392  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7393  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7394  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7395 
7396  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7397  }
7398  //**********************************************************************************************
7399 
7400  //**Subtraction assignment to dense matrices (kernel selection)*********************************
7411  template< typename MT3 // Type of the left-hand side target matrix
7412  , typename MT4 // Type of the left-hand side matrix operand
7413  , typename MT5 // Type of the right-hand side matrix operand
7414  , typename ST2 > // Type of the scalar value
7415  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7416  {
7417  if( ( IsDiagonal_v<MT4> ) ||
7418  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
7419  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
7420  selectSmallSubAssignKernel( C, A, B, scalar );
7421  else
7422  selectBlasSubAssignKernel( C, A, B, scalar );
7423  }
7424  //**********************************************************************************************
7425 
7426  //**Default subtraction assignment to dense matrices (general/general)**************************
7440  template< typename MT3 // Type of the left-hand side target matrix
7441  , typename MT4 // Type of the left-hand side matrix operand
7442  , typename MT5 // Type of the right-hand side matrix operand
7443  , typename ST2 > // Type of the scalar value
7444  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7445  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7446  {
7447  const ResultType tmp( serial( A * B * scalar ) );
7448  subAssign( C, tmp );
7449  }
7450  //**********************************************************************************************
7451 
7452  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
7466  template< typename MT3 // Type of the left-hand side target matrix
7467  , typename MT4 // Type of the left-hand side matrix operand
7468  , typename MT5 // Type of the right-hand side matrix operand
7469  , typename ST2 > // Type of the scalar value
7470  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7471  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7472  {
7474 
7475  const size_t M( A.rows() );
7476  const size_t N( B.columns() );
7477 
7478  for( size_t j=0UL; j<N; ++j )
7479  {
7480  const size_t ibegin( ( IsLower_v<MT4> )
7481  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7482  :( 0UL ) );
7483  const size_t iend( ( IsUpper_v<MT4> )
7484  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
7485  :( M ) );
7486  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7487 
7488  const size_t inum( iend - ibegin );
7489  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7490 
7491  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7492  C(i ,j) -= A(i ,j) * B(j,j) * scalar;
7493  C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
7494  }
7495  if( ipos < iend ) {
7496  C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
7497  }
7498  }
7499  }
7500  //**********************************************************************************************
7501 
7502  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
7516  template< typename MT3 // Type of the left-hand side target matrix
7517  , typename MT4 // Type of the left-hand side matrix operand
7518  , typename MT5 // Type of the right-hand side matrix operand
7519  , typename ST2 > // Type of the scalar value
7520  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7521  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7522  {
7524 
7525  const size_t M( A.rows() );
7526  const size_t N( B.columns() );
7527 
7528  for( size_t j=0UL; j<N; ++j )
7529  {
7530  const size_t ibegin( ( IsLower_v<MT5> )
7531  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7532  :( 0UL ) );
7533  const size_t iend( ( IsUpper_v<MT5> )
7534  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7535  :( M ) );
7536  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7537 
7538  const size_t inum( iend - ibegin );
7539  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7540 
7541  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7542  C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7543  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7544  }
7545  if( ipos < iend ) {
7546  C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7547  }
7548  }
7549  }
7550  //**********************************************************************************************
7551 
7552  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
7566  template< typename MT3 // Type of the left-hand side target matrix
7567  , typename MT4 // Type of the left-hand side matrix operand
7568  , typename MT5 // Type of the right-hand side matrix operand
7569  , typename ST2 > // Type of the scalar value
7570  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7571  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7572  {
7574 
7575  for( size_t i=0UL; i<A.rows(); ++i ) {
7576  C(i,i) -= A(i,i) * B(i,i) * scalar;
7577  }
7578  }
7579  //**********************************************************************************************
7580 
7581  //**Default subtraction assignment to dense matrices (small matrices)***************************
7595  template< typename MT3 // Type of the left-hand side target matrix
7596  , typename MT4 // Type of the left-hand side matrix operand
7597  , typename MT5 // Type of the right-hand side matrix operand
7598  , typename ST2 > // Type of the scalar value
7599  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7600  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7601  {
7602  selectDefaultSubAssignKernel( C, A, B, scalar );
7603  }
7604  //**********************************************************************************************
7605 
7606  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
7621  template< typename MT3 // Type of the left-hand side target matrix
7622  , typename MT4 // Type of the left-hand side matrix operand
7623  , typename MT5 // Type of the right-hand side matrix operand
7624  , typename ST2 > // Type of the scalar value
7625  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7626  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7627  {
7630  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT4> );
7631  BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE( OppositeType_t<MT5> );
7632 
7633  const ForwardFunctor fwd;
7634 
7635  if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7636  const OppositeType_t<MT5> tmp( serial( B ) );
7637  subAssign( C, fwd( A * tmp ) * scalar );
7638  }
7639  else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7640  const OppositeType_t<MT4> tmp( serial( A ) );
7641  subAssign( C, fwd( tmp * B ) * scalar );
7642  }
7643  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7644  const OppositeType_t<MT5> tmp( serial( B ) );
7645  subAssign( C, fwd( A * tmp ) * scalar );
7646  }
7647  else {
7648  const OppositeType_t<MT4> tmp( serial( A ) );
7649  subAssign( C, fwd( tmp * B ) * scalar );
7650  }
7651  }
7652  //**********************************************************************************************
7653 
7654  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
7669  template< typename MT3 // Type of the left-hand side target matrix
7670  , typename MT4 // Type of the left-hand side matrix operand
7671  , typename MT5 // Type of the right-hand side matrix operand
7672  , typename ST2 > // Type of the scalar value
7673  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7674  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7675  {
7676  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
7677 
7678  const size_t M( A.rows() );
7679  const size_t N( B.columns() );
7680  const size_t K( A.columns() );
7681 
7682  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7683 
7684  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
7685  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
7686 
7687  const SIMDType factor( set( scalar ) );
7688 
7689  size_t i( 0UL );
7690 
7691  if( IsIntegral_v<ElementType> )
7692  {
7693  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
7694  for( size_t j=0UL; j<N; ++j )
7695  {
7696  const size_t kbegin( ( IsLower_v<MT5> )
7697  ?( ( IsUpper_v<MT4> )
7698  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7699  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7700  :( IsUpper_v<MT4> ? i : 0UL ) );
7701  const size_t kend( ( IsUpper_v<MT5> )
7702  ?( ( IsLower_v<MT4> )
7703  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7704  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
7705  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
7706 
7707  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7708 
7709  for( size_t k=kbegin; k<kend; ++k ) {
7710  const SIMDType b1( set( B(k,j) ) );
7711  xmm1 += A.load(i ,k) * b1;
7712  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7713  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7714  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7715  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7716  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
7717  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
7718  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
7719  }
7720 
7721  C.store( i , j, C.load(i ,j) - xmm1 * factor );
7722  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
7723  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7724  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7725  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7726  C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
7727  C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
7728  C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
7729  }
7730  }
7731  }
7732 
7733  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
7734  {
7735  size_t j( 0UL );
7736 
7737  for( ; (j+2UL) <= N; j+=2UL )
7738  {
7739  const size_t kbegin( ( IsLower_v<MT5> )
7740  ?( ( IsUpper_v<MT4> )
7741  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7742  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7743  :( IsUpper_v<MT4> ? i : 0UL ) );
7744  const size_t kend( ( IsUpper_v<MT5> )
7745  ?( ( IsLower_v<MT4> )
7746  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7747  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7748  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
7749 
7750  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7751 
7752  for( size_t k=kbegin; k<kend; ++k ) {
7753  const SIMDType a1( A.load(i ,k) );
7754  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7755  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7756  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7757  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
7758  const SIMDType b1( set( B(k,j ) ) );
7759  const SIMDType b2( set( B(k,j+1UL) ) );
7760  xmm1 += a1 * b1;
7761  xmm2 += a2 * b1;
7762  xmm3 += a3 * b1;
7763  xmm4 += a4 * b1;
7764  xmm5 += a5 * b1;
7765  xmm6 += a1 * b2;
7766  xmm7 += a2 * b2;
7767  xmm8 += a3 * b2;
7768  xmm9 += a4 * b2;
7769  xmm10 += a5 * b2;
7770  }
7771 
7772  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7773  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
7774  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7775  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7776  C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
7777  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
7778  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
7779  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
7780  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
7781  C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
7782  }
7783 
7784  if( j < N )
7785  {
7786  const size_t kbegin( ( IsLower_v<MT5> )
7787  ?( ( IsUpper_v<MT4> )
7788  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7789  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7790  :( IsUpper_v<MT4> ? i : 0UL ) );
7791  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
7792 
7793  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7794 
7795  for( size_t k=kbegin; k<kend; ++k ) {
7796  const SIMDType b1( set( B(k,j) ) );
7797  xmm1 += A.load(i ,k) * b1;
7798  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7799  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7800  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7801  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7802  }
7803 
7804  C.store( i , j, C.load(i ,j) - xmm1 * factor );
7805  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
7806  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7807  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7808  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7809  }
7810  }
7811 
7812  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7813  {
7814  size_t j( 0UL );
7815 
7816  for( ; (j+2UL) <= N; j+=2UL )
7817  {
7818  const size_t kbegin( ( IsLower_v<MT5> )
7819  ?( ( IsUpper_v<MT4> )
7820  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7821  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7822  :( IsUpper_v<MT4> ? i : 0UL ) );
7823  const size_t kend( ( IsUpper_v<MT5> )
7824  ?( ( IsLower_v<MT4> )
7825  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7826  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7827  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
7828 
7829  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7830 
7831  for( size_t k=kbegin; k<kend; ++k ) {
7832  const SIMDType a1( A.load(i ,k) );
7833  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7834  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7835  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7836  const SIMDType b1( set( B(k,j ) ) );
7837  const SIMDType b2( set( B(k,j+1UL) ) );
7838  xmm1 += a1 * b1;
7839  xmm2 += a2 * b1;
7840  xmm3 += a3 * b1;
7841  xmm4 += a4 * b1;
7842  xmm5 += a1 * b2;
7843  xmm6 += a2 * b2;
7844  xmm7 += a3 * b2;
7845  xmm8 += a4 * b2;
7846  }
7847 
7848  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7849  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
7850  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7851  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7852  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
7853  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
7854  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
7855  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
7856  }
7857 
7858  if( j < N )
7859  {
7860  const size_t kbegin( ( IsLower_v<MT5> )
7861  ?( ( IsUpper_v<MT4> )
7862  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7863  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7864  :( IsUpper_v<MT4> ? i : 0UL ) );
7865  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
7866 
7867  SIMDType xmm1, xmm2, xmm3, xmm4;
7868 
7869  for( size_t k=kbegin; k<kend; ++k ) {
7870  const SIMDType b1( set( B(k,j) ) );
7871  xmm1 += A.load(i ,k) * b1;
7872  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7873  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7874  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7875  }
7876 
7877  C.store( i , j, C.load(i ,j) - xmm1 * factor );
7878  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
7879  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7880  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7881  }
7882  }
7883 
7884  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
7885  {
7886  size_t j( 0UL );
7887 
7888  for( ; (j+2UL) <= N; j+=2UL )
7889  {
7890  const size_t kbegin( ( IsLower_v<MT5> )
7891  ?( ( IsUpper_v<MT4> )
7892  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7893  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7894  :( IsUpper_v<MT4> ? i : 0UL ) );
7895  const size_t kend( ( IsUpper_v<MT5> )
7896  ?( ( IsLower_v<MT4> )
7897  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7898  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7899  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
7900 
7901  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7902 
7903  for( size_t k=kbegin; k<kend; ++k ) {
7904  const SIMDType a1( A.load(i ,k) );
7905  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7906  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7907  const SIMDType b1( set( B(k,j ) ) );
7908  const SIMDType b2( set( B(k,j+1UL) ) );
7909  xmm1 += a1 * b1;
7910  xmm2 += a2 * b1;
7911  xmm3 += a3 * b1;
7912  xmm4 += a1 * b2;
7913  xmm5 += a2 * b2;
7914  xmm6 += a3 * b2;
7915  }
7916 
7917  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7918  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
7919  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7920  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
7921  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
7922  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
7923  }
7924 
7925  if( j < N )
7926  {
7927  const size_t kbegin( ( IsLower_v<MT5> )
7928  ?( ( IsUpper_v<MT4> )
7929  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7930  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7931  :( IsUpper_v<MT4> ? i : 0UL ) );
7932  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
7933 
7934  SIMDType xmm1, xmm2, xmm3;
7935 
7936  for( size_t k=kbegin; k<kend; ++k ) {
7937  const SIMDType b1( set( B(k,j) ) );
7938  xmm1 += A.load(i ,k) * b1;
7939  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7940  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7941  }
7942 
7943  C.store( i , j, C.load(i ,j) - xmm1 * factor );
7944  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
7945  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7946  }
7947  }
7948 
7949  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7950  {
7951  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
7952  size_t j( UPP ? i : 0UL );
7953 
7954  for( ; (j+4UL) <= jend; j+=4UL )
7955  {
7956  const size_t kbegin( ( IsLower_v<MT5> )
7957  ?( ( IsUpper_v<MT4> )
7958  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7959  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7960  :( IsUpper_v<MT4> ? i : 0UL ) );
7961  const size_t kend( ( IsUpper_v<MT5> )
7962  ?( ( IsLower_v<MT4> )
7963  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
7964  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
7965  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
7966 
7967  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7968 
7969  for( size_t k=kbegin; k<kend; ++k ) {
7970  const SIMDType a1( A.load(i ,k) );
7971  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7972  const SIMDType b1( set( B(k,j ) ) );
7973  const SIMDType b2( set( B(k,j+1UL) ) );
7974  const SIMDType b3( set( B(k,j+2UL) ) );
7975  const SIMDType b4( set( B(k,j+3UL) ) );
7976  xmm1 += a1 * b1;
7977  xmm2 += a2 * b1;
7978  xmm3 += a1 * b2;
7979  xmm4 += a2 * b2;
7980  xmm5 += a1 * b3;
7981  xmm6 += a2 * b3;
7982  xmm7 += a1 * b4;
7983  xmm8 += a2 * b4;
7984  }
7985 
7986  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7987  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
7988  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
7989  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
7990  C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
7991  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
7992  C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
7993  C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
7994  }
7995 
7996  for( ; (j+3UL) <= jend; j+=3UL )
7997  {
7998  const size_t kbegin( ( IsLower_v<MT5> )
7999  ?( ( IsUpper_v<MT4> )
8000  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8001  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8002  :( IsUpper_v<MT4> ? i : 0UL ) );
8003  const size_t kend( ( IsUpper_v<MT5> )
8004  ?( ( IsLower_v<MT4> )
8005  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
8006  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
8007  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8008 
8009  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8010 
8011  for( size_t k=kbegin; k<kend; ++k ) {
8012  const SIMDType a1( A.load(i ,k) );
8013  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8014  const SIMDType b1( set( B(k,j ) ) );
8015  const SIMDType b2( set( B(k,j+1UL) ) );
8016  const SIMDType b3( set( B(k,j+2UL) ) );
8017  xmm1 += a1 * b1;
8018  xmm2 += a2 * b1;
8019  xmm3 += a1 * b2;
8020  xmm4 += a2 * b2;
8021  xmm5 += a1 * b3;
8022  xmm6 += a2 * b3;
8023  }
8024 
8025  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8026  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
8027  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8028  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8029  C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8030  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
8031  }
8032 
8033  for( ; (j+2UL) <= jend; j+=2UL )
8034  {
8035  const size_t kbegin( ( IsLower_v<MT5> )
8036  ?( ( IsUpper_v<MT4> )
8037  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8038  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8039  :( IsUpper_v<MT4> ? i : 0UL ) );
8040  const size_t kend( ( IsUpper_v<MT5> )
8041  ?( ( IsLower_v<MT4> )
8042  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8043  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8044  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8045 
8046  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8047  size_t k( kbegin );
8048 
8049  for( ; (k+2UL) <= kend; k+=2UL ) {
8050  const SIMDType a1( A.load(i ,k ) );
8051  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8052  const SIMDType a3( A.load(i ,k+1UL) );
8053  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8054  const SIMDType b1( set( B(k ,j ) ) );
8055  const SIMDType b2( set( B(k ,j+1UL) ) );
8056  const SIMDType b3( set( B(k+1UL,j ) ) );
8057  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
8058  xmm1 += a1 * b1;
8059  xmm2 += a2 * b1;
8060  xmm3 += a1 * b2;
8061  xmm4 += a2 * b2;
8062  xmm5 += a3 * b3;
8063  xmm6 += a4 * b3;
8064  xmm7 += a3 * b4;
8065  xmm8 += a4 * b4;
8066  }
8067 
8068  for( ; k<kend; ++k ) {
8069  const SIMDType a1( A.load(i ,k) );
8070  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8071  const SIMDType b1( set( B(k,j ) ) );
8072  const SIMDType b2( set( B(k,j+1UL) ) );
8073  xmm1 += a1 * b1;
8074  xmm2 += a2 * b1;
8075  xmm3 += a1 * b2;
8076  xmm4 += a2 * b2;
8077  }
8078 
8079  C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
8080  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
8081  C.store( i , j+1UL, C.load(i ,j+1UL) - (xmm3+xmm7) * factor );
8082  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
8083  }
8084 
8085  if( j < jend )
8086  {
8087  const size_t kbegin( ( IsLower_v<MT5> )
8088  ?( ( IsUpper_v<MT4> )
8089  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8090  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8091  :( IsUpper_v<MT4> ? i : 0UL ) );
8092  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8093 
8094  SIMDType xmm1, xmm2, xmm3, xmm4;
8095  size_t k( kbegin );
8096 
8097  for( ; (k+2UL) <= kend; k+=2UL ) {
8098  const SIMDType b1( set( B(k ,j) ) );
8099  const SIMDType b2( set( B(k+1UL,j) ) );
8100  xmm1 += A.load(i ,k ) * b1;
8101  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8102  xmm3 += A.load(i ,k+1UL) * b2;
8103  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8104  }
8105 
8106  for( ; k<kend; ++k ) {
8107  const SIMDType b1( set( B(k,j) ) );
8108  xmm1 += A.load(i ,k) * b1;
8109  xmm2 += A.load(i+SIMDSIZE,k) * b1;
8110  }
8111 
8112  C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
8113  C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
8114  }
8115  }
8116 
8117  for( ; i<ipos; i+=SIMDSIZE )
8118  {
8119  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
8120  size_t j( UPP ? i : 0UL );
8121 
8122  for( ; (j+4UL) <= jend; j+=4UL )
8123  {
8124  const size_t kbegin( ( IsLower_v<MT5> )
8125  ?( ( IsUpper_v<MT4> )
8126  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8127  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8128  :( IsUpper_v<MT4> ? i : 0UL ) );
8129  const size_t kend( ( IsUpper_v<MT5> )
8130  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
8131  :( K ) );
8132 
8133  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8134  size_t k( kbegin );
8135 
8136  for( ; (k+2UL) <= kend; k+=2UL ) {
8137  const SIMDType a1( A.load(i,k ) );
8138  const SIMDType a2( A.load(i,k+1UL) );
8139  xmm1 += a1 * set( B(k ,j ) );
8140  xmm2 += a1 * set( B(k ,j+1UL) );
8141  xmm3 += a1 * set( B(k ,j+2UL) );
8142  xmm4 += a1 * set( B(k ,j+3UL) );
8143  xmm5 += a2 * set( B(k+1UL,j ) );
8144  xmm6 += a2 * set( B(k+1UL,j+1UL) );
8145  xmm7 += a2 * set( B(k+1UL,j+2UL) );
8146  xmm8 += a2 * set( B(k+1UL,j+3UL) );
8147  }
8148 
8149  for( ; k<kend; ++k ) {
8150  const SIMDType a1( A.load(i,k) );
8151  xmm1 += a1 * set( B(k,j ) );
8152  xmm2 += a1 * set( B(k,j+1UL) );
8153  xmm3 += a1 * set( B(k,j+2UL) );
8154  xmm4 += a1 * set( B(k,j+3UL) );
8155  }
8156 
8157  C.store( i, j , C.load(i,j ) - (xmm1+xmm5) * factor );
8158  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm6) * factor );
8159  C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm7) * factor );
8160  C.store( i, j+3UL, C.load(i,j+3UL) - (xmm4+xmm8) * factor );
8161  }
8162 
8163  for( ; (j+3UL) <= jend; j+=3UL )
8164  {
8165  const size_t kbegin( ( IsLower_v<MT5> )
8166  ?( ( IsUpper_v<MT4> )
8167  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8168  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8169  :( IsUpper_v<MT4> ? i : 0UL ) );
8170  const size_t kend( ( IsUpper_v<MT5> )
8171  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
8172  :( K ) );
8173 
8174  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8175  size_t k( kbegin );
8176 
8177  for( ; (k+2UL) <= kend; k+=2UL ) {
8178  const SIMDType a1( A.load(i,k ) );
8179  const SIMDType a2( A.load(i,k+1UL) );
8180  xmm1 += a1 * set( B(k ,j ) );
8181  xmm2 += a1 * set( B(k ,j+1UL) );
8182  xmm3 += a1 * set( B(k ,j+2UL) );
8183  xmm4 += a2 * set( B(k+1UL,j ) );
8184  xmm5 += a2 * set( B(k+1UL,j+1UL) );
8185  xmm6 += a2 * set( B(k+1UL,j+2UL) );
8186  }
8187 
8188  for( ; k<kend; ++k ) {
8189  const SIMDType a1( A.load(i,k) );
8190  xmm1 += a1 * set( B(k,j ) );
8191  xmm2 += a1 * set( B(k,j+1UL) );
8192  xmm3 += a1 * set( B(k,j+2UL) );
8193  }
8194 
8195  C.store( i, j , C.load(i,j ) - (xmm1+xmm4) * factor );
8196  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm5) * factor );
8197  C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm6) * factor );
8198  }
8199 
8200  for( ; (j+2UL) <= jend; j+=2UL )
8201  {
8202  const size_t kbegin( ( IsLower_v<MT5> )
8203  ?( ( IsUpper_v<MT4> )
8204  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8205  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8206  :( IsUpper_v<MT4> ? i : 0UL ) );
8207  const size_t kend( ( IsUpper_v<MT5> )
8208  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8209  :( K ) );
8210 
8211  SIMDType xmm1, xmm2, xmm3, xmm4;
8212  size_t k( kbegin );
8213 
8214  for( ; (k+2UL) <= kend; k+=2UL ) {
8215  const SIMDType a1( A.load(i,k ) );
8216  const SIMDType a2( A.load(i,k+1UL) );
8217  xmm1 += a1 * set( B(k ,j ) );
8218  xmm2 += a1 * set( B(k ,j+1UL) );
8219  xmm3 += a2 * set( B(k+1UL,j ) );
8220  xmm4 += a2 * set( B(k+1UL,j+1UL) );
8221  }
8222 
8223  for( ; k<kend; ++k ) {
8224  const SIMDType a1( A.load(i,k) );
8225  xmm1 += a1 * set( B(k,j ) );
8226  xmm2 += a1 * set( B(k,j+1UL) );
8227  }
8228 
8229  C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
8230  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm4) * factor );
8231  }
8232 
8233  if( j < jend )
8234  {
8235  const size_t kbegin( ( IsLower_v<MT5> )
8236  ?( ( IsUpper_v<MT4> )
8237  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8238  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8239  :( IsUpper_v<MT4> ? i : 0UL ) );
8240 
8241  SIMDType xmm1, xmm2;
8242  size_t k( kbegin );
8243 
8244  for( ; (k+2UL) <= K; k+=2UL ) {
8245  xmm1 += A.load(i,k ) * set( B(k ,j) );
8246  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
8247  }
8248 
8249  for( ; k<K; ++k ) {
8250  xmm1 += A.load(i,k) * set( B(k,j) );
8251  }
8252 
8253  C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
8254  }
8255  }
8256 
8257  for( ; remainder && i<M; ++i )
8258  {
8259  const size_t jend( LOW ? i+1UL : N );
8260  size_t j( UPP ? i : 0UL );
8261 
8262  for( ; (j+2UL) <= jend; j+=2UL )
8263  {
8264  const size_t kbegin( ( IsLower_v<MT5> )
8265  ?( ( IsUpper_v<MT4> )
8266  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8267  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8268  :( IsUpper_v<MT4> ? i : 0UL ) );
8269  const size_t kend( ( IsUpper_v<MT5> )
8270  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8271  :( K ) );
8272 
8273  ElementType value1{};
8274  ElementType value2{};
8275 
8276  for( size_t k=kbegin; k<kend; ++k ) {
8277  value1 += A(i,k) * B(k,j );
8278  value2 += A(i,k) * B(k,j+1UL);
8279  }
8280 
8281  C(i,j ) -= value1 * scalar;
8282  C(i,j+1UL) -= value2 * scalar;
8283  }
8284 
8285  if( j < jend )
8286  {
8287  const size_t kbegin( ( IsLower_v<MT5> )
8288  ?( ( IsUpper_v<MT4> )
8289  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8290  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8291  :( IsUpper_v<MT4> ? i : 0UL ) );
8292 
8293  ElementType value{};
8294 
8295  for( size_t k=kbegin; k<K; ++k ) {
8296  value += A(i,k) * B(k,j);
8297  }
8298 
8299  C(i,j) -= value * scalar;
8300  }
8301  }
8302  }
8303  //**********************************************************************************************
8304 
8305  //**Default subtraction assignment to dense matrices (large matrices)***************************
8319  template< typename MT3 // Type of the left-hand side target matrix
8320  , typename MT4 // Type of the left-hand side matrix operand
8321  , typename MT5 // Type of the right-hand side matrix operand
8322  , typename ST2 > // Type of the scalar value
8323  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8324  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8325  {
8326  selectDefaultSubAssignKernel( C, A, B, scalar );
8327  }
8328  //**********************************************************************************************
8329 
8330  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
8345  template< typename MT3 // Type of the left-hand side target matrix
8346  , typename MT4 // Type of the left-hand side matrix operand
8347  , typename MT5 // Type of the right-hand side matrix operand
8348  , typename ST2 > // Type of the scalar value
8349  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8350  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8351  {
8352  if( LOW )
8353  lmmm( C, A, B, -scalar, ST2(1) );
8354  else if( UPP )
8355  ummm( C, A, B, -scalar, ST2(1) );
8356  else
8357  mmm( C, A, B, -scalar, ST2(1) );
8358  }
8359  //**********************************************************************************************
8360 
8361  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
8376  template< typename MT3 // Type of the left-hand side target matrix
8377  , typename MT4 // Type of the left-hand side matrix operand
8378  , typename MT5 // Type of the right-hand side matrix operand
8379  , typename ST2 > // Type of the scalar value
8380  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8381  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8382  {
8383  selectLargeSubAssignKernel( C, A, B, scalar );
8384  }
8385  //**********************************************************************************************
8386 
8387  //**BLAS-based subraction assignment to dense matrices******************************************
8388 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
8389 
8402  template< typename MT3 // Type of the left-hand side target matrix
8403  , typename MT4 // Type of the left-hand side matrix operand
8404  , typename MT5 // Type of the right-hand side matrix operand
8405  , typename ST2 > // Type of the scalar value
8406  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8407  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8408  {
8409  using ET = ElementType_t<MT3>;
8410 
8411  if( IsTriangular_v<MT4> ) {
8412  ResultType_t<MT3> tmp( serial( B ) );
8413  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
8414  subAssign( C, tmp );
8415  }
8416  else if( IsTriangular_v<MT5> ) {
8417  ResultType_t<MT3> tmp( serial( A ) );
8418  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
8419  subAssign( C, tmp );
8420  }
8421  else {
8422  gemm( C, A, B, ET(-scalar), ET(1) );
8423  }
8424  }
8425 #endif
8426  //**********************************************************************************************
8427 
8428  //**Restructuring subtraction assignment to row-major matrices**********************************
8442  template< typename MT > // Type of the target matrix
8443  friend inline auto subAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8444  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8445  {
8447 
8449 
8450  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8451  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8452 
8453  const ForwardFunctor fwd;
8454 
8455  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8456  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8457 
8458  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8459  subAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8460  else if( IsSymmetric_v<MT1> )
8461  subAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8462  else
8463  subAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8464  }
8465  //**********************************************************************************************
8466 
8467  //**Subtraction assignment to sparse matrices***************************************************
8468  // No special implementation for the subtraction assignment to sparse matrices.
8469  //**********************************************************************************************
8470 
8471  //**Schur product assignment to dense matrices**************************************************
8483  template< typename MT // Type of the target dense matrix
8484  , bool SO > // Storage order of the target dense matrix
8485  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8486  {
8488 
8492 
8493  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8494  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8495 
8496  const ResultType tmp( serial( rhs ) );
8497  schurAssign( ~lhs, tmp );
8498  }
8499  //**********************************************************************************************
8500 
8501  //**Schur product assignment to sparse matrices*************************************************
8502  // No special implementation for the Schur product assignment to sparse matrices.
8503  //**********************************************************************************************
8504 
8505  //**Multiplication assignment to dense matrices*************************************************
8506  // No special implementation for the multiplication assignment to dense matrices.
8507  //**********************************************************************************************
8508 
8509  //**Multiplication assignment to sparse matrices************************************************
8510  // No special implementation for the multiplication assignment to sparse matrices.
8511  //**********************************************************************************************
8512 
8513  //**SMP assignment to dense matrices************************************************************
8528  template< typename MT // Type of the target dense matrix
8529  , bool SO > // Storage order of the target dense matrix
8530  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8531  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8532  {
8534 
8535  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8536  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8537 
8538  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8539  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8540 
8541  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
8542  return;
8543  }
8544  else if( left.columns() == 0UL ) {
8545  reset( ~lhs );
8546  return;
8547  }
8548 
8549  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8550  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8551 
8552  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8553  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8554  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8555  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8556  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8557  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8558 
8559  smpAssign( ~lhs, A * B * rhs.scalar_ );
8560  }
8561  //**********************************************************************************************
8562 
8563  //**SMP assignment to sparse matrices***********************************************************
8578  template< typename MT // Type of the target sparse matrix
8579  , bool SO > // Storage order of the target sparse matrix
8580  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8581  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8582  {
8584 
8585  using TmpType = If_t< SO, ResultType, OppositeType >;
8586 
8593 
8594  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8595  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8596 
8597  const ForwardFunctor fwd;
8598 
8599  const TmpType tmp( rhs );
8600  smpAssign( ~lhs, fwd( tmp ) );
8601  }
8602  //**********************************************************************************************
8603 
8604  //**Restructuring SMP assignment to row-major matrices******************************************
8618  template< typename MT > // Type of the target matrix
8619  friend inline auto smpAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8620  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8621  {
8623 
8625 
8626  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8627  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8628 
8629  const ForwardFunctor fwd;
8630 
8631  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8632  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8633 
8634  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8635  smpAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8636  else if( IsSymmetric_v<MT1> )
8637  smpAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8638  else
8639  smpAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8640  }
8641  //**********************************************************************************************
8642 
8643  //**SMP addition assignment to dense matrices***************************************************
8658  template< typename MT // Type of the target dense matrix
8659  , bool SO > // Storage order of the target dense matrix
8660  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8661  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8662  {
8664 
8665  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8666  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8667 
8668  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8669  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8670 
8671  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8672  return;
8673  }
8674 
8675  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8676  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8677 
8678  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8679  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8680  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8681  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8682  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8683  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8684 
8685  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
8686  }
8687  //**********************************************************************************************
8688 
8689  //**Restructuring SMP addition assignment to row-major matrices*********************************
8704  template< typename MT > // Type of the target matrix
8705  friend inline auto smpAddAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8706  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8707  {
8709 
8711 
8712  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8713  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8714 
8715  const ForwardFunctor fwd;
8716 
8717  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8718  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8719 
8720  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8721  smpAddAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8722  else if( IsSymmetric_v<MT1> )
8723  smpAddAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8724  else
8725  smpAddAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8726  }
8727  //**********************************************************************************************
8728 
8729  //**SMP addition assignment to sparse matrices**************************************************
8730  // No special implementation for the SMP addition assignment to sparse matrices.
8731  //**********************************************************************************************
8732 
8733  //**SMP subtraction assignment to dense matrices************************************************
8748  template< typename MT // Type of the target dense matrix
8749  , bool SO > // Storage order of the target dense matrix
8750  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8751  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8752  {
8754 
8755  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8756  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8757 
8758  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8759  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8760 
8761  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8762  return;
8763  }
8764 
8765  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8766  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8767 
8768  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8769  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8770  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8771  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8772  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8773  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8774 
8775  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
8776  }
8777  //**********************************************************************************************
8778 
8779  //**Restructuring SMP subtraction assignment to row-major matrices******************************
8794  template< typename MT > // Type of the target matrix
8795  friend inline auto smpSubAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8796  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8797  {
8799 
8801 
8802  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8803  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8804 
8805  const ForwardFunctor fwd;
8806 
8807  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8808  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8809 
8810  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8811  smpSubAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8812  else if( IsSymmetric_v<MT1> )
8813  smpSubAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8814  else
8815  smpSubAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8816  }
8817  //**********************************************************************************************
8818 
8819  //**SMP subtraction assignment to sparse matrices***********************************************
8820  // No special implementation for the SMP subtraction assignment to sparse matrices.
8821  //**********************************************************************************************
8822 
8823  //**SMP Schur product assignment to dense matrices**********************************************
8835  template< typename MT // Type of the target dense matrix
8836  , bool SO > // Storage order of the target dense matrix
8837  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8838  {
8840 
8844 
8845  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8846  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8847 
8848  const ResultType tmp( rhs );
8849  smpSchurAssign( ~lhs, tmp );
8850  }
8851  //**********************************************************************************************
8852 
8853  //**SMP Schur product assignment to sparse matrices*********************************************
8854  // No special implementation for the SMP Schur product assignment to sparse matrices.
8855  //**********************************************************************************************
8856 
8857  //**SMP multiplication assignment to dense matrices*********************************************
8858  // No special implementation for the SMP multiplication assignment to dense matrices.
8859  //**********************************************************************************************
8860 
8861  //**SMP multiplication assignment to sparse matrices********************************************
8862  // No special implementation for the SMP multiplication assignment to sparse matrices.
8863  //**********************************************************************************************
8864 
8865  //**Compile time checks*************************************************************************
8874  //**********************************************************************************************
8875 };
8877 //*************************************************************************************************
8878 
8879 
8880 
8881 
8882 //=================================================================================================
8883 //
8884 // GLOBAL BINARY ARITHMETIC OPERATORS
8885 //
8886 //=================================================================================================
8887 
8888 //*************************************************************************************************
8915 template< typename MT1 // Type of the left-hand side dense matrix
8916  , typename MT2 > // Type of the right-hand side dense matrix
8917 inline decltype(auto)
8918  operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,true>& rhs )
8919 {
8921 
8922  if( (~lhs).columns() != (~rhs).rows() ) {
8923  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
8924  }
8925 
8927  return ReturnType( ~lhs, ~rhs );
8928 }
8929 //*************************************************************************************************
8930 
8931 
8932 
8933 
8934 //=================================================================================================
8935 //
8936 // GLOBAL FUNCTIONS
8937 //
8938 //=================================================================================================
8939 
8940 //*************************************************************************************************
8963 template< typename MT1 // Type of the left-hand side dense matrix
8964  , typename MT2 // Type of the right-hand side dense matrix
8965  , bool SF // Symmetry flag
8966  , bool HF // Hermitian flag
8967  , bool LF // Lower flag
8968  , bool UF > // Upper flag
8969 inline decltype(auto) declsym( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8970 {
8972 
8973  if( !isSquare( dm ) ) {
8974  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
8975  }
8976 
8977  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
8978  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8979 }
8981 //*************************************************************************************************
8982 
8983 
8984 //*************************************************************************************************
9007 template< typename MT1 // Type of the left-hand side dense matrix
9008  , typename MT2 // Type of the right-hand side dense matrix
9009  , bool SF // Symmetry flag
9010  , bool HF // Hermitian flag
9011  , bool LF // Lower flag
9012  , bool UF > // Upper flag
9013 inline decltype(auto) declherm( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9014 {
9016 
9017  if( !isSquare( dm ) ) {
9018  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
9019  }
9020 
9021  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9022  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9023 }
9025 //*************************************************************************************************
9026 
9027 
9028 //*************************************************************************************************
9051 template< typename MT1 // Type of the left-hand side dense matrix
9052  , typename MT2 // Type of the right-hand side dense matrix
9053  , bool SF // Symmetry flag
9054  , bool HF // Hermitian flag
9055  , bool LF // Lower flag
9056  , bool UF > // Upper flag
9057 inline decltype(auto) decllow( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9058 {
9060 
9061  if( !isSquare( dm ) ) {
9062  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9063  }
9064 
9065  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9066  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9067 }
9069 //*************************************************************************************************
9070 
9071 
9072 //*************************************************************************************************
9095 template< typename MT1 // Type of the left-hand side dense matrix
9096  , typename MT2 // Type of the right-hand side dense matrix
9097  , bool SF // Symmetry flag
9098  , bool HF // Hermitian flag
9099  , bool LF // Lower flag
9100  , bool UF > // Upper flag
9101 inline decltype(auto) declupp( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9102 {
9104 
9105  if( !isSquare( dm ) ) {
9106  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9107  }
9108 
9109  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9110  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9111 }
9113 //*************************************************************************************************
9114 
9115 
9116 //*************************************************************************************************
9139 template< typename MT1 // Type of the left-hand side dense matrix
9140  , typename MT2 // Type of the right-hand side dense matrix
9141  , bool SF // Symmetry flag
9142  , bool HF // Hermitian flag
9143  , bool LF // Lower flag
9144  , bool UF > // Upper flag
9145 inline decltype(auto) decldiag( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9146 {
9148 
9149  if( !isSquare( dm ) ) {
9150  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
9151  }
9152 
9153  using ReturnType = const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
9154  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9155 }
9157 //*************************************************************************************************
9158 
9159 
9160 
9161 
9162 //=================================================================================================
9163 //
9164 // SIZE SPECIALIZATIONS
9165 //
9166 //=================================================================================================
9167 
9168 //*************************************************************************************************
9170 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9171 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9172  : public Size<MT1,0UL>
9173 {};
9174 
9175 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9176 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9177  : public Size<MT2,1UL>
9178 {};
9180 //*************************************************************************************************
9181 
9182 
9183 
9184 
9185 //=================================================================================================
9186 //
9187 // ISALIGNED SPECIALIZATIONS
9188 //
9189 //=================================================================================================
9190 
9191 //*************************************************************************************************
9193 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9194 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9195  : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
9196 {};
9198 //*************************************************************************************************
9199 
9200 } // namespace blaze
9201 
9202 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:426
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:496
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:484
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatTDMatMultExpr.h:308
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:532
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:300
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:288
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:522
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
Header file for the IsIntegral type trait.
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:330
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1002
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:596
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:158
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:163
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:431
Header file for the IsBLASCompatible type trait.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:430
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:286
Header file for the IsComplexDouble type trait.
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:178
Constraint on the data type.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatTDMatMultExpr.h:321
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:171
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:564
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:440
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:464
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:167
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1002
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:552
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:161
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:604
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:497
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:468
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:294
Header file for the IsSIMDCombinable type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:420
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:303
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:284
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:173
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:177
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatTDMatMultExpr.h:315
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:421
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:160
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:175
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:166
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:394
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:159
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:576
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:287
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:176
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:765
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:453
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:440
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:410
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:452
Header file for BLAS general matrix/matrix multiplication functions (gemm)
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:289
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:345
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:156
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:586
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
Header file for the IsUpper type trait.
typename DisableIf< Condition, T >::Type DisableIf_t
Auxiliary type for the DisableIf class template.The DisableIf_t alias declaration provides a convenie...
Definition: DisableIf.h:138
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1326
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:474
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
If_t< IsExpression_v< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:297
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
Header file for the IsResizable type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:542
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:170
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:291
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:161
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:290
Header file for the IsExpression type trait class.
Header file for the function trace functionality.