Blaze  3.6
DMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/dense/MMM.h>
54 #include <blaze/math/Exception.h>
69 #include <blaze/math/shims/Reset.h>
71 #include <blaze/math/SIMD.h>
102 #include <blaze/math/views/Check.h>
103 #include <blaze/system/BLAS.h>
104 #include <blaze/system/Blocking.h>
105 #include <blaze/system/Debugging.h>
107 #include <blaze/system/Thresholds.h>
110 #include <blaze/util/Assert.h>
111 #include <blaze/util/Complex.h>
114 #include <blaze/util/DisableIf.h>
115 #include <blaze/util/EnableIf.h>
118 #include <blaze/util/mpl/If.h>
119 #include <blaze/util/Types.h>
128 
129 
130 namespace blaze {
131 
132 //=================================================================================================
133 //
134 // CLASS DMATDMATMULTEXPR
135 //
136 //=================================================================================================
137 
138 //*************************************************************************************************
145 template< typename MT1 // Type of the left-hand side dense matrix
146  , typename MT2 // Type of the right-hand side dense matrix
147  , bool SF // Symmetry flag
148  , bool HF // Hermitian flag
149  , bool LF // Lower flag
150  , bool UF > // Upper flag
152  : public MatMatMultExpr< DenseMatrix< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
153  , private Computation
154 {
155  private:
156  //**Type definitions****************************************************************************
163  //**********************************************************************************************
164 
165  //**********************************************************************************************
167  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
168  //**********************************************************************************************
169 
170  //**********************************************************************************************
172  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
173  //**********************************************************************************************
174 
175  //**********************************************************************************************
176  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
177  static constexpr bool HERM = ( HF && !( LF || UF ) );
178  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
179  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
180  //**********************************************************************************************
181 
182  //**********************************************************************************************
184 
189  template< typename T1, typename T2, typename T3 >
190  static constexpr bool CanExploitSymmetry_v =
191  ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
193  //**********************************************************************************************
194 
195  //**********************************************************************************************
197 
201  template< typename T1, typename T2, typename T3 >
202  static constexpr bool IsEvaluationRequired_v =
203  ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
205  //**********************************************************************************************
206 
207  //**********************************************************************************************
209 
212  template< typename T1, typename T2, typename T3 >
213  static constexpr bool UseBlasKernel_v =
214  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
215  !SYM && !HERM && !LOW && !UPP &&
216  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
217  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
218  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
219  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
220  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
221  IsBLASCompatible_v< ElementType_t<T1> > &&
222  IsBLASCompatible_v< ElementType_t<T2> > &&
223  IsBLASCompatible_v< ElementType_t<T3> > &&
224  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
225  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
227  //**********************************************************************************************
228 
229  //**********************************************************************************************
231 
234  template< typename T1, typename T2, typename T3 >
235  static constexpr bool UseVectorizedDefaultKernel_v =
236  ( useOptimizedKernels &&
237  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
238  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
239  IsSIMDCombinable_v< ElementType_t<T1>
241  , ElementType_t<T3> > &&
242  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
243  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
245  //**********************************************************************************************
246 
247  //**********************************************************************************************
249 
252  using ForwardFunctor = If_t< HERM
253  , DeclHerm
254  , If_t< SYM
255  , DeclSym
256  , If_t< LOW
257  , If_t< UPP
258  , DeclDiag
259  , DeclLow >
260  , If_t< UPP
261  , DeclUpp
262  , Noop > > > >;
264  //**********************************************************************************************
265 
266  public:
267  //**Type definitions****************************************************************************
270 
273 
275  using ResultType = typename If_t< HERM
277  , If_t< SYM
279  , If_t< LOW
280  , If_t< UPP
283  , If_t< UPP
285  , MultTrait<RT1,RT2> > > > >::Type;
286 
291  using ReturnType = const ElementType;
292  using CompositeType = const ResultType;
293 
295  using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
296 
298  using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
299 
302 
305  //**********************************************************************************************
306 
307  //**Compilation flags***************************************************************************
309  static constexpr bool simdEnabled =
310  ( !IsDiagonal_v<MT2> &&
311  MT1::simdEnabled && MT2::simdEnabled &&
312  HasSIMDAdd_v<ET1,ET2> &&
313  HasSIMDMult_v<ET1,ET2> );
314 
316  static constexpr bool smpAssignable =
317  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
318  //**********************************************************************************************
319 
320  //**SIMD properties*****************************************************************************
322  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
323  //**********************************************************************************************
324 
325  //**Constructor*********************************************************************************
331  explicit inline DMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
332  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
333  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
334  {
335  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
336  }
337  //**********************************************************************************************
338 
339  //**Access operator*****************************************************************************
346  inline ReturnType operator()( size_t i, size_t j ) const {
347  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
348  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
349 
350  if( IsDiagonal_v<MT1> ) {
351  return lhs_(i,i) * rhs_(i,j);
352  }
353  else if( IsDiagonal_v<MT2> ) {
354  return lhs_(i,j) * rhs_(j,j);
355  }
356  else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
357  const size_t begin( ( IsUpper_v<MT1> )
358  ?( ( IsLower_v<MT2> )
359  ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
360  , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
361  :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
362  :( ( IsLower_v<MT2> )
363  ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
364  :( 0UL ) ) );
365  const size_t end( ( IsLower_v<MT1> )
366  ?( ( IsUpper_v<MT2> )
367  ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
368  , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
369  :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
370  :( ( IsUpper_v<MT2> )
371  ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
372  :( lhs_.columns() ) ) );
373 
374  if( begin >= end ) return ElementType();
375 
376  const size_t n( end - begin );
377 
378  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
379  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
380  }
381  else {
382  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
383  }
384  }
385  //**********************************************************************************************
386 
387  //**At function*********************************************************************************
395  inline ReturnType at( size_t i, size_t j ) const {
396  if( i >= lhs_.rows() ) {
397  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
398  }
399  if( j >= rhs_.columns() ) {
400  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
401  }
402  return (*this)(i,j);
403  }
404  //**********************************************************************************************
405 
406  //**Rows function*******************************************************************************
411  inline size_t rows() const noexcept {
412  return lhs_.rows();
413  }
414  //**********************************************************************************************
415 
416  //**Columns function****************************************************************************
421  inline size_t columns() const noexcept {
422  return rhs_.columns();
423  }
424  //**********************************************************************************************
425 
426  //**Left operand access*************************************************************************
431  inline LeftOperand leftOperand() const noexcept {
432  return lhs_;
433  }
434  //**********************************************************************************************
435 
436  //**Right operand access************************************************************************
441  inline RightOperand rightOperand() const noexcept {
442  return rhs_;
443  }
444  //**********************************************************************************************
445 
446  //**********************************************************************************************
452  template< typename T >
453  inline bool canAlias( const T* alias ) const noexcept {
454  return ( lhs_.canAlias( alias ) || rhs_.canAlias( alias ) );
455  }
456  //**********************************************************************************************
457 
458  //**********************************************************************************************
464  template< typename T >
465  inline bool isAliased( const T* alias ) const noexcept {
466  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
467  }
468  //**********************************************************************************************
469 
470  //**********************************************************************************************
475  inline bool isAligned() const noexcept {
476  return lhs_.isAligned() && rhs_.isAligned();
477  }
478  //**********************************************************************************************
479 
480  //**********************************************************************************************
485  inline bool canSMPAssign() const noexcept {
486  return ( !BLAZE_BLAS_MODE ||
487  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
489  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
490  ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD ) &&
491  !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
492  }
493  //**********************************************************************************************
494 
495  private:
496  //**Member variables****************************************************************************
499  //**********************************************************************************************
500 
501  //**Assignment to dense matrices****************************************************************
514  template< typename MT // Type of the target dense matrix
515  , bool SO > // Storage order of the target dense matrix
516  friend inline auto assign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
518  {
520 
521  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
522  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
523 
524  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
525  return;
526  }
527  else if( rhs.lhs_.columns() == 0UL ) {
528  reset( ~lhs );
529  return;
530  }
531 
532  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
533  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
534 
535  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
536  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
537  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
538  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
539  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
540  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
541 
542  DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
543  }
545  //**********************************************************************************************
546 
547  //**Assignment to dense matrices (kernel selection)*********************************************
558  template< typename MT3 // Type of the left-hand side target matrix
559  , typename MT4 // Type of the left-hand side matrix operand
560  , typename MT5 > // Type of the right-hand side matrix operand
561  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
562  {
563  if( ( IsDiagonal_v<MT5> ) ||
564  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
565  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
566  selectSmallAssignKernel( C, A, B );
567  else
568  selectBlasAssignKernel( C, A, B );
569  }
571  //**********************************************************************************************
572 
573  //**Default assignment to dense matrices (general/general)**************************************
587  template< typename MT3 // Type of the left-hand side target matrix
588  , typename MT4 // Type of the left-hand side matrix operand
589  , typename MT5 > // Type of the right-hand side matrix operand
590  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
591  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
592  {
593  const size_t M( A.rows() );
594  const size_t N( B.columns() );
595  const size_t K( A.columns() );
596 
597  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
598 
599  for( size_t i=0UL; i<M; ++i )
600  {
601  const size_t kbegin( ( IsUpper_v<MT4> )
602  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
603  :( 0UL ) );
604  const size_t kend( ( IsLower_v<MT4> )
605  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
606  :( K ) );
607  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
608 
609  if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
610  for( size_t j=0UL; j<N; ++j ) {
611  reset( C(i,j) );
612  }
613  continue;
614  }
615 
616  {
617  const size_t jbegin( ( IsUpper_v<MT5> )
618  ?( ( IsStrictlyUpper_v<MT5> )
619  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
620  :( UPP ? max(i,kbegin) : kbegin ) )
621  :( UPP ? i : 0UL ) );
622  const size_t jend( ( IsLower_v<MT5> )
623  ?( ( IsStrictlyLower_v<MT5> )
624  ?( LOW ? min(i+1UL,kbegin) : kbegin )
625  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
626  :( LOW ? i+1UL : N ) );
627 
628  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
629  for( size_t j=0UL; j<jbegin; ++j ) {
630  reset( C(i,j) );
631  }
632  }
633  else if( IsStrictlyUpper_v<MT5> ) {
634  reset( C(i,0UL) );
635  }
636  for( size_t j=jbegin; j<jend; ++j ) {
637  C(i,j) = A(i,kbegin) * B(kbegin,j);
638  }
639  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
640  for( size_t j=jend; j<N; ++j ) {
641  reset( C(i,j) );
642  }
643  }
644  else if( IsStrictlyLower_v<MT5> ) {
645  reset( C(i,N-1UL) );
646  }
647  }
648 
649  for( size_t k=kbegin+1UL; k<kend; ++k )
650  {
651  const size_t jbegin( ( IsUpper_v<MT5> )
652  ?( ( IsStrictlyUpper_v<MT5> )
653  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
654  :( SYM || HERM || UPP ? max( i, k ) : k ) )
655  :( SYM || HERM || UPP ? i : 0UL ) );
656  const size_t jend( ( IsLower_v<MT5> )
657  ?( ( IsStrictlyLower_v<MT5> )
658  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
659  :( LOW ? min(i+1UL,k) : k ) )
660  :( LOW ? i+1UL : N ) );
661 
662  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
663  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
664 
665  for( size_t j=jbegin; j<jend; ++j ) {
666  C(i,j) += A(i,k) * B(k,j);
667  }
668  if( IsLower_v<MT5> ) {
669  C(i,jend) = A(i,k) * B(k,jend);
670  }
671  }
672  }
673 
674  if( SYM || HERM ) {
675  for( size_t i=1UL; i<M; ++i ) {
676  for( size_t j=0UL; j<i; ++j ) {
677  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
678  }
679  }
680  }
681  }
683  //**********************************************************************************************
684 
685  //**Default assignment to dense matrices (general/diagonal)*************************************
699  template< typename MT3 // Type of the left-hand side target matrix
700  , typename MT4 // Type of the left-hand side matrix operand
701  , typename MT5 > // Type of the right-hand side matrix operand
702  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
703  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
704  {
706 
707  const size_t M( A.rows() );
708  const size_t N( B.columns() );
709 
710  for( size_t i=0UL; i<M; ++i )
711  {
712  const size_t jbegin( ( IsUpper_v<MT4> )
713  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
714  :( 0UL ) );
715  const size_t jend( ( IsLower_v<MT4> )
716  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
717  :( N ) );
718  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
719 
720  if( IsUpper_v<MT4> ) {
721  for( size_t j=0UL; j<jbegin; ++j ) {
722  reset( C(i,j) );
723  }
724  }
725  for( size_t j=jbegin; j<jend; ++j ) {
726  C(i,j) = A(i,j) * B(j,j);
727  }
728  if( IsLower_v<MT4> ) {
729  for( size_t j=jend; j<N; ++j ) {
730  reset( C(i,j) );
731  }
732  }
733  }
734  }
736  //**********************************************************************************************
737 
738  //**Default assignment to dense matrices (diagonal/general)*************************************
752  template< typename MT3 // Type of the left-hand side target matrix
753  , typename MT4 // Type of the left-hand side matrix operand
754  , typename MT5 > // Type of the right-hand side matrix operand
755  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
756  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
757  {
759 
760  const size_t M( A.rows() );
761  const size_t N( B.columns() );
762 
763  for( size_t i=0UL; i<M; ++i )
764  {
765  const size_t jbegin( ( IsUpper_v<MT5> )
766  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
767  :( 0UL ) );
768  const size_t jend( ( IsLower_v<MT5> )
769  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
770  :( N ) );
771  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
772 
773  if( IsUpper_v<MT5> ) {
774  for( size_t j=0UL; j<jbegin; ++j ) {
775  reset( C(i,j) );
776  }
777  }
778  for( size_t j=jbegin; j<jend; ++j ) {
779  C(i,j) = A(i,i) * B(i,j);
780  }
781  if( IsLower_v<MT5> ) {
782  for( size_t j=jend; j<N; ++j ) {
783  reset( C(i,j) );
784  }
785  }
786  }
787  }
789  //**********************************************************************************************
790 
791  //**Default assignment to dense matrices (diagonal/diagonal)************************************
805  template< typename MT3 // Type of the left-hand side target matrix
806  , typename MT4 // Type of the left-hand side matrix operand
807  , typename MT5 > // Type of the right-hand side matrix operand
808  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
809  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
810  {
812 
813  reset( C );
814 
815  for( size_t i=0UL; i<A.rows(); ++i ) {
816  C(i,i) = A(i,i) * B(i,i);
817  }
818  }
820  //**********************************************************************************************
821 
822  //**Default assignment to dense matrices (small matrices)***************************************
835  template< typename MT3 // Type of the left-hand side target matrix
836  , typename MT4 // Type of the left-hand side matrix operand
837  , typename MT5 > // Type of the right-hand side matrix operand
838  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
839  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
840  {
841  selectDefaultAssignKernel( C, A, B );
842  }
844  //**********************************************************************************************
845 
846  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
861  template< typename MT3 // Type of the left-hand side target matrix
862  , typename MT4 // Type of the left-hand side matrix operand
863  , typename MT5 > // Type of the right-hand side matrix operand
864  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
865  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
866  {
867  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
868 
869  const size_t M( A.rows() );
870  const size_t N( B.columns() );
871  const size_t K( A.columns() );
872 
873  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
874 
875  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
876  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
877 
878  size_t j( 0UL );
879 
880  if( IsIntegral_v<ElementType> )
881  {
882  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
883  for( size_t i=0UL; i<M; ++i )
884  {
885  const size_t kbegin( ( IsUpper_v<MT4> )
886  ?( ( IsLower_v<MT5> )
887  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
888  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
889  :( IsLower_v<MT5> ? j : 0UL ) );
890  const size_t kend( ( IsLower_v<MT4> )
891  ?( ( IsUpper_v<MT5> )
892  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
893  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
894  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
895 
896  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
897 
898  for( size_t k=kbegin; k<kend; ++k ) {
899  const SIMDType a1( set( A(i,k) ) );
900  xmm1 += a1 * B.load(k,j );
901  xmm2 += a1 * B.load(k,j+SIMDSIZE );
902  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
903  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
904  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
905  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
906  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
907  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
908  }
909 
910  C.store( i, j , xmm1 );
911  C.store( i, j+SIMDSIZE , xmm2 );
912  C.store( i, j+SIMDSIZE*2UL, xmm3 );
913  C.store( i, j+SIMDSIZE*3UL, xmm4 );
914  C.store( i, j+SIMDSIZE*4UL, xmm5 );
915  C.store( i, j+SIMDSIZE*5UL, xmm6 );
916  C.store( i, j+SIMDSIZE*6UL, xmm7 );
917  C.store( i, j+SIMDSIZE*7UL, xmm8 );
918  }
919  }
920  }
921 
922  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
923  {
924  size_t i( 0UL );
925 
926  for( ; (i+2UL) <= M; i+=2UL )
927  {
928  const size_t kbegin( ( IsUpper_v<MT4> )
929  ?( ( IsLower_v<MT5> )
930  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
931  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
932  :( IsLower_v<MT5> ? j : 0UL ) );
933  const size_t kend( ( IsLower_v<MT4> )
934  ?( ( IsUpper_v<MT5> )
935  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
936  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
937  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
938 
939  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
940 
941  for( size_t k=kbegin; k<kend; ++k ) {
942  const SIMDType a1( set( A(i ,k) ) );
943  const SIMDType a2( set( A(i+1UL,k) ) );
944  const SIMDType b1( B.load(k,j ) );
945  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
946  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
947  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
948  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
949  xmm1 += a1 * b1;
950  xmm2 += a1 * b2;
951  xmm3 += a1 * b3;
952  xmm4 += a1 * b4;
953  xmm5 += a1 * b5;
954  xmm6 += a2 * b1;
955  xmm7 += a2 * b2;
956  xmm8 += a2 * b3;
957  xmm9 += a2 * b4;
958  xmm10 += a2 * b5;
959  }
960 
961  C.store( i , j , xmm1 );
962  C.store( i , j+SIMDSIZE , xmm2 );
963  C.store( i , j+SIMDSIZE*2UL, xmm3 );
964  C.store( i , j+SIMDSIZE*3UL, xmm4 );
965  C.store( i , j+SIMDSIZE*4UL, xmm5 );
966  C.store( i+1UL, j , xmm6 );
967  C.store( i+1UL, j+SIMDSIZE , xmm7 );
968  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
969  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
970  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
971  }
972 
973  if( i < M )
974  {
975  const size_t kbegin( ( IsUpper_v<MT4> )
976  ?( ( IsLower_v<MT5> )
977  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
978  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
979  :( IsLower_v<MT5> ? j : 0UL ) );
980  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
981 
982  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
983 
984  for( size_t k=kbegin; k<kend; ++k ) {
985  const SIMDType a1( set( A(i,k) ) );
986  xmm1 += a1 * B.load(k,j );
987  xmm2 += a1 * B.load(k,j+SIMDSIZE );
988  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
989  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
990  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
991  }
992 
993  C.store( i, j , xmm1 );
994  C.store( i, j+SIMDSIZE , xmm2 );
995  C.store( i, j+SIMDSIZE*2UL, xmm3 );
996  C.store( i, j+SIMDSIZE*3UL, xmm4 );
997  C.store( i, j+SIMDSIZE*4UL, xmm5 );
998  }
999  }
1000 
1001  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1002  {
1003  const size_t iend( UPP ? min(j+SIMDSIZE*4UL,M) : M );
1004  size_t i( 0UL );
1005 
1006  if( SYM || HERM ) {
1007  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1008  for( ; i<j; ++i ) {
1009  for( size_t jj=j; jj<jjend; ++jj ) {
1010  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1011  }
1012  }
1013  }
1014  else if( LOW ) {
1015  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1016  for( ; i<j; ++i ) {
1017  for( size_t jj=j; jj<jjend; ++jj ) {
1018  reset( C(i,jj) );
1019  }
1020  }
1021  }
1022 
1023  for( ; (i+2UL) <= iend; i+=2UL )
1024  {
1025  const size_t kbegin( ( IsUpper_v<MT4> )
1026  ?( ( IsLower_v<MT5> )
1027  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1028  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1029  :( IsLower_v<MT5> ? j : 0UL ) );
1030  const size_t kend( ( IsLower_v<MT4> )
1031  ?( ( IsUpper_v<MT5> )
1032  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1033  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1034  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
1035 
1036  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1037 
1038  for( size_t k=kbegin; k<kend; ++k ) {
1039  const SIMDType a1( set( A(i ,k) ) );
1040  const SIMDType a2( set( A(i+1UL,k) ) );
1041  const SIMDType b1( B.load(k,j ) );
1042  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1043  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1044  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1045  xmm1 += a1 * b1;
1046  xmm2 += a1 * b2;
1047  xmm3 += a1 * b3;
1048  xmm4 += a1 * b4;
1049  xmm5 += a2 * b1;
1050  xmm6 += a2 * b2;
1051  xmm7 += a2 * b3;
1052  xmm8 += a2 * b4;
1053  }
1054 
1055  C.store( i , j , xmm1 );
1056  C.store( i , j+SIMDSIZE , xmm2 );
1057  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1058  C.store( i , j+SIMDSIZE*3UL, xmm4 );
1059  C.store( i+1UL, j , xmm5 );
1060  C.store( i+1UL, j+SIMDSIZE , xmm6 );
1061  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1062  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1063  }
1064 
1065  if( i < iend )
1066  {
1067  const size_t kbegin( ( IsUpper_v<MT4> )
1068  ?( ( IsLower_v<MT5> )
1069  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1070  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1071  :( IsLower_v<MT5> ? j : 0UL ) );
1072  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1073 
1074  SIMDType xmm1, xmm2, xmm3, xmm4;
1075 
1076  for( size_t k=kbegin; k<kend; ++k ) {
1077  const SIMDType a1( set( A(i,k) ) );
1078  xmm1 += a1 * B.load(k,j );
1079  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1080  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1081  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1082  }
1083 
1084  C.store( i, j , xmm1 );
1085  C.store( i, j+SIMDSIZE , xmm2 );
1086  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1087  C.store( i, j+SIMDSIZE*3UL, xmm4 );
1088 
1089  if( UPP ) ++i;
1090  }
1091 
1092  if( UPP ) {
1093  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1094  for( ; i<M; ++i ) {
1095  for( size_t jj=j; jj<jjend; ++jj ) {
1096  reset( C(i,jj) );
1097  }
1098  }
1099  }
1100  }
1101 
1102  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1103  {
1104  const size_t iend( UPP ? min(j+SIMDSIZE*3UL,M) : M );
1105  size_t i( 0UL );
1106 
1107  if( SYM || HERM ) {
1108  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1109  for( ; i<j; ++i ) {
1110  for( size_t jj=j; jj<jjend; ++jj ) {
1111  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1112  }
1113  }
1114  }
1115  else if( LOW ) {
1116  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1117  for( ; i<j; ++i ) {
1118  for( size_t jj=j; jj<jjend; ++jj ) {
1119  reset( C(i,jj) );
1120  }
1121  }
1122  }
1123 
1124  for( ; (i+2UL) <= iend; i+=2UL )
1125  {
1126  const size_t kbegin( ( IsUpper_v<MT4> )
1127  ?( ( IsLower_v<MT5> )
1128  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1129  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1130  :( IsLower_v<MT5> ? j : 0UL ) );
1131  const size_t kend( ( IsLower_v<MT4> )
1132  ?( ( IsUpper_v<MT5> )
1133  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1134  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1135  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
1136 
1137  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1138 
1139  for( size_t k=kbegin; k<kend; ++k ) {
1140  const SIMDType a1( set( A(i ,k) ) );
1141  const SIMDType a2( set( A(i+1UL,k) ) );
1142  const SIMDType b1( B.load(k,j ) );
1143  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1144  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1145  xmm1 += a1 * b1;
1146  xmm2 += a1 * b2;
1147  xmm3 += a1 * b3;
1148  xmm4 += a2 * b1;
1149  xmm5 += a2 * b2;
1150  xmm6 += a2 * b3;
1151  }
1152 
1153  C.store( i , j , xmm1 );
1154  C.store( i , j+SIMDSIZE , xmm2 );
1155  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1156  C.store( i+1UL, j , xmm4 );
1157  C.store( i+1UL, j+SIMDSIZE , xmm5 );
1158  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1159  }
1160 
1161  if( i < iend )
1162  {
1163  const size_t kbegin( ( IsUpper_v<MT4> )
1164  ?( ( IsLower_v<MT5> )
1165  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1166  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1167  :( IsLower_v<MT5> ? j : 0UL ) );
1168  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1169 
1170  SIMDType xmm1, xmm2, xmm3;
1171 
1172  for( size_t k=kbegin; k<kend; ++k ) {
1173  const SIMDType a1( set( A(i,k) ) );
1174  xmm1 += a1 * B.load(k,j );
1175  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1176  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1177  }
1178 
1179  C.store( i, j , xmm1 );
1180  C.store( i, j+SIMDSIZE , xmm2 );
1181  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1182 
1183  if( UPP ) ++i;
1184  }
1185 
1186  if( UPP ) {
1187  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1188  for( ; i<M; ++i ) {
1189  for( size_t jj=j; jj<jjend; ++jj ) {
1190  reset( C(i,jj) );
1191  }
1192  }
1193  }
1194  }
1195 
1196  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1197  {
1198  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
1199  size_t i( 0UL );
1200 
1201  if( SYM || HERM ) {
1202  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1203  for( ; i<j; ++i ) {
1204  for( size_t jj=j; jj<jjend; ++jj ) {
1205  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1206  }
1207  }
1208  }
1209  else if( LOW ) {
1210  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1211  for( ; i<j; ++i ) {
1212  for( size_t jj=j; jj<jjend; ++jj ) {
1213  reset( C(i,jj) );
1214  }
1215  }
1216  }
1217 
1218  for( ; (i+4UL) <= iend; i+=4UL )
1219  {
1220  const size_t kbegin( ( IsUpper_v<MT4> )
1221  ?( ( IsLower_v<MT5> )
1222  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1223  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1224  :( IsLower_v<MT5> ? j : 0UL ) );
1225  const size_t kend( ( IsLower_v<MT4> )
1226  ?( ( IsUpper_v<MT5> )
1227  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
1228  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1229  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1230 
1231  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1232 
1233  for( size_t k=kbegin; k<kend; ++k ) {
1234  const SIMDType a1( set( A(i ,k) ) );
1235  const SIMDType a2( set( A(i+1UL,k) ) );
1236  const SIMDType a3( set( A(i+2UL,k) ) );
1237  const SIMDType a4( set( A(i+3UL,k) ) );
1238  const SIMDType b1( B.load(k,j ) );
1239  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1240  xmm1 += a1 * b1;
1241  xmm2 += a1 * b2;
1242  xmm3 += a2 * b1;
1243  xmm4 += a2 * b2;
1244  xmm5 += a3 * b1;
1245  xmm6 += a3 * b2;
1246  xmm7 += a4 * b1;
1247  xmm8 += a4 * b2;
1248  }
1249 
1250  C.store( i , j , xmm1 );
1251  C.store( i , j+SIMDSIZE, xmm2 );
1252  C.store( i+1UL, j , xmm3 );
1253  C.store( i+1UL, j+SIMDSIZE, xmm4 );
1254  C.store( i+2UL, j , xmm5 );
1255  C.store( i+2UL, j+SIMDSIZE, xmm6 );
1256  C.store( i+3UL, j , xmm7 );
1257  C.store( i+3UL, j+SIMDSIZE, xmm8 );
1258  }
1259 
1260  for( ; (i+3UL) <= iend; i+=3UL )
1261  {
1262  const size_t kbegin( ( IsUpper_v<MT4> )
1263  ?( ( IsLower_v<MT5> )
1264  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1265  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1266  :( IsLower_v<MT5> ? j : 0UL ) );
1267  const size_t kend( ( IsLower_v<MT4> )
1268  ?( ( IsUpper_v<MT5> )
1269  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
1270  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1271  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1272 
1273  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1274 
1275  for( size_t k=kbegin; k<kend; ++k ) {
1276  const SIMDType a1( set( A(i ,k) ) );
1277  const SIMDType a2( set( A(i+1UL,k) ) );
1278  const SIMDType a3( set( A(i+2UL,k) ) );
1279  const SIMDType b1( B.load(k,j ) );
1280  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1281  xmm1 += a1 * b1;
1282  xmm2 += a1 * b2;
1283  xmm3 += a2 * b1;
1284  xmm4 += a2 * b2;
1285  xmm5 += a3 * b1;
1286  xmm6 += a3 * b2;
1287  }
1288 
1289  C.store( i , j , xmm1 );
1290  C.store( i , j+SIMDSIZE, xmm2 );
1291  C.store( i+1UL, j , xmm3 );
1292  C.store( i+1UL, j+SIMDSIZE, xmm4 );
1293  C.store( i+2UL, j , xmm5 );
1294  C.store( i+2UL, j+SIMDSIZE, xmm6 );
1295  }
1296 
1297  for( ; (i+2UL) <= iend; i+=2UL )
1298  {
1299  const size_t kbegin( ( IsUpper_v<MT4> )
1300  ?( ( IsLower_v<MT5> )
1301  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1302  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1303  :( IsLower_v<MT5> ? j : 0UL ) );
1304  const size_t kend( ( IsLower_v<MT4> )
1305  ?( ( IsUpper_v<MT5> )
1306  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1307  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1308  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1309 
1310  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1311  size_t k( kbegin );
1312 
1313  for( ; (k+2UL) <= kend; k+=2UL ) {
1314  const SIMDType a1( set( A(i ,k ) ) );
1315  const SIMDType a2( set( A(i+1UL,k ) ) );
1316  const SIMDType a3( set( A(i ,k+1UL) ) );
1317  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
1318  const SIMDType b1( B.load(k ,j ) );
1319  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1320  const SIMDType b3( B.load(k+1UL,j ) );
1321  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1322  xmm1 += a1 * b1;
1323  xmm2 += a1 * b2;
1324  xmm3 += a2 * b1;
1325  xmm4 += a2 * b2;
1326  xmm5 += a3 * b3;
1327  xmm6 += a3 * b4;
1328  xmm7 += a4 * b3;
1329  xmm8 += a4 * b4;
1330  }
1331 
1332  for( ; k<kend; ++k ) {
1333  const SIMDType a1( set( A(i ,k) ) );
1334  const SIMDType a2( set( A(i+1UL,k) ) );
1335  const SIMDType b1( B.load(k,j ) );
1336  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1337  xmm1 += a1 * b1;
1338  xmm2 += a1 * b2;
1339  xmm3 += a2 * b1;
1340  xmm4 += a2 * b2;
1341  }
1342 
1343  C.store( i , j , xmm1+xmm5 );
1344  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
1345  C.store( i+1UL, j , xmm3+xmm7 );
1346  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1347  }
1348 
1349  if( i < iend )
1350  {
1351  const size_t kbegin( ( IsUpper_v<MT4> )
1352  ?( ( IsLower_v<MT5> )
1353  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1354  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1355  :( IsLower_v<MT5> ? j : 0UL ) );
1356  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1357 
1358  SIMDType xmm1, xmm2, xmm3, xmm4;
1359  size_t k( kbegin );
1360 
1361  for( ; (k+2UL) <= kend; k+=2UL ) {
1362  const SIMDType a1( set( A(i,k ) ) );
1363  const SIMDType a2( set( A(i,k+1UL) ) );
1364  xmm1 += a1 * B.load(k ,j );
1365  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1366  xmm3 += a2 * B.load(k+1UL,j );
1367  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1368  }
1369 
1370  for( ; k<kend; ++k ) {
1371  const SIMDType a1( set( A(i,k) ) );
1372  xmm1 += a1 * B.load(k,j );
1373  xmm2 += a1 * B.load(k,j+SIMDSIZE);
1374  }
1375 
1376  C.store( i, j , xmm1+xmm3 );
1377  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
1378 
1379  if( UPP ) ++i;
1380  }
1381 
1382  if( UPP ) {
1383  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1384  for( ; i<M; ++i ) {
1385  for( size_t jj=j; jj<jjend; ++jj ) {
1386  reset( C(i,jj) );
1387  }
1388  }
1389  }
1390  }
1391 
1392  for( ; j<jpos; j+=SIMDSIZE )
1393  {
1394  const size_t iend( UPP ? min(j+SIMDSIZE,M) : M );
1395  size_t i( 0UL );
1396 
1397  if( SYM || HERM ) {
1398  const size_t jjend( min(j+SIMDSIZE,N) );
1399  for( ; i<j; ++i ) {
1400  for( size_t jj=j; jj<jjend; ++jj ) {
1401  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1402  }
1403  }
1404  }
1405  else if( LOW ) {
1406  const size_t jjend( min(j+SIMDSIZE,N) );
1407  for( ; i<j; ++i ) {
1408  for( size_t jj=j; jj<jjend; ++jj ) {
1409  reset( C(i,jj) );
1410  }
1411  }
1412  }
1413 
1414  for( ; (i+4UL) <= iend; i+=4UL )
1415  {
1416  const size_t kbegin( ( IsUpper_v<MT4> )
1417  ?( ( IsLower_v<MT5> )
1418  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1419  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1420  :( IsLower_v<MT5> ? j : 0UL ) );
1421  const size_t kend( ( IsLower_v<MT4> )
1422  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1423  :( K ) );
1424 
1425  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1426  size_t k( kbegin );
1427 
1428  for( ; (k+2UL) <= kend; k+=2UL ) {
1429  const SIMDType b1( B.load(k ,j) );
1430  const SIMDType b2( B.load(k+1UL,j) );
1431  xmm1 += set( A(i ,k ) ) * b1;
1432  xmm2 += set( A(i+1UL,k ) ) * b1;
1433  xmm3 += set( A(i+2UL,k ) ) * b1;
1434  xmm4 += set( A(i+3UL,k ) ) * b1;
1435  xmm5 += set( A(i ,k+1UL) ) * b2;
1436  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
1437  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
1438  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
1439  }
1440 
1441  for( ; k<kend; ++k ) {
1442  const SIMDType b1( B.load(k,j) );
1443  xmm1 += set( A(i ,k) ) * b1;
1444  xmm2 += set( A(i+1UL,k) ) * b1;
1445  xmm3 += set( A(i+2UL,k) ) * b1;
1446  xmm4 += set( A(i+3UL,k) ) * b1;
1447  }
1448 
1449  C.store( i , j, xmm1+xmm5 );
1450  C.store( i+1UL, j, xmm2+xmm6 );
1451  C.store( i+2UL, j, xmm3+xmm7 );
1452  C.store( i+3UL, j, xmm4+xmm8 );
1453  }
1454 
1455  for( ; (i+3UL) <= iend; i+=3UL )
1456  {
1457  const size_t kbegin( ( IsUpper_v<MT4> )
1458  ?( ( IsLower_v<MT5> )
1459  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1460  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1461  :( IsLower_v<MT5> ? j : 0UL ) );
1462  const size_t kend( ( IsLower_v<MT4> )
1463  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1464  :( K ) );
1465 
1466  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1467  size_t k( kbegin );
1468 
1469  for( ; (k+2UL) <= kend; k+=2UL ) {
1470  const SIMDType b1( B.load(k ,j) );
1471  const SIMDType b2( B.load(k+1UL,j) );
1472  xmm1 += set( A(i ,k ) ) * b1;
1473  xmm2 += set( A(i+1UL,k ) ) * b1;
1474  xmm3 += set( A(i+2UL,k ) ) * b1;
1475  xmm4 += set( A(i ,k+1UL) ) * b2;
1476  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
1477  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
1478  }
1479 
1480  for( ; k<kend; ++k ) {
1481  const SIMDType b1( B.load(k,j) );
1482  xmm1 += set( A(i ,k) ) * b1;
1483  xmm2 += set( A(i+1UL,k) ) * b1;
1484  xmm3 += set( A(i+2UL,k) ) * b1;
1485  }
1486 
1487  C.store( i , j, xmm1+xmm4 );
1488  C.store( i+1UL, j, xmm2+xmm5 );
1489  C.store( i+2UL, j, xmm3+xmm6 );
1490  }
1491 
1492  for( ; (i+2UL) <= iend; i+=2UL )
1493  {
1494  const size_t kbegin( ( IsUpper_v<MT4> )
1495  ?( ( IsLower_v<MT5> )
1496  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1497  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1498  :( IsLower_v<MT5> ? j : 0UL ) );
1499  const size_t kend( ( IsLower_v<MT4> )
1500  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1501  :( K ) );
1502 
1503  SIMDType xmm1, xmm2, xmm3, xmm4;
1504  size_t k( kbegin );
1505 
1506  for( ; (k+2UL) <= kend; k+=2UL ) {
1507  const SIMDType b1( B.load(k ,j) );
1508  const SIMDType b2( B.load(k+1UL,j) );
1509  xmm1 += set( A(i ,k ) ) * b1;
1510  xmm2 += set( A(i+1UL,k ) ) * b1;
1511  xmm3 += set( A(i ,k+1UL) ) * b2;
1512  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
1513  }
1514 
1515  for( ; k<kend; ++k ) {
1516  const SIMDType b1( B.load(k,j) );
1517  xmm1 += set( A(i ,k) ) * b1;
1518  xmm2 += set( A(i+1UL,k) ) * b1;
1519  }
1520 
1521  C.store( i , j, xmm1+xmm3 );
1522  C.store( i+1UL, j, xmm2+xmm4 );
1523  }
1524 
1525  if( i < iend )
1526  {
1527  const size_t kbegin( ( IsUpper_v<MT4> )
1528  ?( ( IsLower_v<MT5> )
1529  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1530  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1531  :( IsLower_v<MT5> ? j : 0UL ) );
1532 
1533  SIMDType xmm1, xmm2;
1534  size_t k( kbegin );
1535 
1536  for( ; (k+2UL) <= K; k+=2UL ) {
1537  xmm1 += set( A(i,k ) ) * B.load(k ,j);
1538  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
1539  }
1540 
1541  for( ; k<K; ++k ) {
1542  xmm1 += set( A(i,k) ) * B.load(k,j);
1543  }
1544 
1545  C.store( i, j, xmm1+xmm2 );
1546 
1547  if( UPP ) ++i;
1548  }
1549 
1550  if( UPP ) {
1551  const size_t jjend( min(j+SIMDSIZE,N) );
1552  for( ; i<M; ++i ) {
1553  for( size_t jj=j; jj<jjend; ++jj ) {
1554  reset( C(i,jj) );
1555  }
1556  }
1557  }
1558  }
1559 
1560  for( ; remainder && j<N; ++j )
1561  {
1562  size_t i( 0UL );
1563 
1564  if( SYM || HERM ) {
1565  for( ; i<j; ++i ) {
1566  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1567  }
1568  }
1569  else if( LOW ) {
1570  for( ; i<j; ++i ) {
1571  reset( C(i,j) );
1572  }
1573  }
1574 
1575  for( ; (i+2UL) <= M; i+=2UL )
1576  {
1577  const size_t kbegin( ( IsUpper_v<MT4> )
1578  ?( ( IsLower_v<MT5> )
1579  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1580  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1581  :( IsLower_v<MT5> ? j : 0UL ) );
1582  const size_t kend( ( IsLower_v<MT4> )
1583  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1584  :( K ) );
1585 
1586  ElementType value1{};
1587  ElementType value2{};
1588 
1589  for( size_t k=kbegin; k<kend; ++k ) {
1590  value1 += A(i ,k) * B(k,j);
1591  value2 += A(i+1UL,k) * B(k,j);
1592  }
1593 
1594  C(i ,j) = value1;
1595  C(i+1UL,j) = value2;
1596  }
1597 
1598  if( i < M )
1599  {
1600  const size_t kbegin( ( IsUpper_v<MT4> )
1601  ?( ( IsLower_v<MT5> )
1602  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1603  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1604  :( IsLower_v<MT5> ? j : 0UL ) );
1605 
1606  ElementType value{};
1607 
1608  for( size_t k=kbegin; k<K; ++k ) {
1609  value += A(i,k) * B(k,j);
1610  }
1611 
1612  C(i,j) = value;
1613  }
1614  }
1615  }
1617  //**********************************************************************************************
1618 
1619  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1634  template< typename MT3 // Type of the left-hand side target matrix
1635  , typename MT4 // Type of the left-hand side matrix operand
1636  , typename MT5 > // Type of the right-hand side matrix operand
1637  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1638  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1639  {
1644 
1645  const ForwardFunctor fwd;
1646 
1647  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
1648  const OppositeType_t<MT4> tmp( serial( A ) );
1649  assign( C, fwd( tmp * B ) );
1650  }
1651  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
1652  const OppositeType_t<MT5> tmp( serial( B ) );
1653  assign( C, fwd( A * tmp ) );
1654  }
1655  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1656  const OppositeType_t<MT4> tmp( serial( A ) );
1657  assign( C, fwd( tmp * B ) );
1658  }
1659  else {
1660  const OppositeType_t<MT5> tmp( serial( B ) );
1661  assign( C, fwd( A * tmp ) );
1662  }
1663  }
1665  //**********************************************************************************************
1666 
1667  //**Default assignment to dense matrices (large matrices)***************************************
1680  template< typename MT3 // Type of the left-hand side target matrix
1681  , typename MT4 // Type of the left-hand side matrix operand
1682  , typename MT5 > // Type of the right-hand side matrix operand
1683  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1684  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1685  {
1686  selectDefaultAssignKernel( C, A, B );
1687  }
1689  //**********************************************************************************************
1690 
1691  //**Vectorized default assignment to dense matrices (large matrices)****************************
1705  template< typename MT3 // Type of the left-hand side target matrix
1706  , typename MT4 // Type of the left-hand side matrix operand
1707  , typename MT5 > // Type of the right-hand side matrix operand
1708  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1709  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1710  {
1711  if( SYM )
1712  smmm( C, A, B, ElementType(1) );
1713  else if( HERM )
1714  hmmm( C, A, B, ElementType(1) );
1715  else if( LOW )
1716  lmmm( C, A, B, ElementType(1), ElementType(0) );
1717  else if( UPP )
1718  ummm( C, A, B, ElementType(1), ElementType(0) );
1719  else
1720  mmm( C, A, B, ElementType(1), ElementType(0) );
1721  }
1723  //**********************************************************************************************
1724 
1725  //**BLAS-based assignment to dense matrices (default)*******************************************
1738  template< typename MT3 // Type of the left-hand side target matrix
1739  , typename MT4 // Type of the left-hand side matrix operand
1740  , typename MT5 > // Type of the right-hand side matrix operand
1741  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1742  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1743  {
1744  selectLargeAssignKernel( C, A, B );
1745  }
1747  //**********************************************************************************************
1748 
1749  //**BLAS-based assignment to dense matrices*****************************************************
1750 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1751 
1763  template< typename MT3 // Type of the left-hand side target matrix
1764  , typename MT4 // Type of the left-hand side matrix operand
1765  , typename MT5 > // Type of the right-hand side matrix operand
1766  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1767  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1768  {
1769  using ET = ElementType_t<MT3>;
1770 
1771  if( IsTriangular_v<MT4> ) {
1772  assign( C, B );
1773  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1774  }
1775  else if( IsTriangular_v<MT5> ) {
1776  assign( C, A );
1777  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1778  }
1779  else {
1780  gemm( C, A, B, ET(1), ET(0) );
1781  }
1782  }
1784 #endif
1785  //**********************************************************************************************
1786 
1787  //**Assignment to sparse matrices***************************************************************
1800  template< typename MT // Type of the target sparse matrix
1801  , bool SO > // Storage order of the target sparse matrix
1802  friend inline auto assign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1803  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1804  {
1806 
1807  using TmpType = If_t< SO, OppositeType, ResultType >;
1808 
1815 
1816  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1817  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1818 
1819  const ForwardFunctor fwd;
1820 
1821  const TmpType tmp( serial( rhs ) );
1822  assign( ~lhs, fwd( tmp ) );
1823  }
1825  //**********************************************************************************************
1826 
1827  //**Restructuring assignment to column-major matrices*******************************************
1842  template< typename MT > // Type of the target matrix
1843  friend inline auto assign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
1844  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1845  {
1847 
1849 
1850  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1851  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1852 
1853  const ForwardFunctor fwd;
1854 
1855  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
1856  assign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
1857  else if( IsSymmetric_v<MT1> )
1858  assign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
1859  else
1860  assign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
1861  }
1863  //**********************************************************************************************
1864 
1865  //**Addition assignment to dense matrices*******************************************************
1878  template< typename MT // Type of the target dense matrix
1879  , bool SO > // Storage order of the target dense matrix
1880  friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1881  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1882  {
1884 
1885  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1886  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1887 
1888  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1889  return;
1890  }
1891 
1892  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1893  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1894 
1895  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1896  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1897  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1898  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1899  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1900  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1901 
1902  DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1903  }
1905  //**********************************************************************************************
1906 
1907  //**Addition assignment to dense matrices (kernel selection)************************************
1918  template< typename MT3 // Type of the left-hand side target matrix
1919  , typename MT4 // Type of the left-hand side matrix operand
1920  , typename MT5 > // Type of the right-hand side matrix operand
1921  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1922  {
1923  if( ( IsDiagonal_v<MT5> ) ||
1924  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
1925  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1926  selectSmallAddAssignKernel( C, A, B );
1927  else
1928  selectBlasAddAssignKernel( C, A, B );
1929  }
1931  //**********************************************************************************************
1932 
1933  //**Default addition assignment to dense matrices (general/general)*****************************
1947  template< typename MT3 // Type of the left-hand side target matrix
1948  , typename MT4 // Type of the left-hand side matrix operand
1949  , typename MT5 > // Type of the right-hand side matrix operand
1950  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1951  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1952  {
1953  const size_t M( A.rows() );
1954  const size_t N( B.columns() );
1955  const size_t K( A.columns() );
1956 
1957  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1958 
1959  for( size_t i=0UL; i<M; ++i )
1960  {
1961  const size_t kbegin( ( IsUpper_v<MT4> )
1962  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1963  :( 0UL ) );
1964  const size_t kend( ( IsLower_v<MT4> )
1965  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
1966  :( K ) );
1967  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1968 
1969  for( size_t k=kbegin; k<kend; ++k )
1970  {
1971  const size_t jbegin( ( IsUpper_v<MT5> )
1972  ?( ( IsStrictlyUpper_v<MT5> )
1973  ?( UPP ? max(i,k+1UL) : k+1UL )
1974  :( UPP ? max(i,k) : k ) )
1975  :( UPP ? i : 0UL ) );
1976  const size_t jend( ( IsLower_v<MT5> )
1977  ?( ( IsStrictlyLower_v<MT5> )
1978  ?( LOW ? min(i+1UL,k) : k )
1979  :( LOW ? min(i,k)+1UL : k+1UL ) )
1980  :( LOW ? i+1UL : N ) );
1981 
1982  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
1983  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1984 
1985  const size_t jnum( jend - jbegin );
1986  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1987 
1988  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1989  C(i,j ) += A(i,k) * B(k,j );
1990  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1991  }
1992  if( jpos < jend ) {
1993  C(i,jpos) += A(i,k) * B(k,jpos);
1994  }
1995  }
1996  }
1997  }
1999  //**********************************************************************************************
2000 
2001  //**Default addition assignment to dense matrices (general/diagonal)****************************
2015  template< typename MT3 // Type of the left-hand side target matrix
2016  , typename MT4 // Type of the left-hand side matrix operand
2017  , typename MT5 > // Type of the right-hand side matrix operand
2018  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2019  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2020  {
2022 
2023  const size_t M( A.rows() );
2024  const size_t N( B.columns() );
2025 
2026  for( size_t i=0UL; i<M; ++i )
2027  {
2028  const size_t jbegin( ( IsUpper_v<MT4> )
2029  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2030  :( 0UL ) );
2031  const size_t jend( ( IsLower_v<MT4> )
2032  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2033  :( N ) );
2034  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2035 
2036  const size_t jnum( jend - jbegin );
2037  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2038 
2039  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2040  C(i,j ) += A(i,j ) * B(j ,j );
2041  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2042  }
2043  if( jpos < jend ) {
2044  C(i,jpos) += A(i,jpos) * B(jpos,jpos);
2045  }
2046  }
2047  }
2049  //**********************************************************************************************
2050 
2051  //**Default addition assignment to dense matrices (diagonal/general)****************************
2065  template< typename MT3 // Type of the left-hand side target matrix
2066  , typename MT4 // Type of the left-hand side matrix operand
2067  , typename MT5 > // Type of the right-hand side matrix operand
2068  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2069  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2070  {
2072 
2073  const size_t M( A.rows() );
2074  const size_t N( B.columns() );
2075 
2076  for( size_t i=0UL; i<M; ++i )
2077  {
2078  const size_t jbegin( ( IsUpper_v<MT5> )
2079  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
2080  :( 0UL ) );
2081  const size_t jend( ( IsLower_v<MT5> )
2082  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
2083  :( N ) );
2084  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2085 
2086  const size_t jnum( jend - jbegin );
2087  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2088 
2089  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2090  C(i,j ) += A(i,i) * B(i,j );
2091  C(i,j+1UL) += A(i,i) * B(i,j+1UL);
2092  }
2093  if( jpos < jend ) {
2094  C(i,jpos) += A(i,i) * B(i,jpos);
2095  }
2096  }
2097  }
2099  //**********************************************************************************************
2100 
2101  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2115  template< typename MT3 // Type of the left-hand side target matrix
2116  , typename MT4 // Type of the left-hand side matrix operand
2117  , typename MT5 > // Type of the right-hand side matrix operand
2118  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2119  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2120  {
2122 
2123  for( size_t i=0UL; i<A.rows(); ++i ) {
2124  C(i,i) += A(i,i) * B(i,i);
2125  }
2126  }
2128  //**********************************************************************************************
2129 
2130  //**Default addition assignment to dense matrices (small matrices)******************************
2144  template< typename MT3 // Type of the left-hand side target matrix
2145  , typename MT4 // Type of the left-hand side matrix operand
2146  , typename MT5 > // Type of the right-hand side matrix operand
2147  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2148  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2149  {
2150  selectDefaultAddAssignKernel( C, A, B );
2151  }
2153  //**********************************************************************************************
2154 
2155  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2170  template< typename MT3 // Type of the left-hand side target matrix
2171  , typename MT4 // Type of the left-hand side matrix operand
2172  , typename MT5 > // Type of the right-hand side matrix operand
2173  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2174  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2175  {
2176  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
2177 
2178  const size_t M( A.rows() );
2179  const size_t N( B.columns() );
2180  const size_t K( A.columns() );
2181 
2182  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2183 
2184  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2185  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2186 
2187  size_t j( 0UL );
2188 
2189  if( IsIntegral_v<ElementType> )
2190  {
2191  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2192  for( size_t i=0UL; i<M; ++i )
2193  {
2194  const size_t kbegin( ( IsUpper_v<MT4> )
2195  ?( ( IsLower_v<MT5> )
2196  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2197  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2198  :( IsLower_v<MT5> ? j : 0UL ) );
2199  const size_t kend( ( IsLower_v<MT4> )
2200  ?( ( IsUpper_v<MT5> )
2201  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
2202  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2203  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
2204 
2205  SIMDType xmm1( C.load(i,j ) );
2206  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2207  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2208  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
2209  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
2210  SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
2211  SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
2212  SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
2213 
2214  for( size_t k=kbegin; k<kend; ++k ) {
2215  const SIMDType a1( set( A(i,k) ) );
2216  xmm1 += a1 * B.load(k,j );
2217  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2218  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2219  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2220  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2221  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
2222  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
2223  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
2224  }
2225 
2226  C.store( i, j , xmm1 );
2227  C.store( i, j+SIMDSIZE , xmm2 );
2228  C.store( i, j+SIMDSIZE*2UL, xmm3 );
2229  C.store( i, j+SIMDSIZE*3UL, xmm4 );
2230  C.store( i, j+SIMDSIZE*4UL, xmm5 );
2231  C.store( i, j+SIMDSIZE*5UL, xmm6 );
2232  C.store( i, j+SIMDSIZE*6UL, xmm7 );
2233  C.store( i, j+SIMDSIZE*7UL, xmm8 );
2234  }
2235  }
2236  }
2237 
2238  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
2239  {
2240  size_t i( 0UL );
2241 
2242  for( ; (i+2UL) <= M; i+=2UL )
2243  {
2244  const size_t kbegin( ( IsUpper_v<MT4> )
2245  ?( ( IsLower_v<MT5> )
2246  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2247  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2248  :( IsLower_v<MT5> ? j : 0UL ) );
2249  const size_t kend( ( IsLower_v<MT4> )
2250  ?( ( IsUpper_v<MT5> )
2251  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
2252  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2253  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
2254 
2255  SIMDType xmm1 ( C.load(i ,j ) );
2256  SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
2257  SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
2258  SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
2259  SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
2260  SIMDType xmm6 ( C.load(i+1UL,j ) );
2261  SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
2262  SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
2263  SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
2264  SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
2265 
2266  for( size_t k=kbegin; k<kend; ++k ) {
2267  const SIMDType a1( set( A(i ,k) ) );
2268  const SIMDType a2( set( A(i+1UL,k) ) );
2269  const SIMDType b1( B.load(k,j ) );
2270  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2271  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2272  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2273  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
2274  xmm1 += a1 * b1;
2275  xmm2 += a1 * b2;
2276  xmm3 += a1 * b3;
2277  xmm4 += a1 * b4;
2278  xmm5 += a1 * b5;
2279  xmm6 += a2 * b1;
2280  xmm7 += a2 * b2;
2281  xmm8 += a2 * b3;
2282  xmm9 += a2 * b4;
2283  xmm10 += a2 * b5;
2284  }
2285 
2286  C.store( i , j , xmm1 );
2287  C.store( i , j+SIMDSIZE , xmm2 );
2288  C.store( i , j+SIMDSIZE*2UL, xmm3 );
2289  C.store( i , j+SIMDSIZE*3UL, xmm4 );
2290  C.store( i , j+SIMDSIZE*4UL, xmm5 );
2291  C.store( i+1UL, j , xmm6 );
2292  C.store( i+1UL, j+SIMDSIZE , xmm7 );
2293  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
2294  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
2295  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
2296  }
2297 
2298  if( i < M )
2299  {
2300  const size_t kbegin( ( IsUpper_v<MT4> )
2301  ?( ( IsLower_v<MT5> )
2302  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2303  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2304  :( IsLower_v<MT5> ? j : 0UL ) );
2305  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
2306 
2307  SIMDType xmm1( C.load(i,j ) );
2308  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2309  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2310  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
2311  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
2312 
2313  for( size_t k=kbegin; k<kend; ++k ) {
2314  const SIMDType a1( set( A(i,k) ) );
2315  xmm1 += a1 * B.load(k,j );
2316  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2317  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2318  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2319  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2320  }
2321 
2322  C.store( i, j , xmm1 );
2323  C.store( i, j+SIMDSIZE , xmm2 );
2324  C.store( i, j+SIMDSIZE*2UL, xmm3 );
2325  C.store( i, j+SIMDSIZE*3UL, xmm4 );
2326  C.store( i, j+SIMDSIZE*4UL, xmm5 );
2327  }
2328  }
2329 
2330  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2331  {
2332  size_t i( 0UL );
2333 
2334  for( ; (i+2UL) <= M; i+=2UL )
2335  {
2336  const size_t kbegin( ( IsUpper_v<MT4> )
2337  ?( ( IsLower_v<MT5> )
2338  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2339  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2340  :( IsLower_v<MT5> ? j : 0UL ) );
2341  const size_t kend( ( IsLower_v<MT4> )
2342  ?( ( IsUpper_v<MT5> )
2343  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
2344  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2345  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
2346 
2347  SIMDType xmm1( C.load(i ,j ) );
2348  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
2349  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
2350  SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
2351  SIMDType xmm5( C.load(i+1UL,j ) );
2352  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
2353  SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
2354  SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
2355 
2356  for( size_t k=kbegin; k<kend; ++k ) {
2357  const SIMDType a1( set( A(i ,k) ) );
2358  const SIMDType a2( set( A(i+1UL,k) ) );
2359  const SIMDType b1( B.load(k,j ) );
2360  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2361  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2362  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2363  xmm1 += a1 * b1;
2364  xmm2 += a1 * b2;
2365  xmm3 += a1 * b3;
2366  xmm4 += a1 * b4;
2367  xmm5 += a2 * b1;
2368  xmm6 += a2 * b2;
2369  xmm7 += a2 * b3;
2370  xmm8 += a2 * b4;
2371  }
2372 
2373  C.store( i , j , xmm1 );
2374  C.store( i , j+SIMDSIZE , xmm2 );
2375  C.store( i , j+SIMDSIZE*2UL, xmm3 );
2376  C.store( i , j+SIMDSIZE*3UL, xmm4 );
2377  C.store( i+1UL, j , xmm5 );
2378  C.store( i+1UL, j+SIMDSIZE , xmm6 );
2379  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2380  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2381  }
2382 
2383  if( i < M )
2384  {
2385  const size_t kbegin( ( IsUpper_v<MT4> )
2386  ?( ( IsLower_v<MT5> )
2387  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2388  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2389  :( IsLower_v<MT5> ? j : 0UL ) );
2390  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
2391 
2392  SIMDType xmm1( C.load(i,j ) );
2393  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2394  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2395  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
2396 
2397  for( size_t k=kbegin; k<kend; ++k ) {
2398  const SIMDType a1( set( A(i,k) ) );
2399  xmm1 += a1 * B.load(k,j );
2400  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2401  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2402  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2403  }
2404 
2405  C.store( i, j , xmm1 );
2406  C.store( i, j+SIMDSIZE , xmm2 );
2407  C.store( i, j+SIMDSIZE*2UL, xmm3 );
2408  C.store( i, j+SIMDSIZE*3UL, xmm4 );
2409  }
2410  }
2411 
2412  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2413  {
2414  size_t i( 0UL );
2415 
2416  for( ; (i+2UL) <= M; i+=2UL )
2417  {
2418  const size_t kbegin( ( IsUpper_v<MT4> )
2419  ?( ( IsLower_v<MT5> )
2420  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2421  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2422  :( IsLower_v<MT5> ? j : 0UL ) );
2423  const size_t kend( ( IsLower_v<MT4> )
2424  ?( ( IsUpper_v<MT5> )
2425  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
2426  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2427  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
2428 
2429  SIMDType xmm1( C.load(i ,j ) );
2430  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
2431  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
2432  SIMDType xmm4( C.load(i+1UL,j ) );
2433  SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
2434  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
2435 
2436  for( size_t k=kbegin; k<kend; ++k ) {
2437  const SIMDType a1( set( A(i ,k) ) );
2438  const SIMDType a2( set( A(i+1UL,k) ) );
2439  const SIMDType b1( B.load(k,j ) );
2440  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2441  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2442  xmm1 += a1 * b1;
2443  xmm2 += a1 * b2;
2444  xmm3 += a1 * b3;
2445  xmm4 += a2 * b1;
2446  xmm5 += a2 * b2;
2447  xmm6 += a2 * b3;
2448  }
2449 
2450  C.store( i , j , xmm1 );
2451  C.store( i , j+SIMDSIZE , xmm2 );
2452  C.store( i , j+SIMDSIZE*2UL, xmm3 );
2453  C.store( i+1UL, j , xmm4 );
2454  C.store( i+1UL, j+SIMDSIZE , xmm5 );
2455  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
2456  }
2457 
2458  if( i < M )
2459  {
2460  const size_t kbegin( ( IsUpper_v<MT4> )
2461  ?( ( IsLower_v<MT5> )
2462  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2463  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2464  :( IsLower_v<MT5> ? j : 0UL ) );
2465  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
2466 
2467  SIMDType xmm1( C.load(i,j ) );
2468  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2469  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2470 
2471  for( size_t k=kbegin; k<kend; ++k ) {
2472  const SIMDType a1( set( A(i,k) ) );
2473  xmm1 += a1 * B.load(k,j );
2474  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2475  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2476  }
2477 
2478  C.store( i, j , xmm1 );
2479  C.store( i, j+SIMDSIZE , xmm2 );
2480  C.store( i, j+SIMDSIZE*2UL, xmm3 );
2481  }
2482  }
2483 
2484  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2485  {
2486  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
2487  size_t i( LOW ? j : 0UL );
2488 
2489  for( ; (i+4UL) <= iend; i+=4UL )
2490  {
2491  const size_t kbegin( ( IsUpper_v<MT4> )
2492  ?( ( IsLower_v<MT5> )
2493  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2494  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2495  :( IsLower_v<MT5> ? j : 0UL ) );
2496  const size_t kend( ( IsLower_v<MT4> )
2497  ?( ( IsUpper_v<MT5> )
2498  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
2499  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
2500  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
2501 
2502  SIMDType xmm1( C.load(i ,j ) );
2503  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
2504  SIMDType xmm3( C.load(i+1UL,j ) );
2505  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
2506  SIMDType xmm5( C.load(i+2UL,j ) );
2507  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
2508  SIMDType xmm7( C.load(i+3UL,j ) );
2509  SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
2510 
2511  for( size_t k=kbegin; k<kend; ++k ) {
2512  const SIMDType a1( set( A(i ,k) ) );
2513  const SIMDType a2( set( A(i+1UL,k) ) );
2514  const SIMDType a3( set( A(i+2UL,k) ) );
2515  const SIMDType a4( set( A(i+3UL,k) ) );
2516  const SIMDType b1( B.load(k,j ) );
2517  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2518  xmm1 += a1 * b1;
2519  xmm2 += a1 * b2;
2520  xmm3 += a2 * b1;
2521  xmm4 += a2 * b2;
2522  xmm5 += a3 * b1;
2523  xmm6 += a3 * b2;
2524  xmm7 += a4 * b1;
2525  xmm8 += a4 * b2;
2526  }
2527 
2528  C.store( i , j , xmm1 );
2529  C.store( i , j+SIMDSIZE, xmm2 );
2530  C.store( i+1UL, j , xmm3 );
2531  C.store( i+1UL, j+SIMDSIZE, xmm4 );
2532  C.store( i+2UL, j , xmm5 );
2533  C.store( i+2UL, j+SIMDSIZE, xmm6 );
2534  C.store( i+3UL, j , xmm7 );
2535  C.store( i+3UL, j+SIMDSIZE, xmm8 );
2536  }
2537 
2538  for( ; (i+3UL) <= iend; i+=3UL )
2539  {
2540  const size_t kbegin( ( IsUpper_v<MT4> )
2541  ?( ( IsLower_v<MT5> )
2542  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2543  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2544  :( IsLower_v<MT5> ? j : 0UL ) );
2545  const size_t kend( ( IsLower_v<MT4> )
2546  ?( ( IsUpper_v<MT5> )
2547  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
2548  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
2549  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
2550 
2551  SIMDType xmm1( C.load(i ,j ) );
2552  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
2553  SIMDType xmm3( C.load(i+1UL,j ) );
2554  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
2555  SIMDType xmm5( C.load(i+2UL,j ) );
2556  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
2557 
2558  for( size_t k=kbegin; k<kend; ++k ) {
2559  const SIMDType a1( set( A(i ,k) ) );
2560  const SIMDType a2( set( A(i+1UL,k) ) );
2561  const SIMDType a3( set( A(i+2UL,k) ) );
2562  const SIMDType b1( B.load(k,j ) );
2563  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2564  xmm1 += a1 * b1;
2565  xmm2 += a1 * b2;
2566  xmm3 += a2 * b1;
2567  xmm4 += a2 * b2;
2568  xmm5 += a3 * b1;
2569  xmm6 += a3 * b2;
2570  }
2571 
2572  C.store( i , j , xmm1 );
2573  C.store( i , j+SIMDSIZE, xmm2 );
2574  C.store( i+1UL, j , xmm3 );
2575  C.store( i+1UL, j+SIMDSIZE, xmm4 );
2576  C.store( i+2UL, j , xmm5 );
2577  C.store( i+2UL, j+SIMDSIZE, xmm6 );
2578  }
2579 
2580  for( ; (i+2UL) <= iend; i+=2UL )
2581  {
2582  const size_t kbegin( ( IsUpper_v<MT4> )
2583  ?( ( IsLower_v<MT5> )
2584  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2585  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2586  :( IsLower_v<MT5> ? j : 0UL ) );
2587  const size_t kend( ( IsLower_v<MT4> )
2588  ?( ( IsUpper_v<MT5> )
2589  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
2590  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2591  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
2592 
2593  SIMDType xmm1( C.load(i ,j ) );
2594  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
2595  SIMDType xmm3( C.load(i+1UL,j ) );
2596  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
2597  SIMDType xmm5, xmm6, xmm7, xmm8;
2598  size_t k( kbegin );
2599 
2600  for( ; (k+2UL) <= kend; k+=2UL ) {
2601  const SIMDType a1( set( A(i ,k ) ) );
2602  const SIMDType a2( set( A(i+1UL,k ) ) );
2603  const SIMDType a3( set( A(i ,k+1UL) ) );
2604  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
2605  const SIMDType b1( B.load(k ,j ) );
2606  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
2607  const SIMDType b3( B.load(k+1UL,j ) );
2608  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
2609  xmm1 += a1 * b1;
2610  xmm2 += a1 * b2;
2611  xmm3 += a2 * b1;
2612  xmm4 += a2 * b2;
2613  xmm5 += a3 * b3;
2614  xmm6 += a3 * b4;
2615  xmm7 += a4 * b3;
2616  xmm8 += a4 * b4;
2617  }
2618 
2619  for( ; k<kend; ++k ) {
2620  const SIMDType a1( set( A(i ,k) ) );
2621  const SIMDType a2( set( A(i+1UL,k) ) );
2622  const SIMDType b1( B.load(k,j ) );
2623  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2624  xmm1 += a1 * b1;
2625  xmm2 += a1 * b2;
2626  xmm3 += a2 * b1;
2627  xmm4 += a2 * b2;
2628  }
2629 
2630  C.store( i , j , xmm1+xmm5 );
2631  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
2632  C.store( i+1UL, j , xmm3+xmm7 );
2633  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
2634  }
2635 
2636  if( i < iend )
2637  {
2638  const size_t kbegin( ( IsUpper_v<MT4> )
2639  ?( ( IsLower_v<MT5> )
2640  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2641  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2642  :( IsLower_v<MT5> ? j : 0UL ) );
2643  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
2644 
2645  SIMDType xmm1( C.load(i,j ) );
2646  SIMDType xmm2( C.load(i,j+SIMDSIZE) );
2647  SIMDType xmm3, xmm4;
2648  size_t k( kbegin );
2649 
2650  for( ; (k+2UL) <= kend; k+=2UL ) {
2651  const SIMDType a1( set( A(i,k ) ) );
2652  const SIMDType a2( set( A(i,k+1UL) ) );
2653  xmm1 += a1 * B.load(k ,j );
2654  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
2655  xmm3 += a2 * B.load(k+1UL,j );
2656  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
2657  }
2658 
2659  for( ; k<kend; ++k ) {
2660  const SIMDType a1( set( A(i,k) ) );
2661  xmm1 += a1 * B.load(k,j );
2662  xmm2 += a1 * B.load(k,j+SIMDSIZE);
2663  }
2664 
2665  C.store( i, j , xmm1+xmm3 );
2666  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
2667  }
2668  }
2669 
2670  for( ; j<jpos; j+=SIMDSIZE )
2671  {
2672  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
2673  size_t i( LOW ? j : 0UL );
2674 
2675  for( ; (i+4UL) <= iend; i+=4UL )
2676  {
2677  const size_t kbegin( ( IsUpper_v<MT4> )
2678  ?( ( IsLower_v<MT5> )
2679  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2680  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2681  :( IsLower_v<MT5> ? j : 0UL ) );
2682  const size_t kend( ( IsLower_v<MT4> )
2683  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
2684  :( K ) );
2685 
2686  SIMDType xmm1( C.load(i ,j) );
2687  SIMDType xmm2( C.load(i+1UL,j) );
2688  SIMDType xmm3( C.load(i+2UL,j) );
2689  SIMDType xmm4( C.load(i+3UL,j) );
2690  SIMDType xmm5, xmm6, xmm7, xmm8;
2691  size_t k( kbegin );
2692 
2693  for( ; (k+2UL) <= kend; k+=2UL ) {
2694  const SIMDType b1( B.load(k ,j) );
2695  const SIMDType b2( B.load(k+1UL,j) );
2696  xmm1 += set( A(i ,k ) ) * b1;
2697  xmm2 += set( A(i+1UL,k ) ) * b1;
2698  xmm3 += set( A(i+2UL,k ) ) * b1;
2699  xmm4 += set( A(i+3UL,k ) ) * b1;
2700  xmm5 += set( A(i ,k+1UL) ) * b2;
2701  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
2702  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
2703  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
2704  }
2705 
2706  for( ; k<kend; ++k ) {
2707  const SIMDType b1( B.load(k,j) );
2708  xmm1 += set( A(i ,k) ) * b1;
2709  xmm2 += set( A(i+1UL,k) ) * b1;
2710  xmm3 += set( A(i+2UL,k) ) * b1;
2711  xmm4 += set( A(i+3UL,k) ) * b1;
2712  }
2713 
2714  C.store( i , j, xmm1+xmm5 );
2715  C.store( i+1UL, j, xmm2+xmm6 );
2716  C.store( i+2UL, j, xmm3+xmm7 );
2717  C.store( i+3UL, j, xmm4+xmm8 );
2718  }
2719 
2720  for( ; (i+3UL) <= iend; i+=3UL )
2721  {
2722  const size_t kbegin( ( IsUpper_v<MT4> )
2723  ?( ( IsLower_v<MT5> )
2724  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2725  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2726  :( IsLower_v<MT5> ? j : 0UL ) );
2727  const size_t kend( ( IsLower_v<MT4> )
2728  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
2729  :( K ) );
2730 
2731  SIMDType xmm1( C.load(i ,j) );
2732  SIMDType xmm2( C.load(i+1UL,j) );
2733  SIMDType xmm3( C.load(i+2UL,j) );
2734  SIMDType xmm4, xmm5, xmm6;
2735  size_t k( kbegin );
2736 
2737  for( ; (k+2UL) <= kend; k+=2UL ) {
2738  const SIMDType b1( B.load(k ,j) );
2739  const SIMDType b2( B.load(k+1UL,j) );
2740  xmm1 += set( A(i ,k ) ) * b1;
2741  xmm2 += set( A(i+1UL,k ) ) * b1;
2742  xmm3 += set( A(i+2UL,k ) ) * b1;
2743  xmm4 += set( A(i ,k+1UL) ) * b2;
2744  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
2745  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
2746  }
2747 
2748  for( ; k<kend; ++k ) {
2749  const SIMDType b1( B.load(k,j) );
2750  xmm1 += set( A(i ,k) ) * b1;
2751  xmm2 += set( A(i+1UL,k) ) * b1;
2752  xmm3 += set( A(i+2UL,k) ) * b1;
2753  }
2754 
2755  C.store( i , j, xmm1+xmm4 );
2756  C.store( i+1UL, j, xmm2+xmm5 );
2757  C.store( i+2UL, j, xmm3+xmm6 );
2758  }
2759 
2760  for( ; (i+2UL) <= iend; i+=2UL )
2761  {
2762  const size_t kbegin( ( IsUpper_v<MT4> )
2763  ?( ( IsLower_v<MT5> )
2764  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2765  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2766  :( IsLower_v<MT5> ? j : 0UL ) );
2767  const size_t kend( ( IsLower_v<MT4> )
2768  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2769  :( K ) );
2770 
2771  SIMDType xmm1( C.load(i ,j) );
2772  SIMDType xmm2( C.load(i+1UL,j) );
2773  SIMDType xmm3, xmm4;
2774  size_t k( kbegin );
2775 
2776  for( ; (k+2UL) <= kend; k+=2UL ) {
2777  const SIMDType b1( B.load(k ,j) );
2778  const SIMDType b2( B.load(k+1UL,j) );
2779  xmm1 += set( A(i ,k ) ) * b1;
2780  xmm2 += set( A(i+1UL,k ) ) * b1;
2781  xmm3 += set( A(i ,k+1UL) ) * b2;
2782  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
2783  }
2784 
2785  for( ; k<kend; ++k ) {
2786  const SIMDType b1( B.load(k,j) );
2787  xmm1 += set( A(i ,k) ) * b1;
2788  xmm2 += set( A(i+1UL,k) ) * b1;
2789  }
2790 
2791  C.store( i , j, xmm1+xmm3 );
2792  C.store( i+1UL, j, xmm2+xmm4 );
2793  }
2794 
2795  if( i < iend )
2796  {
2797  const size_t kbegin( ( IsUpper_v<MT4> )
2798  ?( ( IsLower_v<MT5> )
2799  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2800  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2801  :( IsLower_v<MT5> ? j : 0UL ) );
2802 
2803  SIMDType xmm1( C.load(i,j) );
2804  SIMDType xmm2;
2805  size_t k( kbegin );
2806 
2807  for( ; (k+2UL) <= K; k+=2UL ) {
2808  xmm1 += set( A(i,k ) ) * B.load(k ,j);
2809  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
2810  }
2811 
2812  for( ; k<K; ++k ) {
2813  xmm1 += set( A(i,k) ) * B.load(k,j);
2814  }
2815 
2816  C.store( i, j, xmm1+xmm2 );
2817  }
2818  }
2819 
2820  for( ; remainder && j<N; ++j )
2821  {
2822  const size_t iend( UPP ? j+1UL : M );
2823  size_t i( LOW ? j : 0UL );
2824 
2825  for( ; (i+2UL) <= iend; i+=2UL )
2826  {
2827  const size_t kbegin( ( IsUpper_v<MT4> )
2828  ?( ( IsLower_v<MT5> )
2829  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2830  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2831  :( IsLower_v<MT5> ? j : 0UL ) );
2832  const size_t kend( ( IsLower_v<MT4> )
2833  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2834  :( K ) );
2835 
2836  ElementType value1( C(i ,j) );
2837  ElementType value2( C(i+1UL,j) );;
2838 
2839  for( size_t k=kbegin; k<kend; ++k ) {
2840  value1 += A(i ,k) * B(k,j);
2841  value2 += A(i+1UL,k) * B(k,j);
2842  }
2843 
2844  C(i ,j) = value1;
2845  C(i+1UL,j) = value2;
2846  }
2847 
2848  if( i < iend )
2849  {
2850  const size_t kbegin( ( IsUpper_v<MT4> )
2851  ?( ( IsLower_v<MT5> )
2852  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2853  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2854  :( IsLower_v<MT5> ? j : 0UL ) );
2855 
2856  ElementType value( C(i,j) );
2857 
2858  for( size_t k=kbegin; k<K; ++k ) {
2859  value += A(i,k) * B(k,j);
2860  }
2861 
2862  C(i,j) = value;
2863  }
2864  }
2865  }
2867  //**********************************************************************************************
2868 
2869  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2884  template< typename MT3 // Type of the left-hand side target matrix
2885  , typename MT4 // Type of the left-hand side matrix operand
2886  , typename MT5 > // Type of the right-hand side matrix operand
2887  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2888  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2889  {
2894 
2895  const ForwardFunctor fwd;
2896 
2897  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2898  const OppositeType_t<MT4> tmp( serial( A ) );
2899  addAssign( C, fwd( tmp * B ) );
2900  }
2901  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2902  const OppositeType_t<MT5> tmp( serial( B ) );
2903  addAssign( C, fwd( A * tmp ) );
2904  }
2905  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2906  const OppositeType_t<MT4> tmp( serial( A ) );
2907  addAssign( C, fwd( tmp * B ) );
2908  }
2909  else {
2910  const OppositeType_t<MT5> tmp( serial( B ) );
2911  addAssign( C, fwd( A * tmp ) );
2912  }
2913  }
2915  //**********************************************************************************************
2916 
2917  //**Default addition assignment to dense matrices (large matrices)******************************
2931  template< typename MT3 // Type of the left-hand side target matrix
2932  , typename MT4 // Type of the left-hand side matrix operand
2933  , typename MT5 > // Type of the right-hand side matrix operand
2934  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2935  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2936  {
2937  selectDefaultAddAssignKernel( C, A, B );
2938  }
2940  //**********************************************************************************************
2941 
2942  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
2957  template< typename MT3 // Type of the left-hand side target matrix
2958  , typename MT4 // Type of the left-hand side matrix operand
2959  , typename MT5 > // Type of the right-hand side matrix operand
2960  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2961  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2962  {
2963  if( LOW )
2964  lmmm( C, A, B, ElementType(1), ElementType(1) );
2965  else if( UPP )
2966  ummm( C, A, B, ElementType(1), ElementType(1) );
2967  else
2968  mmm( C, A, B, ElementType(1), ElementType(1) );
2969  }
2971  //**********************************************************************************************
2972 
2973  //**BLAS-based addition assignment to dense matrices (default)**********************************
2987  template< typename MT3 // Type of the left-hand side target matrix
2988  , typename MT4 // Type of the left-hand side matrix operand
2989  , typename MT5 > // Type of the right-hand side matrix operand
2990  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2991  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2992  {
2993  selectLargeAddAssignKernel( C, A, B );
2994  }
2996  //**********************************************************************************************
2997 
2998  //**BLAS-based addition assignment to dense matrices********************************************
2999 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3000 
3013  template< typename MT3 // Type of the left-hand side target matrix
3014  , typename MT4 // Type of the left-hand side matrix operand
3015  , typename MT5 > // Type of the right-hand side matrix operand
3016  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3017  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3018  {
3019  using ET = ElementType_t<MT3>;
3020 
3021  if( IsTriangular_v<MT4> ) {
3022  ResultType_t<MT3> tmp( serial( B ) );
3023  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3024  addAssign( C, tmp );
3025  }
3026  else if( IsTriangular_v<MT5> ) {
3027  ResultType_t<MT3> tmp( serial( A ) );
3028  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3029  addAssign( C, tmp );
3030  }
3031  else {
3032  gemm( C, A, B, ET(1), ET(1) );
3033  }
3034  }
3036 #endif
3037  //**********************************************************************************************
3038 
3039  //**Restructuring addition assignment to column-major matrices**********************************
3054  template< typename MT > // Type of the target matrix
3055  friend inline auto addAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
3056  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3057  {
3059 
3061 
3062  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3063  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3064 
3065  const ForwardFunctor fwd;
3066 
3067  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
3068  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
3069  else if( IsSymmetric_v<MT1> )
3070  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
3071  else
3072  addAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
3073  }
3075  //**********************************************************************************************
3076 
3077  //**Addition assignment to sparse matrices******************************************************
3078  // No special implementation for the addition assignment to sparse matrices.
3079  //**********************************************************************************************
3080 
3081  //**Subtraction assignment to dense matrices****************************************************
3094  template< typename MT // Type of the target dense matrix
3095  , bool SO > // Storage order of the target dense matrix
3096  friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3097  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3098  {
3100 
3101  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3102  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3103 
3104  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3105  return;
3106  }
3107 
3108  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3109  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3110 
3111  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3112  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3113  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3114  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3115  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3116  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3117 
3118  DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3119  }
3121  //**********************************************************************************************
3122 
3123  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3134  template< typename MT3 // Type of the left-hand side target matrix
3135  , typename MT4 // Type of the left-hand side matrix operand
3136  , typename MT5 > // Type of the right-hand side matrix operand
3137  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3138  {
3139  if( ( IsDiagonal_v<MT5> ) ||
3140  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
3141  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
3142  selectSmallSubAssignKernel( C, A, B );
3143  else
3144  selectBlasSubAssignKernel( C, A, B );
3145  }
3147  //**********************************************************************************************
3148 
3149  //**Default subtraction assignment to dense matrices (general/general)**************************
3163  template< typename MT3 // Type of the left-hand side target matrix
3164  , typename MT4 // Type of the left-hand side matrix operand
3165  , typename MT5 > // Type of the right-hand side matrix operand
3166  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3167  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3168  {
3169  const size_t M( A.rows() );
3170  const size_t N( B.columns() );
3171  const size_t K( A.columns() );
3172 
3173  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3174 
3175  for( size_t i=0UL; i<M; ++i )
3176  {
3177  const size_t kbegin( ( IsUpper_v<MT4> )
3178  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3179  :( 0UL ) );
3180  const size_t kend( ( IsLower_v<MT4> )
3181  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3182  :( K ) );
3183  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3184 
3185  for( size_t k=kbegin; k<kend; ++k )
3186  {
3187  const size_t jbegin( ( IsUpper_v<MT5> )
3188  ?( ( IsStrictlyUpper_v<MT5> )
3189  ?( UPP ? max(i,k+1UL) : k+1UL )
3190  :( UPP ? max(i,k) : k ) )
3191  :( UPP ? i : 0UL ) );
3192  const size_t jend( ( IsLower_v<MT5> )
3193  ?( ( IsStrictlyLower_v<MT5> )
3194  ?( LOW ? min(i+1UL,k) : k )
3195  :( LOW ? min(i,k)+1UL : k+1UL ) )
3196  :( LOW ? i+1UL : N ) );
3197 
3198  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
3199  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3200 
3201  const size_t jnum( jend - jbegin );
3202  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3203 
3204  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3205  C(i,j ) -= A(i,k) * B(k,j );
3206  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3207  }
3208  if( jpos < jend ) {
3209  C(i,jpos) -= A(i,k) * B(k,jpos);
3210  }
3211  }
3212  }
3213  }
3215  //**********************************************************************************************
3216 
3217  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
3231  template< typename MT3 // Type of the left-hand side target matrix
3232  , typename MT4 // Type of the left-hand side matrix operand
3233  , typename MT5 > // Type of the right-hand side matrix operand
3234  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3235  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3236  {
3238 
3239  const size_t M( A.rows() );
3240  const size_t N( B.columns() );
3241 
3242  for( size_t i=0UL; i<M; ++i )
3243  {
3244  const size_t jbegin( ( IsUpper_v<MT4> )
3245  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3246  :( 0UL ) );
3247  const size_t jend( ( IsLower_v<MT4> )
3248  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3249  :( N ) );
3250  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3251 
3252  const size_t jnum( jend - jbegin );
3253  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3254 
3255  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3256  C(i,j ) -= A(i,j ) * B(j ,j );
3257  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3258  }
3259  if( jpos < jend ) {
3260  C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3261  }
3262  }
3263  }
3265  //**********************************************************************************************
3266 
3267  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
3281  template< typename MT3 // Type of the left-hand side target matrix
3282  , typename MT4 // Type of the left-hand side matrix operand
3283  , typename MT5 > // Type of the right-hand side matrix operand
3284  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3285  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3286  {
3288 
3289  const size_t M( A.rows() );
3290  const size_t N( B.columns() );
3291 
3292  for( size_t i=0UL; i<M; ++i )
3293  {
3294  const size_t jbegin( ( IsUpper_v<MT5> )
3295  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3296  :( 0UL ) );
3297  const size_t jend( ( IsLower_v<MT5> )
3298  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3299  :( N ) );
3300  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3301 
3302  const size_t jnum( jend - jbegin );
3303  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3304 
3305  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3306  C(i,j ) -= A(i,i) * B(i,j );
3307  C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
3308  }
3309  if( jpos < jend ) {
3310  C(i,jpos) -= A(i,i) * B(i,jpos);
3311  }
3312  }
3313  }
3315  //**********************************************************************************************
3316 
3317  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3331  template< typename MT3 // Type of the left-hand side target matrix
3332  , typename MT4 // Type of the left-hand side matrix operand
3333  , typename MT5 > // Type of the right-hand side matrix operand
3334  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3335  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3336  {
3338 
3339  for( size_t i=0UL; i<A.rows(); ++i ) {
3340  C(i,i) -= A(i,i) * B(i,i);
3341  }
3342  }
3344  //**********************************************************************************************
3345 
3346  //**Default subtraction assignment to dense matrices (small matrices)***************************
3360  template< typename MT3 // Type of the left-hand side target matrix
3361  , typename MT4 // Type of the left-hand side matrix operand
3362  , typename MT5 > // Type of the right-hand side matrix operand
3363  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3364  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3365  {
3366  selectDefaultSubAssignKernel( C, A, B );
3367  }
3369  //**********************************************************************************************
3370 
3371  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
3386  template< typename MT3 // Type of the left-hand side target matrix
3387  , typename MT4 // Type of the left-hand side matrix operand
3388  , typename MT5 > // Type of the right-hand side matrix operand
3389  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3390  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3391  {
3392  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3393 
3394  const size_t M( A.rows() );
3395  const size_t N( B.columns() );
3396  const size_t K( A.columns() );
3397 
3398  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3399 
3400  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3401  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3402 
3403  size_t j( 0UL );
3404 
3405  if( IsIntegral_v<ElementType> )
3406  {
3407  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3408  for( size_t i=0UL; i<M; ++i )
3409  {
3410  const size_t kbegin( ( IsUpper_v<MT4> )
3411  ?( ( IsLower_v<MT5> )
3412  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3413  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3414  :( IsLower_v<MT5> ? j : 0UL ) );
3415  const size_t kend( ( IsLower_v<MT4> )
3416  ?( ( IsUpper_v<MT5> )
3417  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
3418  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3419  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
3420 
3421  SIMDType xmm1( C.load(i,j ) );
3422  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3423  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3424  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3425  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3426  SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
3427  SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
3428  SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
3429 
3430  for( size_t k=kbegin; k<kend; ++k ) {
3431  const SIMDType a1( set( A(i,k) ) );
3432  xmm1 -= a1 * B.load(k,j );
3433  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3434  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3435  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3436  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3437  xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
3438  xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
3439  xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
3440  }
3441 
3442  C.store( i, j , xmm1 );
3443  C.store( i, j+SIMDSIZE , xmm2 );
3444  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3445  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3446  C.store( i, j+SIMDSIZE*4UL, xmm5 );
3447  C.store( i, j+SIMDSIZE*5UL, xmm6 );
3448  C.store( i, j+SIMDSIZE*6UL, xmm7 );
3449  C.store( i, j+SIMDSIZE*7UL, xmm8 );
3450  }
3451  }
3452  }
3453 
3454  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3455  {
3456  size_t i( 0UL );
3457 
3458  for( ; (i+2UL) <= M; i+=2UL )
3459  {
3460  const size_t kbegin( ( IsUpper_v<MT4> )
3461  ?( ( IsLower_v<MT5> )
3462  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3463  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3464  :( IsLower_v<MT5> ? j : 0UL ) );
3465  const size_t kend( ( IsLower_v<MT4> )
3466  ?( ( IsUpper_v<MT5> )
3467  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
3468  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3469  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
3470 
3471  SIMDType xmm1 ( C.load(i ,j ) );
3472  SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
3473  SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
3474  SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
3475  SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
3476  SIMDType xmm6 ( C.load(i+1UL,j ) );
3477  SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
3478  SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
3479  SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
3480  SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
3481 
3482  for( size_t k=kbegin; k<kend; ++k ) {
3483  const SIMDType a1( set( A(i ,k) ) );
3484  const SIMDType a2( set( A(i+1UL,k) ) );
3485  const SIMDType b1( B.load(k,j ) );
3486  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3487  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3488  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3489  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3490  xmm1 -= a1 * b1;
3491  xmm2 -= a1 * b2;
3492  xmm3 -= a1 * b3;
3493  xmm4 -= a1 * b4;
3494  xmm5 -= a1 * b5;
3495  xmm6 -= a2 * b1;
3496  xmm7 -= a2 * b2;
3497  xmm8 -= a2 * b3;
3498  xmm9 -= a2 * b4;
3499  xmm10 -= a2 * b5;
3500  }
3501 
3502  C.store( i , j , xmm1 );
3503  C.store( i , j+SIMDSIZE , xmm2 );
3504  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3505  C.store( i , j+SIMDSIZE*3UL, xmm4 );
3506  C.store( i , j+SIMDSIZE*4UL, xmm5 );
3507  C.store( i+1UL, j , xmm6 );
3508  C.store( i+1UL, j+SIMDSIZE , xmm7 );
3509  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3510  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3511  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3512  }
3513 
3514  if( i < M )
3515  {
3516  const size_t kbegin( ( IsUpper_v<MT4> )
3517  ?( ( IsLower_v<MT5> )
3518  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3519  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3520  :( IsLower_v<MT5> ? j : 0UL ) );
3521  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
3522 
3523  SIMDType xmm1( C.load(i,j ) );
3524  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3525  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3526  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3527  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3528 
3529  for( size_t k=kbegin; k<kend; ++k ) {
3530  const SIMDType a1( set( A(i,k) ) );
3531  xmm1 -= a1 * B.load(k,j );
3532  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3533  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3534  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3535  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3536  }
3537 
3538  C.store( i, j , xmm1 );
3539  C.store( i, j+SIMDSIZE , xmm2 );
3540  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3541  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3542  C.store( i, j+SIMDSIZE*4UL, xmm5 );
3543  }
3544  }
3545 
3546  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3547  {
3548  size_t i( 0UL );
3549 
3550  for( ; (i+2UL) <= M; i+=2UL )
3551  {
3552  const size_t kbegin( ( IsUpper_v<MT4> )
3553  ?( ( IsLower_v<MT5> )
3554  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3555  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3556  :( IsLower_v<MT5> ? j : 0UL ) );
3557  const size_t kend( ( IsLower_v<MT4> )
3558  ?( ( IsUpper_v<MT5> )
3559  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3560  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3561  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
3562 
3563  SIMDType xmm1( C.load(i ,j ) );
3564  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3565  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3566  SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
3567  SIMDType xmm5( C.load(i+1UL,j ) );
3568  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
3569  SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
3570  SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
3571 
3572  for( size_t k=kbegin; k<kend; ++k ) {
3573  const SIMDType a1( set( A(i ,k) ) );
3574  const SIMDType a2( set( A(i+1UL,k) ) );
3575  const SIMDType b1( B.load(k,j ) );
3576  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3577  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3578  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3579  xmm1 -= a1 * b1;
3580  xmm2 -= a1 * b2;
3581  xmm3 -= a1 * b3;
3582  xmm4 -= a1 * b4;
3583  xmm5 -= a2 * b1;
3584  xmm6 -= a2 * b2;
3585  xmm7 -= a2 * b3;
3586  xmm8 -= a2 * b4;
3587  }
3588 
3589  C.store( i , j , xmm1 );
3590  C.store( i , j+SIMDSIZE , xmm2 );
3591  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3592  C.store( i , j+SIMDSIZE*3UL, xmm4 );
3593  C.store( i+1UL, j , xmm5 );
3594  C.store( i+1UL, j+SIMDSIZE , xmm6 );
3595  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3596  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3597  }
3598 
3599  if( i < M )
3600  {
3601  const size_t kbegin( ( IsUpper_v<MT4> )
3602  ?( ( IsLower_v<MT5> )
3603  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3604  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3605  :( IsLower_v<MT5> ? j : 0UL ) );
3606  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3607 
3608  SIMDType xmm1( C.load(i,j ) );
3609  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3610  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3611  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3612 
3613  for( size_t k=kbegin; k<kend; ++k ) {
3614  const SIMDType a1( set( A(i,k) ) );
3615  xmm1 -= a1 * B.load(k,j );
3616  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3617  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3618  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3619  }
3620 
3621  C.store( i, j , xmm1 );
3622  C.store( i, j+SIMDSIZE , xmm2 );
3623  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3624  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3625  }
3626  }
3627 
3628  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3629  {
3630  size_t i( 0UL );
3631 
3632  for( ; (i+2UL) <= M; i+=2UL )
3633  {
3634  const size_t kbegin( ( IsUpper_v<MT4> )
3635  ?( ( IsLower_v<MT5> )
3636  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3637  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3638  :( IsLower_v<MT5> ? j : 0UL ) );
3639  const size_t kend( ( IsLower_v<MT4> )
3640  ?( ( IsUpper_v<MT5> )
3641  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
3642  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3643  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
3644 
3645  SIMDType xmm1( C.load(i ,j ) );
3646  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3647  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3648  SIMDType xmm4( C.load(i+1UL,j ) );
3649  SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
3650  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
3651 
3652  for( size_t k=kbegin; k<kend; ++k ) {
3653  const SIMDType a1( set( A(i ,k) ) );
3654  const SIMDType a2( set( A(i+1UL,k) ) );
3655  const SIMDType b1( B.load(k,j ) );
3656  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3657  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3658  xmm1 -= a1 * b1;
3659  xmm2 -= a1 * b2;
3660  xmm3 -= a1 * b3;
3661  xmm4 -= a2 * b1;
3662  xmm5 -= a2 * b2;
3663  xmm6 -= a2 * b3;
3664  }
3665 
3666  C.store( i , j , xmm1 );
3667  C.store( i , j+SIMDSIZE , xmm2 );
3668  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3669  C.store( i+1UL, j , xmm4 );
3670  C.store( i+1UL, j+SIMDSIZE , xmm5 );
3671  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3672  }
3673 
3674  if( i < M )
3675  {
3676  const size_t kbegin( ( IsUpper_v<MT4> )
3677  ?( ( IsLower_v<MT5> )
3678  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3679  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3680  :( IsLower_v<MT5> ? j : 0UL ) );
3681  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
3682 
3683  SIMDType xmm1( C.load(i,j ) );
3684  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3685  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3686 
3687  for( size_t k=kbegin; k<kend; ++k ) {
3688  const SIMDType a1( set( A(i,k) ) );
3689  xmm1 -= a1 * B.load(k,j );
3690  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3691  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3692  }
3693 
3694  C.store( i, j , xmm1 );
3695  C.store( i, j+SIMDSIZE , xmm2 );
3696  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3697  }
3698  }
3699 
3700  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3701  {
3702  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
3703  size_t i( LOW ? j : 0UL );
3704 
3705  for( ; (i+4UL) <= iend; i+=4UL )
3706  {
3707  const size_t kbegin( ( IsUpper_v<MT4> )
3708  ?( ( IsLower_v<MT5> )
3709  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3710  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3711  :( IsLower_v<MT5> ? j : 0UL ) );
3712  const size_t kend( ( IsLower_v<MT4> )
3713  ?( ( IsUpper_v<MT5> )
3714  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
3715  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3716  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3717 
3718  SIMDType xmm1( C.load(i ,j ) );
3719  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3720  SIMDType xmm3( C.load(i+1UL,j ) );
3721  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3722  SIMDType xmm5( C.load(i+2UL,j ) );
3723  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3724  SIMDType xmm7( C.load(i+3UL,j ) );
3725  SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
3726 
3727  for( size_t k=kbegin; k<kend; ++k ) {
3728  const SIMDType a1( set( A(i ,k) ) );
3729  const SIMDType a2( set( A(i+1UL,k) ) );
3730  const SIMDType a3( set( A(i+2UL,k) ) );
3731  const SIMDType a4( set( A(i+3UL,k) ) );
3732  const SIMDType b1( B.load(k,j ) );
3733  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3734  xmm1 -= a1 * b1;
3735  xmm2 -= a1 * b2;
3736  xmm3 -= a2 * b1;
3737  xmm4 -= a2 * b2;
3738  xmm5 -= a3 * b1;
3739  xmm6 -= a3 * b2;
3740  xmm7 -= a4 * b1;
3741  xmm8 -= a4 * b2;
3742  }
3743 
3744  C.store( i , j , xmm1 );
3745  C.store( i , j+SIMDSIZE, xmm2 );
3746  C.store( i+1UL, j , xmm3 );
3747  C.store( i+1UL, j+SIMDSIZE, xmm4 );
3748  C.store( i+2UL, j , xmm5 );
3749  C.store( i+2UL, j+SIMDSIZE, xmm6 );
3750  C.store( i+3UL, j , xmm7 );
3751  C.store( i+3UL, j+SIMDSIZE, xmm8 );
3752  }
3753 
3754  for( ; (i+3UL) <= iend; i+=3UL )
3755  {
3756  const size_t kbegin( ( IsUpper_v<MT4> )
3757  ?( ( IsLower_v<MT5> )
3758  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3759  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3760  :( IsLower_v<MT5> ? j : 0UL ) );
3761  const size_t kend( ( IsLower_v<MT4> )
3762  ?( ( IsUpper_v<MT5> )
3763  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
3764  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3765  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3766 
3767  SIMDType xmm1( C.load(i ,j ) );
3768  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3769  SIMDType xmm3( C.load(i+1UL,j ) );
3770  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3771  SIMDType xmm5( C.load(i+2UL,j ) );
3772  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3773 
3774  for( size_t k=kbegin; k<kend; ++k ) {
3775  const SIMDType a1( set( A(i ,k) ) );
3776  const SIMDType a2( set( A(i+1UL,k) ) );
3777  const SIMDType a3( set( A(i+2UL,k) ) );
3778  const SIMDType b1( B.load(k,j ) );
3779  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3780  xmm1 -= a1 * b1;
3781  xmm2 -= a1 * b2;
3782  xmm3 -= a2 * b1;
3783  xmm4 -= a2 * b2;
3784  xmm5 -= a3 * b1;
3785  xmm6 -= a3 * b2;
3786  }
3787 
3788  C.store( i , j , xmm1 );
3789  C.store( i , j+SIMDSIZE, xmm2 );
3790  C.store( i+1UL, j , xmm3 );
3791  C.store( i+1UL, j+SIMDSIZE, xmm4 );
3792  C.store( i+2UL, j , xmm5 );
3793  C.store( i+2UL, j+SIMDSIZE, xmm6 );
3794  }
3795 
3796  for( ; (i+2UL) <= iend; i+=2UL )
3797  {
3798  const size_t kbegin( ( IsUpper_v<MT4> )
3799  ?( ( IsLower_v<MT5> )
3800  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3801  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3802  :( IsLower_v<MT5> ? j : 0UL ) );
3803  const size_t kend( ( IsLower_v<MT4> )
3804  ?( ( IsUpper_v<MT5> )
3805  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3806  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3807  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3808 
3809  SIMDType xmm1( C.load(i ,j ) );
3810  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3811  SIMDType xmm3( C.load(i+1UL,j ) );
3812  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3813  SIMDType xmm5, xmm6, xmm7, xmm8;
3814  size_t k( kbegin );
3815 
3816  for( ; (k+2UL) <= kend; k+=2UL ) {
3817  const SIMDType a1( set( A(i ,k ) ) );
3818  const SIMDType a2( set( A(i+1UL,k ) ) );
3819  const SIMDType a3( set( A(i ,k+1UL) ) );
3820  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
3821  const SIMDType b1( B.load(k ,j ) );
3822  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3823  const SIMDType b3( B.load(k+1UL,j ) );
3824  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3825  xmm1 -= a1 * b1;
3826  xmm2 -= a1 * b2;
3827  xmm3 -= a2 * b1;
3828  xmm4 -= a2 * b2;
3829  xmm5 -= a3 * b3;
3830  xmm6 -= a3 * b4;
3831  xmm7 -= a4 * b3;
3832  xmm8 -= a4 * b4;
3833  }
3834 
3835  for( ; k<kend; ++k ) {
3836  const SIMDType a1( set( A(i ,k) ) );
3837  const SIMDType a2( set( A(i+1UL,k) ) );
3838  const SIMDType b1( B.load(k,j ) );
3839  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3840  xmm1 -= a1 * b1;
3841  xmm2 -= a1 * b2;
3842  xmm3 -= a2 * b1;
3843  xmm4 -= a2 * b2;
3844  }
3845 
3846  C.store( i , j , xmm1+xmm5 );
3847  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
3848  C.store( i+1UL, j , xmm3+xmm7 );
3849  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3850  }
3851 
3852  if( i < iend )
3853  {
3854  const size_t kbegin( ( IsUpper_v<MT4> )
3855  ?( ( IsLower_v<MT5> )
3856  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3857  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3858  :( IsLower_v<MT5> ? j : 0UL ) );
3859  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
3860 
3861  SIMDType xmm1( C.load(i,j ) );
3862  SIMDType xmm2( C.load(i,j+SIMDSIZE) );
3863  SIMDType xmm3, xmm4;
3864  size_t k( kbegin );
3865 
3866  for( ; (k+2UL) <= kend; k+=2UL ) {
3867  const SIMDType a1( set( A(i,k ) ) );
3868  const SIMDType a2( set( A(i,k+1UL) ) );
3869  xmm1 -= a1 * B.load(k ,j );
3870  xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
3871  xmm3 -= a2 * B.load(k+1UL,j );
3872  xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
3873  }
3874 
3875  for( ; k<kend; ++k ) {
3876  const SIMDType a1( set( A(i,k) ) );
3877  xmm1 -= a1 * B.load(k,j );
3878  xmm2 -= a1 * B.load(k,j+SIMDSIZE);
3879  }
3880 
3881  C.store( i, j , xmm1+xmm3 );
3882  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
3883  }
3884  }
3885 
3886  for( ; j<jpos; j+=SIMDSIZE )
3887  {
3888  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
3889  size_t i( LOW ? j : 0UL );
3890 
3891  for( ; (i+4UL) <= iend; i+=4UL )
3892  {
3893  const size_t kbegin( ( IsUpper_v<MT4> )
3894  ?( ( IsLower_v<MT5> )
3895  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3896  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3897  :( IsLower_v<MT5> ? j : 0UL ) );
3898  const size_t kend( ( IsLower_v<MT4> )
3899  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
3900  :( K ) );
3901 
3902  SIMDType xmm1( C.load(i ,j) );
3903  SIMDType xmm2( C.load(i+1UL,j) );
3904  SIMDType xmm3( C.load(i+2UL,j) );
3905  SIMDType xmm4( C.load(i+3UL,j) );
3906  SIMDType xmm5, xmm6, xmm7, xmm8;
3907  size_t k( kbegin );
3908 
3909  for( ; (k+2UL) <= kend; k+=2UL ) {
3910  const SIMDType b1( B.load(k ,j) );
3911  const SIMDType b2( B.load(k+1UL,j) );
3912  xmm1 -= set( A(i ,k ) ) * b1;
3913  xmm2 -= set( A(i+1UL,k ) ) * b1;
3914  xmm3 -= set( A(i+2UL,k ) ) * b1;
3915  xmm4 -= set( A(i+3UL,k ) ) * b1;
3916  xmm5 -= set( A(i ,k+1UL) ) * b2;
3917  xmm6 -= set( A(i+1UL,k+1UL) ) * b2;
3918  xmm7 -= set( A(i+2UL,k+1UL) ) * b2;
3919  xmm8 -= set( A(i+3UL,k+1UL) ) * b2;
3920  }
3921 
3922  for( ; k<kend; ++k ) {
3923  const SIMDType b1( B.load(k,j) );
3924  xmm1 -= set( A(i ,k) ) * b1;
3925  xmm2 -= set( A(i+1UL,k) ) * b1;
3926  xmm3 -= set( A(i+2UL,k) ) * b1;
3927  xmm4 -= set( A(i+3UL,k) ) * b1;
3928  }
3929 
3930  C.store( i , j, xmm1+xmm5 );
3931  C.store( i+1UL, j, xmm2+xmm6 );
3932  C.store( i+2UL, j, xmm3+xmm7 );
3933  C.store( i+3UL, j, xmm4+xmm8 );
3934  }
3935 
3936  for( ; (i+3UL) <= iend; i+=3UL )
3937  {
3938  const size_t kbegin( ( IsUpper_v<MT4> )
3939  ?( ( IsLower_v<MT5> )
3940  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3941  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3942  :( IsLower_v<MT5> ? j : 0UL ) );
3943  const size_t kend( ( IsLower_v<MT4> )
3944  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
3945  :( K ) );
3946 
3947  SIMDType xmm1( C.load(i ,j) );
3948  SIMDType xmm2( C.load(i+1UL,j) );
3949  SIMDType xmm3( C.load(i+2UL,j) );
3950  SIMDType xmm4, xmm5, xmm6;
3951  size_t k( kbegin );
3952 
3953  for( ; (k+2UL) <= kend; k+=2UL ) {
3954  const SIMDType b1( B.load(k ,j) );
3955  const SIMDType b2( B.load(k+1UL,j) );
3956  xmm1 -= set( A(i ,k ) ) * b1;
3957  xmm2 -= set( A(i+1UL,k ) ) * b1;
3958  xmm3 -= set( A(i+2UL,k ) ) * b1;
3959  xmm4 -= set( A(i ,k+1UL) ) * b2;
3960  xmm5 -= set( A(i+1UL,k+1UL) ) * b2;
3961  xmm6 -= set( A(i+2UL,k+1UL) ) * b2;
3962  }
3963 
3964  for( ; k<kend; ++k ) {
3965  const SIMDType b1( B.load(k,j) );
3966  xmm1 -= set( A(i ,k) ) * b1;
3967  xmm2 -= set( A(i+1UL,k) ) * b1;
3968  xmm3 -= set( A(i+2UL,k) ) * b1;
3969  }
3970 
3971  C.store( i , j, xmm1+xmm4 );
3972  C.store( i+1UL, j, xmm2+xmm5 );
3973  C.store( i+2UL, j, xmm3+xmm6 );
3974  }
3975 
3976  for( ; (i+2UL) <= iend; i+=2UL )
3977  {
3978  const size_t kbegin( ( IsUpper_v<MT4> )
3979  ?( ( IsLower_v<MT5> )
3980  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3981  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3982  :( IsLower_v<MT5> ? j : 0UL ) );
3983  const size_t kend( ( IsLower_v<MT4> )
3984  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3985  :( K ) );
3986 
3987  SIMDType xmm1( C.load(i ,j) );
3988  SIMDType xmm2( C.load(i+1UL,j) );
3989  SIMDType xmm3, xmm4;
3990  size_t k( kbegin );
3991 
3992  for( ; (k+2UL) <= kend; k+=2UL ) {
3993  const SIMDType b1( B.load(k ,j) );
3994  const SIMDType b2( B.load(k+1UL,j) );
3995  xmm1 -= set( A(i ,k ) ) * b1;
3996  xmm2 -= set( A(i+1UL,k ) ) * b1;
3997  xmm3 -= set( A(i ,k+1UL) ) * b2;
3998  xmm4 -= set( A(i+1UL,k+1UL) ) * b2;
3999  }
4000 
4001  for( ; k<kend; ++k ) {
4002  const SIMDType b1( B.load(k,j) );
4003  xmm1 -= set( A(i ,k) ) * b1;
4004  xmm2 -= set( A(i+1UL,k) ) * b1;
4005  }
4006 
4007  C.store( i , j, xmm1+xmm3 );
4008  C.store( i+1UL, j, xmm2+xmm4 );
4009  }
4010 
4011  if( i < iend )
4012  {
4013  const size_t kbegin( ( IsUpper_v<MT4> )
4014  ?( ( IsLower_v<MT5> )
4015  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4016  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4017  :( IsLower_v<MT5> ? j : 0UL ) );
4018 
4019  SIMDType xmm1( C.load(i,j) );
4020  SIMDType xmm2;
4021  size_t k( kbegin );
4022 
4023  for( ; (k+2UL) <= K; k+=2UL ) {
4024  xmm1 -= set( A(i,k ) ) * B.load(k ,j);
4025  xmm2 -= set( A(i,k+1UL) ) * B.load(k+1UL,j);
4026  }
4027 
4028  for( ; k<K; ++k ) {
4029  xmm1 -= set( A(i,k) ) * B.load(k,j);
4030  }
4031 
4032  C.store( i, j, xmm1+xmm2 );
4033  }
4034  }
4035 
4036  for( ; remainder && j<N; ++j )
4037  {
4038  const size_t iend( UPP ? j+1UL : M );
4039  size_t i( LOW ? j : 0UL );
4040 
4041  for( ; (i+2UL) <= iend; i+=2UL )
4042  {
4043  const size_t kbegin( ( IsUpper_v<MT4> )
4044  ?( ( IsLower_v<MT5> )
4045  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4046  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4047  :( IsLower_v<MT5> ? j : 0UL ) );
4048  const size_t kend( ( IsLower_v<MT4> )
4049  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4050  :( K ) );
4051 
4052  ElementType value1( C(i ,j) );
4053  ElementType value2( C(i+1UL,j) );
4054 
4055  for( size_t k=kbegin; k<kend; ++k ) {
4056  value1 -= A(i ,k) * B(k,j);
4057  value2 -= A(i+1UL,k) * B(k,j);
4058  }
4059 
4060  C(i ,j) = value1;
4061  C(i+1UL,j) = value2;
4062  }
4063 
4064  if( i < iend )
4065  {
4066  const size_t kbegin( ( IsUpper_v<MT4> )
4067  ?( ( IsLower_v<MT5> )
4068  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4069  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4070  :( IsLower_v<MT5> ? j : 0UL ) );
4071 
4072  ElementType value( C(i,j) );
4073 
4074  for( size_t k=kbegin; k<K; ++k ) {
4075  value -= A(i,k) * B(k,j);
4076  }
4077 
4078  C(i,j) = value;
4079  }
4080  }
4081  }
4083  //**********************************************************************************************
4084 
4085  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
4100  template< typename MT3 // Type of the left-hand side target matrix
4101  , typename MT4 // Type of the left-hand side matrix operand
4102  , typename MT5 > // Type of the right-hand side matrix operand
4103  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4104  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4105  {
4110 
4111  const ForwardFunctor fwd;
4112 
4113  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
4114  const OppositeType_t<MT4> tmp( serial( A ) );
4115  subAssign( C, fwd( tmp * B ) );
4116  }
4117  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
4118  const OppositeType_t<MT5> tmp( serial( B ) );
4119  subAssign( C, fwd( A * tmp ) );
4120  }
4121  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4122  const OppositeType_t<MT4> tmp( serial( A ) );
4123  subAssign( C, fwd( tmp * B ) );
4124  }
4125  else {
4126  const OppositeType_t<MT5> tmp( serial( B ) );
4127  subAssign( C, fwd( A * tmp ) );
4128  }
4129  }
4131  //**********************************************************************************************
4132 
4133  //**Default subtraction assignment to dense matrices (large matrices)***************************
4147  template< typename MT3 // Type of the left-hand side target matrix
4148  , typename MT4 // Type of the left-hand side matrix operand
4149  , typename MT5 > // Type of the right-hand side matrix operand
4150  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4151  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4152  {
4153  selectDefaultSubAssignKernel( C, A, B );
4154  }
4156  //**********************************************************************************************
4157 
4158  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
4173  template< typename MT3 // Type of the left-hand side target matrix
4174  , typename MT4 // Type of the left-hand side matrix operand
4175  , typename MT5 > // Type of the right-hand side matrix operand
4176  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4177  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4178  {
4179  if( LOW )
4180  lmmm( C, A, B, ElementType(-1), ElementType(1) );
4181  else if( UPP )
4182  ummm( C, A, B, ElementType(-1), ElementType(1) );
4183  else
4184  mmm( C, A, B, ElementType(-1), ElementType(1) );
4185  }
4187  //**********************************************************************************************
4188 
4189  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4203  template< typename MT3 // Type of the left-hand side target matrix
4204  , typename MT4 // Type of the left-hand side matrix operand
4205  , typename MT5 > // Type of the right-hand side matrix operand
4206  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4207  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4208  {
4209  selectLargeSubAssignKernel( C, A, B );
4210  }
4212  //**********************************************************************************************
4213 
4214  //**BLAS-based subraction assignment to dense matrices******************************************
4215 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4216 
4229  template< typename MT3 // Type of the left-hand side target matrix
4230  , typename MT4 // Type of the left-hand side matrix operand
4231  , typename MT5 > // Type of the right-hand side matrix operand
4232  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4233  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4234  {
4235  using ET = ElementType_t<MT3>;
4236 
4237  if( IsTriangular_v<MT4> ) {
4238  ResultType_t<MT3> tmp( serial( B ) );
4239  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4240  subAssign( C, tmp );
4241  }
4242  else if( IsTriangular_v<MT5> ) {
4243  ResultType_t<MT3> tmp( serial( A ) );
4244  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4245  subAssign( C, tmp );
4246  }
4247  else {
4248  gemm( C, A, B, ET(-1), ET(1) );
4249  }
4250  }
4252 #endif
4253  //**********************************************************************************************
4254 
4255  //**Restructuring subtraction assignment to column-major matrices*******************************
4270  template< typename MT > // Type of the target matrix
4271  friend inline auto subAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4272  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4273  {
4275 
4277 
4278  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4279  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4280 
4281  const ForwardFunctor fwd;
4282 
4283  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4284  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4285  else if( IsSymmetric_v<MT1> )
4286  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4287  else
4288  subAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4289  }
4291  //**********************************************************************************************
4292 
4293  //**Subtraction assignment to sparse matrices***************************************************
4294  // No special implementation for the subtraction assignment to sparse matrices.
4295  //**********************************************************************************************
4296 
4297  //**Schur product assignment to dense matrices**************************************************
4310  template< typename MT // Type of the target dense matrix
4311  , bool SO > // Storage order of the target dense matrix
4312  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4313  {
4315 
4319 
4320  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4321  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4322 
4323  const ResultType tmp( serial( rhs ) );
4324  schurAssign( ~lhs, tmp );
4325  }
4327  //**********************************************************************************************
4328 
4329  //**Schur product assignment to sparse matrices*************************************************
4330  // No special implementation for the Schur product assignment to sparse matrices.
4331  //**********************************************************************************************
4332 
4333  //**Multiplication assignment to dense matrices*************************************************
4334  // No special implementation for the multiplication assignment to dense matrices.
4335  //**********************************************************************************************
4336 
4337  //**Multiplication assignment to sparse matrices************************************************
4338  // No special implementation for the multiplication assignment to sparse matrices.
4339  //**********************************************************************************************
4340 
4341  //**SMP assignment to dense matrices************************************************************
4356  template< typename MT // Type of the target dense matrix
4357  , bool SO > // Storage order of the target dense matrix
4358  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4359  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4360  {
4362 
4363  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4364  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4365 
4366  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4367  return;
4368  }
4369  else if( rhs.lhs_.columns() == 0UL ) {
4370  reset( ~lhs );
4371  return;
4372  }
4373 
4374  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4375  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4376 
4377  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4378  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4379  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4380  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4381  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4382  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4383 
4384  smpAssign( ~lhs, A * B );
4385  }
4387  //**********************************************************************************************
4388 
4389  //**SMP assignment to sparse matrices***********************************************************
4404  template< typename MT // Type of the target sparse matrix
4405  , bool SO > // Storage order of the target sparse matrix
4406  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4407  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4408  {
4410 
4411  using TmpType = If_t< SO, OppositeType, ResultType >;
4412 
4419 
4420  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4421  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4422 
4423  const ForwardFunctor fwd;
4424 
4425  const TmpType tmp( rhs );
4426  smpAssign( ~lhs, fwd( tmp ) );
4427  }
4429  //**********************************************************************************************
4430 
4431  //**Restructuring SMP assignment to column-major matrices***************************************
4446  template< typename MT > // Type of the target matrix
4447  friend inline auto smpAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4448  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4449  {
4451 
4453 
4454  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4455  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4456 
4457  const ForwardFunctor fwd;
4458 
4459  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4460  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4461  else if( IsSymmetric_v<MT1> )
4462  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4463  else
4464  smpAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4465  }
4467  //**********************************************************************************************
4468 
4469  //**SMP addition assignment to dense matrices***************************************************
4485  template< typename MT // Type of the target dense matrix
4486  , bool SO > // Storage order of the target dense matrix
4487  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4488  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4489  {
4491 
4492  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4493  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4494 
4495  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4496  return;
4497  }
4498 
4499  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4500  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4501 
4502  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4503  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4504  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4505  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4506  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4507  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4508 
4509  smpAddAssign( ~lhs, A * B );
4510  }
4512  //**********************************************************************************************
4513 
4514  //**Restructuring SMP addition assignment to column-major matrices******************************
4529  template< typename MT > // Type of the target matrix
4530  friend inline auto smpAddAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4531  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4532  {
4534 
4536 
4537  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4538  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4539 
4540  const ForwardFunctor fwd;
4541 
4542  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4543  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4544  else if( IsSymmetric_v<MT1> )
4545  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4546  else
4547  smpAddAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4548  }
4550  //**********************************************************************************************
4551 
4552  //**SMP addition assignment to sparse matrices**************************************************
4553  // No special implementation for the SMP addition assignment to sparse matrices.
4554  //**********************************************************************************************
4555 
4556  //**SMP subtraction assignment to dense matrices************************************************
4572  template< typename MT // Type of the target dense matrix
4573  , bool SO > // Storage order of the target dense matrix
4574  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4575  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4576  {
4578 
4579  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4580  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4581 
4582  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4583  return;
4584  }
4585 
4586  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4587  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4588 
4589  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4590  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4591  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4592  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4593  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4594  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4595 
4596  smpSubAssign( ~lhs, A * B );
4597  }
4599  //**********************************************************************************************
4600 
4601  //**Restructuring SMP subtraction assignment to column-major matrices***************************
4616  template< typename MT > // Type of the target matrix
4617  friend inline auto smpSubAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4618  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4619  {
4621 
4623 
4624  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4625  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4626 
4627  const ForwardFunctor fwd;
4628 
4629  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4630  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4631  else if( IsSymmetric_v<MT1> )
4632  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4633  else
4634  smpSubAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4635  }
4637  //**********************************************************************************************
4638 
4639  //**SMP subtraction assignment to sparse matrices***********************************************
4640  // No special implementation for the SMP subtraction assignment to sparse matrices.
4641  //**********************************************************************************************
4642 
4643  //**SMP Schur product assignment to dense matrices**********************************************
4656  template< typename MT // Type of the target dense matrix
4657  , bool SO > // Storage order of the target dense matrix
4658  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4659  {
4661 
4665 
4666  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4667  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4668 
4669  const ResultType tmp( rhs );
4670  smpSchurAssign( ~lhs, tmp );
4671  }
4673  //**********************************************************************************************
4674 
4675  //**SMP Schur product assignment to sparse matrices*********************************************
4676  // No special implementation for the SMP Schur product assignment to sparse matrices.
4677  //**********************************************************************************************
4678 
4679  //**SMP multiplication assignment to dense matrices*********************************************
4680  // No special implementation for the SMP multiplication assignment to dense matrices.
4681  //**********************************************************************************************
4682 
4683  //**SMP multiplication assignment to sparse matrices********************************************
4684  // No special implementation for the SMP multiplication assignment to sparse matrices.
4685  //**********************************************************************************************
4686 
4687  //**Compile time checks*************************************************************************
4695  //**********************************************************************************************
4696 };
4697 //*************************************************************************************************
4698 
4699 
4700 
4701 
4702 //=================================================================================================
4703 //
4704 // DMATSCALARMULTEXPR SPECIALIZATION
4705 //
4706 //=================================================================================================
4707 
4708 //*************************************************************************************************
4716 template< typename MT1 // Type of the left-hand side dense matrix
4717  , typename MT2 // Type of the right-hand side dense matrix
4718  , bool SF // Symmetry flag
4719  , bool HF // Hermitian flag
4720  , bool LF // Lower flag
4721  , bool UF // Upper flag
4722  , typename ST > // Type of the right-hand side scalar value
4723 class DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4724  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4725  , private Computation
4726 {
4727  private:
4728  //**Type definitions****************************************************************************
4730  using MMM = DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4731 
4732  using RES = ResultType_t<MMM>;
4733  using RT1 = ResultType_t<MT1>;
4734  using RT2 = ResultType_t<MT2>;
4735  using ET1 = ElementType_t<RT1>;
4736  using ET2 = ElementType_t<RT2>;
4737  using CT1 = CompositeType_t<MT1>;
4738  using CT2 = CompositeType_t<MT2>;
4739  //**********************************************************************************************
4740 
4741  //**********************************************************************************************
4743  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4744  //**********************************************************************************************
4745 
4746  //**********************************************************************************************
4748  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4749  //**********************************************************************************************
4750 
4751  //**********************************************************************************************
4752  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4753  static constexpr bool HERM = ( HF && !( LF || UF ) );
4754  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4755  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4756  //**********************************************************************************************
4757 
4758  //**********************************************************************************************
4760 
4764  template< typename T1, typename T2, typename T3 >
4765  static constexpr bool CanExploitSymmetry_v =
4766  ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4767  //**********************************************************************************************
4768 
4769  //**********************************************************************************************
4771 
4774  template< typename T1, typename T2, typename T3 >
4775  static constexpr bool IsEvaluationRequired_v =
4776  ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4777  //**********************************************************************************************
4778 
4779  //**********************************************************************************************
4781 
4783  template< typename T1, typename T2, typename T3, typename T4 >
4784  static constexpr bool UseBlasKernel_v =
4785  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4786  !SYM && !HERM && !LOW && !UPP &&
4787  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4788  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4789  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4790  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4791  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4792  IsBLASCompatible_v< ElementType_t<T1> > &&
4793  IsBLASCompatible_v< ElementType_t<T2> > &&
4794  IsBLASCompatible_v< ElementType_t<T3> > &&
4795  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4796  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4797  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4798  //**********************************************************************************************
4799 
4800  //**********************************************************************************************
4802 
4804  template< typename T1, typename T2, typename T3, typename T4 >
4805  static constexpr bool UseVectorizedDefaultKernel_v =
4806  ( useOptimizedKernels &&
4807  !IsDiagonal_v<T3> &&
4808  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4809  IsSIMDCombinable_v< ElementType_t<T1>
4810  , ElementType_t<T2>
4811  , ElementType_t<T3>
4812  , T4 > &&
4813  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4814  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4815  //**********************************************************************************************
4816 
4817  //**********************************************************************************************
4819 
4821  using ForwardFunctor = If_t< HERM
4822  , DeclHerm
4823  , If_t< SYM
4824  , DeclSym
4825  , If_t< LOW
4826  , If_t< UPP
4827  , DeclDiag
4828  , DeclLow >
4829  , If_t< UPP
4830  , DeclUpp
4831  , Noop > > > >;
4832  //**********************************************************************************************
4833 
4834  public:
4835  //**Type definitions****************************************************************************
4837  using This = DMatScalarMultExpr<MMM,ST,false>;
4838 
4840  using BaseType = DenseMatrix<This,false>;
4841 
4843  using ResultType = typename If_t< HERM
4844  , DeclHermTrait< MultTrait_t<RES,ST> >
4845  , If_t< SYM
4846  , DeclSymTrait< MultTrait_t<RES,ST> >
4847  , If_t< LOW
4848  , If_t< UPP
4849  , DeclDiagTrait< MultTrait_t<RES,ST> >
4850  , DeclLowTrait< MultTrait_t<RES,ST> > >
4851  , If_t< UPP
4852  , DeclUppTrait< MultTrait_t<RES,ST> >
4853  , MultTrait<RES,ST> > > > >::Type;
4854 
4855  using OppositeType = OppositeType_t<ResultType>;
4856  using TransposeType = TransposeType_t<ResultType>;
4857  using ElementType = ElementType_t<ResultType>;
4858  using SIMDType = SIMDTrait_t<ElementType>;
4859  using ReturnType = const ElementType;
4860  using CompositeType = const ResultType;
4861 
4863  using LeftOperand = const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4864 
4866  using RightOperand = ST;
4867 
4869  using LT = If_t< evaluateLeft, const RT1, CT1 >;
4870 
4872  using RT = If_t< evaluateRight, const RT2, CT2 >;
4873  //**********************************************************************************************
4874 
4875  //**Compilation flags***************************************************************************
4877  static constexpr bool simdEnabled =
4878  ( !IsDiagonal_v<MT2> &&
4879  MT1::simdEnabled && MT2::simdEnabled &&
4880  IsSIMDCombinable_v<ET1,ET2,ST> &&
4881  HasSIMDAdd_v<ET1,ET2> &&
4882  HasSIMDMult_v<ET1,ET2> );
4883 
4885  static constexpr bool smpAssignable =
4886  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4887  //**********************************************************************************************
4888 
4889  //**SIMD properties*****************************************************************************
4891  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
4892  //**********************************************************************************************
4893 
4894  //**Constructor*********************************************************************************
4900  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4901  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4902  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4903  {}
4904  //**********************************************************************************************
4905 
4906  //**Access operator*****************************************************************************
4913  inline ReturnType operator()( size_t i, size_t j ) const {
4914  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4915  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4916  return matrix_(i,j) * scalar_;
4917  }
4918  //**********************************************************************************************
4919 
4920  //**At function*********************************************************************************
4928  inline ReturnType at( size_t i, size_t j ) const {
4929  if( i >= matrix_.rows() ) {
4930  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4931  }
4932  if( j >= matrix_.columns() ) {
4933  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4934  }
4935  return (*this)(i,j);
4936  }
4937  //**********************************************************************************************
4938 
4939  //**Rows function*******************************************************************************
4944  inline size_t rows() const {
4945  return matrix_.rows();
4946  }
4947  //**********************************************************************************************
4948 
4949  //**Columns function****************************************************************************
4954  inline size_t columns() const {
4955  return matrix_.columns();
4956  }
4957  //**********************************************************************************************
4958 
4959  //**Left operand access*************************************************************************
4964  inline LeftOperand leftOperand() const {
4965  return matrix_;
4966  }
4967  //**********************************************************************************************
4968 
4969  //**Right operand access************************************************************************
4974  inline RightOperand rightOperand() const {
4975  return scalar_;
4976  }
4977  //**********************************************************************************************
4978 
4979  //**********************************************************************************************
4985  template< typename T >
4986  inline bool canAlias( const T* alias ) const {
4987  return matrix_.canAlias( alias );
4988  }
4989  //**********************************************************************************************
4990 
4991  //**********************************************************************************************
4997  template< typename T >
4998  inline bool isAliased( const T* alias ) const {
4999  return matrix_.isAliased( alias );
5000  }
5001  //**********************************************************************************************
5002 
5003  //**********************************************************************************************
5008  inline bool isAligned() const {
5009  return matrix_.isAligned();
5010  }
5011  //**********************************************************************************************
5012 
5013  //**********************************************************************************************
5018  inline bool canSMPAssign() const noexcept {
5019  return ( !BLAZE_BLAS_MODE ||
5020  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
5022  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
5023  ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD );
5024  }
5025  //**********************************************************************************************
5026 
5027  private:
5028  //**Member variables****************************************************************************
5031  //**********************************************************************************************
5032 
5033  //**Assignment to dense matrices****************************************************************
5045  template< typename MT // Type of the target dense matrix
5046  , bool SO > // Storage order of the target dense matrix
5047  friend inline auto assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5048  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
5049  {
5051 
5052  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5053  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5054 
5055  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5056  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5057 
5058  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
5059  return;
5060  }
5061  else if( left.columns() == 0UL ) {
5062  reset( ~lhs );
5063  return;
5064  }
5065 
5066  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5067  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5068 
5069  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5070  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5071  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5072  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5073  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5074  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
5075 
5076  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
5077  }
5078  //**********************************************************************************************
5079 
5080  //**Assignment to dense matrices (kernel selection)*********************************************
5091  template< typename MT3 // Type of the left-hand side target matrix
5092  , typename MT4 // Type of the left-hand side matrix operand
5093  , typename MT5 // Type of the right-hand side matrix operand
5094  , typename ST2 > // Type of the scalar value
5095  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5096  {
5097  if( ( IsDiagonal_v<MT5> ) ||
5098  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
5099  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5100  selectSmallAssignKernel( C, A, B, scalar );
5101  else
5102  selectBlasAssignKernel( C, A, B, scalar );
5103  }
5104  //**********************************************************************************************
5105 
5106  //**Default assignment to dense matrices (general/general)**************************************
5120  template< typename MT3 // Type of the left-hand side target matrix
5121  , typename MT4 // Type of the left-hand side matrix operand
5122  , typename MT5 // Type of the right-hand side matrix operand
5123  , typename ST2 > // Type of the scalar value
5124  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5125  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5126  {
5127  const size_t M( A.rows() );
5128  const size_t N( B.columns() );
5129  const size_t K( A.columns() );
5130 
5131  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5132 
5133  for( size_t i=0UL; i<M; ++i )
5134  {
5135  const size_t kbegin( ( IsUpper_v<MT4> )
5136  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5137  :( 0UL ) );
5138  const size_t kend( ( IsLower_v<MT4> )
5139  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5140  :( K ) );
5141  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5142 
5143  if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
5144  for( size_t j=0UL; j<N; ++j ) {
5145  reset( C(i,j) );
5146  }
5147  continue;
5148  }
5149 
5150  {
5151  const size_t jbegin( ( IsUpper_v<MT5> )
5152  ?( ( IsStrictlyUpper_v<MT5> )
5153  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
5154  :( UPP ? max(i,kbegin) : kbegin ) )
5155  :( UPP ? i : 0UL ) );
5156  const size_t jend( ( IsLower_v<MT5> )
5157  ?( ( IsStrictlyLower_v<MT5> )
5158  ?( LOW ? min(i+1UL,kbegin) : kbegin )
5159  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
5160  :( LOW ? i+1UL : N ) );
5161 
5162  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5163  for( size_t j=0UL; j<jbegin; ++j ) {
5164  reset( C(i,j) );
5165  }
5166  }
5167  else if( IsStrictlyUpper_v<MT5> ) {
5168  reset( C(i,0UL) );
5169  }
5170  for( size_t j=jbegin; j<jend; ++j ) {
5171  C(i,j) = A(i,kbegin) * B(kbegin,j);
5172  }
5173  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5174  for( size_t j=jend; j<N; ++j ) {
5175  reset( C(i,j) );
5176  }
5177  }
5178  else if( IsStrictlyLower_v<MT5> ) {
5179  reset( C(i,N-1UL) );
5180  }
5181  }
5182 
5183  for( size_t k=kbegin+1UL; k<kend; ++k )
5184  {
5185  const size_t jbegin( ( IsUpper_v<MT5> )
5186  ?( ( IsStrictlyUpper_v<MT5> )
5187  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
5188  :( SYM || HERM || UPP ? max( i, k ) : k ) )
5189  :( SYM || HERM || UPP ? i : 0UL ) );
5190  const size_t jend( ( IsLower_v<MT5> )
5191  ?( ( IsStrictlyLower_v<MT5> )
5192  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
5193  :( LOW ? min(i+1UL,k) : k ) )
5194  :( LOW ? i+1UL : N ) );
5195 
5196  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5197  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5198 
5199  for( size_t j=jbegin; j<jend; ++j ) {
5200  C(i,j) += A(i,k) * B(k,j);
5201  }
5202  if( IsLower_v<MT5> ) {
5203  C(i,jend) = A(i,k) * B(k,jend);
5204  }
5205  }
5206 
5207  {
5208  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5209  ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
5210  :( SYM || HERM || UPP ? i : 0UL ) );
5211  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
5212  ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
5213  :( LOW ? i+1UL : N ) );
5214 
5215  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5216  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5217 
5218  for( size_t j=jbegin; j<jend; ++j ) {
5219  C(i,j) *= scalar;
5220  }
5221  }
5222  }
5223 
5224  if( SYM || HERM ) {
5225  for( size_t i=1UL; i<M; ++i ) {
5226  for( size_t j=0UL; j<i; ++j ) {
5227  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5228  }
5229  }
5230  }
5231  }
5232  //**********************************************************************************************
5233 
5234  //**Default assignment to dense matrices (general/diagonal)*************************************
5248  template< typename MT3 // Type of the left-hand side target matrix
5249  , typename MT4 // Type of the left-hand side matrix operand
5250  , typename MT5 // Type of the right-hand side matrix operand
5251  , typename ST2 > // Type of the scalar value
5252  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5253  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5254  {
5256 
5257  const size_t M( A.rows() );
5258  const size_t N( B.columns() );
5259 
5260  for( size_t i=0UL; i<M; ++i )
5261  {
5262  const size_t jbegin( ( IsUpper_v<MT4> )
5263  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5264  :( 0UL ) );
5265  const size_t jend( ( IsLower_v<MT4> )
5266  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5267  :( N ) );
5268  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5269 
5270  if( IsUpper_v<MT4> ) {
5271  for( size_t j=0UL; j<jbegin; ++j ) {
5272  reset( C(i,j) );
5273  }
5274  }
5275  for( size_t j=jbegin; j<jend; ++j ) {
5276  C(i,j) = A(i,j) * B(j,j) * scalar;
5277  }
5278  if( IsLower_v<MT4> ) {
5279  for( size_t j=jend; j<N; ++j ) {
5280  reset( C(i,j) );
5281  }
5282  }
5283  }
5284  }
5285  //**********************************************************************************************
5286 
5287  //**Default assignment to dense matrices (diagonal/general)*************************************
5301  template< typename MT3 // Type of the left-hand side target matrix
5302  , typename MT4 // Type of the left-hand side matrix operand
5303  , typename MT5 // Type of the right-hand side matrix operand
5304  , typename ST2 > // Type of the scalar value
5305  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5306  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5307  {
5309 
5310  const size_t M( A.rows() );
5311  const size_t N( B.columns() );
5312 
5313  for( size_t i=0UL; i<M; ++i )
5314  {
5315  const size_t jbegin( ( IsUpper_v<MT5> )
5316  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5317  :( 0UL ) );
5318  const size_t jend( ( IsLower_v<MT5> )
5319  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5320  :( N ) );
5321  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5322 
5323  if( IsUpper_v<MT5> ) {
5324  for( size_t j=0UL; j<jbegin; ++j ) {
5325  reset( C(i,j) );
5326  }
5327  }
5328  for( size_t j=jbegin; j<jend; ++j ) {
5329  C(i,j) = A(i,i) * B(i,j) * scalar;
5330  }
5331  if( IsLower_v<MT5> ) {
5332  for( size_t j=jend; j<N; ++j ) {
5333  reset( C(i,j) );
5334  }
5335  }
5336  }
5337  }
5338  //**********************************************************************************************
5339 
5340  //**Default assignment to dense matrices (diagonal/diagonal)************************************
5354  template< typename MT3 // Type of the left-hand side target matrix
5355  , typename MT4 // Type of the left-hand side matrix operand
5356  , typename MT5 // Type of the right-hand side matrix operand
5357  , typename ST2 > // Type of the scalar value
5358  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5359  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5360  {
5362 
5363  reset( C );
5364 
5365  for( size_t i=0UL; i<A.rows(); ++i ) {
5366  C(i,i) = A(i,i) * B(i,i) * scalar;
5367  }
5368  }
5369  //**********************************************************************************************
5370 
5371  //**Default assignment to dense matrices (small matrices)***************************************
5385  template< typename MT3 // Type of the left-hand side target matrix
5386  , typename MT4 // Type of the left-hand side matrix operand
5387  , typename MT5 // Type of the right-hand side matrix operand
5388  , typename ST2 > // Type of the scalar value
5389  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5390  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5391  {
5392  selectDefaultAssignKernel( C, A, B, scalar );
5393  }
5394  //**********************************************************************************************
5395 
5396  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5411  template< typename MT3 // Type of the left-hand side target matrix
5412  , typename MT4 // Type of the left-hand side matrix operand
5413  , typename MT5 // Type of the right-hand side matrix operand
5414  , typename ST2 > // Type of the scalar value
5415  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5416  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5417  {
5418  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5419 
5420  const size_t M( A.rows() );
5421  const size_t N( B.columns() );
5422  const size_t K( A.columns() );
5423 
5424  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5425 
5426  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
5427  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5428 
5429  const SIMDType factor( set( scalar ) );
5430 
5431  size_t j( 0UL );
5432 
5433  if( IsIntegral_v<ElementType> )
5434  {
5435  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5436  for( size_t i=0UL; i<M; ++i )
5437  {
5438  const size_t kbegin( ( IsUpper_v<MT4> )
5439  ?( ( IsLower_v<MT5> )
5440  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5441  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5442  :( IsLower_v<MT5> ? j : 0UL ) );
5443  const size_t kend( ( IsLower_v<MT4> )
5444  ?( ( IsUpper_v<MT5> )
5445  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5446  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5447  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
5448 
5449  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5450 
5451  for( size_t k=kbegin; k<kend; ++k ) {
5452  const SIMDType a1( set( A(i,k) ) );
5453  xmm1 += a1 * B.load(k,j );
5454  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5455  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5456  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5457  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5458  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
5459  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
5460  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
5461  }
5462 
5463  C.store( i, j , xmm1 * factor );
5464  C.store( i, j+SIMDSIZE , xmm2 * factor );
5465  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5466  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5467  C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5468  C.store( i, j+SIMDSIZE*5UL, xmm6 * factor );
5469  C.store( i, j+SIMDSIZE*6UL, xmm7 * factor );
5470  C.store( i, j+SIMDSIZE*7UL, xmm8 * factor );
5471  }
5472  }
5473  }
5474 
5475  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5476  {
5477  size_t i( 0UL );
5478 
5479  for( ; (i+2UL) <= M; i+=2UL )
5480  {
5481  const size_t kbegin( ( IsUpper_v<MT4> )
5482  ?( ( IsLower_v<MT5> )
5483  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5484  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5485  :( IsLower_v<MT5> ? j : 0UL ) );
5486  const size_t kend( ( IsLower_v<MT4> )
5487  ?( ( IsUpper_v<MT5> )
5488  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
5489  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5490  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
5491 
5492  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5493 
5494  for( size_t k=kbegin; k<kend; ++k ) {
5495  const SIMDType a1( set( A(i ,k) ) );
5496  const SIMDType a2( set( A(i+1UL,k) ) );
5497  const SIMDType b1( B.load(k,j ) );
5498  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5499  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5500  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5501  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5502  xmm1 += a1 * b1;
5503  xmm2 += a1 * b2;
5504  xmm3 += a1 * b3;
5505  xmm4 += a1 * b4;
5506  xmm5 += a1 * b5;
5507  xmm6 += a2 * b1;
5508  xmm7 += a2 * b2;
5509  xmm8 += a2 * b3;
5510  xmm9 += a2 * b4;
5511  xmm10 += a2 * b5;
5512  }
5513 
5514  C.store( i , j , xmm1 * factor );
5515  C.store( i , j+SIMDSIZE , xmm2 * factor );
5516  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5517  C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5518  C.store( i , j+SIMDSIZE*4UL, xmm5 * factor );
5519  C.store( i+1UL, j , xmm6 * factor );
5520  C.store( i+1UL, j+SIMDSIZE , xmm7 * factor );
5521  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
5522  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
5523  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
5524  }
5525 
5526  if( i < M )
5527  {
5528  const size_t kbegin( ( IsUpper_v<MT4> )
5529  ?( ( IsLower_v<MT5> )
5530  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5531  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5532  :( IsLower_v<MT5> ? j : 0UL ) );
5533  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
5534 
5535  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5536 
5537  for( size_t k=kbegin; k<kend; ++k ) {
5538  const SIMDType a1( set( A(i,k) ) );
5539  xmm1 += a1 * B.load(k,j );
5540  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5541  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5542  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5543  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5544  }
5545 
5546  C.store( i, j , xmm1 * factor );
5547  C.store( i, j+SIMDSIZE , xmm2 * factor );
5548  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5549  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5550  C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5551  }
5552  }
5553 
5554  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5555  {
5556  const size_t iend( UPP ? min(j+SIMDSIZE*4UL,M) : M );
5557  size_t i( 0UL );
5558 
5559  if( SYM || HERM ) {
5560  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
5561  for( ; i<j; ++i ) {
5562  for( size_t jj=j; jj<jjend; ++jj ) {
5563  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
5564  }
5565  }
5566  }
5567  else if( LOW ) {
5568  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
5569  for( ; i<j; ++i ) {
5570  for( size_t jj=j; jj<jjend; ++jj ) {
5571  reset( C(i,jj) );
5572  }
5573  }
5574  }
5575 
5576  for( ; (i+2UL) <= iend; i+=2UL )
5577  {
5578  const size_t kbegin( ( IsUpper_v<MT4> )
5579  ?( ( IsLower_v<MT5> )
5580  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5581  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5582  :( IsLower_v<MT5> ? j : 0UL ) );
5583  const size_t kend( ( IsLower_v<MT4> )
5584  ?( ( IsUpper_v<MT5> )
5585  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5586  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5587  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
5588 
5589  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5590 
5591  for( size_t k=kbegin; k<kend; ++k ) {
5592  const SIMDType a1( set( A(i ,k) ) );
5593  const SIMDType a2( set( A(i+1UL,k) ) );
5594  const SIMDType b1( B.load(k,j ) );
5595  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5596  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5597  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5598  xmm1 += a1 * b1;
5599  xmm2 += a1 * b2;
5600  xmm3 += a1 * b3;
5601  xmm4 += a1 * b4;
5602  xmm5 += a2 * b1;
5603  xmm6 += a2 * b2;
5604  xmm7 += a2 * b3;
5605  xmm8 += a2 * b4;
5606  }
5607 
5608  C.store( i , j , xmm1 * factor );
5609  C.store( i , j+SIMDSIZE , xmm2 * factor );
5610  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5611  C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5612  C.store( i+1UL, j , xmm5 * factor );
5613  C.store( i+1UL, j+SIMDSIZE , xmm6 * factor );
5614  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
5615  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
5616  }
5617 
5618  if( i < iend )
5619  {
5620  const size_t kbegin( ( IsUpper_v<MT4> )
5621  ?( ( IsLower_v<MT5> )
5622  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5623  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5624  :( IsLower_v<MT5> ? j : 0UL ) );
5625  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5626 
5627  SIMDType xmm1, xmm2, xmm3, xmm4;
5628 
5629  for( size_t k=kbegin; k<kend; ++k ) {
5630  const SIMDType a1( set( A(i,k) ) );
5631  xmm1 += a1 * B.load(k,j );
5632  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5633  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5634  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5635  }
5636 
5637  C.store( i, j , xmm1 * factor );
5638  C.store( i, j+SIMDSIZE , xmm2 * factor );
5639  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5640  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5641 
5642  if( UPP ) ++i;
5643  }
5644 
5645  if( UPP ) {
5646  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
5647  for( ; i<M; ++i ) {
5648  for( size_t jj=j; jj<jjend; ++jj ) {
5649  reset( C(i,jj) );
5650  }
5651  }
5652  }
5653  }
5654 
5655  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5656  {
5657  const size_t iend( UPP ? min(j+SIMDSIZE*3UL,M) : M );
5658  size_t i( 0UL );
5659 
5660  if( SYM || HERM ) {
5661  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
5662  for( ; i<j; ++i ) {
5663  for( size_t jj=j; jj<jjend; ++jj ) {
5664  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
5665  }
5666  }
5667  }
5668  else if( LOW ) {
5669  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
5670  for( ; i<j; ++i ) {
5671  for( size_t jj=j; jj<jjend; ++jj ) {
5672  reset( C(i,jj) );
5673  }
5674  }
5675  }
5676 
5677  for( ; (i+2UL) <= iend; i+=2UL )
5678  {
5679  const size_t kbegin( ( IsUpper_v<MT4> )
5680  ?( ( IsLower_v<MT5> )
5681  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5682  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5683  :( IsLower_v<MT5> ? j : 0UL ) );
5684  const size_t kend( ( IsLower_v<MT4> )
5685  ?( ( IsUpper_v<MT5> )
5686  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
5687  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5688  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
5689 
5690  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5691 
5692  for( size_t k=kbegin; k<kend; ++k ) {
5693  const SIMDType a1( set( A(i ,k) ) );
5694  const SIMDType a2( set( A(i+1UL,k) ) );
5695  const SIMDType b1( B.load(k,j ) );
5696  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5697  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5698  xmm1 += a1 * b1;
5699  xmm2 += a1 * b2;
5700  xmm3 += a1 * b3;
5701  xmm4 += a2 * b1;
5702  xmm5 += a2 * b2;
5703  xmm6 += a2 * b3;
5704  }
5705 
5706  C.store( i , j , xmm1 * factor );
5707  C.store( i , j+SIMDSIZE , xmm2 * factor );
5708  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5709  C.store( i+1UL, j , xmm4 * factor );
5710  C.store( i+1UL, j+SIMDSIZE , xmm5 * factor );
5711  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
5712  }
5713 
5714  if( i < iend )
5715  {
5716  const size_t kbegin( ( IsUpper_v<MT4> )
5717  ?( ( IsLower_v<MT5> )
5718  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5719  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5720  :( IsLower_v<MT5> ? j : 0UL ) );
5721  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
5722 
5723  SIMDType xmm1, xmm2, xmm3;
5724 
5725  for( size_t k=kbegin; k<kend; ++k ) {
5726  const SIMDType a1( set( A(i,k) ) );
5727  xmm1 += a1 * B.load(k,j );
5728  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5729  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5730  }
5731 
5732  C.store( i, j , xmm1 * factor );
5733  C.store( i, j+SIMDSIZE , xmm2 * factor );
5734  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5735 
5736  if( UPP ) ++i;
5737  }
5738 
5739  if( UPP ) {
5740  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
5741  for( ; i<M; ++i ) {
5742  for( size_t jj=j; jj<jjend; ++jj ) {
5743  reset( C(i,jj) );
5744  }
5745  }
5746  }
5747  }
5748 
5749  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5750  {
5751  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
5752  size_t i( 0UL );
5753 
5754  if( SYM || HERM ) {
5755  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
5756  for( ; i<j; ++i ) {
5757  for( size_t jj=j; jj<jjend; ++jj ) {
5758  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
5759  }
5760  }
5761  }
5762  else if( LOW ) {
5763  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
5764  for( ; i<j; ++i ) {
5765  for( size_t jj=j; jj<jjend; ++jj ) {
5766  reset( C(i,jj) );
5767  }
5768  }
5769  }
5770 
5771  for( ; (i+4UL) <= iend; i+=4UL )
5772  {
5773  const size_t kbegin( ( IsUpper_v<MT4> )
5774  ?( ( IsLower_v<MT5> )
5775  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5776  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5777  :( IsLower_v<MT5> ? j : 0UL ) );
5778  const size_t kend( ( IsLower_v<MT4> )
5779  ?( ( IsUpper_v<MT5> )
5780  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
5781  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5782  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5783 
5784  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5785 
5786  for( size_t k=kbegin; k<kend; ++k ) {
5787  const SIMDType a1( set( A(i ,k) ) );
5788  const SIMDType a2( set( A(i+1UL,k) ) );
5789  const SIMDType a3( set( A(i+2UL,k) ) );
5790  const SIMDType a4( set( A(i+3UL,k) ) );
5791  const SIMDType b1( B.load(k,j ) );
5792  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5793  xmm1 += a1 * b1;
5794  xmm2 += a1 * b2;
5795  xmm3 += a2 * b1;
5796  xmm4 += a2 * b2;
5797  xmm5 += a3 * b1;
5798  xmm6 += a3 * b2;
5799  xmm7 += a4 * b1;
5800  xmm8 += a4 * b2;
5801  }
5802 
5803  C.store( i , j , xmm1 * factor );
5804  C.store( i , j+SIMDSIZE, xmm2 * factor );
5805  C.store( i+1UL, j , xmm3 * factor );
5806  C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
5807  C.store( i+2UL, j , xmm5 * factor );
5808  C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
5809  C.store( i+3UL, j , xmm7 * factor );
5810  C.store( i+3UL, j+SIMDSIZE, xmm8 * factor );
5811  }
5812 
5813  for( ; (i+3UL) <= iend; i+=3UL )
5814  {
5815  const size_t kbegin( ( IsUpper_v<MT4> )
5816  ?( ( IsLower_v<MT5> )
5817  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5818  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5819  :( IsLower_v<MT5> ? j : 0UL ) );
5820  const size_t kend( ( IsLower_v<MT4> )
5821  ?( ( IsUpper_v<MT5> )
5822  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
5823  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5824  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5825 
5826  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5827 
5828  for( size_t k=kbegin; k<kend; ++k ) {
5829  const SIMDType a1( set( A(i ,k) ) );
5830  const SIMDType a2( set( A(i+1UL,k) ) );
5831  const SIMDType a3( set( A(i+2UL,k) ) );
5832  const SIMDType b1( B.load(k,j ) );
5833  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5834  xmm1 += a1 * b1;
5835  xmm2 += a1 * b2;
5836  xmm3 += a2 * b1;
5837  xmm4 += a2 * b2;
5838  xmm5 += a3 * b1;
5839  xmm6 += a3 * b2;
5840  }
5841 
5842  C.store( i , j , xmm1 * factor );
5843  C.store( i , j+SIMDSIZE, xmm2 * factor );
5844  C.store( i+1UL, j , xmm3 * factor );
5845  C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
5846  C.store( i+2UL, j , xmm5 * factor );
5847  C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
5848  }
5849 
5850  for( ; (i+2UL) <= iend; i+=2UL )
5851  {
5852  const size_t kbegin( ( IsUpper_v<MT4> )
5853  ?( ( IsLower_v<MT5> )
5854  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5855  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5856  :( IsLower_v<MT5> ? j : 0UL ) );
5857  const size_t kend( ( IsLower_v<MT4> )
5858  ?( ( IsUpper_v<MT5> )
5859  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
5860  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5861  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5862 
5863  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5864  size_t k( kbegin );
5865 
5866  for( ; (k+2UL) <= kend; k+=2UL ) {
5867  const SIMDType a1( set( A(i ,k ) ) );
5868  const SIMDType a2( set( A(i+1UL,k ) ) );
5869  const SIMDType a3( set( A(i ,k+1UL) ) );
5870  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
5871  const SIMDType b1( B.load(k ,j ) );
5872  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5873  const SIMDType b3( B.load(k+1UL,j ) );
5874  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5875  xmm1 += a1 * b1;
5876  xmm2 += a1 * b2;
5877  xmm3 += a2 * b1;
5878  xmm4 += a2 * b2;
5879  xmm5 += a3 * b3;
5880  xmm6 += a3 * b4;
5881  xmm7 += a4 * b3;
5882  xmm8 += a4 * b4;
5883  }
5884 
5885  for( ; k<kend; ++k ) {
5886  const SIMDType a1( set( A(i ,k) ) );
5887  const SIMDType a2( set( A(i+1UL,k) ) );
5888  const SIMDType b1( B.load(k,j ) );
5889  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5890  xmm1 += a1 * b1;
5891  xmm2 += a1 * b2;
5892  xmm3 += a2 * b1;
5893  xmm4 += a2 * b2;
5894  }
5895 
5896  C.store( i , j , (xmm1+xmm5) * factor );
5897  C.store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
5898  C.store( i+1UL, j , (xmm3+xmm7) * factor );
5899  C.store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
5900  }
5901 
5902  if( i < iend )
5903  {
5904  const size_t kbegin( ( IsUpper_v<MT4> )
5905  ?( ( IsLower_v<MT5> )
5906  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5907  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5908  :( IsLower_v<MT5> ? j : 0UL ) );
5909  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
5910 
5911  SIMDType xmm1, xmm2, xmm3, xmm4;
5912  size_t k( kbegin );
5913 
5914  for( ; (k+2UL) <= kend; k+=2UL ) {
5915  const SIMDType a1( set( A(i,k ) ) );
5916  const SIMDType a2( set( A(i,k+1UL) ) );
5917  xmm1 += a1 * B.load(k ,j );
5918  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
5919  xmm3 += a2 * B.load(k+1UL,j );
5920  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
5921  }
5922 
5923  for( ; k<kend; ++k ) {
5924  const SIMDType a1( set( A(i,k) ) );
5925  xmm1 += a1 * B.load(k,j );
5926  xmm2 += a1 * B.load(k,j+SIMDSIZE);
5927  }
5928 
5929  C.store( i, j , (xmm1+xmm3) * factor );
5930  C.store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
5931 
5932  if( UPP ) ++i;
5933  }
5934 
5935  if( UPP ) {
5936  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
5937  for( ; i<M; ++i ) {
5938  for( size_t jj=j; jj<jjend; ++jj ) {
5939  reset( C(i,jj) );
5940  }
5941  }
5942  }
5943  }
5944 
5945  for( ; j<jpos; j+=SIMDSIZE )
5946  {
5947  const size_t iend( UPP ? min(j+SIMDSIZE,M) : M );
5948  size_t i( 0UL );
5949 
5950  if( SYM || HERM ) {
5951  const size_t jjend( min(j+SIMDSIZE,N) );
5952  for( ; i<j; ++i ) {
5953  for( size_t jj=j; jj<jjend; ++jj ) {
5954  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
5955  }
5956  }
5957  }
5958  else if( LOW ) {
5959  const size_t jjend( min(j+SIMDSIZE,N) );
5960  for( ; i<j; ++i ) {
5961  for( size_t jj=j; jj<jjend; ++jj ) {
5962  reset( C(i,jj) );
5963  }
5964  }
5965  }
5966 
5967  for( ; (i+4UL) <= iend; i+=4UL )
5968  {
5969  const size_t kbegin( ( IsUpper_v<MT4> )
5970  ?( ( IsLower_v<MT5> )
5971  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5972  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5973  :( IsLower_v<MT5> ? j : 0UL ) );
5974  const size_t kend( ( IsLower_v<MT4> )
5975  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
5976  :( K ) );
5977 
5978  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5979  size_t k( kbegin );
5980 
5981  for( ; (k+2UL) <= kend; k+=2UL ) {
5982  const SIMDType b1( B.load(k ,j) );
5983  const SIMDType b2( B.load(k+1UL,j) );
5984  xmm1 += set( A(i ,k ) ) * b1;
5985  xmm2 += set( A(i+1UL,k ) ) * b1;
5986  xmm3 += set( A(i+2UL,k ) ) * b1;
5987  xmm4 += set( A(i+3UL,k ) ) * b1;
5988  xmm5 += set( A(i ,k+1UL) ) * b2;
5989  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
5990  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
5991  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
5992  }
5993 
5994  for( ; k<kend; ++k ) {
5995  const SIMDType b1( B.load(k,j) );
5996  xmm1 += set( A(i ,k) ) * b1;
5997  xmm2 += set( A(i+1UL,k) ) * b1;
5998  xmm3 += set( A(i+2UL,k) ) * b1;
5999  xmm4 += set( A(i+3UL,k) ) * b1;
6000  }
6001 
6002  C.store( i , j, (xmm1+xmm5) * factor );
6003  C.store( i+1UL, j, (xmm2+xmm6) * factor );
6004  C.store( i+2UL, j, (xmm3+xmm7) * factor );
6005  C.store( i+3UL, j, (xmm4+xmm8) * factor );
6006  }
6007 
6008  for( ; (i+3UL) <= iend; i+=3UL )
6009  {
6010  const size_t kbegin( ( IsUpper_v<MT4> )
6011  ?( ( IsLower_v<MT5> )
6012  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6013  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6014  :( IsLower_v<MT5> ? j : 0UL ) );
6015  const size_t kend( ( IsLower_v<MT4> )
6016  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
6017  :( K ) );
6018 
6019  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6020  size_t k( kbegin );
6021 
6022  for( ; (k+2UL) <= kend; k+=2UL ) {
6023  const SIMDType b1( B.load(k ,j) );
6024  const SIMDType b2( B.load(k+1UL,j) );
6025  xmm1 += set( A(i ,k ) ) * b1;
6026  xmm2 += set( A(i+1UL,k ) ) * b1;
6027  xmm3 += set( A(i+2UL,k ) ) * b1;
6028  xmm4 += set( A(i ,k+1UL) ) * b2;
6029  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
6030  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
6031  }
6032 
6033  for( ; k<kend; ++k ) {
6034  const SIMDType b1( B.load(k,j) );
6035  xmm1 += set( A(i ,k) ) * b1;
6036  xmm2 += set( A(i+1UL,k) ) * b1;
6037  xmm3 += set( A(i+2UL,k) ) * b1;
6038  }
6039 
6040  C.store( i , j, (xmm1+xmm4) * factor );
6041  C.store( i+1UL, j, (xmm2+xmm5) * factor );
6042  C.store( i+2UL, j, (xmm3+xmm6) * factor );
6043  }
6044 
6045  for( ; (i+2UL) <= iend; i+=2UL )
6046  {
6047  const size_t kbegin( ( IsUpper_v<MT4> )
6048  ?( ( IsLower_v<MT5> )
6049  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6050  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6051  :( IsLower_v<MT5> ? j : 0UL ) );
6052  const size_t kend( ( IsLower_v<MT4> )
6053  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6054  :( K ) );
6055 
6056  SIMDType xmm1, xmm2, xmm3, xmm4;
6057  size_t k( kbegin );
6058 
6059  for( ; (k+2UL) <= kend; k+=2UL ) {
6060  const SIMDType b1( B.load(k ,j) );
6061  const SIMDType b2( B.load(k+1UL,j) );
6062  xmm1 += set( A(i ,k ) ) * b1;
6063  xmm2 += set( A(i+1UL,k ) ) * b1;
6064  xmm3 += set( A(i ,k+1UL) ) * b2;
6065  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
6066  }
6067 
6068  for( ; k<kend; ++k ) {
6069  const SIMDType b1( B.load(k,j) );
6070  xmm1 += set( A(i ,k) ) * b1;
6071  xmm2 += set( A(i+1UL,k) ) * b1;
6072  }
6073 
6074  C.store( i , j, (xmm1+xmm3) * factor );
6075  C.store( i+1UL, j, (xmm2+xmm4) * factor );
6076  }
6077 
6078  if( i < iend )
6079  {
6080  const size_t kbegin( ( IsUpper_v<MT4> )
6081  ?( ( IsLower_v<MT5> )
6082  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6083  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6084  :( IsLower_v<MT5> ? j : 0UL ) );
6085 
6086  SIMDType xmm1, xmm2;
6087  size_t k( kbegin );
6088 
6089  for( ; (k+2UL) <= K; k+=2UL ) {
6090  xmm1 += set( A(i,k ) ) * B.load(k ,j);
6091  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
6092  }
6093 
6094  for( ; k<K; ++k ) {
6095  xmm1 += set( A(i,k) ) * B.load(k,j);
6096  }
6097 
6098  C.store( i, j, (xmm1+xmm2) * factor );
6099 
6100  if( UPP ) ++i;
6101  }
6102 
6103  if( UPP ) {
6104  const size_t jjend( min(j+SIMDSIZE,N) );
6105  for( ; i<M; ++i ) {
6106  for( size_t jj=j; jj<jjend; ++jj ) {
6107  reset( C(i,jj) );
6108  }
6109  }
6110  }
6111  }
6112 
6113  for( ; remainder && j<N; ++j )
6114  {
6115  size_t i( 0UL );
6116 
6117  if( SYM || HERM ) {
6118  for( ; i<j; ++i ) {
6119  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
6120  }
6121  }
6122  else if( LOW ) {
6123  for( ; i<j; ++i ) {
6124  reset( C(i,j) );
6125  }
6126  }
6127 
6128  for( ; (i+2UL) <= M; i+=2UL )
6129  {
6130  const size_t kbegin( ( IsUpper_v<MT4> )
6131  ?( ( IsLower_v<MT5> )
6132  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6133  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6134  :( IsLower_v<MT5> ? j : 0UL ) );
6135  const size_t kend( ( IsLower_v<MT4> )
6136  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6137  :( K ) );
6138 
6139  ElementType value1{};
6140  ElementType value2{};
6141 
6142  for( size_t k=kbegin; k<kend; ++k ) {
6143  value1 += A(i ,k) * B(k,j);
6144  value2 += A(i+1UL,k) * B(k,j);
6145  }
6146 
6147  C(i ,j) = value1 * scalar;
6148  C(i+1UL,j) = value2 * scalar;
6149  }
6150 
6151  if( i < M )
6152  {
6153  const size_t kbegin( ( IsUpper_v<MT4> )
6154  ?( ( IsLower_v<MT5> )
6155  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6156  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6157  :( IsLower_v<MT5> ? j : 0UL ) );
6158 
6159  ElementType value{};
6160 
6161  for( size_t k=kbegin; k<K; ++k ) {
6162  value += A(i,k) * B(k,j);
6163  }
6164 
6165  C(i,j) = value * scalar;
6166  }
6167  }
6168  }
6169  //**********************************************************************************************
6170 
6171  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
6186  template< typename MT3 // Type of the left-hand side target matrix
6187  , typename MT4 // Type of the left-hand side matrix operand
6188  , typename MT5 // Type of the right-hand side matrix operand
6189  , typename ST2 > // Type of the scalar value
6190  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6191  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6192  {
6197 
6198  const ForwardFunctor fwd;
6199 
6200  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6201  const OppositeType_t<MT4> tmp( serial( A ) );
6202  assign( C, fwd( tmp * B ) * scalar );
6203  }
6204  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6205  const OppositeType_t<MT5> tmp( serial( B ) );
6206  assign( C, fwd( A * tmp ) * scalar );
6207  }
6208  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6209  const OppositeType_t<MT4> tmp( serial( A ) );
6210  assign( C, fwd( tmp * B ) * scalar );
6211  }
6212  else {
6213  const OppositeType_t<MT5> tmp( serial( B ) );
6214  assign( C, fwd( A * tmp ) * scalar );
6215  }
6216  }
6217  //**********************************************************************************************
6218 
6219  //**Default assignment to dense matrices (large matrices)***************************************
6233  template< typename MT3 // Type of the left-hand side target matrix
6234  , typename MT4 // Type of the left-hand side matrix operand
6235  , typename MT5 // Type of the right-hand side matrix operand
6236  , typename ST2 > // Type of the scalar value
6237  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6238  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6239  {
6240  selectDefaultAssignKernel( C, A, B, scalar );
6241  }
6242  //**********************************************************************************************
6243 
6244  //**Vectorized default assignment to dense matrices (large matrices)****************************
6259  template< typename MT3 // Type of the left-hand side target matrix
6260  , typename MT4 // Type of the left-hand side matrix operand
6261  , typename MT5 // Type of the right-hand side matrix operand
6262  , typename ST2 > // Type of the scalar value
6263  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6264  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6265  {
6266  if( SYM )
6267  smmm( C, A, B, scalar );
6268  else if( HERM )
6269  hmmm( C, A, B, scalar );
6270  else if( LOW )
6271  lmmm( C, A, B, scalar, ST2(0) );
6272  else if( UPP )
6273  ummm( C, A, B, scalar, ST2(0) );
6274  else
6275  mmm( C, A, B, scalar, ST2(0) );
6276  }
6277  //**********************************************************************************************
6278 
6279  //**BLAS-based assignment to dense matrices (default)*******************************************
6293  template< typename MT3 // Type of the left-hand side target matrix
6294  , typename MT4 // Type of the left-hand side matrix operand
6295  , typename MT5 // Type of the right-hand side matrix operand
6296  , typename ST2 > // Type of the scalar value
6297  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6298  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6299  {
6300  selectLargeAssignKernel( C, A, B, scalar );
6301  }
6302  //**********************************************************************************************
6303 
6304  //**BLAS-based assignment to dense matrices*****************************************************
6305 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6306 
6319  template< typename MT3 // Type of the left-hand side target matrix
6320  , typename MT4 // Type of the left-hand side matrix operand
6321  , typename MT5 // Type of the right-hand side matrix operand
6322  , typename ST2 > // Type of the scalar value
6323  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6324  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6325  {
6326  using ET = ElementType_t<MT3>;
6327 
6328  if( IsTriangular_v<MT4> ) {
6329  assign( C, B );
6330  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6331  }
6332  else if( IsTriangular_v<MT5> ) {
6333  assign( C, A );
6334  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6335  }
6336  else {
6337  gemm( C, A, B, ET(scalar), ET(0) );
6338  }
6339  }
6340 #endif
6341  //**********************************************************************************************
6342 
6343  //**Assignment to sparse matrices***************************************************************
6355  template< typename MT // Type of the target sparse matrix
6356  , bool SO > // Storage order of the target sparse matrix
6357  friend inline auto assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6358  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6359  {
6361 
6362  using TmpType = If_t< SO, OppositeType, ResultType >;
6363 
6370 
6371  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6372  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6373 
6374  const ForwardFunctor fwd;
6375 
6376  const TmpType tmp( serial( rhs ) );
6377  assign( ~lhs, fwd( tmp ) );
6378  }
6379  //**********************************************************************************************
6380 
6381  //**Restructuring assignment to column-major matrices*******************************************
6395  template< typename MT > // Type of the target matrix
6396  friend inline auto assign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
6397  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6398  {
6400 
6402 
6403  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6404  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6405 
6406  const ForwardFunctor fwd;
6407 
6408  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6409  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6410 
6411  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
6412  assign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
6413  else if( IsSymmetric_v<MT1> )
6414  assign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
6415  else
6416  assign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
6417  }
6418  //**********************************************************************************************
6419 
6420  //**Addition assignment to dense matrices*******************************************************
6432  template< typename MT // Type of the target dense matrix
6433  , bool SO > // Storage order of the target dense matrix
6434  friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6435  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6436  {
6438 
6439  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6440  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6441 
6442  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6443  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6444 
6445  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6446  return;
6447  }
6448 
6449  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6450  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6451 
6452  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6453  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6454  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6455  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6456  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6457  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6458 
6459  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6460  }
6461  //**********************************************************************************************
6462 
6463  //**Addition assignment to dense matrices (kernel selection)************************************
6474  template< typename MT3 // Type of the left-hand side target matrix
6475  , typename MT4 // Type of the left-hand side matrix operand
6476  , typename MT5 // Type of the right-hand side matrix operand
6477  , typename ST2 > // Type of the scalar value
6478  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6479  {
6480  if( ( IsDiagonal_v<MT5> ) ||
6481  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
6482  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6483  selectSmallAddAssignKernel( C, A, B, scalar );
6484  else
6485  selectBlasAddAssignKernel( C, A, B, scalar );
6486  }
6487  //**********************************************************************************************
6488 
6489  //**Default addition assignment to dense matrices (general/general)*****************************
6503  template< typename MT3 // Type of the left-hand side target matrix
6504  , typename MT4 // Type of the left-hand side matrix operand
6505  , typename MT5 // Type of the right-hand side matrix operand
6506  , typename ST2 > // Type of the scalar value
6507  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6508  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6509  {
6510  const ResultType tmp( serial( A * B * scalar ) );
6511  addAssign( C, tmp );
6512  }
6513  //**********************************************************************************************
6514 
6515  //**Default addition assignment to dense matrices (general/diagonal)****************************
6529  template< typename MT3 // Type of the left-hand side target matrix
6530  , typename MT4 // Type of the left-hand side matrix operand
6531  , typename MT5 // Type of the right-hand side matrix operand
6532  , typename ST2 > // Type of the scalar value
6533  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6534  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6535  {
6537 
6538  const size_t M( A.rows() );
6539  const size_t N( B.columns() );
6540 
6541  for( size_t i=0UL; i<M; ++i )
6542  {
6543  const size_t jbegin( ( IsUpper_v<MT4> )
6544  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6545  :( 0UL ) );
6546  const size_t jend( ( IsLower_v<MT4> )
6547  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6548  :( N ) );
6549  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6550 
6551  const size_t jnum( jend - jbegin );
6552  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6553 
6554  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6555  C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6556  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6557  }
6558  if( jpos < jend ) {
6559  C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6560  }
6561  }
6562  }
6563  //**********************************************************************************************
6564 
6565  //**Default addition assignment to dense matrices (diagonal/general)****************************
6579  template< typename MT3 // Type of the left-hand side target matrix
6580  , typename MT4 // Type of the left-hand side matrix operand
6581  , typename MT5 // Type of the right-hand side matrix operand
6582  , typename ST2 > // Type of the scalar value
6583  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6584  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6585  {
6587 
6588  const size_t M( A.rows() );
6589  const size_t N( B.columns() );
6590 
6591  for( size_t i=0UL; i<M; ++i )
6592  {
6593  const size_t jbegin( ( IsUpper_v<MT5> )
6594  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
6595  :( 0UL ) );
6596  const size_t jend( ( IsLower_v<MT5> )
6597  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
6598  :( N ) );
6599  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6600 
6601  const size_t jnum( jend - jbegin );
6602  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6603 
6604  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6605  C(i,j ) += A(i,i) * B(i,j ) * scalar;
6606  C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
6607  }
6608  if( jpos < jend ) {
6609  C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
6610  }
6611  }
6612  }
6613  //**********************************************************************************************
6614 
6615  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6629  template< typename MT3 // Type of the left-hand side target matrix
6630  , typename MT4 // Type of the left-hand side matrix operand
6631  , typename MT5 // Type of the right-hand side matrix operand
6632  , typename ST2 > // Type of the scalar value
6633  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6634  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6635  {
6637 
6638  for( size_t i=0UL; i<A.rows(); ++i ) {
6639  C(i,i) += A(i,i) * B(i,i) * scalar;
6640  }
6641  }
6642  //**********************************************************************************************
6643 
6644  //**Default addition assignment to dense matrices (small matrices)******************************
6658  template< typename MT3 // Type of the left-hand side target matrix
6659  , typename MT4 // Type of the left-hand side matrix operand
6660  , typename MT5 // Type of the right-hand side matrix operand
6661  , typename ST2 > // Type of the scalar value
6662  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6663  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6664  {
6665  selectDefaultAddAssignKernel( C, A, B, scalar );
6666  }
6667  //**********************************************************************************************
6668 
6669  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
6684  template< typename MT3 // Type of the left-hand side target matrix
6685  , typename MT4 // Type of the left-hand side matrix operand
6686  , typename MT5 // Type of the right-hand side matrix operand
6687  , typename ST2 > // Type of the scalar value
6688  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6689  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6690  {
6691  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
6692 
6693  const size_t M( A.rows() );
6694  const size_t N( B.columns() );
6695  const size_t K( A.columns() );
6696 
6697  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6698 
6699  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
6700  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
6701 
6702  const SIMDType factor( set( scalar ) );
6703 
6704  size_t j( 0UL );
6705 
6706  if( IsIntegral_v<ElementType> )
6707  {
6708  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6709  for( size_t i=0UL; i<M; ++i )
6710  {
6711  const size_t kbegin( ( IsUpper_v<MT4> )
6712  ?( ( IsLower_v<MT5> )
6713  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6714  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6715  :( IsLower_v<MT5> ? j : 0UL ) );
6716  const size_t kend( ( IsLower_v<MT4> )
6717  ?( ( IsUpper_v<MT5> )
6718  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
6719  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
6720  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
6721 
6722  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6723 
6724  for( size_t k=kbegin; k<kend; ++k ) {
6725  const SIMDType a1( set( A(i,k) ) );
6726  xmm1 += a1 * B.load(k,j );
6727  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6728  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6729  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6730  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6731  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
6732  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
6733  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
6734  }
6735 
6736  C.store( i, j , C.load(i,j ) + xmm1 * factor );
6737  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
6738  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6739  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6740  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
6741  C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
6742  C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
6743  C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
6744  }
6745  }
6746  }
6747 
6748  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
6749  {
6750  size_t i( 0UL );
6751 
6752  for( ; (i+2UL) <= M; i+=2UL )
6753  {
6754  const size_t kbegin( ( IsUpper_v<MT4> )
6755  ?( ( IsLower_v<MT5> )
6756  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6757  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6758  :( IsLower_v<MT5> ? j : 0UL ) );
6759  const size_t kend( ( IsLower_v<MT4> )
6760  ?( ( IsUpper_v<MT5> )
6761  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
6762  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6763  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
6764 
6765  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6766 
6767  for( size_t k=kbegin; k<kend; ++k ) {
6768  const SIMDType a1( set( A(i ,k) ) );
6769  const SIMDType a2( set( A(i+1UL,k) ) );
6770  const SIMDType b1( B.load(k,j ) );
6771  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6772  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6773  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6774  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
6775  xmm1 += a1 * b1;
6776  xmm2 += a1 * b2;
6777  xmm3 += a1 * b3;
6778  xmm4 += a1 * b4;
6779  xmm5 += a1 * b5;
6780  xmm6 += a2 * b1;
6781  xmm7 += a2 * b2;
6782  xmm8 += a2 * b3;
6783  xmm9 += a2 * b4;
6784  xmm10 += a2 * b5;
6785  }
6786 
6787  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6788  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
6789  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6790  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
6791  C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
6792  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
6793  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
6794  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
6795  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
6796  C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
6797  }
6798 
6799  if( i < M )
6800  {
6801  const size_t kbegin( ( IsUpper_v<MT4> )
6802  ?( ( IsLower_v<MT5> )
6803  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6804  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6805  :( IsLower_v<MT5> ? j : 0UL ) );
6806  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
6807 
6808  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6809 
6810  for( size_t k=kbegin; k<kend; ++k ) {
6811  const SIMDType a1( set( A(i,k) ) );
6812  xmm1 += a1 * B.load(k,j );
6813  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6814  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6815  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6816  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6817  }
6818 
6819  C.store( i, j , C.load(i,j ) + xmm1 * factor );
6820  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
6821  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6822  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6823  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
6824  }
6825  }
6826 
6827  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6828  {
6829  size_t i( 0UL );
6830 
6831  for( ; (i+2UL) <= M; i+=2UL )
6832  {
6833  const size_t kbegin( ( IsUpper_v<MT4> )
6834  ?( ( IsLower_v<MT5> )
6835  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6836  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6837  :( IsLower_v<MT5> ? j : 0UL ) );
6838  const size_t kend( ( IsLower_v<MT4> )
6839  ?( ( IsUpper_v<MT5> )
6840  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
6841  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6842  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
6843 
6844  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6845 
6846  for( size_t k=kbegin; k<kend; ++k ) {
6847  const SIMDType a1( set( A(i ,k) ) );
6848  const SIMDType a2( set( A(i+1UL,k) ) );
6849  const SIMDType b1( B.load(k,j ) );
6850  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6851  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6852  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6853  xmm1 += a1 * b1;
6854  xmm2 += a1 * b2;
6855  xmm3 += a1 * b3;
6856  xmm4 += a1 * b4;
6857  xmm5 += a2 * b1;
6858  xmm6 += a2 * b2;
6859  xmm7 += a2 * b3;
6860  xmm8 += a2 * b4;
6861  }
6862 
6863  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6864  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
6865  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6866  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
6867  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
6868  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
6869  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
6870  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
6871  }
6872 
6873  if( i < M )
6874  {
6875  const size_t kbegin( ( IsUpper_v<MT4> )
6876  ?( ( IsLower_v<MT5> )
6877  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6878  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6879  :( IsLower_v<MT5> ? j : 0UL ) );
6880  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
6881 
6882  SIMDType xmm1, xmm2, xmm3, xmm4;
6883 
6884  for( size_t k=kbegin; k<kend; ++k ) {
6885  const SIMDType a1( set( A(i,k) ) );
6886  xmm1 += a1 * B.load(k,j );
6887  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6888  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6889  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6890  }
6891 
6892  C.store( i, j , C.load(i,j ) + xmm1 * factor );
6893  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
6894  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6895  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6896  }
6897  }
6898 
6899  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
6900  {
6901  size_t i( 0UL );
6902 
6903  for( ; (i+2UL) <= M; i+=2UL )
6904  {
6905  const size_t kbegin( ( IsUpper_v<MT4> )
6906  ?( ( IsLower_v<MT5> )
6907  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6908  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6909  :( IsLower_v<MT5> ? j : 0UL ) );
6910  const size_t kend( ( IsLower_v<MT4> )
6911  ?( ( IsUpper_v<MT5> )
6912  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
6913  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6914  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
6915 
6916  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6917 
6918  for( size_t k=kbegin; k<kend; ++k ) {
6919  const SIMDType a1( set( A(i ,k) ) );
6920  const SIMDType a2( set( A(i+1UL,k) ) );
6921  const SIMDType b1( B.load(k,j ) );
6922  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6923  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6924  xmm1 += a1 * b1;
6925  xmm2 += a1 * b2;
6926  xmm3 += a1 * b3;
6927  xmm4 += a2 * b1;
6928  xmm5 += a2 * b2;
6929  xmm6 += a2 * b3;
6930  }
6931 
6932  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6933  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
6934  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6935  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
6936  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
6937  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
6938  }
6939 
6940  if( i < M )
6941  {
6942  const size_t kbegin( ( IsUpper_v<MT4> )
6943  ?( ( IsLower_v<MT5> )
6944  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6945  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6946  :( IsLower_v<MT5> ? j : 0UL ) );
6947  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
6948 
6949  SIMDType xmm1, xmm2, xmm3;
6950 
6951  for( size_t k=kbegin; k<kend; ++k ) {
6952  const SIMDType a1( set( A(i,k) ) );
6953  xmm1 += a1 * B.load(k,j );
6954  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6955  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6956  }
6957 
6958  C.store( i, j , C.load(i,j ) + xmm1 * factor );
6959  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
6960  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6961  }
6962  }
6963 
6964  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6965  {
6966  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
6967  size_t i( LOW ? j : 0UL );
6968 
6969  for( ; (i+4UL) <= iend; i+=4UL )
6970  {
6971  const size_t kbegin( ( IsUpper_v<MT4> )
6972  ?( ( IsLower_v<MT5> )
6973  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6974  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6975  :( IsLower_v<MT5> ? j : 0UL ) );
6976  const size_t kend( ( IsLower_v<MT4> )
6977  ?( ( IsUpper_v<MT5> )
6978  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
6979  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
6980  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
6981 
6982  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6983 
6984  for( size_t k=kbegin; k<kend; ++k ) {
6985  const SIMDType a1( set( A(i ,k) ) );
6986  const SIMDType a2( set( A(i+1UL,k) ) );
6987  const SIMDType a3( set( A(i+2UL,k) ) );
6988  const SIMDType a4( set( A(i+3UL,k) ) );
6989  const SIMDType b1( B.load(k,j ) );
6990  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6991  xmm1 += a1 * b1;
6992  xmm2 += a1 * b2;
6993  xmm3 += a2 * b1;
6994  xmm4 += a2 * b2;
6995  xmm5 += a3 * b1;
6996  xmm6 += a3 * b2;
6997  xmm7 += a4 * b1;
6998  xmm8 += a4 * b2;
6999  }
7000 
7001  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7002  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
7003  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7004  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
7005  C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
7006  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
7007  C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
7008  C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
7009  }
7010 
7011  for( ; (i+3UL) <= iend; i+=3UL )
7012  {
7013  const size_t kbegin( ( IsUpper_v<MT4> )
7014  ?( ( IsLower_v<MT5> )
7015  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7016  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7017  :( IsLower_v<MT5> ? j : 0UL ) );
7018  const size_t kend( ( IsLower_v<MT4> )
7019  ?( ( IsUpper_v<MT5> )
7020  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
7021  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
7022  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
7023 
7024  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7025 
7026  for( size_t k=kbegin; k<kend; ++k ) {
7027  const SIMDType a1( set( A(i ,k) ) );
7028  const SIMDType a2( set( A(i+1UL,k) ) );
7029  const SIMDType a3( set( A(i+2UL,k) ) );
7030  const SIMDType b1( B.load(k,j ) );
7031  const SIMDType b2( B.load(k,j+SIMDSIZE) );
7032  xmm1 += a1 * b1;
7033  xmm2 += a1 * b2;
7034  xmm3 += a2 * b1;
7035  xmm4 += a2 * b2;
7036  xmm5 += a3 * b1;
7037  xmm6 += a3 * b2;
7038  }
7039 
7040  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7041  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
7042  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7043  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
7044  C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
7045  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
7046  }
7047 
7048  for( ; (i+2UL) <= iend; i+=2UL )
7049  {
7050  const size_t kbegin( ( IsUpper_v<MT4> )
7051  ?( ( IsLower_v<MT5> )
7052  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7053  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7054  :( IsLower_v<MT5> ? j : 0UL ) );
7055  const size_t kend( ( IsLower_v<MT4> )
7056  ?( ( IsUpper_v<MT5> )
7057  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
7058  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7059  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
7060 
7061  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7062  size_t k( kbegin );
7063 
7064  for( ; (k+2UL) <= kend; k+=2UL ) {
7065  const SIMDType a1( set( A(i ,k ) ) );
7066  const SIMDType a2( set( A(i+1UL,k ) ) );
7067  const SIMDType a3( set( A(i ,k+1UL) ) );
7068  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
7069  const SIMDType b1( B.load(k ,j ) );
7070  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
7071  const SIMDType b3( B.load(k+1UL,j ) );
7072  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
7073  xmm1 += a1 * b1;
7074  xmm2 += a1 * b2;
7075  xmm3 += a2 * b1;
7076  xmm4 += a2 * b2;
7077  xmm5 += a3 * b3;
7078  xmm6 += a3 * b4;
7079  xmm7 += a4 * b3;
7080  xmm8 += a4 * b4;
7081  }
7082 
7083  for( ; k<kend; ++k ) {
7084  const SIMDType a1( set( A(i ,k) ) );
7085  const SIMDType a2( set( A(i+1UL,k) ) );
7086  const SIMDType b1( B.load(k,j ) );
7087  const SIMDType b2( B.load(k,j+SIMDSIZE) );
7088  xmm1 += a1 * b1;
7089  xmm2 += a1 * b2;
7090  xmm3 += a2 * b1;
7091  xmm4 += a2 * b2;
7092  }
7093 
7094  C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
7095  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
7096  C.store( i+1UL, j , C.load(i+1UL,j ) + (xmm3+xmm7) * factor );
7097  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
7098  }
7099 
7100  if( i < iend )
7101  {
7102  const size_t kbegin( ( IsUpper_v<MT4> )
7103  ?( ( IsLower_v<MT5> )
7104  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7105  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7106  :( IsLower_v<MT5> ? j : 0UL ) );
7107  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
7108 
7109  SIMDType xmm1, xmm2, xmm3, xmm4;
7110  size_t k( kbegin );
7111 
7112  for( ; (k+2UL) <= kend; k+=2UL ) {
7113  const SIMDType a1( set( A(i,k ) ) );
7114  const SIMDType a2( set( A(i,k+1UL) ) );
7115  xmm1 += a1 * B.load(k ,j );
7116  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
7117  xmm3 += a2 * B.load(k+1UL,j );
7118  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
7119  }
7120 
7121  for( ; k<kend; ++k ) {
7122  const SIMDType a1( set( A(i,k) ) );
7123  xmm1 += a1 * B.load(k,j );
7124  xmm2 += a1 * B.load(k,j+SIMDSIZE);
7125  }
7126 
7127  C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
7128  C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
7129  }
7130  }
7131 
7132  for( ; j<jpos; j+=SIMDSIZE )
7133  {
7134  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
7135  size_t i( LOW ? j : 0UL );
7136 
7137  for( ; (i+4UL) <= iend; i+=4UL )
7138  {
7139  const size_t kbegin( ( IsUpper_v<MT4> )
7140  ?( ( IsLower_v<MT5> )
7141  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7142  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7143  :( IsLower_v<MT5> ? j : 0UL ) );
7144  const size_t kend( ( IsLower_v<MT4> )
7145  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
7146  :( K ) );
7147 
7148  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7149  size_t k( kbegin );
7150 
7151  for( ; (k+2UL) <= kend; k+=2UL ) {
7152  const SIMDType b1( B.load(k ,j) );
7153  const SIMDType b2( B.load(k+1UL,j) );
7154  xmm1 += set( A(i ,k ) ) * b1;
7155  xmm2 += set( A(i+1UL,k ) ) * b1;
7156  xmm3 += set( A(i+2UL,k ) ) * b1;
7157  xmm4 += set( A(i+3UL,k ) ) * b1;
7158  xmm5 += set( A(i ,k+1UL) ) * b2;
7159  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
7160  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
7161  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
7162  }
7163 
7164  for( ; k<kend; ++k ) {
7165  const SIMDType b1( B.load(k,j) );
7166  xmm1 += set( A(i ,k) ) * b1;
7167  xmm2 += set( A(i+1UL,k) ) * b1;
7168  xmm3 += set( A(i+2UL,k) ) * b1;
7169  xmm4 += set( A(i+3UL,k) ) * b1;
7170  }
7171 
7172  C.store( i , j, C.load(i ,j) + (xmm1+xmm5) * factor );
7173  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm6) * factor );
7174  C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm7) * factor );
7175  C.store( i+3UL, j, C.load(i+3UL,j) + (xmm4+xmm8) * factor );
7176  }
7177 
7178  for( ; (i+3UL) <= iend; i+=3UL )
7179  {
7180  const size_t kbegin( ( IsUpper_v<MT4> )
7181  ?( ( IsLower_v<MT5> )
7182  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7183  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7184  :( IsLower_v<MT5> ? j : 0UL ) );
7185  const size_t kend( ( IsLower_v<MT4> )
7186  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
7187  :( K ) );
7188 
7189  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7190  size_t k( kbegin );
7191 
7192  for( ; (k+2UL) <= kend; k+=2UL ) {
7193  const SIMDType b1( B.load(k ,j) );
7194  const SIMDType b2( B.load(k+1UL,j) );
7195  xmm1 += set( A(i ,k ) ) * b1;
7196  xmm2 += set( A(i+1UL,k ) ) * b1;
7197  xmm3 += set( A(i+2UL,k ) ) * b1;
7198  xmm4 += set( A(i ,k+1UL) ) * b2;
7199  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
7200  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
7201  }
7202 
7203  for( ; k<kend; ++k ) {
7204  const SIMDType b1( B.load(k,j) );
7205  xmm1 += set( A(i ,k) ) * b1;
7206  xmm2 += set( A(i+1UL,k) ) * b1;
7207  xmm3 += set( A(i+2UL,k) ) * b1;
7208  }
7209 
7210  C.store( i , j, C.load(i ,j) + (xmm1+xmm4) * factor );
7211  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm5) * factor );
7212  C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm6) * factor );
7213  }
7214 
7215  for( ; (i+2UL) <= iend; i+=2UL )
7216  {
7217  const size_t kbegin( ( IsUpper_v<MT4> )
7218  ?( ( IsLower_v<MT5> )
7219  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7220  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7221  :( IsLower_v<MT5> ? j : 0UL ) );
7222  const size_t kend( ( IsLower_v<MT4> )
7223  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7224  :( K ) );
7225 
7226  SIMDType xmm1, xmm2, xmm3, xmm4;
7227  size_t k( kbegin );
7228 
7229  for( ; (k+2UL) <= kend; k+=2UL ) {
7230  const SIMDType b1( B.load(k ,j) );
7231  const SIMDType b2( B.load(k+1UL,j) );
7232  xmm1 += set( A(i ,k ) ) * b1;
7233  xmm2 += set( A(i+1UL,k ) ) * b1;
7234  xmm3 += set( A(i ,k+1UL) ) * b2;
7235  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
7236  }
7237 
7238  for( ; k<kend; ++k ) {
7239  const SIMDType b1( B.load(k,j) );
7240  xmm1 += set( A(i ,k) ) * b1;
7241  xmm2 += set( A(i+1UL,k) ) * b1;
7242  }
7243 
7244  C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
7245  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm4) * factor );
7246  }
7247 
7248  if( i < iend )
7249  {
7250  const size_t kbegin( ( IsUpper_v<MT4> )
7251  ?( ( IsLower_v<MT5> )
7252  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7253  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7254  :( IsLower_v<MT5> ? j : 0UL ) );
7255 
7256  SIMDType xmm1, xmm2;
7257  size_t k( kbegin );
7258 
7259  for( ; (k+2UL) <= K; k+=2UL ) {
7260  xmm1 += set( A(i,k ) ) * B.load(k ,j);
7261  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
7262  }
7263 
7264  for( ; k<K; ++k ) {
7265  xmm1 += set( A(i,k) ) * B.load(k,j);
7266  }
7267 
7268  C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
7269  }
7270  }
7271 
7272  for( ; remainder && j<N; ++j )
7273  {
7274  const size_t iend( UPP ? j+1UL : M );
7275  size_t i( LOW ? j : 0UL );
7276 
7277  for( ; (i+2UL) <= iend; i+=2UL )
7278  {
7279  const size_t kbegin( ( IsUpper_v<MT4> )
7280  ?( ( IsLower_v<MT5> )
7281  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7282  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7283  :( IsLower_v<MT5> ? j : 0UL ) );
7284  const size_t kend( ( IsLower_v<MT4> )
7285  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7286  :( K ) );
7287 
7288  ElementType value1{};
7289  ElementType value2{};
7290 
7291  for( size_t k=kbegin; k<kend; ++k ) {
7292  value1 += A(i ,k) * B(k,j);
7293  value2 += A(i+1UL,k) * B(k,j);
7294  }
7295 
7296  C(i ,j) += value1 * scalar;
7297  C(i+1UL,j) += value2 * scalar;
7298  }
7299 
7300  if( i < iend )
7301  {
7302  const size_t kbegin( ( IsUpper_v<MT4> )
7303  ?( ( IsLower_v<MT5> )
7304  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7305  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7306  :( IsLower_v<MT5> ? j : 0UL ) );
7307 
7308  ElementType value{};
7309 
7310  for( size_t k=kbegin; k<K; ++k ) {
7311  value += A(i,k) * B(k,j);
7312  }
7313 
7314  C(i,j) += value * scalar;
7315  }
7316  }
7317  }
7318  //**********************************************************************************************
7319 
7320  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
7335  template< typename MT3 // Type of the left-hand side target matrix
7336  , typename MT4 // Type of the left-hand side matrix operand
7337  , typename MT5 // Type of the right-hand side matrix operand
7338  , typename ST2 > // Type of the scalar value
7339  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7340  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7341  {
7346 
7347  const ForwardFunctor fwd;
7348 
7349  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7350  const OppositeType_t<MT4> tmp( serial( A ) );
7351  addAssign( C, fwd( tmp * B ) * scalar );
7352  }
7353  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7354  const OppositeType_t<MT5> tmp( serial( B ) );
7355  addAssign( C, fwd( A * tmp ) * scalar );
7356  }
7357  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7358  const OppositeType_t<MT4> tmp( serial( A ) );
7359  addAssign( C, fwd( tmp * B ) * scalar );
7360  }
7361  else {
7362  const OppositeType_t<MT5> tmp( serial( B ) );
7363  addAssign( C, fwd( A * tmp ) * scalar );
7364  }
7365  }
7366  //**********************************************************************************************
7367 
7368  //**Default addition assignment to dense matrices (large matrices)******************************
7382  template< typename MT3 // Type of the left-hand side target matrix
7383  , typename MT4 // Type of the left-hand side matrix operand
7384  , typename MT5 // Type of the right-hand side matrix operand
7385  , typename ST2 > // Type of the scalar value
7386  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7387  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7388  {
7389  selectDefaultAddAssignKernel( C, A, B, scalar );
7390  }
7391  //**********************************************************************************************
7392 
7393  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
7408  template< typename MT3 // Type of the left-hand side target matrix
7409  , typename MT4 // Type of the left-hand side matrix operand
7410  , typename MT5 // Type of the right-hand side matrix operand
7411  , typename ST2 > // Type of the scalar value
7412  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7413  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7414  {
7415  if( LOW )
7416  lmmm( C, A, B, scalar, ST2(1) );
7417  else if( UPP )
7418  ummm( C, A, B, scalar, ST2(1) );
7419  else
7420  mmm( C, A, B, scalar, ST2(1) );
7421  }
7422  //**********************************************************************************************
7423 
7424  //**BLAS-based addition assignment to dense matrices (default)**********************************
7438  template< typename MT3 // Type of the left-hand side target matrix
7439  , typename MT4 // Type of the left-hand side matrix operand
7440  , typename MT5 // Type of the right-hand side matrix operand
7441  , typename ST2 > // Type of the scalar value
7442  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7443  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7444  {
7445  selectLargeAddAssignKernel( C, A, B, scalar );
7446  }
7447  //**********************************************************************************************
7448 
7449  //**BLAS-based addition assignment to dense matrices********************************************
7450 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7451 
7464  template< typename MT3 // Type of the left-hand side target matrix
7465  , typename MT4 // Type of the left-hand side matrix operand
7466  , typename MT5 // Type of the right-hand side matrix operand
7467  , typename ST2 > // Type of the scalar value
7468  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7469  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7470  {
7471  using ET = ElementType_t<MT3>;
7472 
7473  if( IsTriangular_v<MT4> ) {
7474  ResultType_t<MT3> tmp( serial( B ) );
7475  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7476  addAssign( C, tmp );
7477  }
7478  else if( IsTriangular_v<MT5> ) {
7479  ResultType_t<MT3> tmp( serial( A ) );
7480  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7481  addAssign( C, tmp );
7482  }
7483  else {
7484  gemm( C, A, B, ET(scalar), ET(1) );
7485  }
7486  }
7487 #endif
7488  //**********************************************************************************************
7489 
7490  //**Restructuring addition assignment to column-major matrices**********************************
7504  template< typename MT > // Type of the target matrix
7505  friend inline auto addAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7506  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7507  {
7509 
7511 
7512  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7513  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7514 
7515  const ForwardFunctor fwd;
7516 
7517  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7518  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7519 
7520  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
7521  addAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
7522  else if( IsSymmetric_v<MT1> )
7523  addAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
7524  else
7525  addAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
7526  }
7527  //**********************************************************************************************
7528 
7529  //**Addition assignment to sparse matrices******************************************************
7530  // No special implementation for the addition assignment to sparse matrices.
7531  //**********************************************************************************************
7532 
7533  //**Subtraction assignment to dense matrices****************************************************
7545  template< typename MT // Type of the target dense matrix
7546  , bool SO > // Storage order of the target dense matrix
7547  friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7548  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7549  {
7551 
7552  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7553  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7554 
7555  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7556  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7557 
7558  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7559  return;
7560  }
7561 
7562  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7563  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7564 
7565  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7566  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7567  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7568  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7569  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7570  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7571 
7572  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7573  }
7574  //**********************************************************************************************
7575 
7576  //**Subtraction assignment to dense matrices (kernel selection)*********************************
7587  template< typename MT3 // Type of the left-hand side target matrix
7588  , typename MT4 // Type of the left-hand side matrix operand
7589  , typename MT5 // Type of the right-hand side matrix operand
7590  , typename ST2 > // Type of the scalar value
7591  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7592  {
7593  if( ( IsDiagonal_v<MT5> ) ||
7594  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
7595  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
7596  selectSmallSubAssignKernel( C, A, B, scalar );
7597  else
7598  selectBlasSubAssignKernel( C, A, B, scalar );
7599  }
7600  //**********************************************************************************************
7601 
7602  //**Default subtraction assignment to dense matrices (general/general)**************************
7616  template< typename MT3 // Type of the left-hand side target matrix
7617  , typename MT4 // Type of the left-hand side matrix operand
7618  , typename MT5 // Type of the right-hand side matrix operand
7619  , typename ST2 > // Type of the scalar value
7620  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7621  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7622  {
7623  const ResultType tmp( serial( A * B * scalar ) );
7624  subAssign( C, tmp );
7625  }
7626  //**********************************************************************************************
7627 
7628  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
7642  template< typename MT3 // Type of the left-hand side target matrix
7643  , typename MT4 // Type of the left-hand side matrix operand
7644  , typename MT5 // Type of the right-hand side matrix operand
7645  , typename ST2 > // Type of the scalar value
7646  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7647  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7648  {
7650 
7651  const size_t M( A.rows() );
7652  const size_t N( B.columns() );
7653 
7654  for( size_t i=0UL; i<M; ++i )
7655  {
7656  const size_t jbegin( ( IsUpper_v<MT4> )
7657  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7658  :( 0UL ) );
7659  const size_t jend( ( IsLower_v<MT4> )
7660  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7661  :( N ) );
7662  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7663 
7664  const size_t jnum( jend - jbegin );
7665  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
7666 
7667  for( size_t j=jbegin; j<jpos; j+=2UL ) {
7668  C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7669  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7670  }
7671  if( jpos < jend ) {
7672  C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7673  }
7674  }
7675  }
7676  //**********************************************************************************************
7677 
7678  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
7692  template< typename MT3 // Type of the left-hand side target matrix
7693  , typename MT4 // Type of the left-hand side matrix operand
7694  , typename MT5 // Type of the right-hand side matrix operand
7695  , typename ST2 > // Type of the scalar value
7696  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7697  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7698  {
7700 
7701  const size_t M( A.rows() );
7702  const size_t N( B.columns() );
7703 
7704  for( size_t i=0UL; i<M; ++i )
7705  {
7706  const size_t jbegin( ( IsUpper_v<MT5> )
7707  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
7708  :( 0UL ) );
7709  const size_t jend( ( IsLower_v<MT5> )
7710  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
7711  :( N ) );
7712  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7713 
7714  const size_t jnum( jend - jbegin );
7715  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
7716 
7717  for( size_t j=jbegin; j<jpos; j+=2UL ) {
7718  C(i,j ) -= A(i,i) * B(i,j ) * scalar;
7719  C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
7720  }
7721  if( jpos < jend ) {
7722  C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
7723  }
7724  }
7725  }
7726  //**********************************************************************************************
7727 
7728  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
7742  template< typename MT3 // Type of the left-hand side target matrix
7743  , typename MT4 // Type of the left-hand side matrix operand
7744  , typename MT5 // Type of the right-hand side matrix operand
7745  , typename ST2 > // Type of the scalar value
7746  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7747  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7748  {
7750 
7751  for( size_t i=0UL; i<A.rows(); ++i ) {
7752  C(i,i) -= A(i,i) * B(i,i) * scalar;
7753  }
7754  }
7755  //**********************************************************************************************
7756 
7757  //**Default subtraction assignment to dense matrices (small matrices)***************************
7771  template< typename MT3 // Type of the left-hand side target matrix
7772  , typename MT4 // Type of the left-hand side matrix operand
7773  , typename MT5 // Type of the right-hand side matrix operand
7774  , typename ST2 > // Type of the scalar value
7775  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7776  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7777  {
7778  selectDefaultSubAssignKernel( C, A, B, scalar );
7779  }
7780  //**********************************************************************************************
7781 
7782  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
7797  template< typename MT3 // Type of the left-hand side target matrix
7798  , typename MT4 // Type of the left-hand side matrix operand
7799  , typename MT5 // Type of the right-hand side matrix operand
7800  , typename ST2 > // Type of the scalar value
7801  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7802  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7803  {
7804  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
7805 
7806  const size_t M( A.rows() );
7807  const size_t N( B.columns() );
7808  const size_t K( A.columns() );
7809 
7810  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7811 
7812  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
7813  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
7814 
7815  const SIMDType factor( set( scalar ) );
7816 
7817  size_t j( 0UL );
7818 
7819  if( IsIntegral_v<ElementType> )
7820  {
7821  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7822  for( size_t i=0UL; i<M; ++i )
7823  {
7824  const size_t kbegin( ( IsUpper_v<MT4> )
7825  ?( ( IsLower_v<MT5> )
7826  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7827  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7828  :( IsLower_v<MT5> ? j : 0UL ) );
7829  const size_t kend( ( IsLower_v<MT4> )
7830  ?( ( IsUpper_v<MT5> )
7831  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
7832  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7833  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
7834 
7835  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7836 
7837  for( size_t k=kbegin; k<kend; ++k ) {
7838  const SIMDType a1( set( A(i,k) ) );
7839  xmm1 += a1 * B.load(k,j );
7840  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7841  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7842  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7843  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7844  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7845  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7846  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7847  }
7848 
7849  C.store( i, j , C.load(i,j ) - xmm1 * factor );
7850  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
7851  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7852  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7853  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
7854  C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
7855  C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
7856  C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
7857  }
7858  }
7859  }
7860 
7861  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7862  {
7863  size_t i( 0UL );
7864 
7865  for( ; (i+2UL) <= M; i+=2UL )
7866  {
7867  const size_t kbegin( ( IsUpper_v<MT4> )
7868  ?( ( IsLower_v<MT5> )
7869  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7870  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7871  :( IsLower_v<MT5> ? j : 0UL ) );
7872  const size_t kend( ( IsLower_v<MT4> )
7873  ?( ( IsUpper_v<MT5> )
7874  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
7875  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7876  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
7877 
7878  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7879 
7880  for( size_t k=kbegin; k<kend; ++k ) {
7881  const SIMDType a1( set( A(i ,k) ) );
7882  const SIMDType a2( set( A(i+1UL,k) ) );
7883  const SIMDType b1( B.load(k,j ) );
7884  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7885  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7886  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7887  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7888  xmm1 += a1 * b1;
7889  xmm2 += a1 * b2;
7890  xmm3 += a1 * b3;
7891  xmm4 += a1 * b4;
7892  xmm5 += a1 * b5;
7893  xmm6 += a2 * b1;
7894  xmm7 += a2 * b2;
7895  xmm8 += a2 * b3;
7896  xmm9 += a2 * b4;
7897  xmm10 += a2 * b5;
7898  }
7899 
7900  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7901  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
7902  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7903  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
7904  C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
7905  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
7906  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
7907  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
7908  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
7909  C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
7910  }
7911 
7912  if( i < M )
7913  {
7914  const size_t kbegin( ( IsUpper_v<MT4> )
7915  ?( ( IsLower_v<MT5> )
7916  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7917  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7918  :( IsLower_v<MT5> ? j : 0UL ) );
7919  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
7920 
7921  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7922 
7923  for( size_t k=kbegin; k<kend; ++k ) {
7924  const SIMDType a1( set( A(i,k) ) );
7925  xmm1 += a1 * B.load(k,j );
7926  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7927  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7928  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7929  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7930  }
7931 
7932  C.store( i, j , C.load(i,j ) - xmm1 * factor );
7933  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
7934  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7935  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7936  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
7937  }
7938  }
7939 
7940  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7941  {
7942  size_t i( 0UL );
7943 
7944  for( ; (i+2UL) <= M; i+=2UL )
7945  {
7946  const size_t kbegin( ( IsUpper_v<MT4> )
7947  ?( ( IsLower_v<MT5> )
7948  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7949  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7950  :( IsLower_v<MT5> ? j : 0UL ) );
7951  const size_t kend( ( IsLower_v<MT4> )
7952  ?( ( IsUpper_v<MT5> )
7953  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
7954  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7955  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
7956 
7957  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7958 
7959  for( size_t k=kbegin; k<kend; ++k ) {
7960  const SIMDType a1( set( A(i ,k) ) );
7961  const SIMDType a2( set( A(i+1UL,k) ) );
7962  const SIMDType b1( B.load(k,j ) );
7963  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7964  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7965  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7966  xmm1 += a1 * b1;
7967  xmm2 += a1 * b2;
7968  xmm3 += a1 * b3;
7969  xmm4 += a1 * b4;
7970  xmm5 += a2 * b1;
7971  xmm6 += a2 * b2;
7972  xmm7 += a2 * b3;
7973  xmm8 += a2 * b4;
7974  }
7975 
7976  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7977  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
7978  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7979  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
7980  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
7981  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
7982  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
7983  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
7984  }
7985 
7986  if( i < M )
7987  {
7988  const size_t kbegin( ( IsUpper_v<MT4> )
7989  ?( ( IsLower_v<MT5> )
7990  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7991  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7992  :( IsLower_v<MT5> ? j : 0UL ) );
7993  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
7994 
7995  SIMDType xmm1, xmm2, xmm3, xmm4;
7996 
7997  for( size_t k=kbegin; k<kend; ++k ) {
7998  const SIMDType a1( set( A(i,k) ) );
7999  xmm1 += a1 * B.load(k,j );
8000  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8001  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8002  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8003  }
8004 
8005  C.store( i, j , C.load(i,j ) - xmm1 * factor );
8006  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
8007  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
8008  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
8009  }
8010  }
8011 
8012  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8013  {
8014  size_t i( 0UL );
8015 
8016  for( ; (i+2UL) <= M; i+=2UL )
8017  {
8018  const size_t kbegin( ( IsUpper_v<MT4> )
8019  ?( ( IsLower_v<MT5> )
8020  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8021  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8022  :( IsLower_v<MT5> ? j : 0UL ) );
8023  const size_t kend( ( IsLower_v<MT4> )
8024  ?( ( IsUpper_v<MT5> )
8025  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
8026  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8027  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
8028 
8029  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8030 
8031  for( size_t k=kbegin; k<kend; ++k ) {
8032  const SIMDType a1( set( A(i ,k) ) );
8033  const SIMDType a2( set( A(i+1UL,k) ) );
8034  const SIMDType b1( B.load(k,j ) );
8035  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8036  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8037  xmm1 += a1 * b1;
8038  xmm2 += a1 * b2;
8039  xmm3 += a1 * b3;
8040  xmm4 += a2 * b1;
8041  xmm5 += a2 * b2;
8042  xmm6 += a2 * b3;
8043  }
8044 
8045  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8046  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
8047  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
8048  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
8049  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
8050  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
8051  }
8052 
8053  if( i < M )
8054  {
8055  const size_t kbegin( ( IsUpper_v<MT4> )
8056  ?( ( IsLower_v<MT5> )
8057  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8058  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8059  :( IsLower_v<MT5> ? j : 0UL ) );
8060  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
8061 
8062  SIMDType xmm1, xmm2, xmm3;
8063 
8064  for( size_t k=kbegin; k<kend; ++k ) {
8065  const SIMDType a1( set( A(i,k) ) );
8066  xmm1 += a1 * B.load(k,j );
8067  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8068  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8069  }
8070 
8071  C.store( i, j , C.load(i,j ) - xmm1 * factor );
8072  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
8073  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
8074  }
8075  }
8076 
8077  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8078  {
8079  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
8080  size_t i( LOW ? j : 0UL );
8081 
8082  for( ; (i+4UL) <= iend; i+=4UL )
8083  {
8084  const size_t kbegin( ( IsUpper_v<MT4> )
8085  ?( ( IsLower_v<MT5> )
8086  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8087  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8088  :( IsLower_v<MT5> ? j : 0UL ) );
8089  const size_t kend( ( IsLower_v<MT4> )
8090  ?( ( IsUpper_v<MT5> )
8091  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
8092  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8093  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8094 
8095  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8096 
8097  for( size_t k=kbegin; k<kend; ++k ) {
8098  const SIMDType a1( set( A(i ,k) ) );
8099  const SIMDType a2( set( A(i+1UL,k) ) );
8100  const SIMDType a3( set( A(i+2UL,k) ) );
8101  const SIMDType a4( set( A(i+3UL,k) ) );
8102  const SIMDType b1( B.load(k,j ) );
8103  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8104  xmm1 += a1 * b1;
8105  xmm2 += a1 * b2;
8106  xmm3 += a2 * b1;
8107  xmm4 += a2 * b2;
8108  xmm5 += a3 * b1;
8109  xmm6 += a3 * b2;
8110  xmm7 += a4 * b1;
8111  xmm8 += a4 * b2;
8112  }
8113 
8114  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8115  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
8116  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8117  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
8118  C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
8119  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
8120  C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
8121  C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
8122  }
8123 
8124  for( ; (i+3UL) <= iend; i+=3UL )
8125  {
8126  const size_t kbegin( ( IsUpper_v<MT4> )
8127  ?( ( IsLower_v<MT5> )
8128  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8129  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8130  :( IsLower_v<MT5> ? j : 0UL ) );
8131  const size_t kend( ( IsLower_v<MT4> )
8132  ?( ( IsUpper_v<MT5> )
8133  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
8134  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8135  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8136 
8137  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8138 
8139  for( size_t k=kbegin; k<kend; ++k ) {
8140  const SIMDType a1( set( A(i ,k) ) );
8141  const SIMDType a2( set( A(i+1UL,k) ) );
8142  const SIMDType a3( set( A(i+2UL,k) ) );
8143  const SIMDType b1( B.load(k,j ) );
8144  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8145  xmm1 += a1 * b1;
8146  xmm2 += a1 * b2;
8147  xmm3 += a2 * b1;
8148  xmm4 += a2 * b2;
8149  xmm5 += a3 * b1;
8150  xmm6 += a3 * b2;
8151  }
8152 
8153  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8154  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
8155  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8156  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
8157  C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
8158  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
8159  }
8160 
8161  for( ; (i+2UL) <= iend; i+=2UL )
8162  {
8163  const size_t kbegin( ( IsUpper_v<MT4> )
8164  ?( ( IsLower_v<MT5> )
8165  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8166  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8167  :( IsLower_v<MT5> ? j : 0UL ) );
8168  const size_t kend( ( IsLower_v<MT4> )
8169  ?( ( IsUpper_v<MT5> )
8170  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8171  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8172  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8173 
8174  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8175  size_t k( kbegin );
8176 
8177  for( ; (k+2UL) <= kend; k+=2UL ) {
8178  const SIMDType a1( set( A(i ,k ) ) );
8179  const SIMDType a2( set( A(i+1UL,k ) ) );
8180  const SIMDType a3( set( A(i ,k+1UL) ) );
8181  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
8182  const SIMDType b1( B.load(k ,j ) );
8183  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
8184  const SIMDType b3( B.load(k+1UL,j ) );
8185  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
8186  xmm1 += a1 * b1;
8187  xmm2 += a1 * b2;
8188  xmm3 += a2 * b1;
8189  xmm4 += a2 * b2;
8190  xmm5 += a3 * b3;
8191  xmm6 += a3 * b4;
8192  xmm7 += a4 * b3;
8193  xmm8 += a4 * b4;
8194  }
8195 
8196  for( ; k<kend; ++k ) {
8197  const SIMDType a1( set( A(i ,k) ) );
8198  const SIMDType a2( set( A(i+1UL,k) ) );
8199  const SIMDType b1( B.load(k,j ) );
8200  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8201  xmm1 += a1 * b1;
8202  xmm2 += a1 * b2;
8203  xmm3 += a2 * b1;
8204  xmm4 += a2 * b2;
8205  }
8206 
8207  C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
8208  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
8209  C.store( i+1UL, j , C.load(i+1UL,j ) - (xmm3+xmm7) * factor );
8210  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
8211  }
8212 
8213  if( i < iend )
8214  {
8215  const size_t kbegin( ( IsUpper_v<MT4> )
8216  ?( ( IsLower_v<MT5> )
8217  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8218  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8219  :( IsLower_v<MT5> ? j : 0UL ) );
8220  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8221 
8222  SIMDType xmm1, xmm2, xmm3, xmm4;
8223  size_t k( kbegin );
8224 
8225  for( ; (k+2UL) <= kend; k+=2UL ) {
8226  const SIMDType a1( set( A(i,k ) ) );
8227  const SIMDType a2( set( A(i,k+1UL) ) );
8228  xmm1 += a1 * B.load(k ,j );
8229  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8230  xmm3 += a2 * B.load(k+1UL,j );
8231  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8232  }
8233 
8234  for( ; k<kend; ++k ) {
8235  const SIMDType a1( set( A(i,k) ) );
8236  xmm1 += a1 * B.load(k,j );
8237  xmm2 += a1 * B.load(k,j+SIMDSIZE);
8238  }
8239 
8240  C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
8241  C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
8242  }
8243  }
8244 
8245  for( ; j<jpos; j+=SIMDSIZE )
8246  {
8247  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
8248  size_t i( LOW ? j : 0UL );
8249 
8250  for( ; (i+4UL) <= iend; i+=4UL )
8251  {
8252  const size_t kbegin( ( IsUpper_v<MT4> )
8253  ?( ( IsLower_v<MT5> )
8254  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8255  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8256  :( IsLower_v<MT5> ? j : 0UL ) );
8257  const size_t kend( ( IsLower_v<MT4> )
8258  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8259  :( K ) );
8260 
8261  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8262  size_t k( kbegin );
8263 
8264  for( ; (k+2UL) <= kend; k+=2UL ) {
8265  const SIMDType b1( B.load(k ,j) );
8266  const SIMDType b2( B.load(k+1UL,j) );
8267  xmm1 += set( A(i ,k ) ) * b1;
8268  xmm2 += set( A(i+1UL,k ) ) * b1;
8269  xmm3 += set( A(i+2UL,k ) ) * b1;
8270  xmm4 += set( A(i+3UL,k ) ) * b1;
8271  xmm5 += set( A(i ,k+1UL) ) * b2;
8272  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
8273  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
8274  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
8275  }
8276 
8277  for( ; k<kend; ++k ) {
8278  const SIMDType b1( B.load(k,j) );
8279  xmm1 += set( A(i ,k) ) * b1;
8280  xmm2 += set( A(i+1UL,k) ) * b1;
8281  xmm3 += set( A(i+2UL,k) ) * b1;
8282  xmm4 += set( A(i+3UL,k) ) * b1;
8283  }
8284 
8285  C.store( i , j, C.load(i ,j) - (xmm1+xmm5) * factor );
8286  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm6) * factor );
8287  C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm7) * factor );
8288  C.store( i+3UL, j, C.load(i+3UL,j) - (xmm4+xmm8) * factor );
8289  }
8290 
8291  for( ; (i+3UL) <= iend; i+=3UL )
8292  {
8293  const size_t kbegin( ( IsUpper_v<MT4> )
8294  ?( ( IsLower_v<MT5> )
8295  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8296  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8297  :( IsLower_v<MT5> ? j : 0UL ) );
8298  const size_t kend( ( IsLower_v<MT4> )
8299  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8300  :( K ) );
8301 
8302  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8303  size_t k( kbegin );
8304 
8305  for( ; (k+2UL) <= kend; k+=2UL ) {
8306  const SIMDType b1( B.load(k ,j) );
8307  const SIMDType b2( B.load(k+1UL,j) );
8308  xmm1 += set( A(i ,k ) ) * b1;
8309  xmm2 += set( A(i+1UL,k ) ) * b1;
8310  xmm3 += set( A(i+2UL,k ) ) * b1;
8311  xmm4 += set( A(i ,k+1UL) ) * b2;
8312  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
8313  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
8314  }
8315 
8316  for( ; k<kend; ++k ) {
8317  const SIMDType b1( B.load(k,j) );
8318  xmm1 += set( A(i ,k) ) * b1;
8319  xmm2 += set( A(i+1UL,k) ) * b1;
8320  xmm3 += set( A(i+2UL,k) ) * b1;
8321  }
8322 
8323  C.store( i , j, C.load(i ,j) - (xmm1+xmm4) * factor );
8324  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm5) * factor );
8325  C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm6) * factor );
8326  }
8327 
8328  for( ; (i+2UL) <= iend; i+=2UL )
8329  {
8330  const size_t kbegin( ( IsUpper_v<MT4> )
8331  ?( ( IsLower_v<MT5> )
8332  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8333  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8334  :( IsLower_v<MT5> ? j : 0UL ) );
8335  const size_t kend( ( IsLower_v<MT4> )
8336  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8337  :( K ) );
8338 
8339  SIMDType xmm1, xmm2, xmm3, xmm4;
8340  size_t k( kbegin );
8341 
8342  for( ; (k+2UL) <= kend; k+=2UL ) {
8343  const SIMDType b1( B.load(k ,j) );
8344  const SIMDType b2( B.load(k+1UL,j) );
8345  xmm1 += set( A(i ,k ) ) * b1;
8346  xmm2 += set( A(i+1UL,k ) ) * b1;
8347  xmm3 += set( A(i ,k+1UL) ) * b2;
8348  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
8349  }
8350 
8351  for( ; k<kend; ++k ) {
8352  const SIMDType b1( B.load(k,j) );
8353  xmm1 += set( A(i ,k) ) * b1;
8354  xmm2 += set( A(i+1UL,k) ) * b1;
8355  }
8356 
8357  C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
8358  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm4) * factor );
8359  }
8360 
8361  if( i < iend )
8362  {
8363  const size_t kbegin( ( IsUpper_v<MT4> )
8364  ?( ( IsLower_v<MT5> )
8365  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8366  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8367  :( IsLower_v<MT5> ? j : 0UL ) );
8368 
8369  SIMDType xmm1, xmm2;
8370  size_t k( kbegin );
8371 
8372  for( ; (k+2UL) <= K; k+=2UL ) {
8373  xmm1 += set( A(i,k ) ) * B.load(k ,j);
8374  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
8375  }
8376 
8377  for( ; k<K; ++k ) {
8378  xmm1 += set( A(i,k) ) * B.load(k,j);
8379  }
8380 
8381  C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
8382  }
8383  }
8384 
8385  for( ; remainder && j<N; ++j )
8386  {
8387  const size_t iend( UPP ? j+1UL : M );
8388  size_t i( LOW ? j : 0UL );
8389 
8390  for( ; (i+2UL) <= iend; i+=2UL )
8391  {
8392  const size_t kbegin( ( IsUpper_v<MT4> )
8393  ?( ( IsLower_v<MT5> )
8394  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8395  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8396  :( IsLower_v<MT5> ? j : 0UL ) );
8397  const size_t kend( ( IsLower_v<MT4> )
8398  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8399  :( K ) );
8400 
8401  ElementType value1{};
8402  ElementType value2{};
8403 
8404  for( size_t k=kbegin; k<kend; ++k ) {
8405  value1 += A(i ,k) * B(k,j);
8406  value2 += A(i+1UL,k) * B(k,j);
8407  }
8408 
8409  C(i ,j) -= value1 * scalar;
8410  C(i+1UL,j) -= value2 * scalar;
8411  }
8412 
8413  if( i < iend )
8414  {
8415  const size_t kbegin( ( IsUpper_v<MT4> )
8416  ?( ( IsLower_v<MT5> )
8417  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8418  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8419  :( IsLower_v<MT5> ? j : 0UL ) );
8420 
8421  ElementType value{};
8422 
8423  for( size_t k=kbegin; k<K; ++k ) {
8424  value += A(i,k) * B(k,j);
8425  }
8426 
8427  C(i,j) -= value * scalar;
8428  }
8429  }
8430  }
8431  //**********************************************************************************************
8432 
8433  //**********************************************************************************************
8434  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
8448  template< typename MT3 // Type of the left-hand side target matrix
8449  , typename MT4 // Type of the left-hand side matrix operand
8450  , typename MT5 // Type of the right-hand side matrix operand
8451  , typename ST2 > // Type of the scalar value
8452  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8453  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8454  {
8459 
8460  const ForwardFunctor fwd;
8461 
8462  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
8463  const OppositeType_t<MT4> tmp( serial( A ) );
8464  subAssign( C, fwd( tmp * B ) * scalar );
8465  }
8466  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
8467  const OppositeType_t<MT5> tmp( serial( B ) );
8468  subAssign( C, fwd( A * tmp ) * scalar );
8469  }
8470  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
8471  const OppositeType_t<MT4> tmp( serial( A ) );
8472  subAssign( C, fwd( tmp * B ) * scalar );
8473  }
8474  else {
8475  const OppositeType_t<MT5> tmp( serial( B ) );
8476  subAssign( C, fwd( A * tmp ) * scalar );
8477  }
8478  }
8479  //**********************************************************************************************
8480 
8481  //**Default subtraction assignment to dense matrices (large matrices)***************************
8495  template< typename MT3 // Type of the left-hand side target matrix
8496  , typename MT4 // Type of the left-hand side matrix operand
8497  , typename MT5 // Type of the right-hand side matrix operand
8498  , typename ST2 > // Type of the scalar value
8499  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8500  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8501  {
8502  selectDefaultSubAssignKernel( C, A, B, scalar );
8503  }
8504  //**********************************************************************************************
8505 
8506  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
8521  template< typename MT3 // Type of the left-hand side target matrix
8522  , typename MT4 // Type of the left-hand side matrix operand
8523  , typename MT5 // Type of the right-hand side matrix operand
8524  , typename ST2 > // Type of the scalar value
8525  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8526  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8527  {
8528  if( LOW )
8529  lmmm( C, A, B, -scalar, ST2(1) );
8530  else if( UPP )
8531  ummm( C, A, B, -scalar, ST2(1) );
8532  else
8533  mmm( C, A, B, -scalar, ST2(1) );
8534  }
8535  //**********************************************************************************************
8536 
8537  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
8551  template< typename MT3 // Type of the left-hand side target matrix
8552  , typename MT4 // Type of the left-hand side matrix operand
8553  , typename MT5 // Type of the right-hand side matrix operand
8554  , typename ST2 > // Type of the scalar value
8555  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8556  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8557  {
8558  selectLargeSubAssignKernel( C, A, B, scalar );
8559  }
8560  //**********************************************************************************************
8561 
8562  //**BLAS-based subraction assignment to dense matrices******************************************
8563 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
8564 
8577  template< typename MT3 // Type of the left-hand side target matrix
8578  , typename MT4 // Type of the left-hand side matrix operand
8579  , typename MT5 // Type of the right-hand side matrix operand
8580  , typename ST2 > // Type of the scalar value
8581  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8582  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8583  {
8584  using ET = ElementType_t<MT3>;
8585 
8586  if( IsTriangular_v<MT4> ) {
8587  ResultType_t<MT3> tmp( serial( B ) );
8588  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
8589  subAssign( C, tmp );
8590  }
8591  else if( IsTriangular_v<MT5> ) {
8592  ResultType_t<MT3> tmp( serial( A ) );
8593  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
8594  subAssign( C, tmp );
8595  }
8596  else {
8597  gemm( C, A, B, ET(-scalar), ET(1) );
8598  }
8599  }
8600 #endif
8601  //**********************************************************************************************
8602 
8603  //**Restructuring subtraction assignment to column-major matrices*******************************
8617  template< typename MT > // Type of the target matrix
8618  friend inline auto subAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8619  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8620  {
8622 
8624 
8625  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8626  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8627 
8628  const ForwardFunctor fwd;
8629 
8630  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8631  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8632 
8633  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8634  subAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8635  else if( IsSymmetric_v<MT1> )
8636  subAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8637  else
8638  subAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8639  }
8640  //**********************************************************************************************
8641 
8642  //**Subtraction assignment to sparse matrices***************************************************
8643  // No special implementation for the subtraction assignment to sparse matrices.
8644  //**********************************************************************************************
8645 
8646  //**Schur product assignment to dense matrices**************************************************
8658  template< typename MT // Type of the target dense matrix
8659  , bool SO > // Storage order of the target dense matrix
8660  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8661  {
8663 
8667 
8668  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8669  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8670 
8671  const ResultType tmp( serial( rhs ) );
8672  schurAssign( ~lhs, tmp );
8673  }
8674  //**********************************************************************************************
8675 
8676  //**Schur product assignment to sparse matrices*************************************************
8677  // No special implementation for the Schur product assignment to sparse matrices.
8678  //**********************************************************************************************
8679 
8680  //**Multiplication assignment to dense matrices*************************************************
8681  // No special implementation for the multiplication assignment to dense matrices.
8682  //**********************************************************************************************
8683 
8684  //**Multiplication assignment to sparse matrices************************************************
8685  // No special implementation for the multiplication assignment to sparse matrices.
8686  //**********************************************************************************************
8687 
8688  //**SMP assignment to dense matrices************************************************************
8703  template< typename MT // Type of the target dense matrix
8704  , bool SO > // Storage order of the target dense matrix
8705  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8706  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8707  {
8709 
8710  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8711  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8712 
8713  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8714  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8715 
8716  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
8717  return;
8718  }
8719  else if( left.columns() == 0UL ) {
8720  reset( ~lhs );
8721  return;
8722  }
8723 
8724  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8725  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8726 
8727  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8728  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8729  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8730  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8731  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8732  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8733 
8734  smpAssign( ~lhs, A * B * rhs.scalar_ );
8735  }
8736  //**********************************************************************************************
8737 
8738  //**SMP assignment to sparse matrices***********************************************************
8753  template< typename MT // Type of the target sparse matrix
8754  , bool SO > // Storage order of the target sparse matrix
8755  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8756  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8757  {
8759 
8760  using TmpType = If_t< SO, OppositeType, ResultType >;
8761 
8768 
8769  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8770  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8771 
8772  const ForwardFunctor fwd;
8773 
8774  const TmpType tmp( rhs );
8775  smpAssign( ~lhs, fwd( tmp ) );
8776  }
8777  //**********************************************************************************************
8778 
8779  //**Restructuring SMP assignment to column-major matrices***************************************
8793  template< typename MT > // Type of the target matrix
8794  friend inline auto smpAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8795  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8796  {
8798 
8800 
8801  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8802  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8803 
8804  const ForwardFunctor fwd;
8805 
8806  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8807  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8808 
8809  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8810  smpAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8811  else if( IsSymmetric_v<MT1> )
8812  smpAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8813  else
8814  smpAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8815  }
8816  //**********************************************************************************************
8817 
8818  //**SMP addition assignment to dense matrices***************************************************
8833  template< typename MT // Type of the target dense matrix
8834  , bool SO > // Storage order of the target dense matrix
8835  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8836  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8837  {
8839 
8840  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8841  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8842 
8843  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8844  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8845 
8846  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8847  return;
8848  }
8849 
8850  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8851  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8852 
8853  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8854  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8855  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8856  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8857  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8858  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8859 
8860  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
8861  }
8862  //**********************************************************************************************
8863 
8864  //**Restructuring SMP addition assignment to column-major matrices******************************
8878  template< typename MT > // Type of the target matrix
8879  friend inline auto smpAddAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8880  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8881  {
8883 
8885 
8886  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8887  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8888 
8889  const ForwardFunctor fwd;
8890 
8891  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8892  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8893 
8894  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8895  smpAddAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8896  else if( IsSymmetric_v<MT1> )
8897  smpAddAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8898  else
8899  smpAddAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8900  }
8901  //**********************************************************************************************
8902 
8903  //**SMP addition assignment to sparse matrices**************************************************
8904  // No special implementation for the SMP addition assignment to sparse matrices.
8905  //**********************************************************************************************
8906 
8907  //**SMP subtraction assignment to dense matrices************************************************
8922  template< typename MT // Type of the target dense matrix
8923  , bool SO > // Storage order of the target dense matrix
8924  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8925  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8926  {
8928 
8929  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8930  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8931 
8932  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8933  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8934 
8935  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8936  return;
8937  }
8938 
8939  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8940  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8941 
8942  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8943  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8944  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8945  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8946  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8947  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8948 
8949  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
8950  }
8951  //**********************************************************************************************
8952 
8953  //**Restructuring SMP subtraction assignment to column-major matrices***************************
8967  template< typename MT > // Type of the target matrix
8968  friend inline auto smpSubAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8969  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8970  {
8972 
8974 
8975  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8976  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8977 
8978  const ForwardFunctor fwd;
8979 
8980  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8981  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8982 
8983  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8984  smpSubAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8985  else if( IsSymmetric_v<MT1> )
8986  smpSubAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8987  else
8988  smpSubAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8989  }
8990  //**********************************************************************************************
8991 
8992  //**SMP subtraction assignment to sparse matrices***********************************************
8993  // No special implementation for the SMP subtraction assignment to sparse matrices.
8994  //**********************************************************************************************
8995 
8996  //**SMP Schur product assignment to dense matrices**********************************************
9008  template< typename MT // Type of the target dense matrix
9009  , bool SO > // Storage order of the target dense matrix
9010  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9011  {
9013 
9017 
9018  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9019  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9020 
9021  const ResultType tmp( rhs );
9022  smpSchurAssign( ~lhs, tmp );
9023  }
9024  //**********************************************************************************************
9025 
9026  //**SMP Schur product assignment to sparse matrices*********************************************
9027  // No special implementation for the SMP Schur product assignment to sparse matrices.
9028  //**********************************************************************************************
9029 
9030  //**SMP multiplication assignment to dense matrices*********************************************
9031  // No special implementation for the SMP multiplication assignment to dense matrices.
9032  //**********************************************************************************************
9033 
9034  //**SMP multiplication assignment to sparse matrices********************************************
9035  // No special implementation for the SMP multiplication assignment to sparse matrices.
9036  //**********************************************************************************************
9037 
9038  //**Compile time checks*************************************************************************
9047  //**********************************************************************************************
9048 };
9050 //*************************************************************************************************
9051 
9052 
9053 
9054 
9055 //=================================================================================================
9056 //
9057 // GLOBAL BINARY ARITHMETIC OPERATORS
9058 //
9059 //=================================================================================================
9060 
9061 //*************************************************************************************************
9088 template< typename MT1 // Type of the left-hand side dense matrix
9089  , typename MT2 > // Type of the right-hand side dense matrix
9090 inline decltype(auto)
9091  operator*( const DenseMatrix<MT1,false>& lhs, const DenseMatrix<MT2,false>& rhs )
9092 {
9094 
9095  if( (~lhs).columns() != (~rhs).rows() ) {
9096  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
9097  }
9098 
9099  using ReturnType = const DMatDMatMultExpr<MT1,MT2,false,false,false,false>;
9100  return ReturnType( ~lhs, ~rhs );
9101 }
9102 //*************************************************************************************************
9103 
9104 
9105 
9106 
9107 //=================================================================================================
9108 //
9109 // GLOBAL FUNCTIONS
9110 //
9111 //=================================================================================================
9112 
9113 //*************************************************************************************************
9136 template< typename MT1 // Type of the left-hand side dense matrix
9137  , typename MT2 // Type of the right-hand side dense matrix
9138  , bool SF // Symmetry flag
9139  , bool HF // Hermitian flag
9140  , bool LF // Lower flag
9141  , bool UF > // Upper flag
9142 inline decltype(auto) declsym( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9143 {
9145 
9146  if( !isSquare( dm ) ) {
9147  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
9148  }
9149 
9150  using ReturnType = const DMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
9151  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9152 }
9154 //*************************************************************************************************
9155 
9156 
9157 //*************************************************************************************************
9180 template< typename MT1 // Type of the left-hand side dense matrix
9181  , typename MT2 // Type of the right-hand side dense matrix
9182  , bool SF // Symmetry flag
9183  , bool HF // Hermitian flag
9184  , bool LF // Lower flag
9185  , bool UF > // Upper flag
9186 inline decltype(auto) declherm( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9187 {
9189 
9190  if( !isSquare( dm ) ) {
9191  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
9192  }
9193 
9194  using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9195  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9196 }
9198 //*************************************************************************************************
9199 
9200 
9201 //*************************************************************************************************
9224 template< typename MT1 // Type of the left-hand side dense matrix
9225  , typename MT2 // Type of the right-hand side dense matrix
9226  , bool SF // Symmetry flag
9227  , bool HF // Hermitian flag
9228  , bool LF // Lower flag
9229  , bool UF > // Upper flag
9230 inline decltype(auto) decllow( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9231 {
9233 
9234  if( !isSquare( dm ) ) {
9235  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9236  }
9237 
9238  using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9239  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9240 }
9242 //*************************************************************************************************
9243 
9244 
9245 //*************************************************************************************************
9268 template< typename MT1 // Type of the left-hand side dense matrix
9269  , typename MT2 // Type of the right-hand side dense matrix
9270  , bool SF // Symmetry flag
9271  , bool HF // Hermitian flag
9272  , bool LF // Lower flag
9273  , bool UF > // Upper flag
9274 inline decltype(auto) declupp( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9275 {
9277 
9278  if( !isSquare( dm ) ) {
9279  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9280  }
9281 
9282  using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9283  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9284 }
9286 //*************************************************************************************************
9287 
9288 
9289 //*************************************************************************************************
9312 template< typename MT1 // Type of the left-hand side dense matrix
9313  , typename MT2 // Type of the right-hand side dense matrix
9314  , bool SF // Symmetry flag
9315  , bool HF // Hermitian flag
9316  , bool LF // Lower flag
9317  , bool UF > // Upper flag
9318 inline decltype(auto) decldiag( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9319 {
9321 
9322  if( !isSquare( dm ) ) {
9323  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
9324  }
9325 
9326  using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
9327  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9328 }
9330 //*************************************************************************************************
9331 
9332 
9333 
9334 
9335 //=================================================================================================
9336 //
9337 // SIZE SPECIALIZATIONS
9338 //
9339 //=================================================================================================
9340 
9341 //*************************************************************************************************
9343 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9344 struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9345  : public Size<MT1,0UL>
9346 {};
9347 
9348 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9349 struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9350  : public Size<MT2,1UL>
9351 {};
9353 //*************************************************************************************************
9354 
9355 
9356 
9357 
9358 //=================================================================================================
9359 //
9360 // ISALIGNED SPECIALIZATIONS
9361 //
9362 //=================================================================================================
9363 
9364 //*************************************************************************************************
9366 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9367 struct IsAligned< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9368  : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
9369 {};
9371 //*************************************************************************************************
9372 
9373 } // namespace blaze
9374 
9375 #endif
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:427
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:421
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:291
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDMatMultExpr.h:316
Header file for basic type definitions.
Header file for the SparseVector base class.
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:497
Header file for the declherm trait.
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:172
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:287
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDMatMultExpr.h:309
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:533
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:606
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:288
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:523
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:465
Header file for the IsIntegral type trait.
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
Header file for the DenseVector base class.
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatDMatMultExpr.h:176
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:162
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1001
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:597
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:151
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:432
Header file for the IsBLASCompatible type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:441
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
Header file for the IsComplexDouble type trait.
Constraint on the data type.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:167
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:565
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:301
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
Generic wrapper for the decllow() function.
Definition: DeclLow.h:59
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatDMatMultExpr.h:178
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:485
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:168
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:304
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1001
Header file for the IsLower type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:346
Header file for the IsAligned type trait.
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:157
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:553
Generic wrapper for the null function.
Definition: Noop.h:60
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:162
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:158
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:290
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:469
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:161
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDMatMultExpr.h:322
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type,...
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:161
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:174
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:395
Header file for run time assertion macros.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:289
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatDMatMultExpr.h:177
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:285
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:422
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:59
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:298
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:105
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:292
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:453
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:577
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:431
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:59
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:765
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:475
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:295
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:454
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:441
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:59
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:411
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatDMatMultExpr.h:179
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:587
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:107
Header file for the IsUpper type trait.
typename DisableIf< Condition, T >::Type DisableIf_t
Auxiliary type for the DisableIf class template.The DisableIf_t alias declaration provides a convenie...
Definition: DisableIf.h:138
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1324
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:59
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
Header file for the IsResizable type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:543
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:171
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:498
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
Header file for the DeclSym functor.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.