Blaze  3.6
TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/dense/MMM.h>
53 #include <blaze/math/Exception.h>
66 #include <blaze/math/shims/Reset.h>
68 #include <blaze/math/SIMD.h>
97 #include <blaze/math/views/Check.h>
98 #include <blaze/system/BLAS.h>
99 #include <blaze/system/Blocking.h>
100 #include <blaze/system/Debugging.h>
102 #include <blaze/system/Thresholds.h>
105 #include <blaze/util/Assert.h>
106 #include <blaze/util/Complex.h>
109 #include <blaze/util/DisableIf.h>
110 #include <blaze/util/EnableIf.h>
113 #include <blaze/util/mpl/If.h>
114 #include <blaze/util/Types.h>
122 
123 
124 namespace blaze {
125 
126 //=================================================================================================
127 //
128 // CLASS TDMATDMATMULTEXPR
129 //
130 //=================================================================================================
131 
132 //*************************************************************************************************
139 template< typename MT1 // Type of the left-hand side dense matrix
140  , typename MT2 // Type of the right-hand side dense matrix
141  , bool SF // Symmetry flag
142  , bool HF // Hermitian flag
143  , bool LF // Lower flag
144  , bool UF > // Upper flag
145 class TDMatDMatMultExpr
146  : public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
147  , private Computation
148 {
149  private:
150  //**Type definitions****************************************************************************
157  //**********************************************************************************************
158 
159  //**********************************************************************************************
161  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
162  //**********************************************************************************************
163 
164  //**********************************************************************************************
166  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
167  //**********************************************************************************************
168 
169  //**********************************************************************************************
170  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
171  static constexpr bool HERM = ( HF && !( LF || UF ) );
172  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
173  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
174  //**********************************************************************************************
175 
176  //**********************************************************************************************
178 
182  template< typename T1, typename T2, typename T3 >
183  static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
185  //**********************************************************************************************
186 
187  //**********************************************************************************************
189 
192  template< typename T1, typename T2, typename T3 >
193  static constexpr bool UseBlasKernel_v =
194  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
195  !SYM && !HERM && !LOW && !UPP &&
196  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
197  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
198  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
199  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
200  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
201  IsBLASCompatible_v< ElementType_t<T1> > &&
202  IsBLASCompatible_v< ElementType_t<T2> > &&
203  IsBLASCompatible_v< ElementType_t<T3> > &&
204  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
205  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
207  //**********************************************************************************************
208 
209  //**********************************************************************************************
211 
214  template< typename T1, typename T2, typename T3 >
215  static constexpr bool UseVectorizedDefaultKernel_v =
216  ( useOptimizedKernels &&
217  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
218  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219  IsSIMDCombinable_v< ElementType_t<T1>
221  , ElementType_t<T3> > &&
222  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
223  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
225  //**********************************************************************************************
226 
227  //**********************************************************************************************
229 
232  using ForwardFunctor = If_t< HERM
233  , DeclHerm
234  , If_t< SYM
235  , DeclSym
236  , If_t< LOW
237  , If_t< UPP
238  , DeclDiag
239  , DeclLow >
240  , If_t< UPP
241  , DeclUpp
242  , Noop > > > >;
244  //**********************************************************************************************
245 
246  public:
247  //**Type definitions****************************************************************************
250 
253 
255  using ResultType = typename If_t< HERM
257  , If_t< SYM
259  , If_t< LOW
260  , If_t< UPP
263  , If_t< UPP
265  , MultTrait<RT1,RT2> > > > >::Type;
266 
271  using ReturnType = const ElementType;
272  using CompositeType = const ResultType;
273 
275  using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
276 
278  using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
279 
282 
285  //**********************************************************************************************
286 
287  //**Compilation flags***************************************************************************
289  static constexpr bool simdEnabled =
290  ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
291  MT1::simdEnabled && MT2::simdEnabled &&
292  HasSIMDAdd_v<ET1,ET2> &&
293  HasSIMDMult_v<ET1,ET2> );
294 
296  static constexpr bool smpAssignable =
297  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
298  //**********************************************************************************************
299 
300  //**SIMD properties*****************************************************************************
302  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
303  //**********************************************************************************************
304 
305  //**Constructor*********************************************************************************
311  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
312  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
313  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
314  {
315  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
316  }
317  //**********************************************************************************************
318 
319  //**Access operator*****************************************************************************
326  inline ReturnType operator()( size_t i, size_t j ) const {
327  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
328  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
329 
330  if( IsDiagonal_v<MT1> ) {
331  return lhs_(i,i) * rhs_(i,j);
332  }
333  else if( IsDiagonal_v<MT2> ) {
334  return lhs_(i,j) * rhs_(j,j);
335  }
336  else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
337  const size_t begin( ( IsUpper_v<MT1> )
338  ?( ( IsLower_v<MT2> )
339  ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
340  , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
341  :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
342  :( ( IsLower_v<MT2> )
343  ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
344  :( 0UL ) ) );
345  const size_t end( ( IsLower_v<MT1> )
346  ?( ( IsUpper_v<MT2> )
347  ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
348  , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
349  :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
350  :( ( IsUpper_v<MT2> )
351  ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
352  :( lhs_.columns() ) ) );
353 
354  if( begin >= end ) return ElementType();
355 
356  const size_t n( end - begin );
357 
358  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
359  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
360  }
361  else {
362  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
363  }
364  }
365  //**********************************************************************************************
366 
367  //**At function*********************************************************************************
375  inline ReturnType at( size_t i, size_t j ) const {
376  if( i >= lhs_.rows() ) {
377  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
378  }
379  if( j >= rhs_.columns() ) {
380  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
381  }
382  return (*this)(i,j);
383  }
384  //**********************************************************************************************
385 
386  //**Rows function*******************************************************************************
391  inline size_t rows() const noexcept {
392  return lhs_.rows();
393  }
394  //**********************************************************************************************
395 
396  //**Columns function****************************************************************************
401  inline size_t columns() const noexcept {
402  return rhs_.columns();
403  }
404  //**********************************************************************************************
405 
406  //**Left operand access*************************************************************************
411  inline LeftOperand leftOperand() const noexcept {
412  return lhs_;
413  }
414  //**********************************************************************************************
415 
416  //**Right operand access************************************************************************
421  inline RightOperand rightOperand() const noexcept {
422  return rhs_;
423  }
424  //**********************************************************************************************
425 
426  //**********************************************************************************************
432  template< typename T >
433  inline bool canAlias( const T* alias ) const noexcept {
434  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
435  }
436  //**********************************************************************************************
437 
438  //**********************************************************************************************
444  template< typename T >
445  inline bool isAliased( const T* alias ) const noexcept {
446  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
447  }
448  //**********************************************************************************************
449 
450  //**********************************************************************************************
455  inline bool isAligned() const noexcept {
456  return lhs_.isAligned() && rhs_.isAligned();
457  }
458  //**********************************************************************************************
459 
460  //**********************************************************************************************
465  inline bool canSMPAssign() const noexcept {
466  return ( !BLAZE_BLAS_MODE ||
467  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
469  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
470  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
471  !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
472  }
473  //**********************************************************************************************
474 
475  private:
476  //**Member variables****************************************************************************
479  //**********************************************************************************************
480 
481  //**Assignment to dense matrices****************************************************************
494  template< typename MT // Type of the target dense matrix
495  , bool SO > // Storage order of the target dense matrix
496  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
497  {
499 
500  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
501  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
502 
503  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
504  return;
505  }
506  else if( rhs.lhs_.columns() == 0UL ) {
507  reset( ~lhs );
508  return;
509  }
510 
511  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
512  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
513 
514  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
515  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
516  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
517  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
518  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
519  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
520 
521  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
522  }
524  //**********************************************************************************************
525 
526  //**Assignment to dense matrices (kernel selection)*********************************************
537  template< typename MT3 // Type of the left-hand side target matrix
538  , typename MT4 // Type of the left-hand side matrix operand
539  , typename MT5 > // Type of the right-hand side matrix operand
540  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
541  {
542  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
543  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
544  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
545  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
546  selectSmallAssignKernel( C, A, B );
547  else
548  selectBlasAssignKernel( C, A, B );
549  }
551  //**********************************************************************************************
552 
553  //**Default assignment to row-major dense matrices (general/general)****************************
567  template< typename MT3 // Type of the left-hand side target matrix
568  , typename MT4 // Type of the left-hand side matrix operand
569  , typename MT5 > // Type of the right-hand side matrix operand
570  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
571  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
572  {
573  const size_t M( A.rows() );
574  const size_t N( B.columns() );
575  const size_t K( A.columns() );
576 
577  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
578 
579  for( size_t i=0UL; i<M; ++i )
580  {
581  const size_t kbegin( ( IsUpper_v<MT4> )
582  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
583  :( 0UL ) );
584  const size_t kend( ( IsLower_v<MT4> )
585  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
586  :( K ) );
587  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
588 
589  if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
590  for( size_t j=0UL; j<N; ++j ) {
591  reset( C(i,j) );
592  }
593  continue;
594  }
595 
596  {
597  const size_t jbegin( ( IsUpper_v<MT5> )
598  ?( ( IsStrictlyUpper_v<MT5> )
599  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
600  :( UPP ? max(i,kbegin) : kbegin ) )
601  :( UPP ? i : 0UL ) );
602  const size_t jend( ( IsLower_v<MT5> )
603  ?( ( IsStrictlyLower_v<MT5> )
604  ?( LOW ? min(i+1UL,kbegin) : kbegin )
605  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
606  :( LOW ? i+1UL : N ) );
607 
608  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
609  for( size_t j=0UL; j<jbegin; ++j ) {
610  reset( C(i,j) );
611  }
612  }
613  else if( IsStrictlyUpper_v<MT5> ) {
614  reset( C(i,0UL) );
615  }
616  for( size_t j=jbegin; j<jend; ++j ) {
617  C(i,j) = A(i,kbegin) * B(kbegin,j);
618  }
619  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
620  for( size_t j=jend; j<N; ++j ) {
621  reset( C(i,j) );
622  }
623  }
624  else if( IsStrictlyLower_v<MT5> ) {
625  reset( C(i,N-1UL) );
626  }
627  }
628 
629  for( size_t k=kbegin+1UL; k<kend; ++k )
630  {
631  const size_t jbegin( ( IsUpper_v<MT5> )
632  ?( ( IsStrictlyUpper_v<MT5> )
633  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
634  :( SYM || HERM || UPP ? max( i, k ) : k ) )
635  :( SYM || HERM || UPP ? i : 0UL ) );
636  const size_t jend( ( IsLower_v<MT5> )
637  ?( ( IsStrictlyLower_v<MT5> )
638  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
639  :( LOW ? min(i+1UL,k) : k ) )
640  :( LOW ? i+1UL : N ) );
641 
642  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
643  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
644 
645  for( size_t j=jbegin; j<jend; ++j ) {
646  C(i,j) += A(i,k) * B(k,j);
647  }
648  if( IsLower_v<MT5> ) {
649  C(i,jend) = A(i,k) * B(k,jend);
650  }
651  }
652  }
653 
654  if( SYM || HERM ) {
655  for( size_t i=1UL; i<M; ++i ) {
656  for( size_t j=0UL; j<i; ++j ) {
657  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
658  }
659  }
660  }
661  }
663  //**********************************************************************************************
664 
665  //**Default assignment to column-major dense matrices (general/general)*************************
679  template< typename MT3 // Type of the left-hand side target matrix
680  , typename MT4 // Type of the left-hand side matrix operand
681  , typename MT5 > // Type of the right-hand side matrix operand
682  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
683  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
684  {
685  const size_t M( A.rows() );
686  const size_t N( B.columns() );
687  const size_t K( A.columns() );
688 
689  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
690 
691  for( size_t j=0UL; j<N; ++j )
692  {
693  const size_t kbegin( ( IsLower_v<MT5> )
694  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
695  :( 0UL ) );
696  const size_t kend( ( IsUpper_v<MT5> )
697  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
698  :( K ) );
699  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
700 
701  if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
702  for( size_t i=0UL; i<M; ++i ) {
703  reset( C(i,j) );
704  }
705  continue;
706  }
707 
708  {
709  const size_t ibegin( ( IsLower_v<MT4> )
710  ?( ( IsStrictlyLower_v<MT4> )
711  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
712  :( LOW ? max(j,kbegin) : kbegin ) )
713  :( LOW ? j : 0UL ) );
714  const size_t iend( ( IsUpper_v<MT4> )
715  ?( ( IsStrictlyUpper_v<MT4> )
716  ?( UPP ? min(j+1UL,kbegin) : kbegin )
717  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
718  :( UPP ? j+1UL : M ) );
719 
720  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
721  for( size_t i=0UL; i<ibegin; ++i ) {
722  reset( C(i,j) );
723  }
724  }
725  else if( IsStrictlyLower_v<MT4> ) {
726  reset( C(0UL,j) );
727  }
728  for( size_t i=ibegin; i<iend; ++i ) {
729  C(i,j) = A(i,kbegin) * B(kbegin,j);
730  }
731  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
732  for( size_t i=iend; i<M; ++i ) {
733  reset( C(i,j) );
734  }
735  }
736  else if( IsStrictlyUpper_v<MT4> ) {
737  reset( C(M-1UL,j) );
738  }
739  }
740 
741  for( size_t k=kbegin+1UL; k<kend; ++k )
742  {
743  const size_t ibegin( ( IsLower_v<MT4> )
744  ?( ( IsStrictlyLower_v<MT4> )
745  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
746  :( SYM || HERM || LOW ? max( j, k ) : k ) )
747  :( SYM || HERM || LOW ? j : 0UL ) );
748  const size_t iend( ( IsUpper_v<MT4> )
749  ?( ( IsStrictlyUpper_v<MT4> )
750  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
751  :( UPP ? min(j+1UL,k) : k ) )
752  :( UPP ? j+1UL : M ) );
753 
754  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
755  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
756 
757  for( size_t i=ibegin; i<iend; ++i ) {
758  C(i,j) += A(i,k) * B(k,j);
759  }
760  if( IsUpper_v<MT4> ) {
761  C(iend,j) = A(iend,k) * B(k,j);
762  }
763  }
764  }
765 
766  if( SYM || HERM ) {
767  for( size_t j=1UL; j<N; ++j ) {
768  for( size_t i=0UL; i<j; ++i ) {
769  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
770  }
771  }
772  }
773  }
775  //**********************************************************************************************
776 
777  //**Default assignment to row-major dense matrices (general/diagonal)***************************
791  template< typename MT3 // Type of the left-hand side target matrix
792  , typename MT4 // Type of the left-hand side matrix operand
793  , typename MT5 > // Type of the right-hand side matrix operand
794  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
795  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
796  {
797  constexpr size_t block( BLOCK_SIZE );
798 
799  const size_t M( A.rows() );
800  const size_t N( B.columns() );
801 
802  for( size_t ii=0UL; ii<M; ii+=block ) {
803  const size_t iend( min( M, ii+block ) );
804  for( size_t jj=0UL; jj<N; jj+=block ) {
805  const size_t jend( min( N, jj+block ) );
806  for( size_t i=ii; i<iend; ++i )
807  {
808  const size_t jbegin( ( IsUpper_v<MT4> )
809  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
810  :( jj ) );
811  const size_t jpos( ( IsLower_v<MT4> )
812  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
813  :( jend ) );
814 
815  if( IsUpper_v<MT4> ) {
816  for( size_t j=jj; j<jbegin; ++j ) {
817  reset( C(i,j) );
818  }
819  }
820  for( size_t j=jbegin; j<jpos; ++j ) {
821  C(i,j) = A(i,j) * B(j,j);
822  }
823  if( IsLower_v<MT4> ) {
824  for( size_t j=jpos; j<jend; ++j ) {
825  reset( C(i,j) );
826  }
827  }
828  }
829  }
830  }
831  }
833  //**********************************************************************************************
834 
835  //**Default assignment to column-major dense matrices (general/diagonal)************************
849  template< typename MT3 // Type of the left-hand side target matrix
850  , typename MT4 // Type of the left-hand side matrix operand
851  , typename MT5 > // Type of the right-hand side matrix operand
852  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
853  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
854  {
855  const size_t M( A.rows() );
856  const size_t N( B.columns() );
857 
858  for( size_t j=0UL; j<N; ++j )
859  {
860  const size_t ibegin( ( IsLower_v<MT4> )
861  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
862  :( 0UL ) );
863  const size_t iend( ( IsUpper_v<MT4> )
864  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
865  :( M ) );
866  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
867 
868  if( IsLower_v<MT4> ) {
869  for( size_t i=0UL; i<ibegin; ++i ) {
870  reset( C(i,j) );
871  }
872  }
873  for( size_t i=ibegin; i<iend; ++i ) {
874  C(i,j) = A(i,j) * B(j,j);
875  }
876  if( IsUpper_v<MT4> ) {
877  for( size_t i=iend; i<M; ++i ) {
878  reset( C(i,j) );
879  }
880  }
881  }
882  }
884  //**********************************************************************************************
885 
886  //**Default assignment to row-major dense matrices (diagonal/general)***************************
900  template< typename MT3 // Type of the left-hand side target matrix
901  , typename MT4 // Type of the left-hand side matrix operand
902  , typename MT5 > // Type of the right-hand side matrix operand
903  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
904  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
905  {
906  const size_t M( A.rows() );
907  const size_t N( B.columns() );
908 
909  for( size_t i=0UL; i<M; ++i )
910  {
911  const size_t jbegin( ( IsUpper_v<MT5> )
912  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
913  :( 0UL ) );
914  const size_t jend( ( IsLower_v<MT5> )
915  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
916  :( N ) );
917  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
918 
919  if( IsUpper_v<MT5> ) {
920  for( size_t j=0UL; j<jbegin; ++j ) {
921  reset( C(i,j) );
922  }
923  }
924  for( size_t j=jbegin; j<jend; ++j ) {
925  C(i,j) = A(i,i) * B(i,j);
926  }
927  if( IsLower_v<MT5> ) {
928  for( size_t j=jend; j<N; ++j ) {
929  reset( C(i,j) );
930  }
931  }
932  }
933  }
935  //**********************************************************************************************
936 
937  //**Default assignment to column-major dense matrices (diagonal/general)************************
951  template< typename MT3 // Type of the left-hand side target matrix
952  , typename MT4 // Type of the left-hand side matrix operand
953  , typename MT5 > // Type of the right-hand side matrix operand
954  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
955  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
956  {
957  constexpr size_t block( BLOCK_SIZE );
958 
959  const size_t M( A.rows() );
960  const size_t N( B.columns() );
961 
962  for( size_t jj=0UL; jj<N; jj+=block ) {
963  const size_t jend( min( N, jj+block ) );
964  for( size_t ii=0UL; ii<M; ii+=block ) {
965  const size_t iend( min( M, ii+block ) );
966  for( size_t j=jj; j<jend; ++j )
967  {
968  const size_t ibegin( ( IsLower_v<MT5> )
969  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
970  :( ii ) );
971  const size_t ipos( ( IsUpper_v<MT5> )
972  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
973  :( iend ) );
974 
975  if( IsLower_v<MT5> ) {
976  for( size_t i=ii; i<ibegin; ++i ) {
977  reset( C(i,j) );
978  }
979  }
980  for( size_t i=ibegin; i<ipos; ++i ) {
981  C(i,j) = A(i,i) * B(i,j);
982  }
983  if( IsUpper_v<MT5> ) {
984  for( size_t i=ipos; i<iend; ++i ) {
985  reset( C(i,j) );
986  }
987  }
988  }
989  }
990  }
991  }
993  //**********************************************************************************************
994 
995  //**Default assignment to dense matrices (diagonal/diagonal)************************************
1009  template< typename MT3 // Type of the left-hand side target matrix
1010  , typename MT4 // Type of the left-hand side matrix operand
1011  , typename MT5 > // Type of the right-hand side matrix operand
1012  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
1013  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1014  {
1015  reset( C );
1016 
1017  for( size_t i=0UL; i<A.rows(); ++i ) {
1018  C(i,i) = A(i,i) * B(i,i);
1019  }
1020  }
1022  //**********************************************************************************************
1023 
1024  //**Default assignment to dense matrices (small matrices)***************************************
1038  template< typename MT3 // Type of the left-hand side target matrix
1039  , typename MT4 // Type of the left-hand side matrix operand
1040  , typename MT5 > // Type of the right-hand side matrix operand
1041  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1042  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1043  {
1044  selectDefaultAssignKernel( C, A, B );
1045  }
1047  //**********************************************************************************************
1048 
1049  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1064  template< typename MT3 // Type of the left-hand side target matrix
1065  , typename MT4 // Type of the left-hand side matrix operand
1066  , typename MT5 > // Type of the right-hand side matrix operand
1067  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1068  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1069  {
1070  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
1071 
1072  const size_t M( A.rows() );
1073  const size_t N( B.columns() );
1074  const size_t K( A.columns() );
1075 
1076  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1077 
1078  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1079  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1080 
1081  size_t j( 0UL );
1082 
1083  if( IsIntegral_v<ElementType> )
1084  {
1085  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1086  for( size_t i=0UL; i<M; ++i )
1087  {
1088  const size_t kbegin( ( IsUpper_v<MT4> )
1089  ?( ( IsLower_v<MT5> )
1090  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1091  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1092  :( IsLower_v<MT5> ? j : 0UL ) );
1093  const size_t kend( ( IsLower_v<MT4> )
1094  ?( ( IsUpper_v<MT5> )
1095  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1096  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1097  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
1098 
1099  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1100 
1101  for( size_t k=kbegin; k<kend; ++k ) {
1102  const SIMDType a1( set( A(i,k) ) );
1103  xmm1 += a1 * B.load(k,j );
1104  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1105  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1106  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1107  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1108  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1109  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1110  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1111  }
1112 
1113  C.store( i, j , xmm1 );
1114  C.store( i, j+SIMDSIZE , xmm2 );
1115  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1116  C.store( i, j+SIMDSIZE*3UL, xmm4 );
1117  C.store( i, j+SIMDSIZE*4UL, xmm5 );
1118  C.store( i, j+SIMDSIZE*5UL, xmm6 );
1119  C.store( i, j+SIMDSIZE*6UL, xmm7 );
1120  C.store( i, j+SIMDSIZE*7UL, xmm8 );
1121  }
1122  }
1123  }
1124 
1125  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1126  {
1127  size_t i( 0UL );
1128 
1129  for( ; (i+2UL) <= M; i+=2UL )
1130  {
1131  const size_t kbegin( ( IsUpper_v<MT4> )
1132  ?( ( IsLower_v<MT5> )
1133  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1134  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1135  :( IsLower_v<MT5> ? j : 0UL ) );
1136  const size_t kend( ( IsLower_v<MT4> )
1137  ?( ( IsUpper_v<MT5> )
1138  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
1139  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1140  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
1141 
1142  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1143 
1144  for( size_t k=kbegin; k<kend; ++k ) {
1145  const SIMDType a1( set( A(i ,k) ) );
1146  const SIMDType a2( set( A(i+1UL,k) ) );
1147  const SIMDType b1( B.load(k,j ) );
1148  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1149  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1150  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1151  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
1152  xmm1 += a1 * b1;
1153  xmm2 += a1 * b2;
1154  xmm3 += a1 * b3;
1155  xmm4 += a1 * b4;
1156  xmm5 += a1 * b5;
1157  xmm6 += a2 * b1;
1158  xmm7 += a2 * b2;
1159  xmm8 += a2 * b3;
1160  xmm9 += a2 * b4;
1161  xmm10 += a2 * b5;
1162  }
1163 
1164  C.store( i , j , xmm1 );
1165  C.store( i , j+SIMDSIZE , xmm2 );
1166  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1167  C.store( i , j+SIMDSIZE*3UL, xmm4 );
1168  C.store( i , j+SIMDSIZE*4UL, xmm5 );
1169  C.store( i+1UL, j , xmm6 );
1170  C.store( i+1UL, j+SIMDSIZE , xmm7 );
1171  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1172  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1173  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1174  }
1175 
1176  if( i < M )
1177  {
1178  const size_t kbegin( ( IsUpper_v<MT4> )
1179  ?( ( IsLower_v<MT5> )
1180  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1181  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1182  :( IsLower_v<MT5> ? j : 0UL ) );
1183  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
1184 
1185  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1186 
1187  for( size_t k=kbegin; k<kend; ++k ) {
1188  const SIMDType a1( set( A(i,k) ) );
1189  xmm1 += a1 * B.load(k,j );
1190  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1191  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1192  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1193  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1194  }
1195 
1196  C.store( i, j , xmm1 );
1197  C.store( i, j+SIMDSIZE , xmm2 );
1198  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1199  C.store( i, j+SIMDSIZE*3UL, xmm4 );
1200  C.store( i, j+SIMDSIZE*4UL, xmm5 );
1201  }
1202  }
1203 
1204  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1205  {
1206  const size_t iend( UPP ? min(j+SIMDSIZE*4UL,M) : M );
1207  size_t i( 0UL );
1208 
1209  if( SYM || HERM ) {
1210  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1211  for( ; i<j; ++i ) {
1212  for( size_t jj=j; jj<jjend; ++jj ) {
1213  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1214  }
1215  }
1216  }
1217  else if( LOW ) {
1218  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1219  for( ; i<j; ++i ) {
1220  for( size_t jj=j; jj<jjend; ++jj ) {
1221  reset( C(i,jj) );
1222  }
1223  }
1224  }
1225 
1226  for( ; (i+2UL) <= iend; i+=2UL )
1227  {
1228  const size_t kbegin( ( IsUpper_v<MT4> )
1229  ?( ( IsLower_v<MT5> )
1230  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1231  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1232  :( IsLower_v<MT5> ? j : 0UL ) );
1233  const size_t kend( ( IsLower_v<MT4> )
1234  ?( ( IsUpper_v<MT5> )
1235  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1236  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1237  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
1238 
1239  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1240 
1241  for( size_t k=kbegin; k<kend; ++k ) {
1242  const SIMDType a1( set( A(i ,k) ) );
1243  const SIMDType a2( set( A(i+1UL,k) ) );
1244  const SIMDType b1( B.load(k,j ) );
1245  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1246  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1247  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1248  xmm1 += a1 * b1;
1249  xmm2 += a1 * b2;
1250  xmm3 += a1 * b3;
1251  xmm4 += a1 * b4;
1252  xmm5 += a2 * b1;
1253  xmm6 += a2 * b2;
1254  xmm7 += a2 * b3;
1255  xmm8 += a2 * b4;
1256  }
1257 
1258  C.store( i , j , xmm1 );
1259  C.store( i , j+SIMDSIZE , xmm2 );
1260  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1261  C.store( i , j+SIMDSIZE*3UL, xmm4 );
1262  C.store( i+1UL, j , xmm5 );
1263  C.store( i+1UL, j+SIMDSIZE , xmm6 );
1264  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1265  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1266  }
1267 
1268  if( i < iend )
1269  {
1270  const size_t kbegin( ( IsUpper_v<MT4> )
1271  ?( ( IsLower_v<MT5> )
1272  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1273  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1274  :( IsLower_v<MT5> ? j : 0UL ) );
1275  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1276 
1277  SIMDType xmm1, xmm2, xmm3, xmm4;
1278 
1279  for( size_t k=kbegin; k<kend; ++k ) {
1280  const SIMDType a1( set( A(i,k) ) );
1281  xmm1 += a1 * B.load(k,j );
1282  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1283  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1284  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1285  }
1286 
1287  C.store( i, j , xmm1 );
1288  C.store( i, j+SIMDSIZE , xmm2 );
1289  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1290  C.store( i, j+SIMDSIZE*3UL, xmm4 );
1291 
1292  if( UPP ) ++i;
1293  }
1294 
1295  if( UPP ) {
1296  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
1297  for( ; i<M; ++i ) {
1298  for( size_t jj=j; jj<jjend; ++jj ) {
1299  reset( C(i,jj) );
1300  }
1301  }
1302  }
1303  }
1304 
1305  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1306  {
1307  const size_t iend( UPP ? min(j+SIMDSIZE*3UL,M) : M );
1308  size_t i( 0UL );
1309 
1310  if( SYM || HERM ) {
1311  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1312  for( ; i<j; ++i ) {
1313  for( size_t jj=j; jj<jjend; ++jj ) {
1314  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1315  }
1316  }
1317  }
1318  else if( LOW ) {
1319  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1320  for( ; i<j; ++i ) {
1321  for( size_t jj=j; jj<jjend; ++jj ) {
1322  reset( C(i,jj) );
1323  }
1324  }
1325  }
1326 
1327  for( ; (i+2UL) <= iend; i+=2UL )
1328  {
1329  const size_t kbegin( ( IsUpper_v<MT4> )
1330  ?( ( IsLower_v<MT5> )
1331  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1332  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1333  :( IsLower_v<MT5> ? j : 0UL ) );
1334  const size_t kend( ( IsLower_v<MT4> )
1335  ?( ( IsUpper_v<MT5> )
1336  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1337  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1338  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
1339 
1340  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1341 
1342  for( size_t k=kbegin; k<kend; ++k ) {
1343  const SIMDType a1( set( A(i ,k) ) );
1344  const SIMDType a2( set( A(i+1UL,k) ) );
1345  const SIMDType b1( B.load(k,j ) );
1346  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1347  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1348  xmm1 += a1 * b1;
1349  xmm2 += a1 * b2;
1350  xmm3 += a1 * b3;
1351  xmm4 += a2 * b1;
1352  xmm5 += a2 * b2;
1353  xmm6 += a2 * b3;
1354  }
1355 
1356  C.store( i , j , xmm1 );
1357  C.store( i , j+SIMDSIZE , xmm2 );
1358  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1359  C.store( i+1UL, j , xmm4 );
1360  C.store( i+1UL, j+SIMDSIZE , xmm5 );
1361  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1362  }
1363 
1364  if( i < iend )
1365  {
1366  const size_t kbegin( ( IsUpper_v<MT4> )
1367  ?( ( IsLower_v<MT5> )
1368  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1369  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1370  :( IsLower_v<MT5> ? j : 0UL ) );
1371  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1372 
1373  SIMDType xmm1, xmm2, xmm3;
1374 
1375  for( size_t k=kbegin; k<kend; ++k ) {
1376  const SIMDType a1( set( A(i,k) ) );
1377  xmm1 += a1 * B.load(k,j );
1378  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1379  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1380  }
1381 
1382  C.store( i, j , xmm1 );
1383  C.store( i, j+SIMDSIZE , xmm2 );
1384  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1385 
1386  if( UPP ) ++i;
1387  }
1388 
1389  if( UPP ) {
1390  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
1391  for( ; i<M; ++i ) {
1392  for( size_t jj=j; jj<jjend; ++jj ) {
1393  reset( C(i,jj) );
1394  }
1395  }
1396  }
1397  }
1398 
1399  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1400  {
1401  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
1402  size_t i( 0UL );
1403 
1404  if( SYM || HERM ) {
1405  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1406  for( ; i<j; ++i ) {
1407  for( size_t jj=j; jj<jjend; ++jj ) {
1408  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1409  }
1410  }
1411  }
1412  else if( LOW ) {
1413  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1414  for( ; i<j; ++i ) {
1415  for( size_t jj=j; jj<jjend; ++jj ) {
1416  reset( C(i,jj) );
1417  }
1418  }
1419  }
1420 
1421  for( ; (i+4UL) <= iend; i+=4UL )
1422  {
1423  const size_t kbegin( ( IsUpper_v<MT4> )
1424  ?( ( IsLower_v<MT5> )
1425  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1426  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1427  :( IsLower_v<MT5> ? j : 0UL ) );
1428  const size_t kend( ( IsLower_v<MT4> )
1429  ?( ( IsUpper_v<MT5> )
1430  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
1431  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1432  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1433 
1434  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1435 
1436  for( size_t k=kbegin; k<kend; ++k ) {
1437  const SIMDType a1( set( A(i ,k) ) );
1438  const SIMDType a2( set( A(i+1UL,k) ) );
1439  const SIMDType a3( set( A(i+2UL,k) ) );
1440  const SIMDType a4( set( A(i+3UL,k) ) );
1441  const SIMDType b1( B.load(k,j ) );
1442  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1443  xmm1 += a1 * b1;
1444  xmm2 += a1 * b2;
1445  xmm3 += a2 * b1;
1446  xmm4 += a2 * b2;
1447  xmm5 += a3 * b1;
1448  xmm6 += a3 * b2;
1449  xmm7 += a4 * b1;
1450  xmm8 += a4 * b2;
1451  }
1452 
1453  C.store( i , j , xmm1 );
1454  C.store( i , j+SIMDSIZE, xmm2 );
1455  C.store( i+1UL, j , xmm3 );
1456  C.store( i+1UL, j+SIMDSIZE, xmm4 );
1457  C.store( i+2UL, j , xmm5 );
1458  C.store( i+2UL, j+SIMDSIZE, xmm6 );
1459  C.store( i+3UL, j , xmm7 );
1460  C.store( i+3UL, j+SIMDSIZE, xmm8 );
1461  }
1462 
1463  for( ; (i+3UL) <= iend; i+=3UL )
1464  {
1465  const size_t kbegin( ( IsUpper_v<MT4> )
1466  ?( ( IsLower_v<MT5> )
1467  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1468  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1469  :( IsLower_v<MT5> ? j : 0UL ) );
1470  const size_t kend( ( IsLower_v<MT4> )
1471  ?( ( IsUpper_v<MT5> )
1472  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
1473  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1474  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1475 
1476  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1477 
1478  for( size_t k=kbegin; k<kend; ++k ) {
1479  const SIMDType a1( set( A(i ,k) ) );
1480  const SIMDType a2( set( A(i+1UL,k) ) );
1481  const SIMDType a3( set( A(i+2UL,k) ) );
1482  const SIMDType b1( B.load(k,j ) );
1483  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1484  xmm1 += a1 * b1;
1485  xmm2 += a1 * b2;
1486  xmm3 += a2 * b1;
1487  xmm4 += a2 * b2;
1488  xmm5 += a3 * b1;
1489  xmm6 += a3 * b2;
1490  }
1491 
1492  C.store( i , j , xmm1 );
1493  C.store( i , j+SIMDSIZE, xmm2 );
1494  C.store( i+1UL, j , xmm3 );
1495  C.store( i+1UL, j+SIMDSIZE, xmm4 );
1496  C.store( i+2UL, j , xmm5 );
1497  C.store( i+2UL, j+SIMDSIZE, xmm6 );
1498  }
1499 
1500  for( ; (i+2UL) <= iend; i+=2UL )
1501  {
1502  const size_t kbegin( ( IsUpper_v<MT4> )
1503  ?( ( IsLower_v<MT5> )
1504  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1505  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1506  :( IsLower_v<MT5> ? j : 0UL ) );
1507  const size_t kend( ( IsLower_v<MT4> )
1508  ?( ( IsUpper_v<MT5> )
1509  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1510  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1511  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1512 
1513  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1514  size_t k( kbegin );
1515 
1516  for( ; (k+2UL) <= kend; k+=2UL ) {
1517  const SIMDType a1( set( A(i ,k ) ) );
1518  const SIMDType a2( set( A(i+1UL,k ) ) );
1519  const SIMDType a3( set( A(i ,k+1UL) ) );
1520  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
1521  const SIMDType b1( B.load(k ,j ) );
1522  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1523  const SIMDType b3( B.load(k+1UL,j ) );
1524  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1525  xmm1 += a1 * b1;
1526  xmm2 += a1 * b2;
1527  xmm3 += a2 * b1;
1528  xmm4 += a2 * b2;
1529  xmm5 += a3 * b3;
1530  xmm6 += a3 * b4;
1531  xmm7 += a4 * b3;
1532  xmm8 += a4 * b4;
1533  }
1534 
1535  for( ; k<kend; ++k ) {
1536  const SIMDType a1( set( A(i ,k) ) );
1537  const SIMDType a2( set( A(i+1UL,k) ) );
1538  const SIMDType b1( B.load(k,j ) );
1539  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1540  xmm1 += a1 * b1;
1541  xmm2 += a1 * b2;
1542  xmm3 += a2 * b1;
1543  xmm4 += a2 * b2;
1544  }
1545 
1546  C.store( i , j , xmm1+xmm5 );
1547  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
1548  C.store( i+1UL, j , xmm3+xmm7 );
1549  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1550  }
1551 
1552  if( i < iend )
1553  {
1554  const size_t kbegin( ( IsUpper_v<MT4> )
1555  ?( ( IsLower_v<MT5> )
1556  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1557  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1558  :( IsLower_v<MT5> ? j : 0UL ) );
1559  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1560 
1561  SIMDType xmm1, xmm2, xmm3, xmm4;
1562  size_t k( kbegin );
1563 
1564  for( ; (k+2UL) <= kend; k+=2UL ) {
1565  const SIMDType a1( set( A(i,k ) ) );
1566  const SIMDType a2( set( A(i,k+1UL) ) );
1567  xmm1 += a1 * B.load(k ,j );
1568  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1569  xmm3 += a2 * B.load(k+1UL,j );
1570  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1571  }
1572 
1573  for( ; k<kend; ++k ) {
1574  const SIMDType a1( set( A(i,k) ) );
1575  xmm1 += a1 * B.load(k,j );
1576  xmm2 += a1 * B.load(k,j+SIMDSIZE);
1577  }
1578 
1579  C.store( i, j , xmm1+xmm3 );
1580  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
1581 
1582  if( UPP ) ++i;
1583  }
1584 
1585  if( UPP ) {
1586  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
1587  for( ; i<M; ++i ) {
1588  for( size_t jj=j; jj<jjend; ++jj ) {
1589  reset( C(i,jj) );
1590  }
1591  }
1592  }
1593  }
1594 
1595  for( ; j<jpos; j+=SIMDSIZE )
1596  {
1597  const size_t iend( UPP ? min(j+SIMDSIZE,M) : M );
1598  size_t i( 0UL );
1599 
1600  if( SYM || HERM ) {
1601  const size_t jjend( min(j+SIMDSIZE,N) );
1602  for( ; i<j; ++i ) {
1603  for( size_t jj=j; jj<jjend; ++jj ) {
1604  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
1605  }
1606  }
1607  }
1608  else if( LOW ) {
1609  const size_t jjend( min(j+SIMDSIZE,N) );
1610  for( ; i<j; ++i ) {
1611  for( size_t jj=j; jj<jjend; ++jj ) {
1612  reset( C(i,jj) );
1613  }
1614  }
1615  }
1616 
1617  for( ; (i+4UL) <= iend; i+=4UL )
1618  {
1619  const size_t kbegin( ( IsUpper_v<MT4> )
1620  ?( ( IsLower_v<MT5> )
1621  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1622  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1623  :( IsLower_v<MT5> ? j : 0UL ) );
1624  const size_t kend( ( IsLower_v<MT4> )
1625  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1626  :( K ) );
1627 
1628  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1629  size_t k( kbegin );
1630 
1631  for( ; (k+2UL) <= kend; k+=2UL ) {
1632  const SIMDType b1( B.load(k ,j) );
1633  const SIMDType b2( B.load(k+1UL,j) );
1634  xmm1 += set( A(i ,k ) ) * b1;
1635  xmm2 += set( A(i+1UL,k ) ) * b1;
1636  xmm3 += set( A(i+2UL,k ) ) * b1;
1637  xmm4 += set( A(i+3UL,k ) ) * b1;
1638  xmm5 += set( A(i ,k+1UL) ) * b2;
1639  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
1640  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
1641  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
1642  }
1643 
1644  for( ; k<kend; ++k ) {
1645  const SIMDType b1( B.load(k,j) );
1646  xmm1 += set( A(i ,k) ) * b1;
1647  xmm2 += set( A(i+1UL,k) ) * b1;
1648  xmm3 += set( A(i+2UL,k) ) * b1;
1649  xmm4 += set( A(i+3UL,k) ) * b1;
1650  }
1651 
1652  C.store( i , j, xmm1+xmm5 );
1653  C.store( i+1UL, j, xmm2+xmm6 );
1654  C.store( i+2UL, j, xmm3+xmm7 );
1655  C.store( i+3UL, j, xmm4+xmm8 );
1656  }
1657 
1658  for( ; (i+3UL) <= iend; i+=3UL )
1659  {
1660  const size_t kbegin( ( IsUpper_v<MT4> )
1661  ?( ( IsLower_v<MT5> )
1662  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1663  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1664  :( IsLower_v<MT5> ? j : 0UL ) );
1665  const size_t kend( ( IsLower_v<MT4> )
1666  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1667  :( K ) );
1668 
1669  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1670  size_t k( kbegin );
1671 
1672  for( ; (k+2UL) <= kend; k+=2UL ) {
1673  const SIMDType b1( B.load(k ,j) );
1674  const SIMDType b2( B.load(k+1UL,j) );
1675  xmm1 += set( A(i ,k ) ) * b1;
1676  xmm2 += set( A(i+1UL,k ) ) * b1;
1677  xmm3 += set( A(i+2UL,k ) ) * b1;
1678  xmm4 += set( A(i ,k+1UL) ) * b2;
1679  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
1680  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
1681  }
1682 
1683  for( ; k<kend; ++k ) {
1684  const SIMDType b1( B.load(k,j) );
1685  xmm1 += set( A(i ,k) ) * b1;
1686  xmm2 += set( A(i+1UL,k) ) * b1;
1687  xmm3 += set( A(i+2UL,k) ) * b1;
1688  }
1689 
1690  C.store( i , j, xmm1+xmm4 );
1691  C.store( i+1UL, j, xmm2+xmm5 );
1692  C.store( i+2UL, j, xmm3+xmm6 );
1693  }
1694 
1695  for( ; (i+2UL) <= iend; i+=2UL )
1696  {
1697  const size_t kbegin( ( IsUpper_v<MT4> )
1698  ?( ( IsLower_v<MT5> )
1699  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1700  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1701  :( IsLower_v<MT5> ? j : 0UL ) );
1702  const size_t kend( ( IsLower_v<MT4> )
1703  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1704  :( K ) );
1705 
1706  SIMDType xmm1, xmm2, xmm3, xmm4;
1707  size_t k( kbegin );
1708 
1709  for( ; (k+2UL) <= kend; k+=2UL ) {
1710  const SIMDType b1( B.load(k ,j) );
1711  const SIMDType b2( B.load(k+1UL,j) );
1712  xmm1 += set( A(i ,k ) ) * b1;
1713  xmm2 += set( A(i+1UL,k ) ) * b1;
1714  xmm3 += set( A(i ,k+1UL) ) * b2;
1715  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
1716  }
1717 
1718  for( ; k<kend; ++k ) {
1719  const SIMDType b1( B.load(k,j) );
1720  xmm1 += set( A(i ,k) ) * b1;
1721  xmm2 += set( A(i+1UL,k) ) * b1;
1722  }
1723 
1724  C.store( i , j, xmm1+xmm3 );
1725  C.store( i+1UL, j, xmm2+xmm4 );
1726  }
1727 
1728  if( i < iend )
1729  {
1730  const size_t kbegin( ( IsUpper_v<MT4> )
1731  ?( ( IsLower_v<MT5> )
1732  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1733  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1734  :( IsLower_v<MT5> ? j : 0UL ) );
1735 
1736  SIMDType xmm1, xmm2;
1737  size_t k( kbegin );
1738 
1739  for( ; (k+2UL) <= K; k+=2UL ) {
1740  xmm1 += set( A(i,k ) ) * B.load(k ,j);
1741  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
1742  }
1743 
1744  for( ; k<K; ++k ) {
1745  xmm1 += set( A(i,k) ) * B.load(k,j);
1746  }
1747 
1748  C.store( i, j, xmm1+xmm2 );
1749 
1750  if( UPP ) ++i;
1751  }
1752 
1753  if( UPP ) {
1754  const size_t jjend( min(j+SIMDSIZE,N) );
1755  for( ; i<M; ++i ) {
1756  for( size_t jj=j; jj<jjend; ++jj ) {
1757  reset( C(i,jj) );
1758  }
1759  }
1760  }
1761  }
1762 
1763  for( ; remainder && j<N; ++j )
1764  {
1765  size_t i( 0UL );
1766 
1767  if( SYM || HERM ) {
1768  for( ; i<j; ++i ) {
1769  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1770  }
1771  }
1772  else if( LOW ) {
1773  for( ; i<j; ++i ) {
1774  reset( C(i,j) );
1775  }
1776  }
1777 
1778  for( ; (i+2UL) <= M; i+=2UL )
1779  {
1780  const size_t kbegin( ( IsUpper_v<MT4> )
1781  ?( ( IsLower_v<MT5> )
1782  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1783  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1784  :( IsLower_v<MT5> ? j : 0UL ) );
1785  const size_t kend( ( IsLower_v<MT4> )
1786  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1787  :( K ) );
1788 
1789  ElementType value1{};
1790  ElementType value2{};
1791 
1792  for( size_t k=kbegin; k<kend; ++k ) {
1793  value1 += A(i ,k) * B(k,j);
1794  value2 += A(i+1UL,k) * B(k,j);
1795  }
1796 
1797  C(i ,j) = value1;
1798  C(i+1UL,j) = value2;
1799  }
1800 
1801  if( i < M )
1802  {
1803  const size_t kbegin( ( IsUpper_v<MT4> )
1804  ?( ( IsLower_v<MT5> )
1805  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1806  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1807  :( IsLower_v<MT5> ? j : 0UL ) );
1808 
1809  ElementType value{};
1810 
1811  for( size_t k=kbegin; k<K; ++k ) {
1812  value += A(i,k) * B(k,j);
1813  }
1814 
1815  C(i,j) = value;
1816  }
1817  }
1818  }
1820  //**********************************************************************************************
1821 
1822  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1837  template< typename MT3 // Type of the left-hand side target matrix
1838  , typename MT4 // Type of the left-hand side matrix operand
1839  , typename MT5 > // Type of the right-hand side matrix operand
1840  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1841  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1842  {
1843  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
1844 
1845  const size_t M( A.rows() );
1846  const size_t N( B.columns() );
1847  const size_t K( A.columns() );
1848 
1849  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1850 
1851  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
1852  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1853 
1854  size_t i( 0UL );
1855 
1856  if( IsIntegral_v<ElementType> )
1857  {
1858  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1859  for( size_t j=0UL; j<N; ++j )
1860  {
1861  const size_t kbegin( ( IsLower_v<MT5> )
1862  ?( ( IsUpper_v<MT4> )
1863  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1864  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1865  :( IsUpper_v<MT4> ? i : 0UL ) );
1866  const size_t kend( ( IsUpper_v<MT5> )
1867  ?( ( IsLower_v<MT4> )
1868  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
1869  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
1870  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
1871 
1872  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1873 
1874  for( size_t k=kbegin; k<kend; ++k ) {
1875  const SIMDType b1( set( B(k,j) ) );
1876  xmm1 += A.load(i ,k) * b1;
1877  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1878  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1879  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1880  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1881  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
1882  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
1883  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
1884  }
1885 
1886  C.store( i , j, xmm1 );
1887  C.store( i+SIMDSIZE , j, xmm2 );
1888  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1889  C.store( i+SIMDSIZE*3UL, j, xmm4 );
1890  C.store( i+SIMDSIZE*4UL, j, xmm5 );
1891  C.store( i+SIMDSIZE*5UL, j, xmm6 );
1892  C.store( i+SIMDSIZE*6UL, j, xmm7 );
1893  C.store( i+SIMDSIZE*7UL, j, xmm8 );
1894  }
1895  }
1896  }
1897 
1898  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
1899  {
1900  size_t j( 0UL );
1901 
1902  for( ; (j+2UL) <= N; j+=2UL )
1903  {
1904  const size_t kbegin( ( IsLower_v<MT5> )
1905  ?( ( IsUpper_v<MT4> )
1906  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1907  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1908  :( IsUpper_v<MT4> ? i : 0UL ) );
1909  const size_t kend( ( IsUpper_v<MT5> )
1910  ?( ( IsLower_v<MT4> )
1911  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1912  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1913  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
1914 
1915  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1916 
1917  for( size_t k=kbegin; k<kend; ++k ) {
1918  const SIMDType a1( A.load(i ,k) );
1919  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1920  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1921  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1922  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1923  const SIMDType b1( set( B(k,j ) ) );
1924  const SIMDType b2( set( B(k,j+1UL) ) );
1925  xmm1 += a1 * b1;
1926  xmm2 += a2 * b1;
1927  xmm3 += a3 * b1;
1928  xmm4 += a4 * b1;
1929  xmm5 += a5 * b1;
1930  xmm6 += a1 * b2;
1931  xmm7 += a2 * b2;
1932  xmm8 += a3 * b2;
1933  xmm9 += a4 * b2;
1934  xmm10 += a5 * b2;
1935  }
1936 
1937  C.store( i , j , xmm1 );
1938  C.store( i+SIMDSIZE , j , xmm2 );
1939  C.store( i+SIMDSIZE*2UL, j , xmm3 );
1940  C.store( i+SIMDSIZE*3UL, j , xmm4 );
1941  C.store( i+SIMDSIZE*4UL, j , xmm5 );
1942  C.store( i , j+1UL, xmm6 );
1943  C.store( i+SIMDSIZE , j+1UL, xmm7 );
1944  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1945  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1946  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1947  }
1948 
1949  if( j < N )
1950  {
1951  const size_t kbegin( ( IsLower_v<MT5> )
1952  ?( ( IsUpper_v<MT4> )
1953  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1954  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1955  :( IsUpper_v<MT4> ? i : 0UL ) );
1956  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1957 
1958  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1959 
1960  for( size_t k=kbegin; k<kend; ++k ) {
1961  const SIMDType b1( set( B(k,j) ) );
1962  xmm1 += A.load(i ,k) * b1;
1963  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1964  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1965  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1966  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1967  }
1968 
1969  C.store( i , j, xmm1 );
1970  C.store( i+SIMDSIZE , j, xmm2 );
1971  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1972  C.store( i+SIMDSIZE*3UL, j, xmm4 );
1973  C.store( i+SIMDSIZE*4UL, j, xmm5 );
1974  }
1975  }
1976 
1977  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1978  {
1979  const size_t jend( LOW ? min(i+SIMDSIZE*4UL,N) : N );
1980  size_t j( 0UL );
1981 
1982  if( SYM || HERM ) {
1983  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
1984  for( ; j<i; ++j ) {
1985  for( size_t ii=i; ii<iiend; ++ii ) {
1986  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
1987  }
1988  }
1989  }
1990  else if( UPP ) {
1991  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
1992  for( ; j<i; ++j ) {
1993  for( size_t ii=i; ii<iiend; ++ii ) {
1994  reset( C(ii,j) );
1995  }
1996  }
1997  }
1998 
1999  for( ; (j+2UL) <= jend; j+=2UL )
2000  {
2001  const size_t kbegin( ( IsLower_v<MT5> )
2002  ?( ( IsUpper_v<MT4> )
2003  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2004  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2005  :( IsUpper_v<MT4> ? i : 0UL ) );
2006  const size_t kend( ( IsUpper_v<MT5> )
2007  ?( ( IsLower_v<MT4> )
2008  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2009  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2010  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
2011 
2012  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2013 
2014  for( size_t k=kbegin; k<kend; ++k ) {
2015  const SIMDType a1( A.load(i ,k) );
2016  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2017  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2018  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2019  const SIMDType b1( set( B(k,j ) ) );
2020  const SIMDType b2( set( B(k,j+1UL) ) );
2021  xmm1 += a1 * b1;
2022  xmm2 += a2 * b1;
2023  xmm3 += a3 * b1;
2024  xmm4 += a4 * b1;
2025  xmm5 += a1 * b2;
2026  xmm6 += a2 * b2;
2027  xmm7 += a3 * b2;
2028  xmm8 += a4 * b2;
2029  }
2030 
2031  C.store( i , j , xmm1 );
2032  C.store( i+SIMDSIZE , j , xmm2 );
2033  C.store( i+SIMDSIZE*2UL, j , xmm3 );
2034  C.store( i+SIMDSIZE*3UL, j , xmm4 );
2035  C.store( i , j+1UL, xmm5 );
2036  C.store( i+SIMDSIZE , j+1UL, xmm6 );
2037  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2038  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2039  }
2040 
2041  if( j < jend )
2042  {
2043  const size_t kbegin( ( IsLower_v<MT5> )
2044  ?( ( IsUpper_v<MT4> )
2045  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2046  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2047  :( IsUpper_v<MT4> ? i : 0UL ) );
2048  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
2049 
2050  SIMDType xmm1, xmm2, xmm3, xmm4;
2051 
2052  for( size_t k=kbegin; k<kend; ++k ) {
2053  const SIMDType b1( set( B(k,j) ) );
2054  xmm1 += A.load(i ,k) * b1;
2055  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2056  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2057  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2058  }
2059 
2060  C.store( i , j, xmm1 );
2061  C.store( i+SIMDSIZE , j, xmm2 );
2062  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2063  C.store( i+SIMDSIZE*3UL, j, xmm4 );
2064 
2065  if( LOW ) ++j;
2066  }
2067 
2068  if( LOW ) {
2069  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
2070  for( ; j<N; ++j ) {
2071  for( size_t ii=i; ii<iiend; ++ii ) {
2072  reset( C(ii,j) );
2073  }
2074  }
2075  }
2076  }
2077 
2078  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2079  {
2080  const size_t jend( LOW ? min(i+SIMDSIZE*3UL,N) : N );
2081  size_t j( 0UL );
2082 
2083  if( SYM || HERM ) {
2084  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
2085  for( ; j<i; ++j ) {
2086  for( size_t ii=i; ii<iiend; ++ii ) {
2087  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
2088  }
2089  }
2090  }
2091  else if( UPP ) {
2092  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
2093  for( ; j<i; ++j ) {
2094  for( size_t ii=i; ii<iiend; ++ii ) {
2095  reset( C(ii,j) );
2096  }
2097  }
2098  }
2099 
2100  for( ; (j+2UL) <= jend; j+=2UL )
2101  {
2102  const size_t kbegin( ( IsLower_v<MT5> )
2103  ?( ( IsUpper_v<MT4> )
2104  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2105  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2106  :( IsUpper_v<MT4> ? i : 0UL ) );
2107  const size_t kend( ( IsUpper_v<MT5> )
2108  ?( ( IsLower_v<MT4> )
2109  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2110  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2111  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
2112 
2113  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2114 
2115  for( size_t k=kbegin; k<kend; ++k ) {
2116  const SIMDType a1( A.load(i ,k) );
2117  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2118  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2119  const SIMDType b1( set( B(k,j ) ) );
2120  const SIMDType b2( set( B(k,j+1UL) ) );
2121  xmm1 += a1 * b1;
2122  xmm2 += a2 * b1;
2123  xmm3 += a3 * b1;
2124  xmm4 += a1 * b2;
2125  xmm5 += a2 * b2;
2126  xmm6 += a3 * b2;
2127  }
2128 
2129  C.store( i , j , xmm1 );
2130  C.store( i+SIMDSIZE , j , xmm2 );
2131  C.store( i+SIMDSIZE*2UL, j , xmm3 );
2132  C.store( i , j+1UL, xmm4 );
2133  C.store( i+SIMDSIZE , j+1UL, xmm5 );
2134  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2135  }
2136 
2137  if( j < jend )
2138  {
2139  const size_t kbegin( ( IsLower_v<MT5> )
2140  ?( ( IsUpper_v<MT4> )
2141  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2142  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2143  :( IsUpper_v<MT4> ? i : 0UL ) );
2144  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2145 
2146  SIMDType xmm1, xmm2, xmm3;
2147 
2148  for( size_t k=kbegin; k<kend; ++k ) {
2149  const SIMDType b1( set( B(k,j) ) );
2150  xmm1 += A.load(i ,k) * b1;
2151  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2152  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2153  }
2154 
2155  C.store( i , j, xmm1 );
2156  C.store( i+SIMDSIZE , j, xmm2 );
2157  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2158 
2159  if( LOW ) ++j;
2160  }
2161 
2162  if( LOW ) {
2163  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
2164  for( ; j<N; ++j ) {
2165  for( size_t ii=i; ii<iiend; ++ii ) {
2166  reset( C(ii,j) );
2167  }
2168  }
2169  }
2170  }
2171 
2172  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2173  {
2174  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
2175  size_t j( 0UL );
2176 
2177  if( SYM || HERM ) {
2178  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
2179  for( ; j<i; ++j ) {
2180  for( size_t ii=i; ii<iiend; ++ii ) {
2181  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
2182  }
2183  }
2184  }
2185  else if( UPP ) {
2186  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
2187  for( ; j<i; ++j ) {
2188  for( size_t ii=i; ii<iiend; ++ii ) {
2189  reset( C(ii,j) );
2190  }
2191  }
2192  }
2193 
2194  for( ; (j+4UL) <= jend; j+=4UL )
2195  {
2196  const size_t kbegin( ( IsLower_v<MT5> )
2197  ?( ( IsUpper_v<MT4> )
2198  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2199  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2200  :( IsUpper_v<MT4> ? i : 0UL ) );
2201  const size_t kend( ( IsUpper_v<MT5> )
2202  ?( ( IsLower_v<MT4> )
2203  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2204  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2205  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2206 
2207  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2208 
2209  for( size_t k=kbegin; k<kend; ++k ) {
2210  const SIMDType a1( A.load(i ,k) );
2211  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2212  const SIMDType b1( set( B(k,j ) ) );
2213  const SIMDType b2( set( B(k,j+1UL) ) );
2214  const SIMDType b3( set( B(k,j+2UL) ) );
2215  const SIMDType b4( set( B(k,j+3UL) ) );
2216  xmm1 += a1 * b1;
2217  xmm2 += a2 * b1;
2218  xmm3 += a1 * b2;
2219  xmm4 += a2 * b2;
2220  xmm5 += a1 * b3;
2221  xmm6 += a2 * b3;
2222  xmm7 += a1 * b4;
2223  xmm8 += a2 * b4;
2224  }
2225 
2226  C.store( i , j , xmm1 );
2227  C.store( i+SIMDSIZE, j , xmm2 );
2228  C.store( i , j+1UL, xmm3 );
2229  C.store( i+SIMDSIZE, j+1UL, xmm4 );
2230  C.store( i , j+2UL, xmm5 );
2231  C.store( i+SIMDSIZE, j+2UL, xmm6 );
2232  C.store( i , j+3UL, xmm7 );
2233  C.store( i+SIMDSIZE, j+3UL, xmm8 );
2234  }
2235 
2236  for( ; (j+3UL) <= jend; j+=3UL )
2237  {
2238  const size_t kbegin( ( IsLower_v<MT5> )
2239  ?( ( IsUpper_v<MT4> )
2240  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2241  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2242  :( IsUpper_v<MT4> ? i : 0UL ) );
2243  const size_t kend( ( IsUpper_v<MT5> )
2244  ?( ( IsLower_v<MT4> )
2245  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2246  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2247  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2248 
2249  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2250 
2251  for( size_t k=kbegin; k<kend; ++k ) {
2252  const SIMDType a1( A.load(i ,k) );
2253  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2254  const SIMDType b1( set( B(k,j ) ) );
2255  const SIMDType b2( set( B(k,j+1UL) ) );
2256  const SIMDType b3( set( B(k,j+2UL) ) );
2257  xmm1 += a1 * b1;
2258  xmm2 += a2 * b1;
2259  xmm3 += a1 * b2;
2260  xmm4 += a2 * b2;
2261  xmm5 += a1 * b3;
2262  xmm6 += a2 * b3;
2263  }
2264 
2265  C.store( i , j , xmm1 );
2266  C.store( i+SIMDSIZE, j , xmm2 );
2267  C.store( i , j+1UL, xmm3 );
2268  C.store( i+SIMDSIZE, j+1UL, xmm4 );
2269  C.store( i , j+2UL, xmm5 );
2270  C.store( i+SIMDSIZE, j+2UL, xmm6 );
2271  }
2272 
2273  for( ; (j+2UL) <= jend; j+=2UL )
2274  {
2275  const size_t kbegin( ( IsLower_v<MT5> )
2276  ?( ( IsUpper_v<MT4> )
2277  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2278  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2279  :( IsUpper_v<MT4> ? i : 0UL ) );
2280  const size_t kend( ( IsUpper_v<MT5> )
2281  ?( ( IsLower_v<MT4> )
2282  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2283  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2284  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2285 
2286  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2287  size_t k( kbegin );
2288 
2289  for( ; (k+2UL) <= kend; k+=2UL ) {
2290  const SIMDType a1( A.load(i ,k ) );
2291  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2292  const SIMDType a3( A.load(i ,k+1UL) );
2293  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2294  const SIMDType b1( set( B(k ,j ) ) );
2295  const SIMDType b2( set( B(k ,j+1UL) ) );
2296  const SIMDType b3( set( B(k+1UL,j ) ) );
2297  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
2298  xmm1 += a1 * b1;
2299  xmm2 += a2 * b1;
2300  xmm3 += a1 * b2;
2301  xmm4 += a2 * b2;
2302  xmm5 += a3 * b3;
2303  xmm6 += a4 * b3;
2304  xmm7 += a3 * b4;
2305  xmm8 += a4 * b4;
2306  }
2307 
2308  for( ; k<kend; ++k ) {
2309  const SIMDType a1( A.load(i ,k) );
2310  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2311  const SIMDType b1( set( B(k,j ) ) );
2312  const SIMDType b2( set( B(k,j+1UL) ) );
2313  xmm1 += a1 * b1;
2314  xmm2 += a2 * b1;
2315  xmm3 += a1 * b2;
2316  xmm4 += a2 * b2;
2317  }
2318 
2319  C.store( i , j , xmm1+xmm5 );
2320  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
2321  C.store( i , j+1UL, xmm3+xmm7 );
2322  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2323  }
2324 
2325  if( j < jend )
2326  {
2327  const size_t kbegin( ( IsLower_v<MT5> )
2328  ?( ( IsUpper_v<MT4> )
2329  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2330  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2331  :( IsUpper_v<MT4> ? i : 0UL ) );
2332  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2333 
2334  SIMDType xmm1, xmm2, xmm3, xmm4;
2335  size_t k( kbegin );
2336 
2337  for( ; (k+2UL) <= kend; k+=2UL ) {
2338  const SIMDType b1( set( B(k ,j) ) );
2339  const SIMDType b2( set( B(k+1UL,j) ) );
2340  xmm1 += A.load(i ,k ) * b1;
2341  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2342  xmm3 += A.load(i ,k+1UL) * b2;
2343  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2344  }
2345 
2346  for( ; k<kend; ++k ) {
2347  const SIMDType b1( set( B(k,j) ) );
2348  xmm1 += A.load(i ,k) * b1;
2349  xmm2 += A.load(i+SIMDSIZE,k) * b1;
2350  }
2351 
2352  C.store( i , j, xmm1+xmm3 );
2353  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
2354 
2355  if( LOW ) ++j;
2356  }
2357 
2358  if( LOW ) {
2359  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
2360  for( ; j<N; ++j ) {
2361  for( size_t ii=i; ii<iiend; ++ii ) {
2362  reset( C(ii,j) );
2363  }
2364  }
2365  }
2366  }
2367 
2368  for( ; i<ipos; i+=SIMDSIZE )
2369  {
2370  const size_t jend( LOW ? min(i+SIMDSIZE,N) : N );
2371  size_t j( 0UL );
2372 
2373  if( SYM || HERM ) {
2374  const size_t iiend( min(i+SIMDSIZE,M) );
2375  for( ; j<i; ++j ) {
2376  for( size_t ii=i; ii<iiend; ++ii ) {
2377  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
2378  }
2379  }
2380  }
2381  else if( UPP ) {
2382  const size_t iiend( min(i+SIMDSIZE,M) );
2383  for( ; j<i; ++j ) {
2384  for( size_t ii=i; ii<iiend; ++ii ) {
2385  reset( C(ii,j) );
2386  }
2387  }
2388  }
2389 
2390  for( ; (j+4UL) <= jend; j+=4UL )
2391  {
2392  const size_t kbegin( ( IsLower_v<MT5> )
2393  ?( ( IsUpper_v<MT4> )
2394  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2395  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2396  :( IsUpper_v<MT4> ? i : 0UL ) );
2397  const size_t kend( ( IsUpper_v<MT5> )
2398  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2399  :( K ) );
2400 
2401  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2402  size_t k( kbegin );
2403 
2404  for( ; (k+2UL) <= kend; k+=2UL ) {
2405  const SIMDType a1( A.load(i,k ) );
2406  const SIMDType a2( A.load(i,k+1UL) );
2407  xmm1 += a1 * set( B(k ,j ) );
2408  xmm2 += a1 * set( B(k ,j+1UL) );
2409  xmm3 += a1 * set( B(k ,j+2UL) );
2410  xmm4 += a1 * set( B(k ,j+3UL) );
2411  xmm5 += a2 * set( B(k+1UL,j ) );
2412  xmm6 += a2 * set( B(k+1UL,j+1UL) );
2413  xmm7 += a2 * set( B(k+1UL,j+2UL) );
2414  xmm8 += a2 * set( B(k+1UL,j+3UL) );
2415  }
2416 
2417  for( ; k<kend; ++k ) {
2418  const SIMDType a1( A.load(i,k) );
2419  xmm1 += a1 * set( B(k,j ) );
2420  xmm2 += a1 * set( B(k,j+1UL) );
2421  xmm3 += a1 * set( B(k,j+2UL) );
2422  xmm4 += a1 * set( B(k,j+3UL) );
2423  }
2424 
2425  C.store( i, j , xmm1+xmm5 );
2426  C.store( i, j+1UL, xmm2+xmm6 );
2427  C.store( i, j+2UL, xmm3+xmm7 );
2428  C.store( i, j+3UL, xmm4+xmm8 );
2429  }
2430 
2431  for( ; (j+3UL) <= jend; j+=3UL )
2432  {
2433  const size_t kbegin( ( IsLower_v<MT5> )
2434  ?( ( IsUpper_v<MT4> )
2435  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2436  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2437  :( IsUpper_v<MT4> ? i : 0UL ) );
2438  const size_t kend( ( IsUpper_v<MT5> )
2439  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2440  :( K ) );
2441 
2442  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2443  size_t k( kbegin );
2444 
2445  for( ; (k+2UL) <= kend; k+=2UL ) {
2446  const SIMDType a1( A.load(i,k ) );
2447  const SIMDType a2( A.load(i,k+1UL) );
2448  xmm1 += a1 * set( B(k ,j ) );
2449  xmm2 += a1 * set( B(k ,j+1UL) );
2450  xmm3 += a1 * set( B(k ,j+2UL) );
2451  xmm4 += a2 * set( B(k+1UL,j ) );
2452  xmm5 += a2 * set( B(k+1UL,j+1UL) );
2453  xmm6 += a2 * set( B(k+1UL,j+2UL) );
2454  }
2455 
2456  for( ; k<kend; ++k ) {
2457  const SIMDType a1( A.load(i,k) );
2458  xmm1 += a1 * set( B(k,j ) );
2459  xmm2 += a1 * set( B(k,j+1UL) );
2460  xmm3 += a1 * set( B(k,j+2UL) );
2461  }
2462 
2463  C.store( i, j , xmm1+xmm4 );
2464  C.store( i, j+1UL, xmm2+xmm5 );
2465  C.store( i, j+2UL, xmm3+xmm6 );
2466  }
2467 
2468  for( ; (j+2UL) <= jend; j+=2UL )
2469  {
2470  const size_t kbegin( ( IsLower_v<MT5> )
2471  ?( ( IsUpper_v<MT4> )
2472  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2473  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2474  :( IsUpper_v<MT4> ? i : 0UL ) );
2475  const size_t kend( ( IsUpper_v<MT5> )
2476  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2477  :( K ) );
2478 
2479  SIMDType xmm1, xmm2, xmm3, xmm4;
2480  size_t k( kbegin );
2481 
2482  for( ; (k+2UL) <= kend; k+=2UL ) {
2483  const SIMDType a1( A.load(i,k ) );
2484  const SIMDType a2( A.load(i,k+1UL) );
2485  xmm1 += a1 * set( B(k ,j ) );
2486  xmm2 += a1 * set( B(k ,j+1UL) );
2487  xmm3 += a2 * set( B(k+1UL,j ) );
2488  xmm4 += a2 * set( B(k+1UL,j+1UL) );
2489  }
2490 
2491  for( ; k<kend; ++k ) {
2492  const SIMDType a1( A.load(i,k) );
2493  xmm1 += a1 * set( B(k,j ) );
2494  xmm2 += a1 * set( B(k,j+1UL) );
2495  }
2496 
2497  C.store( i, j , xmm1+xmm3 );
2498  C.store( i, j+1UL, xmm2+xmm4 );
2499  }
2500 
2501  if( j < jend )
2502  {
2503  const size_t kbegin( ( IsLower_v<MT5> )
2504  ?( ( IsUpper_v<MT4> )
2505  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2506  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2507  :( IsUpper_v<MT4> ? i : 0UL ) );
2508 
2509  SIMDType xmm1, xmm2;
2510  size_t k( kbegin );
2511 
2512  for( ; (k+2UL) <= K; k+=2UL ) {
2513  xmm1 += A.load(i,k ) * set( B(k ,j) );
2514  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
2515  }
2516 
2517  for( ; k<K; ++k ) {
2518  xmm1 += A.load(i,k) * set( B(k,j) );
2519  }
2520 
2521  C.store( i, j, xmm1+xmm2 );
2522 
2523  if( LOW ) ++j;
2524  }
2525 
2526  if( LOW ) {
2527  const size_t iiend( min(i+SIMDSIZE,M) );
2528  for( ; j<N; ++j ) {
2529  for( size_t ii=i; ii<iiend; ++ii ) {
2530  reset( C(ii,j) );
2531  }
2532  }
2533  }
2534  }
2535 
2536  for( ; remainder && i<M; ++i )
2537  {
2538  size_t j( 0UL );
2539 
2540  if( SYM || HERM ) {
2541  for( ; j<i; ++j ) {
2542  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
2543  }
2544  }
2545  else if( UPP ) {
2546  for( ; j<i; ++j ) {
2547  reset( C(i,j) );
2548  }
2549  }
2550 
2551  for( ; (j+2UL) <= N; j+=2UL )
2552  {
2553  const size_t kbegin( ( IsLower_v<MT5> )
2554  ?( ( IsUpper_v<MT4> )
2555  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2556  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2557  :( IsUpper_v<MT4> ? i : 0UL ) );
2558  const size_t kend( ( IsUpper_v<MT5> )
2559  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2560  :( K ) );
2561 
2562  ElementType value1{};
2563  ElementType value2{};
2564 
2565  for( size_t k=kbegin; k<kend; ++k ) {
2566  value1 += A(i,k) * B(k,j );
2567  value2 += A(i,k) * B(k,j+1UL);
2568  }
2569 
2570  C(i,j ) = value1;
2571  C(i,j+1UL) = value2;
2572  }
2573 
2574  if( j < N )
2575  {
2576  const size_t kbegin( ( IsLower_v<MT5> )
2577  ?( ( IsUpper_v<MT4> )
2578  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2579  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2580  :( IsUpper_v<MT4> ? i : 0UL ) );
2581 
2582  ElementType value{};
2583 
2584  for( size_t k=kbegin; k<K; ++k ) {
2585  value += A(i,k) * B(k,j);
2586  }
2587 
2588  C(i,j) = value;
2589  }
2590  }
2591  }
2593  //**********************************************************************************************
2594 
2595  //**Default assignment to dense matrices (large matrices)***************************************
2609  template< typename MT3 // Type of the left-hand side target matrix
2610  , typename MT4 // Type of the left-hand side matrix operand
2611  , typename MT5 > // Type of the right-hand side matrix operand
2612  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2613  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2614  {
2615  selectDefaultAssignKernel( C, A, B );
2616  }
2618  //**********************************************************************************************
2619 
2620  //**Vectorized default assignment to dense matrices (large matrices)****************************
2635  template< typename MT3 // Type of the left-hand side target matrix
2636  , typename MT4 // Type of the left-hand side matrix operand
2637  , typename MT5 > // Type of the right-hand side matrix operand
2638  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2639  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2640  {
2641  if( SYM )
2642  smmm( C, A, B, ElementType(1) );
2643  else if( HERM )
2644  hmmm( C, A, B, ElementType(1) );
2645  else if( LOW )
2646  lmmm( C, A, B, ElementType(1), ElementType(0) );
2647  else if( UPP )
2648  ummm( C, A, B, ElementType(1), ElementType(0) );
2649  else
2650  mmm( C, A, B, ElementType(1), ElementType(0) );
2651  }
2653  //**********************************************************************************************
2654 
2655  //**BLAS-based assignment to dense matrices (default)*******************************************
2669  template< typename MT3 // Type of the left-hand side target matrix
2670  , typename MT4 // Type of the left-hand side matrix operand
2671  , typename MT5 > // Type of the right-hand side matrix operand
2672  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2673  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2674  {
2675  selectLargeAssignKernel( C, A, B );
2676  }
2678  //**********************************************************************************************
2679 
2680  //**BLAS-based assignment to dense matrices*****************************************************
2681 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2682 
2695  template< typename MT3 // Type of the left-hand side target matrix
2696  , typename MT4 // Type of the left-hand side matrix operand
2697  , typename MT5 > // Type of the right-hand side matrix operand
2698  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2699  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2700  {
2701  using ET = ElementType_t<MT3>;
2702 
2703  if( IsTriangular_v<MT4> ) {
2704  assign( C, B );
2705  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2706  }
2707  else if( IsTriangular_v<MT5> ) {
2708  assign( C, A );
2709  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2710  }
2711  else {
2712  gemm( C, A, B, ET(1), ET(0) );
2713  }
2714  }
2716 #endif
2717  //**********************************************************************************************
2718 
2719  //**Assignment to sparse matrices***************************************************************
2732  template< typename MT // Type of the target sparse matrix
2733  , bool SO > // Storage order of the target sparse matrix
2734  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2735  {
2737 
2738  using TmpType = If_t< SO, ResultType, OppositeType >;
2739 
2746 
2747  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2748  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2749 
2750  const ForwardFunctor fwd;
2751 
2752  const TmpType tmp( serial( rhs ) );
2753  assign( ~lhs, fwd( tmp ) );
2754  }
2756  //**********************************************************************************************
2757 
2758  //**Addition assignment to dense matrices*******************************************************
2771  template< typename MT // Type of the target dense matrix
2772  , bool SO > // Storage order of the target dense matrix
2773  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2774  {
2776 
2777  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2778  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2779 
2780  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2781  return;
2782  }
2783 
2784  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2785  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2786 
2787  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2788  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2789  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2790  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2791  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2792  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2793 
2794  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2795  }
2797  //**********************************************************************************************
2798 
2799  //**Addition assignment to dense matrices (kernel selection)************************************
2810  template< typename MT3 // Type of the left-hand side target matrix
2811  , typename MT4 // Type of the left-hand side matrix operand
2812  , typename MT5 > // Type of the right-hand side matrix operand
2813  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2814  {
2815  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
2816  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
2817  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
2818  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2819  selectSmallAddAssignKernel( C, A, B );
2820  else
2821  selectBlasAddAssignKernel( C, A, B );
2822  }
2824  //**********************************************************************************************
2825 
2826  //**Default addition assignment to row-major dense matrices (general/general)*******************
2840  template< typename MT3 // Type of the left-hand side target matrix
2841  , typename MT4 // Type of the left-hand side matrix operand
2842  , typename MT5 > // Type of the right-hand side matrix operand
2843  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2844  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2845  {
2846  const size_t M( A.rows() );
2847  const size_t N( B.columns() );
2848  const size_t K( A.columns() );
2849 
2850  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2851 
2852  for( size_t i=0UL; i<M; ++i )
2853  {
2854  const size_t kbegin( ( IsUpper_v<MT4> )
2855  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2856  :( 0UL ) );
2857  const size_t kend( ( IsLower_v<MT4> )
2858  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2859  :( K ) );
2860  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2861 
2862  for( size_t k=kbegin; k<kend; ++k )
2863  {
2864  const size_t jbegin( ( IsUpper_v<MT5> )
2865  ?( ( IsStrictlyUpper_v<MT5> )
2866  ?( UPP ? max(i,k+1UL) : k+1UL )
2867  :( UPP ? max(i,k) : k ) )
2868  :( UPP ? i : 0UL ) );
2869  const size_t jend( ( IsLower_v<MT5> )
2870  ?( ( IsStrictlyLower_v<MT5> )
2871  ?( LOW ? min(i+1UL,k) : k )
2872  :( LOW ? min(i,k)+1UL : k+1UL ) )
2873  :( LOW ? i+1UL : N ) );
2874 
2875  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
2876  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2877 
2878  const size_t jnum( jend - jbegin );
2879  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2880 
2881  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2882  C(i,j ) += A(i,k) * B(k,j );
2883  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2884  }
2885  if( jpos < jend ) {
2886  C(i,jpos) += A(i,k) * B(k,jpos);
2887  }
2888  }
2889  }
2890  }
2892  //**********************************************************************************************
2893 
2894  //**Default addition assignment to column-major dense matrices (general/general)****************
2908  template< typename MT3 // Type of the left-hand side target matrix
2909  , typename MT4 // Type of the left-hand side matrix operand
2910  , typename MT5 > // Type of the right-hand side matrix operand
2911  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2912  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2913  {
2914  const size_t M( A.rows() );
2915  const size_t N( B.columns() );
2916  const size_t K( A.columns() );
2917 
2918  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2919 
2920  for( size_t j=0UL; j<N; ++j )
2921  {
2922  const size_t kbegin( ( IsLower_v<MT5> )
2923  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2924  :( 0UL ) );
2925  const size_t kend( ( IsUpper_v<MT5> )
2926  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2927  :( K ) );
2928  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2929 
2930  for( size_t k=kbegin; k<kend; ++k )
2931  {
2932  const size_t ibegin( ( IsLower_v<MT4> )
2933  ?( ( IsStrictlyLower_v<MT4> )
2934  ?( LOW ? max(j,k+1UL) : k+1UL )
2935  :( LOW ? max(j,k) : k ) )
2936  :( LOW ? j : 0UL ) );
2937  const size_t iend( ( IsUpper_v<MT4> )
2938  ?( ( IsStrictlyUpper_v<MT4> )
2939  ?( UPP ? min(j+1UL,k) : k )
2940  :( UPP ? min(j,k)+1UL : k+1UL ) )
2941  :( UPP ? j+1UL : M ) );
2942 
2943  if( ( LOW || UPP ) && ibegin >= iend ) continue;
2944  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2945 
2946  const size_t inum( iend - ibegin );
2947  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2948 
2949  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2950  C(i ,j) += A(i ,k) * B(k,j);
2951  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2952  }
2953  if( ipos < iend ) {
2954  C(ipos,j) += A(ipos,k) * B(k,j);
2955  }
2956  }
2957  }
2958  }
2960  //**********************************************************************************************
2961 
2962  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2976  template< typename MT3 // Type of the left-hand side target matrix
2977  , typename MT4 // Type of the left-hand side matrix operand
2978  , typename MT5 > // Type of the right-hand side matrix operand
2979  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2980  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2981  {
2982  constexpr size_t block( BLOCK_SIZE );
2983 
2984  const size_t M( A.rows() );
2985  const size_t N( B.columns() );
2986 
2987  for( size_t ii=0UL; ii<M; ii+=block ) {
2988  const size_t iend( min( M, ii+block ) );
2989  for( size_t jj=0UL; jj<N; jj+=block ) {
2990  const size_t jend( min( N, jj+block ) );
2991  for( size_t i=ii; i<iend; ++i )
2992  {
2993  const size_t jbegin( ( IsUpper_v<MT4> )
2994  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
2995  :( jj ) );
2996  const size_t jpos( ( IsLower_v<MT4> )
2997  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
2998  :( jend ) );
2999 
3000  for( size_t j=jbegin; j<jpos; ++j ) {
3001  C(i,j) += A(i,j) * B(j,j);
3002  }
3003  }
3004  }
3005  }
3006  }
3008  //**********************************************************************************************
3009 
3010  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
3024  template< typename MT3 // Type of the left-hand side target matrix
3025  , typename MT4 // Type of the left-hand side matrix operand
3026  , typename MT5 > // Type of the right-hand side matrix operand
3027  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3028  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3029  {
3030  const size_t M( A.rows() );
3031  const size_t N( B.columns() );
3032 
3033  for( size_t j=0UL; j<N; ++j )
3034  {
3035  const size_t ibegin( ( IsLower_v<MT4> )
3036  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3037  :( 0UL ) );
3038  const size_t iend( ( IsUpper_v<MT4> )
3039  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3040  :( M ) );
3041  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3042 
3043  const size_t inum( iend - ibegin );
3044  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3045 
3046  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3047  C(i ,j) += A(i ,j) * B(j,j);
3048  C(i+1UL,j) += A(i+1UL,j) * B(j,j);
3049  }
3050  if( ipos < iend ) {
3051  C(ipos,j) += A(ipos,j) * B(j,j);
3052  }
3053  }
3054  }
3056  //**********************************************************************************************
3057 
3058  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
3072  template< typename MT3 // Type of the left-hand side target matrix
3073  , typename MT4 // Type of the left-hand side matrix operand
3074  , typename MT5 > // Type of the right-hand side matrix operand
3075  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3076  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3077  {
3078  const size_t M( A.rows() );
3079  const size_t N( B.columns() );
3080 
3081  for( size_t i=0UL; i<M; ++i )
3082  {
3083  const size_t jbegin( ( IsUpper_v<MT5> )
3084  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3085  :( 0UL ) );
3086  const size_t jend( ( IsLower_v<MT5> )
3087  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3088  :( N ) );
3089  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3090 
3091  const size_t jnum( jend - jbegin );
3092  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3093 
3094  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3095  C(i,j ) += A(i,i) * B(i,j );
3096  C(i,j+1UL) += A(i,i) * B(i,j+1UL);
3097  }
3098  if( jpos < jend ) {
3099  C(i,jpos) += A(i,i) * B(i,jpos);
3100  }
3101  }
3102  }
3104  //**********************************************************************************************
3105 
3106  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
3120  template< typename MT3 // Type of the left-hand side target matrix
3121  , typename MT4 // Type of the left-hand side matrix operand
3122  , typename MT5 > // Type of the right-hand side matrix operand
3123  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3124  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3125  {
3126  constexpr size_t block( BLOCK_SIZE );
3127 
3128  const size_t M( A.rows() );
3129  const size_t N( B.columns() );
3130 
3131  for( size_t jj=0UL; jj<N; jj+=block ) {
3132  const size_t jend( min( N, jj+block ) );
3133  for( size_t ii=0UL; ii<M; ii+=block ) {
3134  const size_t iend( min( M, ii+block ) );
3135  for( size_t j=jj; j<jend; ++j )
3136  {
3137  const size_t ibegin( ( IsLower_v<MT5> )
3138  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
3139  :( ii ) );
3140  const size_t ipos( ( IsUpper_v<MT5> )
3141  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
3142  :( iend ) );
3143 
3144  for( size_t i=ibegin; i<ipos; ++i ) {
3145  C(i,j) += A(i,i) * B(i,j);
3146  }
3147  }
3148  }
3149  }
3150  }
3152  //**********************************************************************************************
3153 
3154  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
3168  template< typename MT3 // Type of the left-hand side target matrix
3169  , typename MT4 // Type of the left-hand side matrix operand
3170  , typename MT5 > // Type of the right-hand side matrix operand
3171  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3172  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3173  {
3174  for( size_t i=0UL; i<A.rows(); ++i ) {
3175  C(i,i) += A(i,i) * B(i,i);
3176  }
3177  }
3179  //**********************************************************************************************
3180 
3181  //**Default addition assignment to dense matrices (small matrices)******************************
3195  template< typename MT3 // Type of the left-hand side target matrix
3196  , typename MT4 // Type of the left-hand side matrix operand
3197  , typename MT5 > // Type of the right-hand side matrix operand
3198  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3199  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3200  {
3201  selectDefaultAddAssignKernel( C, A, B );
3202  }
3204  //**********************************************************************************************
3205 
3206  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
3221  template< typename MT3 // Type of the left-hand side target matrix
3222  , typename MT4 // Type of the left-hand side matrix operand
3223  , typename MT5 > // Type of the right-hand side matrix operand
3224  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3225  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3226  {
3227  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3228 
3229  const size_t M( A.rows() );
3230  const size_t N( B.columns() );
3231  const size_t K( A.columns() );
3232 
3233  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3234 
3235  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3236  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3237 
3238  size_t j( 0UL );
3239 
3240  if( IsIntegral_v<ElementType> )
3241  {
3242  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3243  for( size_t i=0UL; i<M; ++i )
3244  {
3245  const size_t kbegin( ( IsUpper_v<MT4> )
3246  ?( ( IsLower_v<MT5> )
3247  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3248  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3249  :( IsLower_v<MT5> ? j : 0UL ) );
3250  const size_t kend( ( IsLower_v<MT4> )
3251  ?( ( IsUpper_v<MT5> )
3252  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
3253  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3254  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
3255 
3256  SIMDType xmm1( C.load(i,j ) );
3257  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3258  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3259  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3260  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3261  SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
3262  SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
3263  SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
3264 
3265  for( size_t k=kbegin; k<kend; ++k ) {
3266  const SIMDType a1( set( A(i,k) ) );
3267  xmm1 += a1 * B.load(k,j );
3268  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3269  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3270  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3271  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3272  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
3273  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
3274  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
3275  }
3276 
3277  C.store( i, j , xmm1 );
3278  C.store( i, j+SIMDSIZE , xmm2 );
3279  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3280  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3281  C.store( i, j+SIMDSIZE*4UL, xmm5 );
3282  C.store( i, j+SIMDSIZE*5UL, xmm6 );
3283  C.store( i, j+SIMDSIZE*6UL, xmm7 );
3284  C.store( i, j+SIMDSIZE*7UL, xmm8 );
3285  }
3286  }
3287  }
3288 
3289  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3290  {
3291  size_t i( 0UL );
3292 
3293  for( ; (i+2UL) <= M; i+=2UL )
3294  {
3295  const size_t kbegin( ( IsUpper_v<MT4> )
3296  ?( ( IsLower_v<MT5> )
3297  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3298  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3299  :( IsLower_v<MT5> ? j : 0UL ) );
3300  const size_t kend( ( IsLower_v<MT4> )
3301  ?( ( IsUpper_v<MT5> )
3302  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
3303  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3304  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
3305 
3306  SIMDType xmm1 ( C.load(i ,j ) );
3307  SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
3308  SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
3309  SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
3310  SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
3311  SIMDType xmm6 ( C.load(i+1UL,j ) );
3312  SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
3313  SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
3314  SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
3315  SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
3316 
3317  for( size_t k=kbegin; k<kend; ++k ) {
3318  const SIMDType a1( set( A(i ,k) ) );
3319  const SIMDType a2( set( A(i+1UL,k) ) );
3320  const SIMDType b1( B.load(k,j ) );
3321  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3322  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3323  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3324  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3325  xmm1 += a1 * b1;
3326  xmm2 += a1 * b2;
3327  xmm3 += a1 * b3;
3328  xmm4 += a1 * b4;
3329  xmm5 += a1 * b5;
3330  xmm6 += a2 * b1;
3331  xmm7 += a2 * b2;
3332  xmm8 += a2 * b3;
3333  xmm9 += a2 * b4;
3334  xmm10 += a2 * b5;
3335  }
3336 
3337  C.store( i , j , xmm1 );
3338  C.store( i , j+SIMDSIZE , xmm2 );
3339  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3340  C.store( i , j+SIMDSIZE*3UL, xmm4 );
3341  C.store( i , j+SIMDSIZE*4UL, xmm5 );
3342  C.store( i+1UL, j , xmm6 );
3343  C.store( i+1UL, j+SIMDSIZE , xmm7 );
3344  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3345  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3346  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3347  }
3348 
3349  if( i < M )
3350  {
3351  const size_t kbegin( ( IsUpper_v<MT4> )
3352  ?( ( IsLower_v<MT5> )
3353  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3354  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3355  :( IsLower_v<MT5> ? j : 0UL ) );
3356  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
3357 
3358  SIMDType xmm1( C.load(i,j ) );
3359  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3360  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3361  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3362  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3363 
3364  for( size_t k=kbegin; k<kend; ++k ) {
3365  const SIMDType a1( set( A(i,k) ) );
3366  xmm1 += a1 * B.load(k,j );
3367  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3368  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3369  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3370  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3371  }
3372 
3373  C.store( i, j , xmm1 );
3374  C.store( i, j+SIMDSIZE , xmm2 );
3375  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3376  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3377  C.store( i, j+SIMDSIZE*4UL, xmm5 );
3378  }
3379  }
3380 
3381  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3382  {
3383  size_t i( 0UL );
3384 
3385  for( ; (i+2UL) <= M; i+=2UL )
3386  {
3387  const size_t kbegin( ( IsUpper_v<MT4> )
3388  ?( ( IsLower_v<MT5> )
3389  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3390  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3391  :( IsLower_v<MT5> ? j : 0UL ) );
3392  const size_t kend( ( IsLower_v<MT4> )
3393  ?( ( IsUpper_v<MT5> )
3394  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3395  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3396  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
3397 
3398  SIMDType xmm1( C.load(i ,j ) );
3399  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3400  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3401  SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
3402  SIMDType xmm5( C.load(i+1UL,j ) );
3403  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
3404  SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
3405  SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
3406 
3407  for( size_t k=kbegin; k<kend; ++k ) {
3408  const SIMDType a1( set( A(i ,k) ) );
3409  const SIMDType a2( set( A(i+1UL,k) ) );
3410  const SIMDType b1( B.load(k,j ) );
3411  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3412  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3413  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3414  xmm1 += a1 * b1;
3415  xmm2 += a1 * b2;
3416  xmm3 += a1 * b3;
3417  xmm4 += a1 * b4;
3418  xmm5 += a2 * b1;
3419  xmm6 += a2 * b2;
3420  xmm7 += a2 * b3;
3421  xmm8 += a2 * b4;
3422  }
3423 
3424  C.store( i , j , xmm1 );
3425  C.store( i , j+SIMDSIZE , xmm2 );
3426  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3427  C.store( i , j+SIMDSIZE*3UL, xmm4 );
3428  C.store( i+1UL, j , xmm5 );
3429  C.store( i+1UL, j+SIMDSIZE , xmm6 );
3430  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3431  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3432  }
3433 
3434  if( i < M )
3435  {
3436  const size_t kbegin( ( IsUpper_v<MT4> )
3437  ?( ( IsLower_v<MT5> )
3438  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3439  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3440  :( IsLower_v<MT5> ? j : 0UL ) );
3441  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3442 
3443  SIMDType xmm1( C.load(i,j ) );
3444  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3445  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3446  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3447 
3448  for( size_t k=kbegin; k<kend; ++k ) {
3449  const SIMDType a1( set( A(i,k) ) );
3450  xmm1 += a1 * B.load(k,j );
3451  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3452  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3453  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3454  }
3455 
3456  C.store( i, j , xmm1 );
3457  C.store( i, j+SIMDSIZE , xmm2 );
3458  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3459  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3460  }
3461  }
3462 
3463  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3464  {
3465  size_t i( 0UL );
3466 
3467  for( ; (i+2UL) <= M; i+=2UL )
3468  {
3469  const size_t kbegin( ( IsUpper_v<MT4> )
3470  ?( ( IsLower_v<MT5> )
3471  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3472  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3473  :( IsLower_v<MT5> ? j : 0UL ) );
3474  const size_t kend( ( IsLower_v<MT4> )
3475  ?( ( IsUpper_v<MT5> )
3476  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
3477  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3478  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
3479 
3480  SIMDType xmm1( C.load(i ,j ) );
3481  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3482  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3483  SIMDType xmm4( C.load(i+1UL,j ) );
3484  SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
3485  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
3486 
3487  for( size_t k=kbegin; k<kend; ++k ) {
3488  const SIMDType a1( set( A(i ,k) ) );
3489  const SIMDType a2( set( A(i+1UL,k) ) );
3490  const SIMDType b1( B.load(k,j ) );
3491  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3492  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3493  xmm1 += a1 * b1;
3494  xmm2 += a1 * b2;
3495  xmm3 += a1 * b3;
3496  xmm4 += a2 * b1;
3497  xmm5 += a2 * b2;
3498  xmm6 += a2 * b3;
3499  }
3500 
3501  C.store( i , j , xmm1 );
3502  C.store( i , j+SIMDSIZE , xmm2 );
3503  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3504  C.store( i+1UL, j , xmm4 );
3505  C.store( i+1UL, j+SIMDSIZE , xmm5 );
3506  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3507  }
3508 
3509  if( i < M )
3510  {
3511  const size_t kbegin( ( IsUpper_v<MT4> )
3512  ?( ( IsLower_v<MT5> )
3513  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3514  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3515  :( IsLower_v<MT5> ? j : 0UL ) );
3516  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
3517 
3518  SIMDType xmm1( C.load(i,j ) );
3519  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3520  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3521 
3522  for( size_t k=kbegin; k<kend; ++k ) {
3523  const SIMDType a1( set( A(i,k) ) );
3524  xmm1 += a1 * B.load(k,j );
3525  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3526  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3527  }
3528 
3529  C.store( i, j , xmm1 );
3530  C.store( i, j+SIMDSIZE , xmm2 );
3531  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3532  }
3533  }
3534 
3535  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3536  {
3537  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
3538  size_t i( LOW ? j : 0UL );
3539 
3540  for( ; (i+4UL) <= iend; i+=4UL )
3541  {
3542  const size_t kbegin( ( IsUpper_v<MT4> )
3543  ?( ( IsLower_v<MT5> )
3544  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3545  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3546  :( IsLower_v<MT5> ? j : 0UL ) );
3547  const size_t kend( ( IsLower_v<MT4> )
3548  ?( ( IsUpper_v<MT5> )
3549  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
3550  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3551  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3552 
3553  SIMDType xmm1( C.load(i ,j ) );
3554  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3555  SIMDType xmm3( C.load(i+1UL,j ) );
3556  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3557  SIMDType xmm5( C.load(i+2UL,j ) );
3558  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3559  SIMDType xmm7( C.load(i+3UL,j ) );
3560  SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
3561 
3562  for( size_t k=kbegin; k<kend; ++k ) {
3563  const SIMDType a1( set( A(i ,k) ) );
3564  const SIMDType a2( set( A(i+1UL,k) ) );
3565  const SIMDType a3( set( A(i+2UL,k) ) );
3566  const SIMDType a4( set( A(i+3UL,k) ) );
3567  const SIMDType b1( B.load(k,j ) );
3568  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3569  xmm1 += a1 * b1;
3570  xmm2 += a1 * b2;
3571  xmm3 += a2 * b1;
3572  xmm4 += a2 * b2;
3573  xmm5 += a3 * b1;
3574  xmm6 += a3 * b2;
3575  xmm7 += a4 * b1;
3576  xmm8 += a4 * b2;
3577  }
3578 
3579  C.store( i , j , xmm1 );
3580  C.store( i , j+SIMDSIZE, xmm2 );
3581  C.store( i+1UL, j , xmm3 );
3582  C.store( i+1UL, j+SIMDSIZE, xmm4 );
3583  C.store( i+2UL, j , xmm5 );
3584  C.store( i+2UL, j+SIMDSIZE, xmm6 );
3585  C.store( i+3UL, j , xmm7 );
3586  C.store( i+3UL, j+SIMDSIZE, xmm8 );
3587  }
3588 
3589  for( ; (i+3UL) <= iend; i+=3UL )
3590  {
3591  const size_t kbegin( ( IsUpper_v<MT4> )
3592  ?( ( IsLower_v<MT5> )
3593  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3594  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3595  :( IsLower_v<MT5> ? j : 0UL ) );
3596  const size_t kend( ( IsLower_v<MT4> )
3597  ?( ( IsUpper_v<MT5> )
3598  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
3599  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3600  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3601 
3602  SIMDType xmm1( C.load(i ,j ) );
3603  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3604  SIMDType xmm3( C.load(i+1UL,j ) );
3605  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3606  SIMDType xmm5( C.load(i+2UL,j ) );
3607  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3608 
3609  for( size_t k=kbegin; k<kend; ++k ) {
3610  const SIMDType a1( set( A(i ,k) ) );
3611  const SIMDType a2( set( A(i+1UL,k) ) );
3612  const SIMDType a3( set( A(i+2UL,k) ) );
3613  const SIMDType b1( B.load(k,j ) );
3614  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3615  xmm1 += a1 * b1;
3616  xmm2 += a1 * b2;
3617  xmm3 += a2 * b1;
3618  xmm4 += a2 * b2;
3619  xmm5 += a3 * b1;
3620  xmm6 += a3 * b2;
3621  }
3622 
3623  C.store( i , j , xmm1 );
3624  C.store( i , j+SIMDSIZE, xmm2 );
3625  C.store( i+1UL, j , xmm3 );
3626  C.store( i+1UL, j+SIMDSIZE, xmm4 );
3627  C.store( i+2UL, j , xmm5 );
3628  C.store( i+2UL, j+SIMDSIZE, xmm6 );
3629  }
3630 
3631  for( ; (i+2UL) <= iend; i+=2UL )
3632  {
3633  const size_t kbegin( ( IsUpper_v<MT4> )
3634  ?( ( IsLower_v<MT5> )
3635  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3636  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3637  :( IsLower_v<MT5> ? j : 0UL ) );
3638  const size_t kend( ( IsLower_v<MT4> )
3639  ?( ( IsUpper_v<MT5> )
3640  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3641  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3642  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3643 
3644  SIMDType xmm1( C.load(i ,j ) );
3645  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3646  SIMDType xmm3( C.load(i+1UL,j ) );
3647  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3648  SIMDType xmm5, xmm6, xmm7, xmm8;
3649  size_t k( kbegin );
3650 
3651  for( ; (k+2UL) <= kend; k+=2UL ) {
3652  const SIMDType a1( set( A(i ,k ) ) );
3653  const SIMDType a2( set( A(i+1UL,k ) ) );
3654  const SIMDType a3( set( A(i ,k+1UL) ) );
3655  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
3656  const SIMDType b1( B.load(k ,j ) );
3657  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3658  const SIMDType b3( B.load(k+1UL,j ) );
3659  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3660  xmm1 += a1 * b1;
3661  xmm2 += a1 * b2;
3662  xmm3 += a2 * b1;
3663  xmm4 += a2 * b2;
3664  xmm5 += a3 * b3;
3665  xmm6 += a3 * b4;
3666  xmm7 += a4 * b3;
3667  xmm8 += a4 * b4;
3668  }
3669 
3670  for( ; k<kend; ++k ) {
3671  const SIMDType a1( set( A(i ,k) ) );
3672  const SIMDType a2( set( A(i+1UL,k) ) );
3673  const SIMDType b1( B.load(k,j ) );
3674  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3675  xmm1 += a1 * b1;
3676  xmm2 += a1 * b2;
3677  xmm3 += a2 * b1;
3678  xmm4 += a2 * b2;
3679  }
3680 
3681  C.store( i , j , xmm1+xmm5 );
3682  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
3683  C.store( i+1UL, j , xmm3+xmm7 );
3684  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3685  }
3686 
3687  if( i < iend )
3688  {
3689  const size_t kbegin( ( IsUpper_v<MT4> )
3690  ?( ( IsLower_v<MT5> )
3691  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3692  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3693  :( IsLower_v<MT5> ? j : 0UL ) );
3694  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
3695 
3696  SIMDType xmm1( C.load(i,j ) );
3697  SIMDType xmm2( C.load(i,j+SIMDSIZE) );
3698  SIMDType xmm3, xmm4;
3699  size_t k( kbegin );
3700 
3701  for( ; (k+2UL) <= kend; k+=2UL ) {
3702  const SIMDType a1( set( A(i,k ) ) );
3703  const SIMDType a2( set( A(i,k+1UL) ) );
3704  xmm1 += a1 * B.load(k ,j );
3705  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
3706  xmm3 += a2 * B.load(k+1UL,j );
3707  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
3708  }
3709 
3710  for( ; k<kend; ++k ) {
3711  const SIMDType a1( set( A(i,k) ) );
3712  xmm1 += a1 * B.load(k,j );
3713  xmm2 += a1 * B.load(k,j+SIMDSIZE);
3714  }
3715 
3716  C.store( i, j , xmm1+xmm3 );
3717  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
3718  }
3719  }
3720 
3721  for( ; j<jpos; j+=SIMDSIZE )
3722  {
3723  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
3724  size_t i( LOW ? j : 0UL );
3725 
3726  for( ; (i+4UL) <= iend; i+=4UL )
3727  {
3728  const size_t kbegin( ( IsUpper_v<MT4> )
3729  ?( ( IsLower_v<MT5> )
3730  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3731  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3732  :( IsLower_v<MT5> ? j : 0UL ) );
3733  const size_t kend( ( IsLower_v<MT4> )
3734  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
3735  :( K ) );
3736 
3737  SIMDType xmm1( C.load(i ,j) );
3738  SIMDType xmm2( C.load(i+1UL,j) );
3739  SIMDType xmm3( C.load(i+2UL,j) );
3740  SIMDType xmm4( C.load(i+3UL,j) );
3741  SIMDType xmm5, xmm6, xmm7, xmm8;
3742  size_t k( kbegin );
3743 
3744  for( ; (k+2UL) <= kend; k+=2UL ) {
3745  const SIMDType b1( B.load(k ,j) );
3746  const SIMDType b2( B.load(k+1UL,j) );
3747  xmm1 += set( A(i ,k ) ) * b1;
3748  xmm2 += set( A(i+1UL,k ) ) * b1;
3749  xmm3 += set( A(i+2UL,k ) ) * b1;
3750  xmm4 += set( A(i+3UL,k ) ) * b1;
3751  xmm5 += set( A(i ,k+1UL) ) * b2;
3752  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
3753  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
3754  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
3755  }
3756 
3757  for( ; k<kend; ++k ) {
3758  const SIMDType b1( B.load(k,j) );
3759  xmm1 += set( A(i ,k) ) * b1;
3760  xmm2 += set( A(i+1UL,k) ) * b1;
3761  xmm3 += set( A(i+2UL,k) ) * b1;
3762  xmm4 += set( A(i+3UL,k) ) * b1;
3763  }
3764 
3765  C.store( i , j, xmm1+xmm5 );
3766  C.store( i+1UL, j, xmm2+xmm6 );
3767  C.store( i+2UL, j, xmm3+xmm7 );
3768  C.store( i+3UL, j, xmm4+xmm8 );
3769  }
3770 
3771  for( ; (i+3UL) <= iend; i+=3UL )
3772  {
3773  const size_t kbegin( ( IsUpper_v<MT4> )
3774  ?( ( IsLower_v<MT5> )
3775  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3776  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3777  :( IsLower_v<MT5> ? j : 0UL ) );
3778  const size_t kend( ( IsLower_v<MT4> )
3779  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
3780  :( K ) );
3781 
3782  SIMDType xmm1( C.load(i ,j) );
3783  SIMDType xmm2( C.load(i+1UL,j) );
3784  SIMDType xmm3( C.load(i+2UL,j) );
3785  SIMDType xmm4, xmm5, xmm6;
3786  size_t k( kbegin );
3787 
3788  for( ; (k+2UL) <= kend; k+=2UL ) {
3789  const SIMDType b1( B.load(k ,j) );
3790  const SIMDType b2( B.load(k+1UL,j) );
3791  xmm1 += set( A(i ,k ) ) * b1;
3792  xmm2 += set( A(i+1UL,k ) ) * b1;
3793  xmm3 += set( A(i+2UL,k ) ) * b1;
3794  xmm4 += set( A(i ,k+1UL) ) * b2;
3795  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
3796  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
3797  }
3798 
3799  for( ; k<kend; ++k ) {
3800  const SIMDType b1( B.load(k,j) );
3801  xmm1 += set( A(i ,k) ) * b1;
3802  xmm2 += set( A(i+1UL,k) ) * b1;
3803  xmm3 += set( A(i+2UL,k) ) * b1;
3804  }
3805 
3806  C.store( i , j, xmm1+xmm4 );
3807  C.store( i+1UL, j, xmm2+xmm5 );
3808  C.store( i+2UL, j, xmm3+xmm6 );
3809  }
3810 
3811  for( ; (i+2UL) <= iend; i+=2UL )
3812  {
3813  const size_t kbegin( ( IsUpper_v<MT4> )
3814  ?( ( IsLower_v<MT5> )
3815  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3816  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3817  :( IsLower_v<MT5> ? j : 0UL ) );
3818  const size_t kend( ( IsLower_v<MT4> )
3819  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3820  :( K ) );
3821 
3822  SIMDType xmm1( C.load(i ,j) );
3823  SIMDType xmm2( C.load(i+1UL,j) );
3824  SIMDType xmm3, xmm4;
3825  size_t k( kbegin );
3826 
3827  for( ; (k+2UL) <= kend; k+=2UL ) {
3828  const SIMDType b1( B.load(k ,j) );
3829  const SIMDType b2( B.load(k+1UL,j) );
3830  xmm1 += set( A(i ,k ) ) * b1;
3831  xmm2 += set( A(i+1UL,k ) ) * b1;
3832  xmm3 += set( A(i ,k+1UL) ) * b2;
3833  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
3834  }
3835 
3836  for( ; k<kend; ++k ) {
3837  const SIMDType b1( B.load(k,j) );
3838  xmm1 += set( A(i ,k) ) * b1;
3839  xmm2 += set( A(i+1UL,k) ) * b1;
3840  }
3841 
3842  C.store( i , j, xmm1+xmm3 );
3843  C.store( i+1UL, j, xmm2+xmm4 );
3844  }
3845 
3846  if( i < iend )
3847  {
3848  const size_t kbegin( ( IsUpper_v<MT4> )
3849  ?( ( IsLower_v<MT5> )
3850  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3851  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3852  :( IsLower_v<MT5> ? j : 0UL ) );
3853 
3854  SIMDType xmm1( C.load(i,j) );
3855  SIMDType xmm2;
3856  size_t k( kbegin );
3857 
3858  for( ; (k+2UL) <= K; k+=2UL ) {
3859  xmm1 += set( A(i,k ) ) * B.load(k ,j);
3860  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
3861  }
3862 
3863  for( ; k<K; ++k ) {
3864  xmm1 += set( A(i,k) ) * B.load(k,j);
3865  }
3866 
3867  C.store( i, j, xmm1+xmm2 );
3868  }
3869  }
3870 
3871  for( ; remainder && j<N; ++j )
3872  {
3873  const size_t iend( UPP ? j+1UL : M );
3874  size_t i( LOW ? j : 0UL );
3875 
3876  for( ; (i+2UL) <= iend; i+=2UL )
3877  {
3878  const size_t kbegin( ( IsUpper_v<MT4> )
3879  ?( ( IsLower_v<MT5> )
3880  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3881  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3882  :( IsLower_v<MT5> ? j : 0UL ) );
3883  const size_t kend( ( IsLower_v<MT4> )
3884  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3885  :( K ) );
3886 
3887  ElementType value1( C(i ,j) );
3888  ElementType value2( C(i+1UL,j) );;
3889 
3890  for( size_t k=kbegin; k<kend; ++k ) {
3891  value1 += A(i ,k) * B(k,j);
3892  value2 += A(i+1UL,k) * B(k,j);
3893  }
3894 
3895  C(i ,j) = value1;
3896  C(i+1UL,j) = value2;
3897  }
3898 
3899  if( i < iend )
3900  {
3901  const size_t kbegin( ( IsUpper_v<MT4> )
3902  ?( ( IsLower_v<MT5> )
3903  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3904  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3905  :( IsLower_v<MT5> ? j : 0UL ) );
3906 
3907  ElementType value( C(i,j) );
3908 
3909  for( size_t k=kbegin; k<K; ++k ) {
3910  value += A(i,k) * B(k,j);
3911  }
3912 
3913  C(i,j) = value;
3914  }
3915  }
3916  }
3918  //**********************************************************************************************
3919 
3920  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
3935  template< typename MT3 // Type of the left-hand side target matrix
3936  , typename MT4 // Type of the left-hand side matrix operand
3937  , typename MT5 > // Type of the right-hand side matrix operand
3938  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3939  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3940  {
3941  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3942 
3943  const size_t M( A.rows() );
3944  const size_t N( B.columns() );
3945  const size_t K( A.columns() );
3946 
3947  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3948 
3949  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3950  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3951 
3952  size_t i( 0UL );
3953 
3954  if( IsIntegral_v<ElementType> )
3955  {
3956  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3957  for( size_t j=0UL; j<N; ++j )
3958  {
3959  const size_t kbegin( ( IsLower_v<MT5> )
3960  ?( ( IsUpper_v<MT4> )
3961  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3962  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3963  :( IsUpper_v<MT4> ? i : 0UL ) );
3964  const size_t kend( ( IsUpper_v<MT5> )
3965  ?( ( IsLower_v<MT4> )
3966  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3967  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3968  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
3969 
3970  SIMDType xmm1( C.load(i ,j) );
3971  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3972  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3973  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3974  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
3975  SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
3976  SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
3977  SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
3978 
3979  for( size_t k=kbegin; k<kend; ++k ) {
3980  const SIMDType b1( set( B(k,j) ) );
3981  xmm1 += A.load(i ,k) * b1;
3982  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3983  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3984  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3985  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3986  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
3987  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
3988  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
3989  }
3990 
3991  C.store( i , j, xmm1 );
3992  C.store( i+SIMDSIZE , j, xmm2 );
3993  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3994  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3995  C.store( i+SIMDSIZE*4UL, j, xmm5 );
3996  C.store( i+SIMDSIZE*5UL, j, xmm6 );
3997  C.store( i+SIMDSIZE*6UL, j, xmm7 );
3998  C.store( i+SIMDSIZE*7UL, j, xmm8 );
3999  }
4000  }
4001  }
4002 
4003  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
4004  {
4005  size_t j( 0UL );
4006 
4007  for( ; (j+2UL) <= N; j+=2UL )
4008  {
4009  const size_t kbegin( ( IsLower_v<MT5> )
4010  ?( ( IsUpper_v<MT4> )
4011  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4012  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4013  :( IsUpper_v<MT4> ? i : 0UL ) );
4014  const size_t kend( ( IsUpper_v<MT5> )
4015  ?( ( IsLower_v<MT4> )
4016  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4017  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4018  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
4019 
4020  SIMDType xmm1 ( C.load(i ,j ) );
4021  SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
4022  SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
4023  SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
4024  SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
4025  SIMDType xmm6 ( C.load(i ,j+1UL) );
4026  SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
4027  SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
4028  SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
4029  SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
4030 
4031  for( size_t k=kbegin; k<kend; ++k ) {
4032  const SIMDType a1( A.load(i ,k) );
4033  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4034  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4035  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4036  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
4037  const SIMDType b1( set( B(k,j ) ) );
4038  const SIMDType b2( set( B(k,j+1UL) ) );
4039  xmm1 += a1 * b1;
4040  xmm2 += a2 * b1;
4041  xmm3 += a3 * b1;
4042  xmm4 += a4 * b1;
4043  xmm5 += a5 * b1;
4044  xmm6 += a1 * b2;
4045  xmm7 += a2 * b2;
4046  xmm8 += a3 * b2;
4047  xmm9 += a4 * b2;
4048  xmm10 += a5 * b2;
4049  }
4050 
4051  C.store( i , j , xmm1 );
4052  C.store( i+SIMDSIZE , j , xmm2 );
4053  C.store( i+SIMDSIZE*2UL, j , xmm3 );
4054  C.store( i+SIMDSIZE*3UL, j , xmm4 );
4055  C.store( i+SIMDSIZE*4UL, j , xmm5 );
4056  C.store( i , j+1UL, xmm6 );
4057  C.store( i+SIMDSIZE , j+1UL, xmm7 );
4058  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
4059  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
4060  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
4061  }
4062 
4063  if( j < N )
4064  {
4065  const size_t kbegin( ( IsLower_v<MT5> )
4066  ?( ( IsUpper_v<MT4> )
4067  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4068  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4069  :( IsUpper_v<MT4> ? i : 0UL ) );
4070  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
4071 
4072  SIMDType xmm1( C.load(i ,j) );
4073  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
4074  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
4075  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
4076  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
4077 
4078  for( size_t k=kbegin; k<kend; ++k ) {
4079  const SIMDType b1( set( B(k,j) ) );
4080  xmm1 += A.load(i ,k) * b1;
4081  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4082  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4083  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
4084  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
4085  }
4086 
4087  C.store( i , j, xmm1 );
4088  C.store( i+SIMDSIZE , j, xmm2 );
4089  C.store( i+SIMDSIZE*2UL, j, xmm3 );
4090  C.store( i+SIMDSIZE*3UL, j, xmm4 );
4091  C.store( i+SIMDSIZE*4UL, j, xmm5 );
4092  }
4093  }
4094 
4095  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4096  {
4097  size_t j( 0UL );
4098 
4099  for( ; (j+2UL) <= N; j+=2UL )
4100  {
4101  const size_t kbegin( ( IsLower_v<MT5> )
4102  ?( ( IsUpper_v<MT4> )
4103  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4104  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4105  :( IsUpper_v<MT4> ? i : 0UL ) );
4106  const size_t kend( ( IsUpper_v<MT5> )
4107  ?( ( IsLower_v<MT4> )
4108  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4109  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4110  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
4111 
4112  SIMDType xmm1( C.load(i ,j ) );
4113  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
4114  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
4115  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
4116  SIMDType xmm5( C.load(i ,j+1UL) );
4117  SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
4118  SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
4119  SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
4120 
4121  for( size_t k=kbegin; k<kend; ++k ) {
4122  const SIMDType a1( A.load(i ,k) );
4123  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4124  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4125  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4126  const SIMDType b1( set( B(k,j ) ) );
4127  const SIMDType b2( set( B(k,j+1UL) ) );
4128  xmm1 += a1 * b1;
4129  xmm2 += a2 * b1;
4130  xmm3 += a3 * b1;
4131  xmm4 += a4 * b1;
4132  xmm5 += a1 * b2;
4133  xmm6 += a2 * b2;
4134  xmm7 += a3 * b2;
4135  xmm8 += a4 * b2;
4136  }
4137 
4138  C.store( i , j , xmm1 );
4139  C.store( i+SIMDSIZE , j , xmm2 );
4140  C.store( i+SIMDSIZE*2UL, j , xmm3 );
4141  C.store( i+SIMDSIZE*3UL, j , xmm4 );
4142  C.store( i , j+1UL, xmm5 );
4143  C.store( i+SIMDSIZE , j+1UL, xmm6 );
4144  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
4145  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
4146  }
4147 
4148  if( j < N )
4149  {
4150  const size_t kbegin( ( IsLower_v<MT5> )
4151  ?( ( IsUpper_v<MT4> )
4152  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4153  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4154  :( IsUpper_v<MT4> ? i : 0UL ) );
4155  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
4156 
4157  SIMDType xmm1( C.load(i ,j) );
4158  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
4159  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
4160  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
4161 
4162  for( size_t k=kbegin; k<kend; ++k ) {
4163  const SIMDType b1( set( B(k,j) ) );
4164  xmm1 += A.load(i ,k) * b1;
4165  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4166  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4167  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
4168  }
4169 
4170  C.store( i , j, xmm1 );
4171  C.store( i+SIMDSIZE , j, xmm2 );
4172  C.store( i+SIMDSIZE*2UL, j, xmm3 );
4173  C.store( i+SIMDSIZE*3UL, j, xmm4 );
4174  }
4175  }
4176 
4177  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4178  {
4179  size_t j( 0UL );
4180 
4181  for( ; (j+2UL) <= N; j+=2UL )
4182  {
4183  const size_t kbegin( ( IsLower_v<MT5> )
4184  ?( ( IsUpper_v<MT4> )
4185  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4186  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4187  :( IsUpper_v<MT4> ? i : 0UL ) );
4188  const size_t kend( ( IsUpper_v<MT5> )
4189  ?( ( IsLower_v<MT4> )
4190  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4191  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4192  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
4193 
4194  SIMDType xmm1( C.load(i ,j ) );
4195  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
4196  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
4197  SIMDType xmm4( C.load(i ,j+1UL) );
4198  SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
4199  SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
4200 
4201  for( size_t k=kbegin; k<kend; ++k ) {
4202  const SIMDType a1( A.load(i ,k) );
4203  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4204  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4205  const SIMDType b1( set( B(k,j ) ) );
4206  const SIMDType b2( set( B(k,j+1UL) ) );
4207  xmm1 += a1 * b1;
4208  xmm2 += a2 * b1;
4209  xmm3 += a3 * b1;
4210  xmm4 += a1 * b2;
4211  xmm5 += a2 * b2;
4212  xmm6 += a3 * b2;
4213  }
4214 
4215  C.store( i , j , xmm1 );
4216  C.store( i+SIMDSIZE , j , xmm2 );
4217  C.store( i+SIMDSIZE*2UL, j , xmm3 );
4218  C.store( i , j+1UL, xmm4 );
4219  C.store( i+SIMDSIZE , j+1UL, xmm5 );
4220  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
4221  }
4222 
4223  if( j < N )
4224  {
4225  const size_t kbegin( ( IsLower_v<MT5> )
4226  ?( ( IsUpper_v<MT4> )
4227  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4228  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4229  :( IsUpper_v<MT4> ? i : 0UL ) );
4230  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
4231 
4232  SIMDType xmm1( C.load(i ,j) );
4233  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
4234  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
4235 
4236  for( size_t k=kbegin; k<kend; ++k ) {
4237  const SIMDType b1( set( B(k,j) ) );
4238  xmm1 += A.load(i ,k) * b1;
4239  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4240  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4241  }
4242 
4243  C.store( i , j, xmm1 );
4244  C.store( i+SIMDSIZE , j, xmm2 );
4245  C.store( i+SIMDSIZE*2UL, j, xmm3 );
4246  }
4247  }
4248 
4249  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4250  {
4251  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
4252  size_t j( UPP ? i : 0UL );
4253 
4254  for( ; (j+4UL) <= jend; j+=4UL )
4255  {
4256  const size_t kbegin( ( IsLower_v<MT5> )
4257  ?( ( IsUpper_v<MT4> )
4258  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4259  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4260  :( IsUpper_v<MT4> ? i : 0UL ) );
4261  const size_t kend( ( IsUpper_v<MT5> )
4262  ?( ( IsLower_v<MT4> )
4263  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
4264  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
4265  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4266 
4267  SIMDType xmm1( C.load(i ,j ) );
4268  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4269  SIMDType xmm3( C.load(i ,j+1UL) );
4270  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4271  SIMDType xmm5( C.load(i ,j+2UL) );
4272  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
4273  SIMDType xmm7( C.load(i ,j+3UL) );
4274  SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
4275 
4276  for( size_t k=kbegin; k<kend; ++k ) {
4277  const SIMDType a1( A.load(i ,k) );
4278  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4279  const SIMDType b1( set( B(k,j ) ) );
4280  const SIMDType b2( set( B(k,j+1UL) ) );
4281  const SIMDType b3( set( B(k,j+2UL) ) );
4282  const SIMDType b4( set( B(k,j+3UL) ) );
4283  xmm1 += a1 * b1;
4284  xmm2 += a2 * b1;
4285  xmm3 += a1 * b2;
4286  xmm4 += a2 * b2;
4287  xmm5 += a1 * b3;
4288  xmm6 += a2 * b3;
4289  xmm7 += a1 * b4;
4290  xmm8 += a2 * b4;
4291  }
4292 
4293  C.store( i , j , xmm1 );
4294  C.store( i+SIMDSIZE, j , xmm2 );
4295  C.store( i , j+1UL, xmm3 );
4296  C.store( i+SIMDSIZE, j+1UL, xmm4 );
4297  C.store( i , j+2UL, xmm5 );
4298  C.store( i+SIMDSIZE, j+2UL, xmm6 );
4299  C.store( i , j+3UL, xmm7 );
4300  C.store( i+SIMDSIZE, j+3UL, xmm8 );
4301  }
4302 
4303  for( ; (j+3UL) <= jend; j+=3UL )
4304  {
4305  const size_t kbegin( ( IsLower_v<MT5> )
4306  ?( ( IsUpper_v<MT4> )
4307  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4308  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4309  :( IsUpper_v<MT4> ? i : 0UL ) );
4310  const size_t kend( ( IsUpper_v<MT5> )
4311  ?( ( IsLower_v<MT4> )
4312  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
4313  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
4314  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4315 
4316  SIMDType xmm1( C.load(i ,j ) );
4317  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4318  SIMDType xmm3( C.load(i ,j+1UL) );
4319  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4320  SIMDType xmm5( C.load(i ,j+2UL) );
4321  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
4322 
4323  for( size_t k=kbegin; k<kend; ++k ) {
4324  const SIMDType a1( A.load(i ,k) );
4325  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4326  const SIMDType b1( set( B(k,j ) ) );
4327  const SIMDType b2( set( B(k,j+1UL) ) );
4328  const SIMDType b3( set( B(k,j+2UL) ) );
4329  xmm1 += a1 * b1;
4330  xmm2 += a2 * b1;
4331  xmm3 += a1 * b2;
4332  xmm4 += a2 * b2;
4333  xmm5 += a1 * b3;
4334  xmm6 += a2 * b3;
4335  }
4336 
4337  C.store( i , j , xmm1 );
4338  C.store( i+SIMDSIZE, j , xmm2 );
4339  C.store( i , j+1UL, xmm3 );
4340  C.store( i+SIMDSIZE, j+1UL, xmm4 );
4341  C.store( i , j+2UL, xmm5 );
4342  C.store( i+SIMDSIZE, j+2UL, xmm6 );
4343  }
4344 
4345  for( ; (j+2UL) <= jend; j+=2UL )
4346  {
4347  const size_t kbegin( ( IsLower_v<MT5> )
4348  ?( ( IsUpper_v<MT4> )
4349  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4350  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4351  :( IsUpper_v<MT4> ? i : 0UL ) );
4352  const size_t kend( ( IsUpper_v<MT5> )
4353  ?( ( IsLower_v<MT4> )
4354  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4355  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4356  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4357 
4358  SIMDType xmm1( C.load(i ,j ) );
4359  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4360  SIMDType xmm3( C.load(i ,j+1UL) );
4361  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4362  SIMDType xmm5, xmm6, xmm7, xmm8;
4363  size_t k( kbegin );
4364 
4365  for( ; (k+2UL) < kend; k+=2UL ) {
4366  const SIMDType a1( A.load(i ,k ) );
4367  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
4368  const SIMDType a3( A.load(i ,k+1UL) );
4369  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
4370  const SIMDType b1( set( B(k ,j ) ) );
4371  const SIMDType b2( set( B(k ,j+1UL) ) );
4372  const SIMDType b3( set( B(k+1UL,j ) ) );
4373  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
4374  xmm1 += a1 * b1;
4375  xmm2 += a2 * b1;
4376  xmm3 += a1 * b2;
4377  xmm4 += a2 * b2;
4378  xmm5 += a3 * b3;
4379  xmm6 += a4 * b3;
4380  xmm7 += a3 * b4;
4381  xmm8 += a4 * b4;
4382  }
4383 
4384  for( ; k<kend; ++k ) {
4385  const SIMDType a1( A.load(i ,k) );
4386  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4387  const SIMDType b1( set( B(k,j ) ) );
4388  const SIMDType b2( set( B(k,j+1UL) ) );
4389  xmm1 += a1 * b1;
4390  xmm2 += a2 * b1;
4391  xmm3 += a1 * b2;
4392  xmm4 += a2 * b2;
4393  }
4394 
4395  C.store( i , j , xmm1+xmm5 );
4396  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
4397  C.store( i , j+1UL, xmm3+xmm7 );
4398  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
4399  }
4400 
4401  if( j < jend )
4402  {
4403  const size_t kbegin( ( IsLower_v<MT5> )
4404  ?( ( IsUpper_v<MT4> )
4405  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4406  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4407  :( IsUpper_v<MT4> ? i : 0UL ) );
4408  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
4409 
4410  SIMDType xmm1( C.load(i ,j) );
4411  SIMDType xmm2( C.load(i+SIMDSIZE,j) );
4412  SIMDType xmm3, xmm4;
4413  size_t k( kbegin );
4414 
4415  for( ; (k+2UL) <= kend; k+=2UL ) {
4416  const SIMDType b1( set( B(k ,j) ) );
4417  const SIMDType b2( set( B(k+1UL,j) ) );
4418  xmm1 += A.load(i ,k ) * b1;
4419  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
4420  xmm3 += A.load(i ,k+1UL) * b2;
4421  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
4422  }
4423 
4424  for( ; k<kend; ++k ) {
4425  const SIMDType b1( set( B(k,j) ) );
4426  xmm1 += A.load(i ,k) * b1;
4427  xmm2 += A.load(i+SIMDSIZE,k) * b1;
4428  }
4429 
4430  C.store( i , j, xmm1+xmm3 );
4431  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
4432  }
4433  }
4434 
4435  for( ; i<ipos; i+=SIMDSIZE )
4436  {
4437  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
4438  size_t j( UPP ? i : 0UL );
4439 
4440  for( ; (j+4UL) <= jend; j+=4UL )
4441  {
4442  const size_t kbegin( ( IsLower_v<MT5> )
4443  ?( ( IsUpper_v<MT4> )
4444  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4445  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4446  :( IsUpper_v<MT4> ? i : 0UL ) );
4447  const size_t kend( ( IsUpper_v<MT5> )
4448  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
4449  :( K ) );
4450 
4451  SIMDType xmm1( C.load(i,j ) );
4452  SIMDType xmm2( C.load(i,j+1UL) );
4453  SIMDType xmm3( C.load(i,j+2UL) );
4454  SIMDType xmm4( C.load(i,j+3UL) );
4455  SIMDType xmm5, xmm6, xmm7, xmm8;
4456  size_t k( kbegin );
4457 
4458  for( ; (k+2UL) <= kend; k+=2UL ) {
4459  const SIMDType a1( A.load(i,k ) );
4460  const SIMDType a2( A.load(i,k+1UL) );
4461  xmm1 += a1 * set( B(k ,j ) );
4462  xmm2 += a1 * set( B(k ,j+1UL) );
4463  xmm3 += a1 * set( B(k ,j+2UL) );
4464  xmm4 += a1 * set( B(k ,j+3UL) );
4465  xmm5 += a2 * set( B(k+1UL,j ) );
4466  xmm6 += a2 * set( B(k+1UL,j+1UL) );
4467  xmm7 += a2 * set( B(k+1UL,j+2UL) );
4468  xmm8 += a2 * set( B(k+1UL,j+3UL) );
4469  }
4470 
4471  for( ; k<kend; ++k ) {
4472  const SIMDType a1( A.load(i,k) );
4473  xmm1 += a1 * set( B(k,j ) );
4474  xmm2 += a1 * set( B(k,j+1UL) );
4475  xmm3 += a1 * set( B(k,j+2UL) );
4476  xmm4 += a1 * set( B(k,j+3UL) );
4477  }
4478 
4479  C.store( i, j , xmm1+xmm5 );
4480  C.store( i, j+1UL, xmm2+xmm6 );
4481  C.store( i, j+2UL, xmm3+xmm7 );
4482  C.store( i, j+3UL, xmm4+xmm8 );
4483  }
4484 
4485  for( ; (j+3UL) <= jend; j+=3UL )
4486  {
4487  const size_t kbegin( ( IsLower_v<MT5> )
4488  ?( ( IsUpper_v<MT4> )
4489  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4490  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4491  :( IsUpper_v<MT4> ? i : 0UL ) );
4492  const size_t kend( ( IsUpper_v<MT5> )
4493  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
4494  :( K ) );
4495 
4496  SIMDType xmm1( C.load(i,j ) );
4497  SIMDType xmm2( C.load(i,j+1UL) );
4498  SIMDType xmm3( C.load(i,j+2UL) );
4499  SIMDType xmm4, xmm5, xmm6;
4500  size_t k( kbegin );
4501 
4502  for( ; (k+2UL) <= kend; k+=2UL ) {
4503  const SIMDType a1( A.load(i,k ) );
4504  const SIMDType a2( A.load(i,k+1UL) );
4505  xmm1 += a1 * set( B(k ,j ) );
4506  xmm2 += a1 * set( B(k ,j+1UL) );
4507  xmm3 += a1 * set( B(k ,j+2UL) );
4508  xmm4 += a2 * set( B(k+1UL,j ) );
4509  xmm5 += a2 * set( B(k+1UL,j+1UL) );
4510  xmm6 += a2 * set( B(k+1UL,j+2UL) );
4511  }
4512 
4513  for( ; k<kend; ++k ) {
4514  const SIMDType a1( A.load(i,k) );
4515  xmm1 += a1 * set( B(k,j ) );
4516  xmm2 += a1 * set( B(k,j+1UL) );
4517  xmm3 += a1 * set( B(k,j+2UL) );
4518  }
4519 
4520  C.store( i, j , xmm1+xmm4 );
4521  C.store( i, j+1UL, xmm2+xmm5 );
4522  C.store( i, j+2UL, xmm3+xmm6 );
4523  }
4524 
4525  for( ; (j+2UL) <= jend; j+=2UL )
4526  {
4527  const size_t kbegin( ( IsLower_v<MT5> )
4528  ?( ( IsUpper_v<MT4> )
4529  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4530  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4531  :( IsUpper_v<MT4> ? i : 0UL ) );
4532  const size_t kend( ( IsUpper_v<MT5> )
4533  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4534  :( K ) );
4535 
4536  SIMDType xmm1( C.load(i,j ) );
4537  SIMDType xmm2( C.load(i,j+1UL) );
4538  SIMDType xmm3, xmm4;
4539  size_t k( kbegin );
4540 
4541  for( ; (k+2UL) <= kend; k+=2UL ) {
4542  const SIMDType a1( A.load(i,k ) );
4543  const SIMDType a2( A.load(i,k+1UL) );
4544  xmm1 += a1 * set( B(k ,j ) );
4545  xmm2 += a1 * set( B(k ,j+1UL) );
4546  xmm3 += a2 * set( B(k+1UL,j ) );
4547  xmm4 += a2 * set( B(k+1UL,j+1UL) );
4548  }
4549 
4550  for( ; k<kend; ++k ) {
4551  const SIMDType a1( A.load(i,k) );
4552  xmm1 += a1 * set( B(k,j ) );
4553  xmm2 += a1 * set( B(k,j+1UL) );
4554  }
4555 
4556  C.store( i, j , xmm1+xmm3 );
4557  C.store( i, j+1UL, xmm2+xmm4 );
4558  }
4559 
4560  if( j < jend )
4561  {
4562  const size_t kbegin( ( IsLower_v<MT5> )
4563  ?( ( IsUpper_v<MT4> )
4564  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4565  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4566  :( IsUpper_v<MT4> ? i : 0UL ) );
4567 
4568  SIMDType xmm1( C.load(i,j) );
4569  SIMDType xmm2;
4570  size_t k( kbegin );
4571 
4572  for( ; (k+2UL) <= K; k+=2UL ) {
4573  xmm1 += A.load(i,k ) * set( B(k ,j) );
4574  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
4575  }
4576 
4577  for( ; k<K; ++k ) {
4578  xmm1 += A.load(i,k) * set( B(k,j) );
4579  }
4580 
4581  C.store( i, j, xmm1+xmm2 );
4582  }
4583  }
4584 
4585  for( ; remainder && i<M; ++i )
4586  {
4587  const size_t jend( LOW ? i+1UL : N );
4588  size_t j( UPP ? i : 0UL );
4589 
4590  for( ; (j+2UL) <= jend; j+=2UL )
4591  {
4592  const size_t kbegin( ( IsLower_v<MT5> )
4593  ?( ( IsUpper_v<MT4> )
4594  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4595  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4596  :( IsUpper_v<MT4> ? i : 0UL ) );
4597  const size_t kend( ( IsUpper_v<MT5> )
4598  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4599  :( K ) );
4600 
4601  ElementType value1( C(i,j ) );
4602  ElementType value2( C(i,j+1UL) );
4603 
4604  for( size_t k=kbegin; k<kend; ++k ) {
4605  value1 += A(i,k) * B(k,j );
4606  value2 += A(i,k) * B(k,j+1UL);
4607  }
4608 
4609  C(i,j ) = value1;
4610  C(i,j+1UL) = value2;
4611  }
4612 
4613  if( j < jend )
4614  {
4615  const size_t kbegin( ( IsLower_v<MT5> )
4616  ?( ( IsUpper_v<MT4> )
4617  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4618  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4619  :( IsUpper_v<MT4> ? i : 0UL ) );
4620 
4621  ElementType value( C(i,j) );
4622 
4623  for( size_t k=kbegin; k<K; ++k ) {
4624  value += A(i,k) * B(k,j);
4625  }
4626 
4627  C(i,j) = value;
4628  }
4629  }
4630  }
4632  //**********************************************************************************************
4633 
4634  //**Default addition assignment to dense matrices (large matrices)******************************
4648  template< typename MT3 // Type of the left-hand side target matrix
4649  , typename MT4 // Type of the left-hand side matrix operand
4650  , typename MT5 > // Type of the right-hand side matrix operand
4651  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4652  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4653  {
4654  selectDefaultAddAssignKernel( C, A, B );
4655  }
4657  //**********************************************************************************************
4658 
4659  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
4674  template< typename MT3 // Type of the left-hand side target matrix
4675  , typename MT4 // Type of the left-hand side matrix operand
4676  , typename MT5 > // Type of the right-hand side matrix operand
4677  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4678  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4679  {
4680  if( LOW )
4681  lmmm( C, A, B, ElementType(1), ElementType(1) );
4682  else if( UPP )
4683  ummm( C, A, B, ElementType(1), ElementType(1) );
4684  else
4685  mmm( C, A, B, ElementType(1), ElementType(1) );
4686  }
4688  //**********************************************************************************************
4689 
4690  //**BLAS-based addition assignment to dense matrices (default)**********************************
4704  template< typename MT3 // Type of the left-hand side target matrix
4705  , typename MT4 // Type of the left-hand side matrix operand
4706  , typename MT5 > // Type of the right-hand side matrix operand
4707  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4708  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4709  {
4710  selectLargeAddAssignKernel( C, A, B );
4711  }
4713  //**********************************************************************************************
4714 
4715  //**BLAS-based addition assignment to dense matrices********************************************
4716 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4717 
4730  template< typename MT3 // Type of the left-hand side target matrix
4731  , typename MT4 // Type of the left-hand side matrix operand
4732  , typename MT5 > // Type of the right-hand side matrix operand
4733  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4734  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4735  {
4736  using ET = ElementType_t<MT3>;
4737 
4738  if( IsTriangular_v<MT4> ) {
4739  ResultType_t<MT3> tmp( serial( B ) );
4740  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4741  addAssign( C, tmp );
4742  }
4743  else if( IsTriangular_v<MT5> ) {
4744  ResultType_t<MT3> tmp( serial( A ) );
4745  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4746  addAssign( C, tmp );
4747  }
4748  else {
4749  gemm( C, A, B, ET(1), ET(1) );
4750  }
4751  }
4753 #endif
4754  //**********************************************************************************************
4755 
4756  //**Addition assignment to sparse matrices******************************************************
4757  // No special implementation for the addition assignment to sparse matrices.
4758  //**********************************************************************************************
4759 
4760  //**Subtraction assignment to dense matrices****************************************************
4773  template< typename MT // Type of the target dense matrix
4774  , bool SO > // Storage order of the target dense matrix
4775  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
4776  {
4778 
4779  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4780  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4781 
4782  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4783  return;
4784  }
4785 
4786  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
4787  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
4788 
4789  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4790  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4791  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4792  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4793  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4794  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4795 
4796  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
4797  }
4799  //**********************************************************************************************
4800 
4801  //**Subtraction assignment to dense matrices (kernel selection)*********************************
4812  template< typename MT3 // Type of the left-hand side target matrix
4813  , typename MT4 // Type of the left-hand side matrix operand
4814  , typename MT5 > // Type of the right-hand side matrix operand
4815  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4816  {
4817  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
4818  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
4819  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
4820  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
4821  selectSmallSubAssignKernel( C, A, B );
4822  else
4823  selectBlasSubAssignKernel( C, A, B );
4824  }
4826  //**********************************************************************************************
4827 
4828  //**Default subtraction assignment to row-major dense matrices (general/general)****************
4842  template< typename MT3 // Type of the left-hand side target matrix
4843  , typename MT4 // Type of the left-hand side matrix operand
4844  , typename MT5 > // Type of the right-hand side matrix operand
4845  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4846  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4847  {
4848  const size_t M( A.rows() );
4849  const size_t N( B.columns() );
4850  const size_t K( A.columns() );
4851 
4852  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4853 
4854  for( size_t i=0UL; i<M; ++i )
4855  {
4856  const size_t kbegin( ( IsUpper_v<MT4> )
4857  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4858  :( 0UL ) );
4859  const size_t kend( ( IsLower_v<MT4> )
4860  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
4861  :( K ) );
4862  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4863 
4864  for( size_t k=kbegin; k<kend; ++k )
4865  {
4866  const size_t jbegin( ( IsUpper_v<MT5> )
4867  ?( ( IsStrictlyUpper_v<MT5> )
4868  ?( UPP ? max(i,k+1UL) : k+1UL )
4869  :( UPP ? max(i,k) : k ) )
4870  :( UPP ? i : 0UL ) );
4871  const size_t jend( ( IsLower_v<MT5> )
4872  ?( ( IsStrictlyLower_v<MT5> )
4873  ?( LOW ? min(i+1UL,k) : k )
4874  :( LOW ? min(i,k)+1UL : k+1UL ) )
4875  :( LOW ? i+1UL : N ) );
4876 
4877  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
4878  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4879 
4880  const size_t jnum( jend - jbegin );
4881  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4882 
4883  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4884  C(i,j ) -= A(i,k) * B(k,j );
4885  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4886  }
4887  if( jpos < jend ) {
4888  C(i,jpos) -= A(i,k) * B(k,jpos);
4889  }
4890  }
4891  }
4892  }
4894  //**********************************************************************************************
4895 
4896  //**Default subtraction assignment to column-major dense matrices (general/general)*************
4910  template< typename MT3 // Type of the left-hand side target matrix
4911  , typename MT4 // Type of the left-hand side matrix operand
4912  , typename MT5 > // Type of the right-hand side matrix operand
4913  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4914  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4915  {
4916  const size_t M( A.rows() );
4917  const size_t N( B.columns() );
4918  const size_t K( A.columns() );
4919 
4920  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4921 
4922  for( size_t j=0UL; j<N; ++j )
4923  {
4924  const size_t kbegin( ( IsLower_v<MT5> )
4925  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4926  :( 0UL ) );
4927  const size_t kend( ( IsUpper_v<MT5> )
4928  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4929  :( K ) );
4930  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4931 
4932  for( size_t k=kbegin; k<kend; ++k )
4933  {
4934  const size_t ibegin( ( IsLower_v<MT4> )
4935  ?( ( IsStrictlyLower_v<MT4> )
4936  ?( LOW ? max(j,k+1UL) : k+1UL )
4937  :( LOW ? max(j,k) : k ) )
4938  :( LOW ? j : 0UL ) );
4939  const size_t iend( ( IsUpper_v<MT4> )
4940  ?( ( IsStrictlyUpper_v<MT4> )
4941  ?( UPP ? min(j+1UL,k) : k )
4942  :( UPP ? min(j,k)+1UL : k+1UL ) )
4943  :( UPP ? j+1UL : M ) );
4944 
4945  if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
4946  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4947 
4948  const size_t inum( iend - ibegin );
4949  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4950 
4951  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4952  C(i ,j) -= A(i ,k) * B(k,j);
4953  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4954  }
4955  if( ipos < iend ) {
4956  C(ipos,j) -= A(ipos,k) * B(k,j);
4957  }
4958  }
4959  }
4960  }
4962  //**********************************************************************************************
4963 
4964  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
4978  template< typename MT3 // Type of the left-hand side target matrix
4979  , typename MT4 // Type of the left-hand side matrix operand
4980  , typename MT5 > // Type of the right-hand side matrix operand
4981  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4982  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4983  {
4984  constexpr size_t block( BLOCK_SIZE );
4985 
4986  const size_t M( A.rows() );
4987  const size_t N( B.columns() );
4988 
4989  for( size_t ii=0UL; ii<M; ii+=block ) {
4990  const size_t iend( min( M, ii+block ) );
4991  for( size_t jj=0UL; jj<N; jj+=block ) {
4992  const size_t jend( min( N, jj+block ) );
4993  for( size_t i=ii; i<iend; ++i )
4994  {
4995  const size_t jbegin( ( IsUpper_v<MT4> )
4996  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
4997  :( jj ) );
4998  const size_t jpos( ( IsLower_v<MT4> )
4999  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
5000  :( jend ) );
5001 
5002  for( size_t j=jbegin; j<jpos; ++j ) {
5003  C(i,j) -= A(i,j) * B(j,j);
5004  }
5005  }
5006  }
5007  }
5008  }
5010  //**********************************************************************************************
5011 
5012  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
5026  template< typename MT3 // Type of the left-hand side target matrix
5027  , typename MT4 // Type of the left-hand side matrix operand
5028  , typename MT5 > // Type of the right-hand side matrix operand
5029  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5030  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5031  {
5032  const size_t M( A.rows() );
5033  const size_t N( B.columns() );
5034 
5035  for( size_t j=0UL; j<N; ++j )
5036  {
5037  const size_t ibegin( ( IsLower_v<MT4> )
5038  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5039  :( 0UL ) );
5040  const size_t iend( ( IsUpper_v<MT4> )
5041  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5042  :( M ) );
5043  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5044 
5045  const size_t inum( iend - ibegin );
5046  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
5047 
5048  for( size_t i=ibegin; i<ipos; i+=2UL ) {
5049  C(i ,j) -= A(i ,j) * B(j,j);
5050  C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
5051  }
5052  if( ipos < iend ) {
5053  C(ipos,j) -= A(ipos,j) * B(j,j);
5054  }
5055  }
5056  }
5058  //**********************************************************************************************
5059 
5060  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
5074  template< typename MT3 // Type of the left-hand side target matrix
5075  , typename MT4 // Type of the left-hand side matrix operand
5076  , typename MT5 > // Type of the right-hand side matrix operand
5077  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5078  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5079  {
5080  const size_t M( A.rows() );
5081  const size_t N( B.columns() );
5082 
5083  for( size_t i=0UL; i<M; ++i )
5084  {
5085  const size_t jbegin( ( IsUpper_v<MT5> )
5086  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5087  :( 0UL ) );
5088  const size_t jend( ( IsLower_v<MT5> )
5089  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5090  :( N ) );
5091  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5092 
5093  const size_t jnum( jend - jbegin );
5094  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
5095 
5096  for( size_t j=jbegin; j<jpos; j+=2UL ) {
5097  C(i,j ) -= A(i,i) * B(i,j );
5098  C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
5099  }
5100  if( jpos < jend ) {
5101  C(i,jpos) -= A(i,i) * B(i,jpos);
5102  }
5103  }
5104  }
5106  //**********************************************************************************************
5107 
5108  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
5122  template< typename MT3 // Type of the left-hand side target matrix
5123  , typename MT4 // Type of the left-hand side matrix operand
5124  , typename MT5 > // Type of the right-hand side matrix operand
5125  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5126  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5127  {
5128  constexpr size_t block( BLOCK_SIZE );
5129 
5130  const size_t M( A.rows() );
5131  const size_t N( B.columns() );
5132 
5133  for( size_t jj=0UL; jj<N; jj+=block ) {
5134  const size_t jend( min( N, jj+block ) );
5135  for( size_t ii=0UL; ii<M; ii+=block ) {
5136  const size_t iend( min( M, ii+block ) );
5137  for( size_t j=jj; j<jend; ++j )
5138  {
5139  const size_t ibegin( ( IsLower_v<MT5> )
5140  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
5141  :( ii ) );
5142  const size_t ipos( ( IsUpper_v<MT5> )
5143  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
5144  :( iend ) );
5145 
5146  for( size_t i=ibegin; i<ipos; ++i ) {
5147  C(i,j) -= A(i,i) * B(i,j);
5148  }
5149  }
5150  }
5151  }
5152  }
5154  //**********************************************************************************************
5155 
5156  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
5170  template< typename MT3 // Type of the left-hand side target matrix
5171  , typename MT4 // Type of the left-hand side matrix operand
5172  , typename MT5 > // Type of the right-hand side matrix operand
5173  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5174  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5175  {
5176  for( size_t i=0UL; i<A.rows(); ++i ) {
5177  C(i,i) -= A(i,i) * B(i,i);
5178  }
5179  }
5181  //**********************************************************************************************
5182 
5183  //**Default subtraction assignment to dense matrices (small matrices)***************************
5197  template< typename MT3 // Type of the left-hand side target matrix
5198  , typename MT4 // Type of the left-hand side matrix operand
5199  , typename MT5 > // Type of the right-hand side matrix operand
5200  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5201  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5202  {
5203  selectDefaultSubAssignKernel( C, A, B );
5204  }
5206  //**********************************************************************************************
5207 
5208  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
5223  template< typename MT3 // Type of the left-hand side target matrix
5224  , typename MT4 // Type of the left-hand side matrix operand
5225  , typename MT5 > // Type of the right-hand side matrix operand
5226  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5227  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5228  {
5229  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5230 
5231  const size_t M( A.rows() );
5232  const size_t N( B.columns() );
5233  const size_t K( A.columns() );
5234 
5235  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5236 
5237  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
5238  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5239 
5240  size_t j( 0UL );
5241 
5242  if( IsIntegral_v<ElementType> )
5243  {
5244  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5245  for( size_t i=0UL; i<M; ++i )
5246  {
5247  const size_t kbegin( ( IsUpper_v<MT4> )
5248  ?( ( IsLower_v<MT5> )
5249  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5250  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5251  :( IsLower_v<MT5> ? j : 0UL ) );
5252  const size_t kend( ( IsLower_v<MT4> )
5253  ?( ( IsUpper_v<MT5> )
5254  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5255  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5256  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
5257 
5258  SIMDType xmm1( C.load(i,j ) );
5259  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5260  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5261  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
5262  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
5263  SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
5264  SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
5265  SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
5266 
5267  for( size_t k=kbegin; k<kend; ++k ) {
5268  const SIMDType a1( set( A(i,k) ) );
5269  xmm1 -= a1 * B.load(k,j );
5270  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5271  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5272  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5273  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5274  xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
5275  xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
5276  xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
5277  }
5278 
5279  C.store( i, j , xmm1 );
5280  C.store( i, j+SIMDSIZE , xmm2 );
5281  C.store( i, j+SIMDSIZE*2UL, xmm3 );
5282  C.store( i, j+SIMDSIZE*3UL, xmm4 );
5283  C.store( i, j+SIMDSIZE*4UL, xmm5 );
5284  C.store( i, j+SIMDSIZE*5UL, xmm6 );
5285  C.store( i, j+SIMDSIZE*6UL, xmm7 );
5286  C.store( i, j+SIMDSIZE*7UL, xmm8 );
5287  }
5288  }
5289  }
5290 
5291  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5292  {
5293  size_t i( 0UL );
5294 
5295  for( ; (i+2UL) <= M; i+=2UL )
5296  {
5297  const size_t kbegin( ( IsUpper_v<MT4> )
5298  ?( ( IsLower_v<MT5> )
5299  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5300  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5301  :( IsLower_v<MT5> ? j : 0UL ) );
5302  const size_t kend( ( IsLower_v<MT4> )
5303  ?( ( IsUpper_v<MT5> )
5304  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
5305  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5306  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
5307 
5308  SIMDType xmm1 ( C.load(i ,j ) );
5309  SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
5310  SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
5311  SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
5312  SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
5313  SIMDType xmm6 ( C.load(i+1UL,j ) );
5314  SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
5315  SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
5316  SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
5317  SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
5318 
5319  for( size_t k=kbegin; k<kend; ++k ) {
5320  const SIMDType a1( set( A(i ,k) ) );
5321  const SIMDType a2( set( A(i+1UL,k) ) );
5322  const SIMDType b1( B.load(k,j ) );
5323  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5324  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5325  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5326  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5327  xmm1 -= a1 * b1;
5328  xmm2 -= a1 * b2;
5329  xmm3 -= a1 * b3;
5330  xmm4 -= a1 * b4;
5331  xmm5 -= a1 * b5;
5332  xmm6 -= a2 * b1;
5333  xmm7 -= a2 * b2;
5334  xmm8 -= a2 * b3;
5335  xmm9 -= a2 * b4;
5336  xmm10 -= a2 * b5;
5337  }
5338 
5339  C.store( i , j , xmm1 );
5340  C.store( i , j+SIMDSIZE , xmm2 );
5341  C.store( i , j+SIMDSIZE*2UL, xmm3 );
5342  C.store( i , j+SIMDSIZE*3UL, xmm4 );
5343  C.store( i , j+SIMDSIZE*4UL, xmm5 );
5344  C.store( i+1UL, j , xmm6 );
5345  C.store( i+1UL, j+SIMDSIZE , xmm7 );
5346  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
5347  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
5348  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
5349  }
5350 
5351  if( i < M )
5352  {
5353  const size_t kbegin( ( IsUpper_v<MT4> )
5354  ?( ( IsLower_v<MT5> )
5355  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5356  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5357  :( IsLower_v<MT5> ? j : 0UL ) );
5358  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
5359 
5360  SIMDType xmm1( C.load(i,j ) );
5361  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5362  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5363  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
5364  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
5365 
5366  for( size_t k=kbegin; k<kend; ++k ) {
5367  const SIMDType a1( set( A(i,k) ) );
5368  xmm1 -= a1 * B.load(k,j );
5369  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5370  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5371  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5372  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5373  }
5374 
5375  C.store( i, j , xmm1 );
5376  C.store( i, j+SIMDSIZE , xmm2 );
5377  C.store( i, j+SIMDSIZE*2UL, xmm3 );
5378  C.store( i, j+SIMDSIZE*3UL, xmm4 );
5379  C.store( i, j+SIMDSIZE*4UL, xmm5 );
5380  }
5381  }
5382 
5383  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5384  {
5385  size_t i( 0UL );
5386 
5387  for( ; (i+2UL) <= M; i+=2UL )
5388  {
5389  const size_t kbegin( ( IsUpper_v<MT4> )
5390  ?( ( IsLower_v<MT5> )
5391  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5392  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5393  :( IsLower_v<MT5> ? j : 0UL ) );
5394  const size_t kend( ( IsLower_v<MT4> )
5395  ?( ( IsUpper_v<MT5> )
5396  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5397  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5398  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
5399 
5400  SIMDType xmm1( C.load(i ,j ) );
5401  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
5402  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
5403  SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
5404  SIMDType xmm5( C.load(i+1UL,j ) );
5405  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
5406  SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
5407  SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
5408 
5409  for( size_t k=kbegin; k<kend; ++k ) {
5410  const SIMDType a1( set( A(i ,k) ) );
5411  const SIMDType a2( set( A(i+1UL,k) ) );
5412  const SIMDType b1( B.load(k,j ) );
5413  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5414  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5415  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5416  xmm1 -= a1 * b1;
5417  xmm2 -= a1 * b2;
5418  xmm3 -= a1 * b3;
5419  xmm4 -= a1 * b4;
5420  xmm5 -= a2 * b1;
5421  xmm6 -= a2 * b2;
5422  xmm7 -= a2 * b3;
5423  xmm8 -= a2 * b4;
5424  }
5425 
5426  C.store( i , j , xmm1 );
5427  C.store( i , j+SIMDSIZE , xmm2 );
5428  C.store( i , j+SIMDSIZE*2UL, xmm3 );
5429  C.store( i , j+SIMDSIZE*3UL, xmm4 );
5430  C.store( i+1UL, j , xmm5 );
5431  C.store( i+1UL, j+SIMDSIZE , xmm6 );
5432  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
5433  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
5434  }
5435 
5436  if( i < M )
5437  {
5438  const size_t kbegin( ( IsUpper_v<MT4> )
5439  ?( ( IsLower_v<MT5> )
5440  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5441  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5442  :( IsLower_v<MT5> ? j : 0UL ) );
5443  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5444 
5445  SIMDType xmm1( C.load(i,j ) );
5446  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5447  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5448  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
5449 
5450  for( size_t k=kbegin; k<kend; ++k ) {
5451  const SIMDType a1( set( A(i,k) ) );
5452  xmm1 -= a1 * B.load(k,j );
5453  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5454  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5455  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5456  }
5457 
5458  C.store( i, j , xmm1 );
5459  C.store( i, j+SIMDSIZE , xmm2 );
5460  C.store( i, j+SIMDSIZE*2UL, xmm3 );
5461  C.store( i, j+SIMDSIZE*3UL, xmm4 );
5462  }
5463  }
5464 
5465  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5466  {
5467  size_t i( 0UL );
5468 
5469  for( ; (i+2UL) <= M; i+=2UL )
5470  {
5471  const size_t kbegin( ( IsUpper_v<MT4> )
5472  ?( ( IsLower_v<MT5> )
5473  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5474  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5475  :( IsLower_v<MT5> ? j : 0UL ) );
5476  const size_t kend( ( IsLower_v<MT4> )
5477  ?( ( IsUpper_v<MT5> )
5478  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
5479  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5480  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
5481 
5482  SIMDType xmm1( C.load(i ,j ) );
5483  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
5484  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
5485  SIMDType xmm4( C.load(i+1UL,j ) );
5486  SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
5487  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
5488 
5489  for( size_t k=kbegin; k<kend; ++k ) {
5490  const SIMDType a1( set( A(i ,k) ) );
5491  const SIMDType a2( set( A(i+1UL,k) ) );
5492  const SIMDType b1( B.load(k,j ) );
5493  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5494  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5495  xmm1 -= a1 * b1;
5496  xmm2 -= a1 * b2;
5497  xmm3 -= a1 * b3;
5498  xmm4 -= a2 * b1;
5499  xmm5 -= a2 * b2;
5500  xmm6 -= a2 * b3;
5501  }
5502 
5503  C.store( i , j , xmm1 );
5504  C.store( i , j+SIMDSIZE , xmm2 );
5505  C.store( i , j+SIMDSIZE*2UL, xmm3 );
5506  C.store( i+1UL, j , xmm4 );
5507  C.store( i+1UL, j+SIMDSIZE , xmm5 );
5508  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
5509  }
5510 
5511  if( i < M )
5512  {
5513  const size_t kbegin( ( IsUpper_v<MT4> )
5514  ?( ( IsLower_v<MT5> )
5515  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5516  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5517  :( IsLower_v<MT5> ? j : 0UL ) );
5518  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
5519 
5520  SIMDType xmm1( C.load(i,j ) );
5521  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5522  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5523 
5524  for( size_t k=kbegin; k<kend; ++k ) {
5525  const SIMDType a1( set( A(i,k) ) );
5526  xmm1 -= a1 * B.load(k,j );
5527  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5528  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5529  }
5530 
5531  C.store( i, j , xmm1 );
5532  C.store( i, j+SIMDSIZE , xmm2 );
5533  C.store( i, j+SIMDSIZE*2UL, xmm3 );
5534  }
5535  }
5536 
5537  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5538  {
5539  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
5540  size_t i( LOW ? j : 0UL );
5541 
5542  for( ; (i+4UL) <= iend; i+=4UL )
5543  {
5544  const size_t kbegin( ( IsUpper_v<MT4> )
5545  ?( ( IsLower_v<MT5> )
5546  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5547  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5548  :( IsLower_v<MT5> ? j : 0UL ) );
5549  const size_t kend( ( IsLower_v<MT4> )
5550  ?( ( IsUpper_v<MT5> )
5551  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
5552  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5553  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5554 
5555  SIMDType xmm1( C.load(i ,j ) );
5556  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
5557  SIMDType xmm3( C.load(i+1UL,j ) );
5558  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
5559  SIMDType xmm5( C.load(i+2UL,j ) );
5560  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
5561  SIMDType xmm7( C.load(i+3UL,j ) );
5562  SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
5563 
5564  for( size_t k=kbegin; k<kend; ++k ) {
5565  const SIMDType a1( set( A(i ,k) ) );
5566  const SIMDType a2( set( A(i+1UL,k) ) );
5567  const SIMDType a3( set( A(i+2UL,k) ) );
5568  const SIMDType a4( set( A(i+3UL,k) ) );
5569  const SIMDType b1( B.load(k,j ) );
5570  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5571  xmm1 -= a1 * b1;
5572  xmm2 -= a1 * b2;
5573  xmm3 -= a2 * b1;
5574  xmm4 -= a2 * b2;
5575  xmm5 -= a3 * b1;
5576  xmm6 -= a3 * b2;
5577  xmm7 -= a4 * b1;
5578  xmm8 -= a4 * b2;
5579  }
5580 
5581  C.store( i , j , xmm1 );
5582  C.store( i , j+SIMDSIZE, xmm2 );
5583  C.store( i+1UL, j , xmm3 );
5584  C.store( i+1UL, j+SIMDSIZE, xmm4 );
5585  C.store( i+2UL, j , xmm5 );
5586  C.store( i+2UL, j+SIMDSIZE, xmm6 );
5587  C.store( i+3UL, j , xmm7 );
5588  C.store( i+3UL, j+SIMDSIZE, xmm8 );
5589  }
5590 
5591  for( ; (i+3UL) <= iend; i+=3UL )
5592  {
5593  const size_t kbegin( ( IsUpper_v<MT4> )
5594  ?( ( IsLower_v<MT5> )
5595  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5596  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5597  :( IsLower_v<MT5> ? j : 0UL ) );
5598  const size_t kend( ( IsLower_v<MT4> )
5599  ?( ( IsUpper_v<MT5> )
5600  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
5601  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5602  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5603 
5604  SIMDType xmm1( C.load(i ,j ) );
5605  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
5606  SIMDType xmm3( C.load(i+1UL,j ) );
5607  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
5608  SIMDType xmm5( C.load(i+2UL,j ) );
5609  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
5610 
5611  for( size_t k=kbegin; k<kend; ++k ) {
5612  const SIMDType a1( set( A(i ,k) ) );
5613  const SIMDType a2( set( A(i+1UL,k) ) );
5614  const SIMDType a3( set( A(i+2UL,k) ) );
5615  const SIMDType b1( B.load(k,j ) );
5616  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5617  xmm1 -= a1 * b1;
5618  xmm2 -= a1 * b2;
5619  xmm3 -= a2 * b1;
5620  xmm4 -= a2 * b2;
5621  xmm5 -= a3 * b1;
5622  xmm6 -= a3 * b2;
5623  }
5624 
5625  C.store( i , j , xmm1 );
5626  C.store( i , j+SIMDSIZE, xmm2 );
5627  C.store( i+1UL, j , xmm3 );
5628  C.store( i+1UL, j+SIMDSIZE, xmm4 );
5629  C.store( i+2UL, j , xmm5 );
5630  C.store( i+2UL, j+SIMDSIZE, xmm6 );
5631  }
5632 
5633  for( ; (i+2UL) <= iend; i+=2UL )
5634  {
5635  const size_t kbegin( ( IsUpper_v<MT4> )
5636  ?( ( IsLower_v<MT5> )
5637  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5638  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5639  :( IsLower_v<MT5> ? j : 0UL ) );
5640  const size_t kend( ( IsLower_v<MT4> )
5641  ?( ( IsUpper_v<MT5> )
5642  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
5643  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5644  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5645 
5646  SIMDType xmm1( C.load(i ,j ) );
5647  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
5648  SIMDType xmm3( C.load(i+1UL,j ) );
5649  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
5650  SIMDType xmm5, xmm6, xmm7, xmm8;
5651  size_t k( kbegin );
5652 
5653  for( ; (k+2UL) <= kend; k+=2UL ) {
5654  const SIMDType a1( set( A(i ,k ) ) );
5655  const SIMDType a2( set( A(i+1UL,k ) ) );
5656  const SIMDType a3( set( A(i ,k+1UL) ) );
5657  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
5658  const SIMDType b1( B.load(k ,j ) );
5659  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5660  const SIMDType b3( B.load(k+1UL,j ) );
5661  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5662  xmm1 -= a1 * b1;
5663  xmm2 -= a1 * b2;
5664  xmm3 -= a2 * b1;
5665  xmm4 -= a2 * b2;
5666  xmm5 -= a3 * b3;
5667  xmm6 -= a3 * b4;
5668  xmm7 -= a4 * b3;
5669  xmm8 -= a4 * b4;
5670  }
5671 
5672  for( ; k<kend; ++k ) {
5673  const SIMDType a1( set( A(i ,k) ) );
5674  const SIMDType a2( set( A(i+1UL,k) ) );
5675  const SIMDType b1( B.load(k,j ) );
5676  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5677  xmm1 -= a1 * b1;
5678  xmm2 -= a1 * b2;
5679  xmm3 -= a2 * b1;
5680  xmm4 -= a2 * b2;
5681  }
5682 
5683  C.store( i , j , xmm1+xmm5 );
5684  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
5685  C.store( i+1UL, j , xmm3+xmm7 );
5686  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
5687  }
5688 
5689  if( i < iend )
5690  {
5691  const size_t kbegin( ( IsUpper_v<MT4> )
5692  ?( ( IsLower_v<MT5> )
5693  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5694  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5695  :( IsLower_v<MT5> ? j : 0UL ) );
5696  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
5697 
5698  SIMDType xmm1( C.load(i,j ) );
5699  SIMDType xmm2( C.load(i,j+SIMDSIZE) );
5700  SIMDType xmm3, xmm4;
5701  size_t k( kbegin );
5702 
5703  for( ; (k+2UL) <= kend; k+=2UL ) {
5704  const SIMDType a1( set( A(i,k ) ) );
5705  const SIMDType a2( set( A(i,k+1UL) ) );
5706  xmm1 -= a1 * B.load(k ,j );
5707  xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
5708  xmm3 -= a2 * B.load(k+1UL,j );
5709  xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
5710  }
5711 
5712  for( ; k<kend; ++k ) {
5713  const SIMDType a1( set( A(i,k) ) );
5714  xmm1 -= a1 * B.load(k,j );
5715  xmm2 -= a1 * B.load(k,j+SIMDSIZE);
5716  }
5717 
5718  C.store( i, j , xmm1+xmm3 );
5719  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
5720  }
5721  }
5722 
5723  for( ; j<jpos; j+=SIMDSIZE )
5724  {
5725  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
5726  size_t i( LOW ? j : 0UL );
5727 
5728  for( ; (i+4UL) <= iend; i+=4UL )
5729  {
5730  const size_t kbegin( ( IsUpper_v<MT4> )
5731  ?( ( IsLower_v<MT5> )
5732  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5733  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5734  :( IsLower_v<MT5> ? j : 0UL ) );
5735  const size_t kend( ( IsLower_v<MT4> )
5736  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
5737  :( K ) );
5738 
5739  SIMDType xmm1( C.load(i ,j) );
5740  SIMDType xmm2( C.load(i+1UL,j) );
5741  SIMDType xmm3( C.load(i+2UL,j) );
5742  SIMDType xmm4( C.load(i+3UL,j) );
5743  SIMDType xmm5, xmm6, xmm7, xmm8;
5744  size_t k( kbegin );
5745 
5746  for( ; (k+2UL) <= kend; k+=2UL ) {
5747  const SIMDType b1( B.load(k ,j) );
5748  const SIMDType b2( B.load(k+1UL,j) );
5749  xmm1 -= set( A(i ,k ) ) * b1;
5750  xmm2 -= set( A(i+1UL,k ) ) * b1;
5751  xmm3 -= set( A(i+2UL,k ) ) * b1;
5752  xmm4 -= set( A(i+3UL,k ) ) * b1;
5753  xmm5 -= set( A(i ,k+1UL) ) * b2;
5754  xmm6 -= set( A(i+1UL,k+1UL) ) * b2;
5755  xmm7 -= set( A(i+2UL,k+1UL) ) * b2;
5756  xmm8 -= set( A(i+3UL,k+1UL) ) * b2;
5757  }
5758 
5759  for( ; k<kend; ++k ) {
5760  const SIMDType b1( B.load(k,j) );
5761  xmm1 -= set( A(i ,k) ) * b1;
5762  xmm2 -= set( A(i+1UL,k) ) * b1;
5763  xmm3 -= set( A(i+2UL,k) ) * b1;
5764  xmm4 -= set( A(i+3UL,k) ) * b1;
5765  }
5766 
5767  C.store( i , j, xmm1+xmm5 );
5768  C.store( i+1UL, j, xmm2+xmm6 );
5769  C.store( i+2UL, j, xmm3+xmm7 );
5770  C.store( i+3UL, j, xmm4+xmm8 );
5771  }
5772 
5773  for( ; (i+3UL) <= iend; i+=3UL )
5774  {
5775  const size_t kbegin( ( IsUpper_v<MT4> )
5776  ?( ( IsLower_v<MT5> )
5777  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5778  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5779  :( IsLower_v<MT5> ? j : 0UL ) );
5780  const size_t kend( ( IsLower_v<MT4> )
5781  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
5782  :( K ) );
5783 
5784  SIMDType xmm1( C.load(i ,j) );
5785  SIMDType xmm2( C.load(i+1UL,j) );
5786  SIMDType xmm3( C.load(i+2UL,j) );
5787  SIMDType xmm4, xmm5, xmm6;
5788  size_t k( kbegin );
5789 
5790  for( ; (k+2UL) <= kend; k+=2UL ) {
5791  const SIMDType b1( B.load(k ,j) );
5792  const SIMDType b2( B.load(k+1UL,j) );
5793  xmm1 -= set( A(i ,k ) ) * b1;
5794  xmm2 -= set( A(i+1UL,k ) ) * b1;
5795  xmm3 -= set( A(i+2UL,k ) ) * b1;
5796  xmm4 -= set( A(i ,k+1UL) ) * b2;
5797  xmm5 -= set( A(i+1UL,k+1UL) ) * b2;
5798  xmm6 -= set( A(i+2UL,k+1UL) ) * b2;
5799  }
5800 
5801  for( ; k<kend; ++k ) {
5802  const SIMDType b1( B.load(k,j) );
5803  xmm1 -= set( A(i ,k) ) * b1;
5804  xmm2 -= set( A(i+1UL,k) ) * b1;
5805  xmm3 -= set( A(i+2UL,k) ) * b1;
5806  }
5807 
5808  C.store( i , j, xmm1+xmm4 );
5809  C.store( i+1UL, j, xmm2+xmm5 );
5810  C.store( i+2UL, j, xmm3+xmm6 );
5811  }
5812 
5813  for( ; (i+2UL) <= iend; i+=2UL )
5814  {
5815  const size_t kbegin( ( IsUpper_v<MT4> )
5816  ?( ( IsLower_v<MT5> )
5817  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5818  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5819  :( IsLower_v<MT5> ? j : 0UL ) );
5820  const size_t kend( ( IsLower_v<MT4> )
5821  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5822  :( K ) );
5823 
5824  SIMDType xmm1( C.load(i ,j) );
5825  SIMDType xmm2( C.load(i+1UL,j) );
5826  SIMDType xmm3, xmm4;
5827  size_t k( kbegin );
5828 
5829  for( ; (k+2UL) <= kend; k+=2UL ) {
5830  const SIMDType b1( B.load(k ,j) );
5831  const SIMDType b2( B.load(k+1UL,j) );
5832  xmm1 -= set( A(i ,k ) ) * b1;
5833  xmm2 -= set( A(i+1UL,k ) ) * b1;
5834  xmm3 -= set( A(i ,k+1UL) ) * b2;
5835  xmm4 -= set( A(i+1UL,k+1UL) ) * b2;
5836  }
5837 
5838  for( ; k<kend; ++k ) {
5839  const SIMDType b1( B.load(k,j) );
5840  xmm1 -= set( A(i ,k) ) * b1;
5841  xmm2 -= set( A(i+1UL,k) ) * b1;
5842  }
5843 
5844  C.store( i , j, xmm1+xmm3 );
5845  C.store( i+1UL, j, xmm2+xmm4 );
5846  }
5847 
5848  if( i < iend )
5849  {
5850  const size_t kbegin( ( IsUpper_v<MT4> )
5851  ?( ( IsLower_v<MT5> )
5852  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5853  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5854  :( IsLower_v<MT5> ? j : 0UL ) );
5855 
5856  SIMDType xmm1( C.load(i,j) );
5857  SIMDType xmm2;
5858  size_t k( kbegin );
5859 
5860  for( ; (k+2UL) <= K; k+=2UL ) {
5861  xmm1 -= set( A(i,k ) ) * B.load(k ,j);
5862  xmm2 -= set( A(i,k+1UL) ) * B.load(k+1UL,j);
5863  }
5864 
5865  for( ; k<K; ++k ) {
5866  xmm1 -= set( A(i,k) ) * B.load(k,j);
5867  }
5868 
5869  C.store( i, j, xmm1+xmm2 );
5870  }
5871  }
5872 
5873  for( ; remainder && j<N; ++j )
5874  {
5875  const size_t iend( UPP ? j+1UL : M );
5876  size_t i( LOW ? j : 0UL );
5877 
5878  for( ; (i+2UL) <= iend; i+=2UL )
5879  {
5880  const size_t kbegin( ( IsUpper_v<MT4> )
5881  ?( ( IsLower_v<MT5> )
5882  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5883  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5884  :( IsLower_v<MT5> ? j : 0UL ) );
5885  const size_t kend( ( IsLower_v<MT4> )
5886  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5887  :( K ) );
5888 
5889  ElementType value1( C(i ,j) );
5890  ElementType value2( C(i+1UL,j) );
5891 
5892  for( size_t k=kbegin; k<kend; ++k ) {
5893  value1 -= A(i ,k) * B(k,j);
5894  value2 -= A(i+1UL,k) * B(k,j);
5895  }
5896 
5897  C(i ,j) = value1;
5898  C(i+1UL,j) = value2;
5899  }
5900 
5901  if( i < iend )
5902  {
5903  const size_t kbegin( ( IsUpper_v<MT4> )
5904  ?( ( IsLower_v<MT5> )
5905  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5906  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5907  :( IsLower_v<MT5> ? j : 0UL ) );
5908 
5909  ElementType value( C(i,j) );
5910 
5911  for( size_t k=kbegin; k<K; ++k ) {
5912  value -= A(i,k) * B(k,j);
5913  }
5914 
5915  C(i,j) = value;
5916  }
5917  }
5918  }
5920  //**********************************************************************************************
5921 
5922  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
5937  template< typename MT3 // Type of the left-hand side target matrix
5938  , typename MT4 // Type of the left-hand side matrix operand
5939  , typename MT5 > // Type of the right-hand side matrix operand
5940  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5941  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5942  {
5943  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5944 
5945  const size_t M( A.rows() );
5946  const size_t N( B.columns() );
5947  const size_t K( A.columns() );
5948 
5949  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5950 
5951  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
5952  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5953 
5954  size_t i( 0UL );
5955 
5956  if( IsIntegral_v<ElementType> )
5957  {
5958  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5959  for( size_t j=0UL; j<N; ++j )
5960  {
5961  const size_t kbegin( ( IsLower_v<MT5> )
5962  ?( ( IsUpper_v<MT4> )
5963  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5964  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5965  :( IsUpper_v<MT4> ? i : 0UL ) );
5966  const size_t kend( ( IsUpper_v<MT5> )
5967  ?( ( IsLower_v<MT4> )
5968  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5969  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5970  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
5971 
5972  SIMDType xmm1( C.load(i ,j) );
5973  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
5974  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
5975  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
5976  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
5977  SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
5978  SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
5979  SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
5980 
5981  for( size_t k=kbegin; k<kend; ++k ) {
5982  const SIMDType b1( set( B(k,j) ) );
5983  xmm1 -= A.load(i ,k) * b1;
5984  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5985  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5986  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5987  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5988  xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
5989  xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
5990  xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
5991  }
5992 
5993  C.store( i , j, xmm1 );
5994  C.store( i+SIMDSIZE , j, xmm2 );
5995  C.store( i+SIMDSIZE*2UL, j, xmm3 );
5996  C.store( i+SIMDSIZE*3UL, j, xmm4 );
5997  C.store( i+SIMDSIZE*4UL, j, xmm5 );
5998  C.store( i+SIMDSIZE*5UL, j, xmm6 );
5999  C.store( i+SIMDSIZE*6UL, j, xmm7 );
6000  C.store( i+SIMDSIZE*7UL, j, xmm8 );
6001  }
6002  }
6003  }
6004 
6005  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6006  {
6007  size_t j( 0UL );
6008 
6009  for( ; (j+2UL) <= N; j+=2UL )
6010  {
6011  const size_t kbegin( ( IsLower_v<MT5> )
6012  ?( ( IsUpper_v<MT4> )
6013  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6014  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6015  :( IsUpper_v<MT4> ? i : 0UL ) );
6016  const size_t kend( ( IsUpper_v<MT5> )
6017  ?( ( IsLower_v<MT4> )
6018  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6019  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6020  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
6021 
6022  SIMDType xmm1 ( C.load(i ,j ) );
6023  SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
6024  SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
6025  SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
6026  SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
6027  SIMDType xmm6 ( C.load(i ,j+1UL) );
6028  SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
6029  SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
6030  SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
6031  SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
6032 
6033  for( size_t k=kbegin; k<kend; ++k ) {
6034  const SIMDType a1( A.load(i ,k) );
6035  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6036  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6037  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6038  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6039  const SIMDType b1( set( B(k,j ) ) );
6040  const SIMDType b2( set( B(k,j+1UL) ) );
6041  xmm1 -= a1 * b1;
6042  xmm2 -= a2 * b1;
6043  xmm3 -= a3 * b1;
6044  xmm4 -= a4 * b1;
6045  xmm5 -= a5 * b1;
6046  xmm6 -= a1 * b2;
6047  xmm7 -= a2 * b2;
6048  xmm8 -= a3 * b2;
6049  xmm9 -= a4 * b2;
6050  xmm10 -= a5 * b2;
6051  }
6052 
6053  C.store( i , j , xmm1 );
6054  C.store( i+SIMDSIZE , j , xmm2 );
6055  C.store( i+SIMDSIZE*2UL, j , xmm3 );
6056  C.store( i+SIMDSIZE*3UL, j , xmm4 );
6057  C.store( i+SIMDSIZE*4UL, j , xmm5 );
6058  C.store( i , j+1UL, xmm6 );
6059  C.store( i+SIMDSIZE , j+1UL, xmm7 );
6060  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
6061  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
6062  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
6063  }
6064 
6065  if( j < N )
6066  {
6067  const size_t kbegin( ( IsLower_v<MT5> )
6068  ?( ( IsUpper_v<MT4> )
6069  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6070  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6071  :( IsUpper_v<MT4> ? i : 0UL ) );
6072  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
6073 
6074  SIMDType xmm1( C.load(i ,j) );
6075  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
6076  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
6077  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
6078  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
6079 
6080  for( size_t k=kbegin; k<kend; ++k ) {
6081  const SIMDType b1( set( B(k,j) ) );
6082  xmm1 -= A.load(i ,k) * b1;
6083  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6084  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6085  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
6086  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
6087  }
6088 
6089  C.store( i , j, xmm1 );
6090  C.store( i+SIMDSIZE , j, xmm2 );
6091  C.store( i+SIMDSIZE*2UL, j, xmm3 );
6092  C.store( i+SIMDSIZE*3UL, j, xmm4 );
6093  C.store( i+SIMDSIZE*4UL, j, xmm5 );
6094  }
6095  }
6096 
6097  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6098  {
6099  size_t j( 0UL );
6100 
6101  for( ; (j+2UL) <= N; j+=2UL )
6102  {
6103  const size_t kbegin( ( IsLower_v<MT5> )
6104  ?( ( IsUpper_v<MT4> )
6105  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6106  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6107  :( IsUpper_v<MT4> ? i : 0UL ) );
6108  const size_t kend( ( IsUpper_v<MT5> )
6109  ?( ( IsLower_v<MT4> )
6110  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6111  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6112  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
6113 
6114  SIMDType xmm1( C.load(i ,j ) );
6115  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
6116  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
6117  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
6118  SIMDType xmm5( C.load(i ,j+1UL) );
6119  SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
6120  SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
6121  SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
6122 
6123  for( size_t k=kbegin; k<kend; ++k ) {
6124  const SIMDType a1( A.load(i ,k) );
6125  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6126  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6127  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6128  const SIMDType b1( set( B(k,j ) ) );
6129  const SIMDType b2( set( B(k,j+1UL) ) );
6130  xmm1 -= a1 * b1;
6131  xmm2 -= a2 * b1;
6132  xmm3 -= a3 * b1;
6133  xmm4 -= a4 * b1;
6134  xmm5 -= a1 * b2;
6135  xmm6 -= a2 * b2;
6136  xmm7 -= a3 * b2;
6137  xmm8 -= a4 * b2;
6138  }
6139 
6140  C.store( i , j , xmm1 );
6141  C.store( i+SIMDSIZE , j , xmm2 );
6142  C.store( i+SIMDSIZE*2UL, j , xmm3 );
6143  C.store( i+SIMDSIZE*3UL, j , xmm4 );
6144  C.store( i , j+1UL, xmm5 );
6145  C.store( i+SIMDSIZE , j+1UL, xmm6 );
6146  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
6147  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
6148  }
6149 
6150  if( j < N )
6151  {
6152  const size_t kbegin( ( IsLower_v<MT5> )
6153  ?( ( IsUpper_v<MT4> )
6154  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6155  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6156  :( IsUpper_v<MT4> ? i : 0UL ) );
6157  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
6158 
6159  SIMDType xmm1( C.load(i ,j) );
6160  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
6161  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
6162  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
6163 
6164  for( size_t k=kbegin; k<kend; ++k ) {
6165  const SIMDType b1( set( B(k,j) ) );
6166  xmm1 -= A.load(i ,k) * b1;
6167  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6168  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6169  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
6170  }
6171 
6172  C.store( i , j, xmm1 );
6173  C.store( i+SIMDSIZE , j, xmm2 );
6174  C.store( i+SIMDSIZE*2UL, j, xmm3 );
6175  C.store( i+SIMDSIZE*3UL, j, xmm4 );
6176  }
6177  }
6178 
6179  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
6180  {
6181  size_t j( 0UL );
6182 
6183  for( ; (j+2UL) <= N; j+=2UL )
6184  {
6185  const size_t kbegin( ( IsLower_v<MT5> )
6186  ?( ( IsUpper_v<MT4> )
6187  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6188  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6189  :( IsUpper_v<MT4> ? i : 0UL ) );
6190  const size_t kend( ( IsUpper_v<MT5> )
6191  ?( ( IsLower_v<MT4> )
6192  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6193  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6194  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
6195 
6196  SIMDType xmm1( C.load(i ,j ) );
6197  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
6198  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
6199  SIMDType xmm4( C.load(i ,j+1UL) );
6200  SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
6201  SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
6202 
6203  for( size_t k=kbegin; k<kend; ++k ) {
6204  const SIMDType a1( A.load(i ,k) );
6205  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6206  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6207  const SIMDType b1( set( B(k,j ) ) );
6208  const SIMDType b2( set( B(k,j+1UL) ) );
6209  xmm1 -= a1 * b1;
6210  xmm2 -= a2 * b1;
6211  xmm3 -= a3 * b1;
6212  xmm4 -= a1 * b2;
6213  xmm5 -= a2 * b2;
6214  xmm6 -= a3 * b2;
6215  }
6216 
6217  C.store( i , j , xmm1 );
6218  C.store( i+SIMDSIZE , j , xmm2 );
6219  C.store( i+SIMDSIZE*2UL, j , xmm3 );
6220  C.store( i , j+1UL, xmm4 );
6221  C.store( i+SIMDSIZE , j+1UL, xmm5 );
6222  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
6223  }
6224 
6225  if( j < N )
6226  {
6227  const size_t kbegin( ( IsLower_v<MT5> )
6228  ?( ( IsUpper_v<MT4> )
6229  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6230  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6231  :( IsUpper_v<MT4> ? i : 0UL ) );
6232  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
6233 
6234  SIMDType xmm1( C.load(i ,j) );
6235  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
6236  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
6237 
6238  for( size_t k=kbegin; k<kend; ++k ) {
6239  const SIMDType b1( set( B(k,j) ) );
6240  xmm1 -= A.load(i ,k) * b1;
6241  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6242  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6243  }
6244 
6245  C.store( i , j, xmm1 );
6246  C.store( i+SIMDSIZE , j, xmm2 );
6247  C.store( i+SIMDSIZE*2UL, j, xmm3 );
6248  }
6249  }
6250 
6251  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6252  {
6253  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
6254  size_t j( UPP ? i : 0UL );
6255 
6256  for( ; (j+4UL) <= jend; j+=4UL )
6257  {
6258  const size_t kbegin( ( IsLower_v<MT5> )
6259  ?( ( IsUpper_v<MT4> )
6260  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6261  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6262  :( IsUpper_v<MT4> ? i : 0UL ) );
6263  const size_t kend( ( IsUpper_v<MT5> )
6264  ?( ( IsLower_v<MT4> )
6265  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6266  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6267  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6268 
6269  SIMDType xmm1( C.load(i ,j ) );
6270  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
6271  SIMDType xmm3( C.load(i ,j+1UL) );
6272  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
6273  SIMDType xmm5( C.load(i ,j+2UL) );
6274  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
6275  SIMDType xmm7( C.load(i ,j+3UL) );
6276  SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
6277 
6278  for( size_t k=kbegin; k<kend; ++k ) {
6279  const SIMDType a1( A.load(i ,k) );
6280  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6281  const SIMDType b1( set( B(k,j ) ) );
6282  const SIMDType b2( set( B(k,j+1UL) ) );
6283  const SIMDType b3( set( B(k,j+2UL) ) );
6284  const SIMDType b4( set( B(k,j+3UL) ) );
6285  xmm1 -= a1 * b1;
6286  xmm2 -= a2 * b1;
6287  xmm3 -= a1 * b2;
6288  xmm4 -= a2 * b2;
6289  xmm5 -= a1 * b3;
6290  xmm6 -= a2 * b3;
6291  xmm7 -= a1 * b4;
6292  xmm8 -= a2 * b4;
6293  }
6294 
6295  C.store( i , j , xmm1 );
6296  C.store( i+SIMDSIZE, j , xmm2 );
6297  C.store( i , j+1UL, xmm3 );
6298  C.store( i+SIMDSIZE, j+1UL, xmm4 );
6299  C.store( i , j+2UL, xmm5 );
6300  C.store( i+SIMDSIZE, j+2UL, xmm6 );
6301  C.store( i , j+3UL, xmm7 );
6302  C.store( i+SIMDSIZE, j+3UL, xmm8 );
6303  }
6304 
6305  for( ; (j+3UL) <= jend; j+=3UL )
6306  {
6307  const size_t kbegin( ( IsLower_v<MT5> )
6308  ?( ( IsUpper_v<MT4> )
6309  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6310  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6311  :( IsUpper_v<MT4> ? i : 0UL ) );
6312  const size_t kend( ( IsUpper_v<MT5> )
6313  ?( ( IsLower_v<MT4> )
6314  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6315  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6316  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6317 
6318  SIMDType xmm1( C.load(i ,j ) );
6319  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
6320  SIMDType xmm3( C.load(i ,j+1UL) );
6321  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
6322  SIMDType xmm5( C.load(i ,j+2UL) );
6323  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
6324 
6325  for( size_t k=kbegin; k<kend; ++k ) {
6326  const SIMDType a1( A.load(i ,k) );
6327  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6328  const SIMDType b1( set( B(k,j ) ) );
6329  const SIMDType b2( set( B(k,j+1UL) ) );
6330  const SIMDType b3( set( B(k,j+2UL) ) );
6331  xmm1 -= a1 * b1;
6332  xmm2 -= a2 * b1;
6333  xmm3 -= a1 * b2;
6334  xmm4 -= a2 * b2;
6335  xmm5 -= a1 * b3;
6336  xmm6 -= a2 * b3;
6337  }
6338 
6339  C.store( i , j , xmm1 );
6340  C.store( i+SIMDSIZE, j , xmm2 );
6341  C.store( i , j+1UL, xmm3 );
6342  C.store( i+SIMDSIZE, j+1UL, xmm4 );
6343  C.store( i , j+2UL, xmm5 );
6344  C.store( i+SIMDSIZE, j+2UL, xmm6 );
6345  }
6346 
6347  for( ; (j+2UL) <= jend; j+=2UL )
6348  {
6349  const size_t kbegin( ( IsLower_v<MT5> )
6350  ?( ( IsUpper_v<MT4> )
6351  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6352  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6353  :( IsUpper_v<MT4> ? i : 0UL ) );
6354  const size_t kend( ( IsUpper_v<MT5> )
6355  ?( ( IsLower_v<MT4> )
6356  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6357  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6358  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6359 
6360  SIMDType xmm1( C.load(i ,j ) );
6361  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
6362  SIMDType xmm3( C.load(i ,j+1UL) );
6363  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
6364  SIMDType xmm5, xmm6, xmm7, xmm8;
6365  size_t k( kbegin );
6366 
6367  for( ; (k+2UL) <= kend; k+=2UL ) {
6368  const SIMDType a1( A.load(i ,k ) );
6369  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6370  const SIMDType a3( A.load(i ,k+1UL) );
6371  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6372  const SIMDType b1( set( B(k ,j ) ) );
6373  const SIMDType b2( set( B(k ,j+1UL) ) );
6374  const SIMDType b3( set( B(k+1UL,j ) ) );
6375  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
6376  xmm1 -= a1 * b1;
6377  xmm2 -= a2 * b1;
6378  xmm3 -= a1 * b2;
6379  xmm4 -= a2 * b2;
6380  xmm5 -= a3 * b3;
6381  xmm6 -= a4 * b3;
6382  xmm7 -= a3 * b4;
6383  xmm8 -= a4 * b4;
6384  }
6385 
6386  for( ; k<kend; ++k ) {
6387  const SIMDType a1( A.load(i ,k) );
6388  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6389  const SIMDType b1( set( B(k,j ) ) );
6390  const SIMDType b2( set( B(k,j+1UL) ) );
6391  xmm1 -= a1 * b1;
6392  xmm2 -= a2 * b1;
6393  xmm3 -= a1 * b2;
6394  xmm4 -= a2 * b2;
6395  }
6396 
6397  C.store( i , j , xmm1+xmm5 );
6398  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
6399  C.store( i , j+1UL, xmm3+xmm7 );
6400  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
6401  }
6402 
6403  if( j < jend )
6404  {
6405  const size_t kbegin( ( IsLower_v<MT5> )
6406  ?( ( IsUpper_v<MT4> )
6407  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6408  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6409  :( IsUpper_v<MT4> ? i : 0UL ) );
6410  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6411 
6412  SIMDType xmm1( C.load(i ,j) );
6413  SIMDType xmm2( C.load(i+SIMDSIZE,j) );
6414  SIMDType xmm3, xmm4;
6415  size_t k( kbegin );
6416 
6417  for( ; (k+2UL) <= kend; k+=2UL ) {
6418  const SIMDType b1( set( B(k ,j) ) );
6419  const SIMDType b2( set( B(k+1UL,j) ) );
6420  xmm1 -= A.load(i ,k ) * b1;
6421  xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
6422  xmm3 -= A.load(i ,k+1UL) * b2;
6423  xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
6424  }
6425 
6426  for( ; k<kend; ++k ) {
6427  const SIMDType b1( set( B(k,j) ) );
6428  xmm1 -= A.load(i ,k) * b1;
6429  xmm2 -= A.load(i+SIMDSIZE,k) * b1;
6430  }
6431 
6432  C.store( i , j, xmm1+xmm3 );
6433  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
6434  }
6435  }
6436 
6437  for( ; i<ipos; i+=SIMDSIZE )
6438  {
6439  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
6440  size_t j( UPP ? i : 0UL );
6441 
6442  for( ; (j+4UL) <= jend; j+=4UL )
6443  {
6444  const size_t kbegin( ( IsLower_v<MT5> )
6445  ?( ( IsUpper_v<MT4> )
6446  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6447  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6448  :( IsUpper_v<MT4> ? i : 0UL ) );
6449  const size_t kend( ( IsUpper_v<MT5> )
6450  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6451  :( K ) );
6452 
6453  SIMDType xmm1( C.load(i,j ) );
6454  SIMDType xmm2( C.load(i,j+1UL) );
6455  SIMDType xmm3( C.load(i,j+2UL) );
6456  SIMDType xmm4( C.load(i,j+3UL) );
6457  SIMDType xmm5, xmm6, xmm7, xmm8;
6458  size_t k( kbegin );
6459 
6460  for( ; (k+2UL) <= kend; k+=2UL ) {
6461  const SIMDType a1( A.load(i,k ) );
6462  const SIMDType a2( A.load(i,k+1UL) );
6463  xmm1 -= a1 * set( B(k ,j ) );
6464  xmm2 -= a1 * set( B(k ,j+1UL) );
6465  xmm3 -= a1 * set( B(k ,j+2UL) );
6466  xmm4 -= a1 * set( B(k ,j+3UL) );
6467  xmm5 -= a2 * set( B(k+1UL,j ) );
6468  xmm6 -= a2 * set( B(k+1UL,j+1UL) );
6469  xmm7 -= a2 * set( B(k+1UL,j+2UL) );
6470  xmm8 -= a2 * set( B(k+1UL,j+3UL) );
6471  }
6472 
6473  for( ; k<kend; ++k ) {
6474  const SIMDType a1( A.load(i,k) );
6475  xmm1 -= a1 * set( B(k,j ) );
6476  xmm2 -= a1 * set( B(k,j+1UL) );
6477  xmm3 -= a1 * set( B(k,j+2UL) );
6478  xmm4 -= a1 * set( B(k,j+3UL) );
6479  }
6480 
6481  C.store( i, j , xmm1+xmm5 );
6482  C.store( i, j+1UL, xmm2+xmm6 );
6483  C.store( i, j+2UL, xmm3+xmm7 );
6484  C.store( i, j+3UL, xmm4+xmm8 );
6485  }
6486 
6487  for( ; (j+3UL) <= jend; j+=3UL )
6488  {
6489  const size_t kbegin( ( IsLower_v<MT5> )
6490  ?( ( IsUpper_v<MT4> )
6491  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6492  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6493  :( IsUpper_v<MT4> ? i : 0UL ) );
6494  const size_t kend( ( IsUpper_v<MT5> )
6495  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6496  :( K ) );
6497 
6498  SIMDType xmm1( C.load(i,j ) );
6499  SIMDType xmm2( C.load(i,j+1UL) );
6500  SIMDType xmm3( C.load(i,j+2UL) );
6501  SIMDType xmm4, xmm5, xmm6;
6502  size_t k( kbegin );
6503 
6504  for( ; (k+2UL) <= kend; k+=2UL ) {
6505  const SIMDType a1( A.load(i,k ) );
6506  const SIMDType a2( A.load(i,k+1UL) );
6507  xmm1 -= a1 * set( B(k ,j ) );
6508  xmm2 -= a1 * set( B(k ,j+1UL) );
6509  xmm3 -= a1 * set( B(k ,j+2UL) );
6510  xmm4 -= a2 * set( B(k+1UL,j ) );
6511  xmm5 -= a2 * set( B(k+1UL,j+1UL) );
6512  xmm6 -= a2 * set( B(k+1UL,j+2UL) );
6513  }
6514 
6515  for( ; k<kend; ++k ) {
6516  const SIMDType a1( A.load(i,k) );
6517  xmm1 -= a1 * set( B(k,j ) );
6518  xmm2 -= a1 * set( B(k,j+1UL) );
6519  xmm3 -= a1 * set( B(k,j+2UL) );
6520  }
6521 
6522  C.store( i, j , xmm1+xmm4 );
6523  C.store( i, j+1UL, xmm2+xmm5 );
6524  C.store( i, j+2UL, xmm3+xmm6 );
6525  }
6526 
6527  for( ; (j+2UL) <= jend; j+=2UL )
6528  {
6529  const size_t kbegin( ( IsLower_v<MT5> )
6530  ?( ( IsUpper_v<MT4> )
6531  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6532  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6533  :( IsUpper_v<MT4> ? i : 0UL ) );
6534  const size_t kend( ( IsUpper_v<MT5> )
6535  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6536  :( K ) );
6537 
6538  SIMDType xmm1( C.load(i,j ) );
6539  SIMDType xmm2( C.load(i,j+1UL) );
6540  SIMDType xmm3, xmm4;
6541  size_t k( kbegin );
6542 
6543  for( ; (k+2UL) <= kend; k+=2UL ) {
6544  const SIMDType a1( A.load(i,k ) );
6545  const SIMDType a2( A.load(i,k+1UL) );
6546  xmm1 -= a1 * set( B(k ,j ) );
6547  xmm2 -= a1 * set( B(k ,j+1UL) );
6548  xmm3 -= a2 * set( B(k+1UL,j ) );
6549  xmm4 -= a2 * set( B(k+1UL,j+1UL) );
6550  }
6551 
6552  for( ; k<kend; ++k ) {
6553  const SIMDType a1( A.load(i,k) );
6554  xmm1 -= a1 * set( B(k,j ) );
6555  xmm2 -= a1 * set( B(k,j+1UL) );
6556  }
6557 
6558  C.store( i, j , xmm1+xmm3 );
6559  C.store( i, j+1UL, xmm2+xmm4 );
6560  }
6561 
6562  if( j < jend )
6563  {
6564  const size_t kbegin( ( IsLower_v<MT5> )
6565  ?( ( IsUpper_v<MT4> )
6566  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6567  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6568  :( IsUpper_v<MT4> ? i : 0UL ) );
6569 
6570  SIMDType xmm1( C.load(i,j) );
6571  SIMDType xmm2;
6572  size_t k( kbegin );
6573 
6574  for( ; (k+2UL) <= K; k+=2UL ) {
6575  xmm1 -= A.load(i,k ) * set( B(k ,j) );
6576  xmm2 -= A.load(i,k+1UL) * set( B(k+1UL,j) );
6577  }
6578 
6579  for( ; k<K; ++k ) {
6580  xmm1 -= A.load(i,k) * set( B(k,j) );
6581  }
6582 
6583  C.store( i, j, xmm1+xmm2 );
6584  }
6585  }
6586 
6587  for( ; remainder && i<M; ++i )
6588  {
6589  const size_t jend( LOW ? i+1UL : N );
6590  size_t j( UPP ? i : 0UL );
6591 
6592  for( ; (j+2UL) <= jend; j+=2UL )
6593  {
6594  const size_t kbegin( ( IsLower_v<MT5> )
6595  ?( ( IsUpper_v<MT4> )
6596  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6597  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6598  :( IsUpper_v<MT4> ? i : 0UL ) );
6599  const size_t kend( ( IsUpper_v<MT5> )
6600  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6601  :( K ) );
6602 
6603  ElementType value1( C(i,j ) );
6604  ElementType value2( C(i,j+1UL) );
6605 
6606  for( size_t k=kbegin; k<kend; ++k ) {
6607  value1 -= A(i,k) * B(k,j );
6608  value2 -= A(i,k) * B(k,j+1UL);
6609  }
6610 
6611  C(i,j ) = value1;
6612  C(i,j+1UL) = value2;
6613  }
6614 
6615  if( j < jend )
6616  {
6617  const size_t kbegin( ( IsLower_v<MT5> )
6618  ?( ( IsUpper_v<MT4> )
6619  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6620  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6621  :( IsUpper_v<MT4> ? i : 0UL ) );
6622 
6623  ElementType value( C(i,j) );
6624 
6625  for( size_t k=kbegin; k<K; ++k ) {
6626  value -= A(i,k) * B(k,j);
6627  }
6628 
6629  C(i,j) = value;
6630  }
6631  }
6632  }
6634  //**********************************************************************************************
6635 
6636  //**Default subtraction assignment to dense matrices (large matrices)***************************
6650  template< typename MT3 // Type of the left-hand side target matrix
6651  , typename MT4 // Type of the left-hand side matrix operand
6652  , typename MT5 > // Type of the right-hand side matrix operand
6653  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6654  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6655  {
6656  selectDefaultSubAssignKernel( C, A, B );
6657  }
6659  //**********************************************************************************************
6660 
6661  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
6676  template< typename MT3 // Type of the left-hand side target matrix
6677  , typename MT4 // Type of the left-hand side matrix operand
6678  , typename MT5 > // Type of the right-hand side matrix operand
6679  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6680  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6681  {
6682  if( LOW )
6683  lmmm( C, A, B, ElementType(-1), ElementType(1) );
6684  else if( UPP )
6685  ummm( C, A, B, ElementType(-1), ElementType(1) );
6686  else
6687  mmm( C, A, B, ElementType(-1), ElementType(1) );
6688  }
6690  //**********************************************************************************************
6691 
6692  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
6706  template< typename MT3 // Type of the left-hand side target matrix
6707  , typename MT4 // Type of the left-hand side matrix operand
6708  , typename MT5 > // Type of the right-hand side matrix operand
6709  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6710  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6711  {
6712  selectLargeSubAssignKernel( C, A, B );
6713  }
6715  //**********************************************************************************************
6716 
6717  //**BLAS-based subraction assignment to dense matrices******************************************
6718 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6719 
6732  template< typename MT3 // Type of the left-hand side target matrix
6733  , typename MT4 // Type of the left-hand side matrix operand
6734  , typename MT5 > // Type of the right-hand side matrix operand
6735  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6736  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6737  {
6738  using ET = ElementType_t<MT3>;
6739 
6740  if( IsTriangular_v<MT4> ) {
6741  ResultType_t<MT3> tmp( serial( B ) );
6742  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
6743  subAssign( C, tmp );
6744  }
6745  else if( IsTriangular_v<MT5> ) {
6746  ResultType_t<MT3> tmp( serial( A ) );
6747  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
6748  subAssign( C, tmp );
6749  }
6750  else {
6751  gemm( C, A, B, ET(-1), ET(1) );
6752  }
6753  }
6755 #endif
6756  //**********************************************************************************************
6757 
6758  //**Subtraction assignment to sparse matrices***************************************************
6759  // No special implementation for the subtraction assignment to sparse matrices.
6760  //**********************************************************************************************
6761 
6762  //**Schur product assignment to dense matrices**************************************************
6775  template< typename MT // Type of the target dense matrix
6776  , bool SO > // Storage order of the target dense matrix
6777  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6778  {
6780 
6784 
6785  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6786  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6787 
6788  const ResultType tmp( serial( rhs ) );
6789  schurAssign( ~lhs, tmp );
6790  }
6792  //**********************************************************************************************
6793 
6794  //**Schur product assignment to sparse matrices*************************************************
6795  // No special implementation for the Schur product assignment to sparse matrices.
6796  //**********************************************************************************************
6797 
6798  //**Multiplication assignment to dense matrices*************************************************
6799  // No special implementation for the multiplication assignment to dense matrices.
6800  //**********************************************************************************************
6801 
6802  //**Multiplication assignment to sparse matrices************************************************
6803  // No special implementation for the multiplication assignment to sparse matrices.
6804  //**********************************************************************************************
6805 
6806  //**SMP assignment to dense matrices************************************************************
6822  template< typename MT // Type of the target dense matrix
6823  , bool SO > // Storage order of the target dense matrix
6824  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6825  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6826  {
6828 
6829  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6830  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6831 
6832  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
6833  return;
6834  }
6835  else if( rhs.lhs_.columns() == 0UL ) {
6836  reset( ~lhs );
6837  return;
6838  }
6839 
6840  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6841  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6842 
6843  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6844  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6845  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6846  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6847  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6848  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6849 
6850  smpAssign( ~lhs, A * B );
6851  }
6853  //**********************************************************************************************
6854 
6855  //**SMP assignment to sparse matrices***********************************************************
6871  template< typename MT // Type of the target sparse matrix
6872  , bool SO > // Storage order of the target sparse matrix
6873  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6874  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6875  {
6877 
6878  using TmpType = If_t< SO, ResultType, OppositeType >;
6879 
6886 
6887  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6888  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6889 
6890  const ForwardFunctor fwd;
6891 
6892  const TmpType tmp( rhs );
6893  smpAssign( ~lhs, fwd( tmp ) );
6894  }
6896  //**********************************************************************************************
6897 
6898  //**SMP addition assignment to dense matrices***************************************************
6914  template< typename MT // Type of the target dense matrix
6915  , bool SO > // Storage order of the target dense matrix
6916  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6917  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6918  {
6920 
6921  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6922  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6923 
6924  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6925  return;
6926  }
6927 
6928  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6929  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6930 
6931  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6932  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6933  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6934  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6935  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6936  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6937 
6938  smpAddAssign( ~lhs, A * B );
6939  }
6941  //**********************************************************************************************
6942 
6943  //**SMP addition assignment to sparse matrices**************************************************
6944  // No special implementation for the SMP addition assignment to sparse matrices.
6945  //**********************************************************************************************
6946 
6947  //**SMP subtraction assignment to dense matrices************************************************
6963  template< typename MT // Type of the target dense matrix
6964  , bool SO > // Storage order of the target dense matrix
6965  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6966  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6967  {
6969 
6970  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6971  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6972 
6973  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6974  return;
6975  }
6976 
6977  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6978  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6979 
6980  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6981  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6982  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6983  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6984  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6985  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6986 
6987  smpSubAssign( ~lhs, A * B );
6988  }
6990  //**********************************************************************************************
6991 
6992  //**SMP subtraction assignment to sparse matrices***********************************************
6993  // No special implementation for the SMP subtraction assignment to sparse matrices.
6994  //**********************************************************************************************
6995 
6996  //**SMP Schur product assignment to dense matrices**********************************************
7009  template< typename MT // Type of the target dense matrix
7010  , bool SO > // Storage order of the target dense matrix
7011  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
7012  {
7014 
7018 
7019  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7020  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7021 
7022  const ResultType tmp( rhs );
7023  smpSchurAssign( ~lhs, tmp );
7024  }
7026  //**********************************************************************************************
7027 
7028  //**SMP Schur product assignment to sparse matrices*********************************************
7029  // No special implementation for the SMP Schur product assignment to sparse matrices.
7030  //**********************************************************************************************
7031 
7032  //**SMP multiplication assignment to dense matrices*********************************************
7033  // No special implementation for the SMP multiplication assignment to dense matrices.
7034  //**********************************************************************************************
7035 
7036  //**SMP multiplication assignment to sparse matrices********************************************
7037  // No special implementation for the SMP multiplication assignment to sparse matrices.
7038  //**********************************************************************************************
7039 
7040  //**Compile time checks*************************************************************************
7048  //**********************************************************************************************
7049 };
7050 //*************************************************************************************************
7051 
7052 
7053 
7054 
7055 //=================================================================================================
7056 //
7057 // DMATSCALARMULTEXPR SPECIALIZATION
7058 //
7059 //=================================================================================================
7060 
7061 //*************************************************************************************************
7069 template< typename MT1 // Type of the left-hand side dense matrix
7070  , typename MT2 // Type of the right-hand side dense matrix
7071  , bool SF // Symmetry flag
7072  , bool HF // Hermitian flag
7073  , bool LF // Lower flag
7074  , bool UF // Upper flag
7075  , typename ST > // Type of the right-hand side scalar value
7076 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
7077  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
7078  , private Computation
7079 {
7080  private:
7081  //**Type definitions****************************************************************************
7083  using MMM = TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7084 
7085  using RES = ResultType_t<MMM>;
7086  using RT1 = ResultType_t<MT1>;
7087  using RT2 = ResultType_t<MT2>;
7088  using ET1 = ElementType_t<RT1>;
7089  using ET2 = ElementType_t<RT2>;
7090  using CT1 = CompositeType_t<MT1>;
7091  using CT2 = CompositeType_t<MT2>;
7092  //**********************************************************************************************
7093 
7094  //**********************************************************************************************
7096  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
7097  //**********************************************************************************************
7098 
7099  //**********************************************************************************************
7101  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
7102  //**********************************************************************************************
7103 
7104  //**********************************************************************************************
7105  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
7106  static constexpr bool HERM = ( HF && !( LF || UF ) );
7107  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
7108  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
7109  //**********************************************************************************************
7110 
7111  //**********************************************************************************************
7113 
7116  template< typename T1, typename T2, typename T3 >
7117  static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
7118  //**********************************************************************************************
7119 
7120  //**********************************************************************************************
7122 
7124  template< typename T1, typename T2, typename T3, typename T4 >
7125  static constexpr bool UseBlasKernel_v =
7126  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
7127  !SYM && !HERM && !LOW && !UPP &&
7128  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
7129  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
7130  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
7131  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
7132  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7133  IsBLASCompatible_v< ElementType_t<T1> > &&
7134  IsBLASCompatible_v< ElementType_t<T2> > &&
7135  IsBLASCompatible_v< ElementType_t<T3> > &&
7136  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
7137  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
7138  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
7139  //**********************************************************************************************
7140 
7141  //**********************************************************************************************
7143 
7145  template< typename T1, typename T2, typename T3, typename T4 >
7146  static constexpr bool UseVectorizedDefaultKernel_v =
7147  ( useOptimizedKernels &&
7148  !( IsDiagonal_v<T2> && IsDiagonal_v<T3> ) &&
7149  !( IsDiagonal_v<T2> && IsColumnMajorMatrix_v<T1> ) &&
7150  !( IsDiagonal_v<T3> && IsRowMajorMatrix_v<T1> ) &&
7151  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7152  IsSIMDCombinable_v< ElementType_t<T1>
7153  , ElementType_t<T2>
7154  , ElementType_t<T3>
7155  , T4 > &&
7156  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
7157  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
7158  //**********************************************************************************************
7159 
7160  //**********************************************************************************************
7162 
7164  using ForwardFunctor = If_t< HERM
7165  , DeclHerm
7166  , If_t< SYM
7167  , DeclSym
7168  , If_t< LOW
7169  , If_t< UPP
7170  , DeclDiag
7171  , DeclLow >
7172  , If_t< UPP
7173  , DeclUpp
7174  , Noop > > > >;
7175  //**********************************************************************************************
7176 
7177  public:
7178  //**Type definitions****************************************************************************
7180  using This = DMatScalarMultExpr<MMM,ST,true>;
7181 
7183  using BaseType = DenseMatrix<This,true>;
7184 
7186  using ResultType = typename If_t< HERM
7187  , DeclHermTrait< MultTrait_t<RES,ST> >
7188  , If_t< SYM
7189  , DeclSymTrait< MultTrait_t<RES,ST> >
7190  , If_t< LOW
7191  , If_t< UPP
7192  , DeclDiagTrait< MultTrait_t<RES,ST> >
7193  , DeclLowTrait< MultTrait_t<RES,ST> > >
7194  , If_t< UPP
7195  , DeclUppTrait< MultTrait_t<RES,ST> >
7196  , MultTrait<RES,ST> > > > >::Type;
7197 
7198  using OppositeType = OppositeType_t<ResultType>;
7199  using TransposeType = TransposeType_t<ResultType>;
7200  using ElementType = ElementType_t<ResultType>;
7201  using SIMDType = SIMDTrait_t<ElementType>;
7202  using ReturnType = const ElementType;
7203  using CompositeType = const ResultType;
7204 
7206  using LeftOperand = const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7207 
7209  using RightOperand = ST;
7210 
7212  using LT = If_t< evaluateLeft, const RT1, CT1 >;
7213 
7215  using RT = If_t< evaluateRight, const RT2, CT2 >;
7216  //**********************************************************************************************
7217 
7218  //**Compilation flags***************************************************************************
7220  static constexpr bool simdEnabled =
7221  ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
7222  MT1::simdEnabled && MT2::simdEnabled &&
7223  IsSIMDCombinable_v<ET1,ET2,ST> &&
7224  HasSIMDAdd_v<ET1,ET2> &&
7225  HasSIMDMult_v<ET1,ET2> );
7226 
7228  static constexpr bool smpAssignable =
7229  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
7230  //**********************************************************************************************
7231 
7232  //**SIMD properties*****************************************************************************
7234  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
7235  //**********************************************************************************************
7236 
7237  //**Constructor*********************************************************************************
7243  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
7244  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
7245  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
7246  {}
7247  //**********************************************************************************************
7248 
7249  //**Access operator*****************************************************************************
7256  inline ReturnType operator()( size_t i, size_t j ) const {
7257  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
7258  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
7259  return matrix_(i,j) * scalar_;
7260  }
7261  //**********************************************************************************************
7262 
7263  //**At function*********************************************************************************
7271  inline ReturnType at( size_t i, size_t j ) const {
7272  if( i >= matrix_.rows() ) {
7273  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
7274  }
7275  if( j >= matrix_.columns() ) {
7276  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
7277  }
7278  return (*this)(i,j);
7279  }
7280  //**********************************************************************************************
7281 
7282  //**Rows function*******************************************************************************
7287  inline size_t rows() const {
7288  return matrix_.rows();
7289  }
7290  //**********************************************************************************************
7291 
7292  //**Columns function****************************************************************************
7297  inline size_t columns() const {
7298  return matrix_.columns();
7299  }
7300  //**********************************************************************************************
7301 
7302  //**Left operand access*************************************************************************
7307  inline LeftOperand leftOperand() const {
7308  return matrix_;
7309  }
7310  //**********************************************************************************************
7311 
7312  //**Right operand access************************************************************************
7317  inline RightOperand rightOperand() const {
7318  return scalar_;
7319  }
7320  //**********************************************************************************************
7321 
7322  //**********************************************************************************************
7328  template< typename T >
7329  inline bool canAlias( const T* alias ) const {
7330  return matrix_.canAlias( alias );
7331  }
7332  //**********************************************************************************************
7333 
7334  //**********************************************************************************************
7340  template< typename T >
7341  inline bool isAliased( const T* alias ) const {
7342  return matrix_.isAliased( alias );
7343  }
7344  //**********************************************************************************************
7345 
7346  //**********************************************************************************************
7351  inline bool isAligned() const {
7352  return matrix_.isAligned();
7353  }
7354  //**********************************************************************************************
7355 
7356  //**********************************************************************************************
7361  inline bool canSMPAssign() const noexcept {
7362  return ( !BLAZE_BLAS_MODE ||
7363  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7365  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7366  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7367  }
7368  //**********************************************************************************************
7369 
7370  private:
7371  //**Member variables****************************************************************************
7374  //**********************************************************************************************
7375 
7376  //**Assignment to dense matrices****************************************************************
7388  template< typename MT // Type of the target dense matrix
7389  , bool SO > // Storage order of the target dense matrix
7390  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7391  {
7393 
7394  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7395  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7396 
7397  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7398  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7399 
7400  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7401  return;
7402  }
7403  else if( left.columns() == 0UL ) {
7404  reset( ~lhs );
7405  return;
7406  }
7407 
7408  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7409  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7410 
7411  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7412  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7413  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7414  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7415  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7416  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7417 
7418  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
7419  }
7420  //**********************************************************************************************
7421 
7422  //**Assignment to dense matrices (kernel selection)*********************************************
7433  template< typename MT3 // Type of the left-hand side target matrix
7434  , typename MT4 // Type of the left-hand side matrix operand
7435  , typename MT5 // Type of the right-hand side matrix operand
7436  , typename ST2 > // Type of the scalar value
7437  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7438  {
7439  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
7440  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
7441  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
7442  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7443  selectSmallAssignKernel( C, A, B, scalar );
7444  else
7445  selectBlasAssignKernel( C, A, B, scalar );
7446  }
7447  //**********************************************************************************************
7448 
7449  //**Default assignment to row-major dense matrices (general/general)****************************
7463  template< typename MT3 // Type of the left-hand side target matrix
7464  , typename MT4 // Type of the left-hand side matrix operand
7465  , typename MT5 // Type of the right-hand side matrix operand
7466  , typename ST2 > // Type of the scalar value
7467  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7468  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7469  {
7470  const size_t M( A.rows() );
7471  const size_t N( B.columns() );
7472  const size_t K( A.columns() );
7473 
7474  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7475 
7476  for( size_t i=0UL; i<M; ++i )
7477  {
7478  const size_t kbegin( ( IsUpper_v<MT4> )
7479  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7480  :( 0UL ) );
7481  const size_t kend( ( IsLower_v<MT4> )
7482  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7483  :( K ) );
7484  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7485 
7486  if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
7487  for( size_t j=0UL; j<N; ++j ) {
7488  reset( C(i,j) );
7489  }
7490  continue;
7491  }
7492 
7493  {
7494  const size_t jbegin( ( IsUpper_v<MT5> )
7495  ?( ( IsStrictlyUpper_v<MT5> )
7496  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
7497  :( UPP ? max(i,kbegin) : kbegin ) )
7498  :( UPP ? i : 0UL ) );
7499  const size_t jend( ( IsLower_v<MT5> )
7500  ?( ( IsStrictlyLower_v<MT5> )
7501  ?( LOW ? min(i+1UL,kbegin) : kbegin )
7502  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
7503  :( LOW ? i+1UL : N ) );
7504 
7505  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7506  for( size_t j=0UL; j<jbegin; ++j ) {
7507  reset( C(i,j) );
7508  }
7509  }
7510  else if( IsStrictlyUpper_v<MT5> ) {
7511  reset( C(i,0UL) );
7512  }
7513  for( size_t j=jbegin; j<jend; ++j ) {
7514  C(i,j) = A(i,kbegin) * B(kbegin,j);
7515  }
7516  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7517  for( size_t j=jend; j<N; ++j ) {
7518  reset( C(i,j) );
7519  }
7520  }
7521  else if( IsStrictlyLower_v<MT5> ) {
7522  reset( C(i,N-1UL) );
7523  }
7524  }
7525 
7526  for( size_t k=kbegin+1UL; k<kend; ++k )
7527  {
7528  const size_t jbegin( ( IsUpper_v<MT5> )
7529  ?( ( IsStrictlyUpper_v<MT5> )
7530  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
7531  :( SYM || HERM || UPP ? max( i, k ) : k ) )
7532  :( SYM || HERM || UPP ? i : 0UL ) );
7533  const size_t jend( ( IsLower_v<MT5> )
7534  ?( ( IsStrictlyLower_v<MT5> )
7535  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
7536  :( LOW ? min(i+1UL,k) : k ) )
7537  :( LOW ? i+1UL : N ) );
7538 
7539  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7540  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7541 
7542  for( size_t j=jbegin; j<jend; ++j ) {
7543  C(i,j) += A(i,k) * B(k,j);
7544  }
7545  if( IsLower_v<MT5> ) {
7546  C(i,jend) = A(i,k) * B(k,jend);
7547  }
7548  }
7549 
7550  {
7551  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7552  ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
7553  :( SYM || HERM || UPP ? i : 0UL ) );
7554  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
7555  ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
7556  :( LOW ? i+1UL : N ) );
7557 
7558  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7559  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7560 
7561  for( size_t j=jbegin; j<jend; ++j ) {
7562  C(i,j) *= scalar;
7563  }
7564  }
7565  }
7566 
7567  if( SYM || HERM ) {
7568  for( size_t i=1UL; i<M; ++i ) {
7569  for( size_t j=0UL; j<i; ++j ) {
7570  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
7571  }
7572  }
7573  }
7574  }
7575  //**********************************************************************************************
7576 
7577  //**Default assignment to column-major dense matrices (general/general)*************************
7591  template< typename MT3 // Type of the left-hand side target matrix
7592  , typename MT4 // Type of the left-hand side matrix operand
7593  , typename MT5 // Type of the right-hand side matrix operand
7594  , typename ST2 > // Type of the scalar value
7595  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7596  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7597  {
7598  const size_t M( A.rows() );
7599  const size_t N( B.columns() );
7600  const size_t K( A.columns() );
7601 
7602  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7603 
7604  for( size_t j=0UL; j<N; ++j )
7605  {
7606  const size_t kbegin( ( IsLower_v<MT5> )
7607  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7608  :( 0UL ) );
7609  const size_t kend( ( IsUpper_v<MT5> )
7610  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7611  :( K ) );
7612  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7613 
7614  if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
7615  for( size_t i=0UL; i<M; ++i ) {
7616  reset( C(i,j) );
7617  }
7618  continue;
7619  }
7620 
7621  {
7622  const size_t ibegin( ( IsLower_v<MT4> )
7623  ?( ( IsStrictlyLower_v<MT4> )
7624  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
7625  :( LOW ? max(j,kbegin) : kbegin ) )
7626  :( LOW ? j : 0UL ) );
7627  const size_t iend( ( IsUpper_v<MT4> )
7628  ?( ( IsStrictlyUpper_v<MT4> )
7629  ?( UPP ? min(j+1UL,kbegin) : kbegin )
7630  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
7631  :( UPP ? j+1UL : M ) );
7632 
7633  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7634  for( size_t i=0UL; i<ibegin; ++i ) {
7635  reset( C(i,j) );
7636  }
7637  }
7638  else if( IsStrictlyLower_v<MT4> ) {
7639  reset( C(0UL,j) );
7640  }
7641  for( size_t i=ibegin; i<iend; ++i ) {
7642  C(i,j) = A(i,kbegin) * B(kbegin,j);
7643  }
7644  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7645  for( size_t i=iend; i<M; ++i ) {
7646  reset( C(i,j) );
7647  }
7648  }
7649  else if( IsStrictlyUpper_v<MT4> ) {
7650  reset( C(M-1UL,j) );
7651  }
7652  }
7653 
7654  for( size_t k=kbegin+1UL; k<kend; ++k )
7655  {
7656  const size_t ibegin( ( IsLower_v<MT4> )
7657  ?( ( IsStrictlyLower_v<MT4> )
7658  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
7659  :( SYM || HERM || LOW ? max( j, k ) : k ) )
7660  :( SYM || HERM || LOW ? j : 0UL ) );
7661  const size_t iend( ( IsUpper_v<MT4> )
7662  ?( ( IsStrictlyUpper_v<MT4> )
7663  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
7664  :( UPP ? min(j+1UL,k) : k ) )
7665  :( UPP ? j+1UL : M ) );
7666 
7667  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7668  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7669 
7670  for( size_t i=ibegin; i<iend; ++i ) {
7671  C(i,j) += A(i,k) * B(k,j);
7672  }
7673  if( IsUpper_v<MT4> ) {
7674  C(iend,j) = A(iend,k) * B(k,j);
7675  }
7676  }
7677 
7678  {
7679  const size_t ibegin( ( ( IsLower_v<MT4> && IsLower_v<MT5> ) )
7680  ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
7681  :( SYM || HERM || LOW ? j : 0UL ) );
7682  const size_t iend( ( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) )
7683  ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
7684  :( UPP ? j+1UL : M ) );
7685 
7686  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7687  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7688 
7689  for( size_t i=ibegin; i<iend; ++i ) {
7690  C(i,j) *= scalar;
7691  }
7692  }
7693  }
7694 
7695  if( SYM || HERM ) {
7696  for( size_t j=1UL; j<N; ++j ) {
7697  for( size_t i=0UL; i<j; ++i ) {
7698  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
7699  }
7700  }
7701  }
7702  }
7703  //**********************************************************************************************
7704 
7705  //**Default assignment to row-major dense matrices (general/diagonal)***************************
7719  template< typename MT3 // Type of the left-hand side target matrix
7720  , typename MT4 // Type of the left-hand side matrix operand
7721  , typename MT5 // Type of the right-hand side matrix operand
7722  , typename ST2 > // Type of the scalar value
7723  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7724  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7725  {
7726  constexpr size_t block( BLOCK_SIZE );
7727 
7728  const size_t M( A.rows() );
7729  const size_t N( B.columns() );
7730 
7731  for( size_t ii=0UL; ii<M; ii+=block ) {
7732  const size_t iend( min( M, ii+block ) );
7733  for( size_t jj=0UL; jj<N; jj+=block ) {
7734  const size_t jend( min( N, jj+block ) );
7735  for( size_t i=ii; i<iend; ++i )
7736  {
7737  const size_t jbegin( ( IsUpper_v<MT4> )
7738  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
7739  :( jj ) );
7740  const size_t jpos( ( IsLower_v<MT4> )
7741  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
7742  :( jend ) );
7743 
7744  if( IsUpper_v<MT4> ) {
7745  for( size_t j=jj; j<jbegin; ++j ) {
7746  reset( C(i,j) );
7747  }
7748  }
7749  for( size_t j=jbegin; j<jpos; ++j ) {
7750  C(i,j) = A(i,j) * B(j,j) * scalar;
7751  }
7752  if( IsLower_v<MT4> ) {
7753  for( size_t j=jpos; j<jend; ++j ) {
7754  reset( C(i,j) );
7755  }
7756  }
7757  }
7758  }
7759  }
7760  }
7761  //**********************************************************************************************
7762 
7763  //**Default assignment to column-major dense matrices (general/diagonal)************************
7777  template< typename MT3 // Type of the left-hand side target matrix
7778  , typename MT4 // Type of the left-hand side matrix operand
7779  , typename MT5 // Type of the right-hand side matrix operand
7780  , typename ST2 > // Type of the scalar value
7781  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7782  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7783  {
7784  const size_t M( A.rows() );
7785  const size_t N( B.columns() );
7786 
7787  for( size_t j=0UL; j<N; ++j )
7788  {
7789  const size_t ibegin( ( IsLower_v<MT4> )
7790  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7791  :( 0UL ) );
7792  const size_t iend( ( IsUpper_v<MT4> )
7793  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
7794  :( M ) );
7795  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7796 
7797  if( IsLower_v<MT4> ) {
7798  for( size_t i=0UL; i<ibegin; ++i ) {
7799  reset( C(i,j) );
7800  }
7801  }
7802  for( size_t i=ibegin; i<iend; ++i ) {
7803  C(i,j) = A(i,j) * B(j,j) * scalar;
7804  }
7805  if( IsUpper_v<MT4> ) {
7806  for( size_t i=iend; i<M; ++i ) {
7807  reset( C(i,j) );
7808  }
7809  }
7810  }
7811  }
7812  //**********************************************************************************************
7813 
7814  //**Default assignment to row-major dense matrices (diagonal/general)***************************
7828  template< typename MT3 // Type of the left-hand side target matrix
7829  , typename MT4 // Type of the left-hand side matrix operand
7830  , typename MT5 // Type of the right-hand side matrix operand
7831  , typename ST2 > // Type of the scalar value
7832  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7833  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7834  {
7835  const size_t M( A.rows() );
7836  const size_t N( B.columns() );
7837 
7838  for( size_t i=0UL; i<M; ++i )
7839  {
7840  const size_t jbegin( ( IsUpper_v<MT5> )
7841  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
7842  :( 0UL ) );
7843  const size_t jend( ( IsLower_v<MT5> )
7844  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
7845  :( N ) );
7846  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7847 
7848  if( IsUpper_v<MT5> ) {
7849  for( size_t j=0UL; j<jbegin; ++j ) {
7850  reset( C(i,j) );
7851  }
7852  }
7853  for( size_t j=jbegin; j<jend; ++j ) {
7854  C(i,j) = A(i,i) * B(i,j) * scalar;
7855  }
7856  if( IsLower_v<MT5> ) {
7857  for( size_t j=jend; j<N; ++j ) {
7858  reset( C(i,j) );
7859  }
7860  }
7861  }
7862  }
7863  //**********************************************************************************************
7864 
7865  //**Default assignment to column-major dense matrices (diagonal/general)************************
7879  template< typename MT3 // Type of the left-hand side target matrix
7880  , typename MT4 // Type of the left-hand side matrix operand
7881  , typename MT5 // Type of the right-hand side matrix operand
7882  , typename ST2 > // Type of the scalar value
7883  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7884  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7885  {
7886  constexpr size_t block( BLOCK_SIZE );
7887 
7888  const size_t M( A.rows() );
7889  const size_t N( B.columns() );
7890 
7891  for( size_t jj=0UL; jj<N; jj+=block ) {
7892  const size_t jend( min( N, jj+block ) );
7893  for( size_t ii=0UL; ii<M; ii+=block ) {
7894  const size_t iend( min( M, ii+block ) );
7895  for( size_t j=jj; j<jend; ++j )
7896  {
7897  const size_t ibegin( ( IsLower_v<MT5> )
7898  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
7899  :( ii ) );
7900  const size_t ipos( ( IsUpper_v<MT5> )
7901  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
7902  :( iend ) );
7903 
7904  if( IsLower_v<MT5> ) {
7905  for( size_t i=ii; i<ibegin; ++i ) {
7906  reset( C(i,j) );
7907  }
7908  }
7909  for( size_t i=ibegin; i<ipos; ++i ) {
7910  C(i,j) = A(i,i) * B(i,j) * scalar;
7911  }
7912  if( IsUpper_v<MT5> ) {
7913  for( size_t i=ipos; i<iend; ++i ) {
7914  reset( C(i,j) );
7915  }
7916  }
7917  }
7918  }
7919  }
7920  }
7921  //**********************************************************************************************
7922 
7923  //**Default assignment to dense matrices (diagonal/diagonal)************************************
7937  template< typename MT3 // Type of the left-hand side target matrix
7938  , typename MT4 // Type of the left-hand side matrix operand
7939  , typename MT5 // Type of the right-hand side matrix operand
7940  , typename ST2 > // Type of the scalar value
7941  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7942  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7943  {
7944  reset( C );
7945 
7946  for( size_t i=0UL; i<A.rows(); ++i ) {
7947  C(i,i) = A(i,i) * B(i,i) * scalar;
7948  }
7949  }
7950  //**********************************************************************************************
7951 
7952  //**Default assignment to dense matrices (small matrices)***************************************
7966  template< typename MT3 // Type of the left-hand side target matrix
7967  , typename MT4 // Type of the left-hand side matrix operand
7968  , typename MT5 // Type of the right-hand side matrix operand
7969  , typename ST2 > // Type of the scalar value
7970  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7971  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7972  {
7973  selectDefaultAssignKernel( C, A, B, scalar );
7974  }
7975  //**********************************************************************************************
7976 
7977  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
7992  template< typename MT3 // Type of the left-hand side target matrix
7993  , typename MT4 // Type of the left-hand side matrix operand
7994  , typename MT5 // Type of the right-hand side matrix operand
7995  , typename ST2 > // Type of the scalar value
7996  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7997  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7998  {
7999  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
8000 
8001  const size_t M( A.rows() );
8002  const size_t N( B.columns() );
8003  const size_t K( A.columns() );
8004 
8005  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8006 
8007  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
8008  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
8009 
8010  const SIMDType factor( set( scalar ) );
8011 
8012  size_t j( 0UL );
8013 
8014  if( IsIntegral_v<ElementType> )
8015  {
8016  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
8017  for( size_t i=0UL; i<M; ++i )
8018  {
8019  const size_t kbegin( ( IsUpper_v<MT4> )
8020  ?( ( IsLower_v<MT5> )
8021  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8022  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8023  :( IsLower_v<MT5> ? j : 0UL ) );
8024  const size_t kend( ( IsLower_v<MT4> )
8025  ?( ( IsUpper_v<MT5> )
8026  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
8027  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
8028  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
8029 
8030  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8031 
8032  for( size_t k=kbegin; k<kend; ++k ) {
8033  const SIMDType a1( set( A(i,k) ) );
8034  xmm1 += a1 * B.load(k,j );
8035  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8036  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8037  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8038  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
8039  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
8040  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
8041  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
8042  }
8043 
8044  C.store( i, j , xmm1 * factor );
8045  C.store( i, j+SIMDSIZE , xmm2 * factor );
8046  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8047  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8048  C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
8049  C.store( i, j+SIMDSIZE*5UL, xmm6 * factor );
8050  C.store( i, j+SIMDSIZE*6UL, xmm7 * factor );
8051  C.store( i, j+SIMDSIZE*7UL, xmm8 * factor );
8052  }
8053  }
8054  }
8055 
8056  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
8057  {
8058  size_t i( 0UL );
8059 
8060  for( ; (i+2UL) <= M; i+=2UL )
8061  {
8062  const size_t kbegin( ( IsUpper_v<MT4> )
8063  ?( ( IsLower_v<MT5> )
8064  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8065  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8066  :( IsLower_v<MT5> ? j : 0UL ) );
8067  const size_t kend( ( IsLower_v<MT4> )
8068  ?( ( IsUpper_v<MT5> )
8069  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
8070  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8071  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
8072 
8073  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8074 
8075  for( size_t k=kbegin; k<kend; ++k ) {
8076  const SIMDType a1( set( A(i ,k) ) );
8077  const SIMDType a2( set( A(i+1UL,k) ) );
8078  const SIMDType b1( B.load(k,j ) );
8079  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8080  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8081  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8082  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
8083  xmm1 += a1 * b1;
8084  xmm2 += a1 * b2;
8085  xmm3 += a1 * b3;
8086  xmm4 += a1 * b4;
8087  xmm5 += a1 * b5;
8088  xmm6 += a2 * b1;
8089  xmm7 += a2 * b2;
8090  xmm8 += a2 * b3;
8091  xmm9 += a2 * b4;
8092  xmm10 += a2 * b5;
8093  }
8094 
8095  C.store( i , j , xmm1 * factor );
8096  C.store( i , j+SIMDSIZE , xmm2 * factor );
8097  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8098  C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
8099  C.store( i , j+SIMDSIZE*4UL, xmm5 * factor );
8100  C.store( i+1UL, j , xmm6 * factor );
8101  C.store( i+1UL, j+SIMDSIZE , xmm7 * factor );
8102  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
8103  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
8104  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
8105  }
8106 
8107  if( i < M )
8108  {
8109  const size_t kbegin( ( IsUpper_v<MT4> )
8110  ?( ( IsLower_v<MT5> )
8111  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8112  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8113  :( IsLower_v<MT5> ? j : 0UL ) );
8114  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
8115 
8116  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8117 
8118  for( size_t k=kbegin; k<kend; ++k ) {
8119  const SIMDType a1( set( A(i,k) ) );
8120  xmm1 += a1 * B.load(k,j );
8121  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8122  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8123  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8124  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
8125  }
8126 
8127  C.store( i, j , xmm1 * factor );
8128  C.store( i, j+SIMDSIZE , xmm2 * factor );
8129  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8130  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8131  C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
8132  }
8133  }
8134 
8135  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
8136  {
8137  const size_t iend( UPP ? min(j+SIMDSIZE*4UL,M) : M );
8138  size_t i( 0UL );
8139 
8140  if( SYM || HERM ) {
8141  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
8142  for( ; i<j; ++i ) {
8143  for( size_t jj=j; jj<jjend; ++jj ) {
8144  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
8145  }
8146  }
8147  }
8148  else if( LOW ) {
8149  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
8150  for( ; i<j; ++i ) {
8151  for( size_t jj=j; jj<jjend; ++jj ) {
8152  reset( C(i,jj) );
8153  }
8154  }
8155  }
8156 
8157  for( ; (i+2UL) <= iend; i+=2UL )
8158  {
8159  const size_t kbegin( ( IsUpper_v<MT4> )
8160  ?( ( IsLower_v<MT5> )
8161  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8162  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8163  :( IsLower_v<MT5> ? j : 0UL ) );
8164  const size_t kend( ( IsLower_v<MT4> )
8165  ?( ( IsUpper_v<MT5> )
8166  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
8167  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8168  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
8169 
8170  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8171 
8172  for( size_t k=kbegin; k<kend; ++k ) {
8173  const SIMDType a1( set( A(i ,k) ) );
8174  const SIMDType a2( set( A(i+1UL,k) ) );
8175  const SIMDType b1( B.load(k,j ) );
8176  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8177  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8178  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8179  xmm1 += a1 * b1;
8180  xmm2 += a1 * b2;
8181  xmm3 += a1 * b3;
8182  xmm4 += a1 * b4;
8183  xmm5 += a2 * b1;
8184  xmm6 += a2 * b2;
8185  xmm7 += a2 * b3;
8186  xmm8 += a2 * b4;
8187  }
8188 
8189  C.store( i , j , xmm1 * factor );
8190  C.store( i , j+SIMDSIZE , xmm2 * factor );
8191  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8192  C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
8193  C.store( i+1UL, j , xmm5 * factor );
8194  C.store( i+1UL, j+SIMDSIZE , xmm6 * factor );
8195  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
8196  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
8197  }
8198 
8199  if( i < iend )
8200  {
8201  const size_t kbegin( ( IsUpper_v<MT4> )
8202  ?( ( IsLower_v<MT5> )
8203  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8204  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8205  :( IsLower_v<MT5> ? j : 0UL ) );
8206  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
8207 
8208  SIMDType xmm1, xmm2, xmm3, xmm4;
8209 
8210  for( size_t k=kbegin; k<kend; ++k ) {
8211  const SIMDType a1( set( A(i,k) ) );
8212  xmm1 += a1 * B.load(k,j );
8213  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8214  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8215  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8216  }
8217 
8218  C.store( i, j , xmm1 * factor );
8219  C.store( i, j+SIMDSIZE , xmm2 * factor );
8220  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8221  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8222 
8223  if( UPP ) ++i;
8224  }
8225 
8226  if( UPP ) {
8227  const size_t jjend( min(j+SIMDSIZE*4UL,N) );
8228  for( ; i<M; ++i ) {
8229  for( size_t jj=j; jj<jjend; ++jj ) {
8230  reset( C(i,jj) );
8231  }
8232  }
8233  }
8234  }
8235 
8236  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8237  {
8238  const size_t iend( UPP ? min(j+SIMDSIZE*3UL,M) : M );
8239  size_t i( 0UL );
8240 
8241  if( SYM || HERM ) {
8242  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
8243  for( ; i<j; ++i ) {
8244  for( size_t jj=j; jj<jjend; ++jj ) {
8245  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
8246  }
8247  }
8248  }
8249  else if( LOW ) {
8250  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
8251  for( ; i<j; ++i ) {
8252  for( size_t jj=j; jj<jjend; ++jj ) {
8253  reset( C(i,jj) );
8254  }
8255  }
8256  }
8257 
8258  for( ; (i+2UL) <= iend; i+=2UL )
8259  {
8260  const size_t kbegin( ( IsUpper_v<MT4> )
8261  ?( ( IsLower_v<MT5> )
8262  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8263  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8264  :( IsLower_v<MT5> ? j : 0UL ) );
8265  const size_t kend( ( IsLower_v<MT4> )
8266  ?( ( IsUpper_v<MT5> )
8267  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
8268  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8269  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
8270 
8271  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8272 
8273  for( size_t k=kbegin; k<kend; ++k ) {
8274  const SIMDType a1( set( A(i ,k) ) );
8275  const SIMDType a2( set( A(i+1UL,k) ) );
8276  const SIMDType b1( B.load(k,j ) );
8277  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8278  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8279  xmm1 += a1 * b1;
8280  xmm2 += a1 * b2;
8281  xmm3 += a1 * b3;
8282  xmm4 += a2 * b1;
8283  xmm5 += a2 * b2;
8284  xmm6 += a2 * b3;
8285  }
8286 
8287  C.store( i , j , xmm1 * factor );
8288  C.store( i , j+SIMDSIZE , xmm2 * factor );
8289  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8290  C.store( i+1UL, j , xmm4 * factor );
8291  C.store( i+1UL, j+SIMDSIZE , xmm5 * factor );
8292  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
8293  }
8294 
8295  if( i < iend )
8296  {
8297  const size_t kbegin( ( IsUpper_v<MT4> )
8298  ?( ( IsLower_v<MT5> )
8299  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8300  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8301  :( IsLower_v<MT5> ? j : 0UL ) );
8302  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
8303 
8304  SIMDType xmm1, xmm2, xmm3;
8305 
8306  for( size_t k=kbegin; k<kend; ++k ) {
8307  const SIMDType a1( set( A(i,k) ) );
8308  xmm1 += a1 * B.load(k,j );
8309  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8310  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8311  }
8312 
8313  C.store( i, j , xmm1 * factor );
8314  C.store( i, j+SIMDSIZE , xmm2 * factor );
8315  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8316 
8317  if( UPP ) ++i;
8318  }
8319 
8320  if( UPP ) {
8321  const size_t jjend( min(j+SIMDSIZE*3UL,N) );
8322  for( ; i<M; ++i ) {
8323  for( size_t jj=j; jj<jjend; ++jj ) {
8324  reset( C(i,jj) );
8325  }
8326  }
8327  }
8328  }
8329 
8330  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8331  {
8332  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
8333  size_t i( 0UL );
8334 
8335  if( SYM || HERM ) {
8336  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
8337  for( ; i<j; ++i ) {
8338  for( size_t jj=j; jj<jjend; ++jj ) {
8339  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
8340  }
8341  }
8342  }
8343  else if( LOW ) {
8344  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
8345  for( ; i<j; ++i ) {
8346  for( size_t jj=j; jj<jjend; ++jj ) {
8347  reset( C(i,jj) );
8348  }
8349  }
8350  }
8351 
8352  for( ; (i+4UL) <= iend; i+=4UL )
8353  {
8354  const size_t kbegin( ( IsUpper_v<MT4> )
8355  ?( ( IsLower_v<MT5> )
8356  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8357  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8358  :( IsLower_v<MT5> ? j : 0UL ) );
8359  const size_t kend( ( IsLower_v<MT4> )
8360  ?( ( IsUpper_v<MT5> )
8361  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
8362  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8363  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8364 
8365  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8366 
8367  for( size_t k=kbegin; k<kend; ++k ) {
8368  const SIMDType a1( set( A(i ,k) ) );
8369  const SIMDType a2( set( A(i+1UL,k) ) );
8370  const SIMDType a3( set( A(i+2UL,k) ) );
8371  const SIMDType a4( set( A(i+3UL,k) ) );
8372  const SIMDType b1( B.load(k,j ) );
8373  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8374  xmm1 += a1 * b1;
8375  xmm2 += a1 * b2;
8376  xmm3 += a2 * b1;
8377  xmm4 += a2 * b2;
8378  xmm5 += a3 * b1;
8379  xmm6 += a3 * b2;
8380  xmm7 += a4 * b1;
8381  xmm8 += a4 * b2;
8382  }
8383 
8384  C.store( i , j , xmm1 * factor );
8385  C.store( i , j+SIMDSIZE, xmm2 * factor );
8386  C.store( i+1UL, j , xmm3 * factor );
8387  C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8388  C.store( i+2UL, j , xmm5 * factor );
8389  C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8390  C.store( i+3UL, j , xmm7 * factor );
8391  C.store( i+3UL, j+SIMDSIZE, xmm8 * factor );
8392  }
8393 
8394  for( ; (i+3UL) <= iend; i+=3UL )
8395  {
8396  const size_t kbegin( ( IsUpper_v<MT4> )
8397  ?( ( IsLower_v<MT5> )
8398  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8399  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8400  :( IsLower_v<MT5> ? j : 0UL ) );
8401  const size_t kend( ( IsLower_v<MT4> )
8402  ?( ( IsUpper_v<MT5> )
8403  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
8404  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8405  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8406 
8407  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8408 
8409  for( size_t k=kbegin; k<kend; ++k ) {
8410  const SIMDType a1( set( A(i ,k) ) );
8411  const SIMDType a2( set( A(i+1UL,k) ) );
8412  const SIMDType a3( set( A(i+2UL,k) ) );
8413  const SIMDType b1( B.load(k,j ) );
8414  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8415  xmm1 += a1 * b1;
8416  xmm2 += a1 * b2;
8417  xmm3 += a2 * b1;
8418  xmm4 += a2 * b2;
8419  xmm5 += a3 * b1;
8420  xmm6 += a3 * b2;
8421  }
8422 
8423  C.store( i , j , xmm1 * factor );
8424  C.store( i , j+SIMDSIZE, xmm2 * factor );
8425  C.store( i+1UL, j , xmm3 * factor );
8426  C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8427  C.store( i+2UL, j , xmm5 * factor );
8428  C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8429  }
8430 
8431  for( ; (i+2UL) <= iend; i+=2UL )
8432  {
8433  const size_t kbegin( ( IsUpper_v<MT4> )
8434  ?( ( IsLower_v<MT5> )
8435  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8436  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8437  :( IsLower_v<MT5> ? j : 0UL ) );
8438  const size_t kend( ( IsLower_v<MT4> )
8439  ?( ( IsUpper_v<MT5> )
8440  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8441  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8442  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8443 
8444  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8445  size_t k( kbegin );
8446 
8447  for( ; (k+2UL) <= kend; k+=2UL ) {
8448  const SIMDType a1( set( A(i ,k ) ) );
8449  const SIMDType a2( set( A(i+1UL,k ) ) );
8450  const SIMDType a3( set( A(i ,k+1UL) ) );
8451  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
8452  const SIMDType b1( B.load(k ,j ) );
8453  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
8454  const SIMDType b3( B.load(k+1UL,j ) );
8455  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
8456  xmm1 += a1 * b1;
8457  xmm2 += a1 * b2;
8458  xmm3 += a2 * b1;
8459  xmm4 += a2 * b2;
8460  xmm5 += a3 * b3;
8461  xmm6 += a3 * b4;
8462  xmm7 += a4 * b3;
8463  xmm8 += a4 * b4;
8464  }
8465 
8466  for( ; k<kend; ++k ) {
8467  const SIMDType a1( set( A(i ,k) ) );
8468  const SIMDType a2( set( A(i+1UL,k) ) );
8469  const SIMDType b1( B.load(k,j ) );
8470  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8471  xmm1 += a1 * b1;
8472  xmm2 += a1 * b2;
8473  xmm3 += a2 * b1;
8474  xmm4 += a2 * b2;
8475  }
8476 
8477  C.store( i , j , (xmm1+xmm5) * factor );
8478  C.store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
8479  C.store( i+1UL, j , (xmm3+xmm7) * factor );
8480  C.store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
8481  }
8482 
8483  if( i < iend )
8484  {
8485  const size_t kbegin( ( IsUpper_v<MT4> )
8486  ?( ( IsLower_v<MT5> )
8487  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8488  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8489  :( IsLower_v<MT5> ? j : 0UL ) );
8490  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8491 
8492  SIMDType xmm1, xmm2, xmm3, xmm4;
8493  size_t k( kbegin );
8494 
8495  for( ; (k+2UL) <= kend; k+=2UL ) {
8496  const SIMDType a1( set( A(i,k ) ) );
8497  const SIMDType a2( set( A(i,k+1UL) ) );
8498  xmm1 += a1 * B.load(k ,j );
8499  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8500  xmm3 += a2 * B.load(k+1UL,j );
8501  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8502  }
8503 
8504  for( ; k<kend; ++k ) {
8505  const SIMDType a1( set( A(i,k) ) );
8506  xmm1 += a1 * B.load(k,j );
8507  xmm2 += a1 * B.load(k,j+SIMDSIZE);
8508  }
8509 
8510  C.store( i, j , (xmm1+xmm3) * factor );
8511  C.store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
8512 
8513  if( UPP ) ++i;
8514  }
8515 
8516  if( UPP ) {
8517  const size_t jjend( min(j+SIMDSIZE*2UL,N) );
8518  for( ; i<M; ++i ) {
8519  for( size_t jj=j; jj<jjend; ++jj ) {
8520  reset( C(i,jj) );
8521  }
8522  }
8523  }
8524  }
8525 
8526  for( ; j<jpos; j+=SIMDSIZE )
8527  {
8528  const size_t iend( UPP ? min(j+SIMDSIZE,M) : M );
8529  size_t i( 0UL );
8530 
8531  if( SYM || HERM ) {
8532  const size_t jjend( min(j+SIMDSIZE,N) );
8533  for( ; i<j; ++i ) {
8534  for( size_t jj=j; jj<jjend; ++jj ) {
8535  C(i,jj) = HERM ? conj( C(jj,i) ) : C(jj,i);
8536  }
8537  }
8538  }
8539  else if( LOW ) {
8540  const size_t jjend( min(j+SIMDSIZE,N) );
8541  for( ; i<j; ++i ) {
8542  for( size_t jj=j; jj<jjend; ++jj ) {
8543  reset( C(i,jj) );
8544  }
8545  }
8546  }
8547 
8548  for( ; (i+4UL) <= iend; i+=4UL )
8549  {
8550  const size_t kbegin( ( IsUpper_v<MT4> )
8551  ?( ( IsLower_v<MT5> )
8552  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8553  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8554  :( IsLower_v<MT5> ? j : 0UL ) );
8555  const size_t kend( ( IsLower_v<MT4> )
8556  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8557  :( K ) );
8558 
8559  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8560  size_t k( kbegin );
8561 
8562  for( ; (k+2UL) <= kend; k+=2UL ) {
8563  const SIMDType b1( B.load(k ,j) );
8564  const SIMDType b2( B.load(k+1UL,j) );
8565  xmm1 += set( A(i ,k ) ) * b1;
8566  xmm2 += set( A(i+1UL,k ) ) * b1;
8567  xmm3 += set( A(i+2UL,k ) ) * b1;
8568  xmm4 += set( A(i+3UL,k ) ) * b1;
8569  xmm5 += set( A(i ,k+1UL) ) * b2;
8570  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
8571  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
8572  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
8573  }
8574 
8575  for( ; k<kend; ++k ) {
8576  const SIMDType b1( B.load(k,j) );
8577  xmm1 += set( A(i ,k) ) * b1;
8578  xmm2 += set( A(i+1UL,k) ) * b1;
8579  xmm3 += set( A(i+2UL,k) ) * b1;
8580  xmm4 += set( A(i+3UL,k) ) * b1;
8581  }
8582 
8583  C.store( i , j, (xmm1+xmm5) * factor );
8584  C.store( i+1UL, j, (xmm2+xmm6) * factor );
8585  C.store( i+2UL, j, (xmm3+xmm7) * factor );
8586  C.store( i+3UL, j, (xmm4+xmm8) * factor );
8587  }
8588 
8589  for( ; (i+3UL) <= iend; i+=3UL )
8590  {
8591  const size_t kbegin( ( IsUpper_v<MT4> )
8592  ?( ( IsLower_v<MT5> )
8593  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8594  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8595  :( IsLower_v<MT5> ? j : 0UL ) );
8596  const size_t kend( ( IsLower_v<MT4> )
8597  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8598  :( K ) );
8599 
8600  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8601  size_t k( kbegin );
8602 
8603  for( ; (k+2UL) <= kend; k+=2UL ) {
8604  const SIMDType b1( B.load(k ,j) );
8605  const SIMDType b2( B.load(k+1UL,j) );
8606  xmm1 += set( A(i ,k ) ) * b1;
8607  xmm2 += set( A(i+1UL,k ) ) * b1;
8608  xmm3 += set( A(i+2UL,k ) ) * b1;
8609  xmm4 += set( A(i ,k+1UL) ) * b2;
8610  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
8611  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
8612  }
8613 
8614  for( ; k<kend; ++k ) {
8615  const SIMDType b1( B.load(k,j) );
8616  xmm1 += set( A(i ,k) ) * b1;
8617  xmm2 += set( A(i+1UL,k) ) * b1;
8618  xmm3 += set( A(i+2UL,k) ) * b1;
8619  }
8620 
8621  C.store( i , j, (xmm1+xmm4) * factor );
8622  C.store( i+1UL, j, (xmm2+xmm5) * factor );
8623  C.store( i+2UL, j, (xmm3+xmm6) * factor );
8624  }
8625 
8626  for( ; (i+2UL) <= iend; i+=2UL )
8627  {
8628  const size_t kbegin( ( IsUpper_v<MT4> )
8629  ?( ( IsLower_v<MT5> )
8630  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8631  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8632  :( IsLower_v<MT5> ? j : 0UL ) );
8633  const size_t kend( ( IsLower_v<MT4> )
8634  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8635  :( K ) );
8636 
8637  SIMDType xmm1, xmm2, xmm3, xmm4;
8638  size_t k( kbegin );
8639 
8640  for( ; (k+2UL) <= kend; k+=2UL ) {
8641  const SIMDType b1( B.load(k ,j) );
8642  const SIMDType b2( B.load(k+1UL,j) );
8643  xmm1 += set( A(i ,k ) ) * b1;
8644  xmm2 += set( A(i+1UL,k ) ) * b1;
8645  xmm3 += set( A(i ,k+1UL) ) * b2;
8646  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
8647  }
8648 
8649  for( ; k<kend; ++k ) {
8650  const SIMDType b1( B.load(k,j) );
8651  xmm1 += set( A(i ,k) ) * b1;
8652  xmm2 += set( A(i+1UL,k) ) * b1;
8653  }
8654 
8655  C.store( i , j, (xmm1+xmm3) * factor );
8656  C.store( i+1UL, j, (xmm2+xmm4) * factor );
8657  }
8658 
8659  if( i < iend )
8660  {
8661  const size_t kbegin( ( IsUpper_v<MT4> )
8662  ?( ( IsLower_v<MT5> )
8663  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8664  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8665  :( IsLower_v<MT5> ? j : 0UL ) );
8666 
8667  SIMDType xmm1, xmm2;
8668  size_t k( kbegin );
8669 
8670  for( ; (k+2UL) <= K; k+=2UL ) {
8671  xmm1 += set( A(i,k ) ) * B.load(k ,j);
8672  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
8673  }
8674 
8675  for( ; k<K; ++k ) {
8676  xmm1 += set( A(i,k) ) * B.load(k,j);
8677  }
8678 
8679  C.store( i, j, (xmm1+xmm2) * factor );
8680 
8681  if( UPP ) ++i;
8682  }
8683 
8684  if( UPP ) {
8685  const size_t jjend( min(j+SIMDSIZE,N) );
8686  for( ; i<M; ++i ) {
8687  for( size_t jj=j; jj<jjend; ++jj ) {
8688  reset( C(i,jj) );
8689  }
8690  }
8691  }
8692  }
8693 
8694  for( ; remainder && j<N; ++j )
8695  {
8696  size_t i( 0UL );
8697 
8698  if( SYM || HERM ) {
8699  for( ; i<j; ++i ) {
8700  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
8701  }
8702  }
8703  else if( LOW ) {
8704  for( ; i<j; ++i ) {
8705  reset( C(i,j) );
8706  }
8707  }
8708 
8709  for( ; (i+2UL) <= M; i+=2UL )
8710  {
8711  const size_t kbegin( ( IsUpper_v<MT4> )
8712  ?( ( IsLower_v<MT5> )
8713  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8714  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8715  :( IsLower_v<MT5> ? j : 0UL ) );
8716  const size_t kend( ( IsLower_v<MT4> )
8717  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8718  :( K ) );
8719 
8720  ElementType value1{};
8721  ElementType value2{};
8722 
8723  for( size_t k=kbegin; k<kend; ++k ) {
8724  value1 += A(i ,k) * B(k,j);
8725  value2 += A(i+1UL,k) * B(k,j);
8726  }
8727 
8728  C(i ,j) = value1 * scalar;
8729  C(i+1UL,j) = value2 * scalar;
8730  }
8731 
8732  if( i < M )
8733  {
8734  const size_t kbegin( ( IsUpper_v<MT4> )
8735  ?( ( IsLower_v<MT5> )
8736  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8737  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8738  :( IsLower_v<MT5> ? j : 0UL ) );
8739 
8740  ElementType value{};
8741 
8742  for( size_t k=kbegin; k<K; ++k ) {
8743  value += A(i,k) * B(k,j);
8744  }
8745 
8746  C(i,j) = value * scalar;
8747  }
8748  }
8749  }
8750  //**********************************************************************************************
8751 
8752  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
8767  template< typename MT3 // Type of the left-hand side target matrix
8768  , typename MT4 // Type of the left-hand side matrix operand
8769  , typename MT5 // Type of the right-hand side matrix operand
8770  , typename ST2 > // Type of the scalar value
8771  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8772  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8773  {
8774  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
8775 
8776  const size_t M( A.rows() );
8777  const size_t N( B.columns() );
8778  const size_t K( A.columns() );
8779 
8780  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8781 
8782  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
8783  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
8784 
8785  const SIMDType factor( set( scalar ) );
8786 
8787  size_t i( 0UL );
8788 
8789  if( IsIntegral_v<ElementType> )
8790  {
8791  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8792  for( size_t j=0UL; j<N; ++j )
8793  {
8794  const size_t kbegin( ( IsLower_v<MT5> )
8795  ?( ( IsUpper_v<MT4> )
8796  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8797  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8798  :( IsUpper_v<MT4> ? i : 0UL ) );
8799  const size_t kend( ( IsUpper_v<MT5> )
8800  ?( ( IsLower_v<MT4> )
8801  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
8802  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
8803  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
8804 
8805  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8806 
8807  for( size_t k=kbegin; k<kend; ++k ) {
8808  const SIMDType b1( set( B(k,j) ) );
8809  xmm1 += A.load(i ,k) * b1;
8810  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8811  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8812  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8813  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8814  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
8815  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
8816  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
8817  }
8818 
8819  C.store( i , j, xmm1 * factor );
8820  C.store( i+SIMDSIZE , j, xmm2 * factor );
8821  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8822  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8823  C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8824  C.store( i+SIMDSIZE*5UL, j, xmm6 * factor );
8825  C.store( i+SIMDSIZE*6UL, j, xmm7 * factor );
8826  C.store( i+SIMDSIZE*7UL, j, xmm8 * factor );
8827  }
8828  }
8829  }
8830 
8831  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
8832  {
8833  size_t j( 0UL );
8834 
8835  for( ; (j+2UL) <= N; j+=2UL )
8836  {
8837  const size_t kbegin( ( IsLower_v<MT5> )
8838  ?( ( IsUpper_v<MT4> )
8839  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8840  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8841  :( IsUpper_v<MT4> ? i : 0UL ) );
8842  const size_t kend( ( IsUpper_v<MT5> )
8843  ?( ( IsLower_v<MT4> )
8844  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8845  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8846  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
8847 
8848  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8849 
8850  for( size_t k=kbegin; k<kend; ++k ) {
8851  const SIMDType a1( A.load(i ,k) );
8852  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8853  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8854  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8855  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
8856  const SIMDType b1( set( B(k,j ) ) );
8857  const SIMDType b2( set( B(k,j+1UL) ) );
8858  xmm1 += a1 * b1;
8859  xmm2 += a2 * b1;
8860  xmm3 += a3 * b1;
8861  xmm4 += a4 * b1;
8862  xmm5 += a5 * b1;
8863  xmm6 += a1 * b2;
8864  xmm7 += a2 * b2;
8865  xmm8 += a3 * b2;
8866  xmm9 += a4 * b2;
8867  xmm10 += a5 * b2;
8868  }
8869 
8870  C.store( i , j , xmm1 * factor );
8871  C.store( i+SIMDSIZE , j , xmm2 * factor );
8872  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8873  C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8874  C.store( i+SIMDSIZE*4UL, j , xmm5 * factor );
8875  C.store( i , j+1UL, xmm6 * factor );
8876  C.store( i+SIMDSIZE , j+1UL, xmm7 * factor );
8877  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
8878  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
8879  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
8880  }
8881 
8882  if( j < N )
8883  {
8884  const size_t kbegin( ( IsLower_v<MT5> )
8885  ?( ( IsUpper_v<MT4> )
8886  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8887  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8888  :( IsUpper_v<MT4> ? i : 0UL ) );
8889  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
8890 
8891  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8892 
8893  for( size_t k=kbegin; k<kend; ++k ) {
8894  const SIMDType b1( set( B(k,j) ) );
8895  xmm1 += A.load(i ,k) * b1;
8896  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8897  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8898  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8899  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8900  }
8901 
8902  C.store( i , j, xmm1 * factor );
8903  C.store( i+SIMDSIZE , j, xmm2 * factor );
8904  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8905  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8906  C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8907  }
8908  }
8909 
8910  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8911  {
8912  const size_t jend( LOW ? min(i+SIMDSIZE*4UL,N) : N );
8913  size_t j( 0UL );
8914 
8915  if( SYM || HERM ) {
8916  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
8917  for( ; j<i; ++j ) {
8918  for( size_t ii=i; ii<iiend; ++ii ) {
8919  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
8920  }
8921  }
8922  }
8923  else if( UPP ) {
8924  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
8925  for( ; j<i; ++j ) {
8926  for( size_t ii=i; ii<iiend; ++ii ) {
8927  reset( C(ii,j) );
8928  }
8929  }
8930  }
8931 
8932  for( ; (j+2UL) <= jend; j+=2UL )
8933  {
8934  const size_t kbegin( ( IsLower_v<MT5> )
8935  ?( ( IsUpper_v<MT4> )
8936  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8937  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8938  :( IsUpper_v<MT4> ? i : 0UL ) );
8939  const size_t kend( ( IsUpper_v<MT5> )
8940  ?( ( IsLower_v<MT4> )
8941  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8942  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8943  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
8944 
8945  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8946 
8947  for( size_t k=kbegin; k<kend; ++k ) {
8948  const SIMDType a1( A.load(i ,k) );
8949  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8950  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8951  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8952  const SIMDType b1( set( B(k,j ) ) );
8953  const SIMDType b2( set( B(k,j+1UL) ) );
8954  xmm1 += a1 * b1;
8955  xmm2 += a2 * b1;
8956  xmm3 += a3 * b1;
8957  xmm4 += a4 * b1;
8958  xmm5 += a1 * b2;
8959  xmm6 += a2 * b2;
8960  xmm7 += a3 * b2;
8961  xmm8 += a4 * b2;
8962  }
8963 
8964  C.store( i , j , xmm1 * factor );
8965  C.store( i+SIMDSIZE , j , xmm2 * factor );
8966  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8967  C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8968  C.store( i , j+1UL, xmm5 * factor );
8969  C.store( i+SIMDSIZE , j+1UL, xmm6 * factor );
8970  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
8971  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
8972  }
8973 
8974  if( j < jend )
8975  {
8976  const size_t kbegin( ( IsLower_v<MT5> )
8977  ?( ( IsUpper_v<MT4> )
8978  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8979  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8980  :( IsUpper_v<MT4> ? i : 0UL ) );
8981  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
8982 
8983  SIMDType xmm1, xmm2, xmm3, xmm4;
8984 
8985  for( size_t k=kbegin; k<kend; ++k ) {
8986  const SIMDType b1( set( B(k,j) ) );
8987  xmm1 += A.load(i ,k) * b1;
8988  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8989  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8990  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8991  }
8992 
8993  C.store( i , j, xmm1 * factor );
8994  C.store( i+SIMDSIZE , j, xmm2 * factor );
8995  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8996  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8997 
8998  if( LOW ) ++j;
8999  }
9000 
9001  if( LOW ) {
9002  const size_t iiend( min(i+SIMDSIZE*4UL,M) );
9003  for( ; j<N; ++j ) {
9004  for( size_t ii=i; ii<iiend; ++ii ) {
9005  reset( C(ii,j) );
9006  }
9007  }
9008  }
9009  }
9010 
9011  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
9012  {
9013  const size_t jend( LOW ? min(i+SIMDSIZE*3UL,N) : N );
9014  size_t j( 0UL );
9015 
9016  if( SYM || HERM ) {
9017  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
9018  for( ; j<i; ++j ) {
9019  for( size_t ii=i; ii<iiend; ++ii ) {
9020  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
9021  }
9022  }
9023  }
9024  else if( UPP ) {
9025  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
9026  for( ; j<i; ++j ) {
9027  for( size_t ii=i; ii<iiend; ++ii ) {
9028  reset( C(ii,j) );
9029  }
9030  }
9031  }
9032 
9033  for( ; (j+2UL) <= jend; j+=2UL )
9034  {
9035  const size_t kbegin( ( IsLower_v<MT5> )
9036  ?( ( IsUpper_v<MT4> )
9037  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9038  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9039  :( IsUpper_v<MT4> ? i : 0UL ) );
9040  const size_t kend( ( IsUpper_v<MT5> )
9041  ?( ( IsLower_v<MT4> )
9042  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9043  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9044  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
9045 
9046  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9047 
9048  for( size_t k=kbegin; k<kend; ++k ) {
9049  const SIMDType a1( A.load(i ,k) );
9050  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
9051  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9052  const SIMDType b1( set( B(k,j ) ) );
9053  const SIMDType b2( set( B(k,j+1UL) ) );
9054  xmm1 += a1 * b1;
9055  xmm2 += a2 * b1;
9056  xmm3 += a3 * b1;
9057  xmm4 += a1 * b2;
9058  xmm5 += a2 * b2;
9059  xmm6 += a3 * b2;
9060  }
9061 
9062  C.store( i , j , xmm1 * factor );
9063  C.store( i+SIMDSIZE , j , xmm2 * factor );
9064  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
9065  C.store( i , j+1UL, xmm4 * factor );
9066  C.store( i+SIMDSIZE , j+1UL, xmm5 * factor );
9067  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
9068  }
9069 
9070  if( j < jend )
9071  {
9072  const size_t kbegin( ( IsLower_v<MT5> )
9073  ?( ( IsUpper_v<MT4> )
9074  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9075  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9076  :( IsUpper_v<MT4> ? i : 0UL ) );
9077  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
9078 
9079  SIMDType xmm1, xmm2, xmm3;
9080 
9081  for( size_t k=kbegin; k<kend; ++k ) {
9082  const SIMDType b1( set( B(k,j) ) );
9083  xmm1 += A.load(i ,k) * b1;
9084  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9085  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9086  }
9087 
9088  C.store( i , j, xmm1 * factor );
9089  C.store( i+SIMDSIZE , j, xmm2 * factor );
9090  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
9091 
9092  if( LOW ) ++j;
9093  }
9094 
9095  if( LOW ) {
9096  const size_t iiend( min(i+SIMDSIZE*3UL,M) );
9097  for( ; j<N; ++j ) {
9098  for( size_t ii=i; ii<iiend; ++ii ) {
9099  reset( C(ii,j) );
9100  }
9101  }
9102  }
9103  }
9104 
9105  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
9106  {
9107  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
9108  size_t j( 0UL );
9109 
9110  if( SYM || HERM ) {
9111  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
9112  for( ; j<i; ++j ) {
9113  for( size_t ii=i; ii<iiend; ++ii ) {
9114  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
9115  }
9116  }
9117  }
9118  else if( UPP ) {
9119  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
9120  for( ; j<i; ++j ) {
9121  for( size_t ii=i; ii<iiend; ++ii ) {
9122  reset( C(ii,j) );
9123  }
9124  }
9125  }
9126 
9127  for( ; (j+4UL) <= jend; j+=4UL )
9128  {
9129  const size_t kbegin( ( IsLower_v<MT5> )
9130  ?( ( IsUpper_v<MT4> )
9131  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9132  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9133  :( IsUpper_v<MT4> ? i : 0UL ) );
9134  const size_t kend( ( IsUpper_v<MT5> )
9135  ?( ( IsLower_v<MT4> )
9136  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
9137  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
9138  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
9139 
9140  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9141 
9142  for( size_t k=kbegin; k<kend; ++k ) {
9143  const SIMDType a1( A.load(i ,k) );
9144  const SIMDType a2( A.load(i+SIMDSIZE,k) );
9145  const SIMDType b1( set( B(k,j ) ) );
9146  const SIMDType b2( set( B(k,j+1UL) ) );
9147  const SIMDType b3( set( B(k,j+2UL) ) );
9148  const SIMDType b4( set( B(k,j+3UL) ) );
9149  xmm1 += a1 * b1;
9150  xmm2 += a2 * b1;
9151  xmm3 += a1 * b2;
9152  xmm4 += a2 * b2;
9153  xmm5 += a1 * b3;
9154  xmm6 += a2 * b3;
9155  xmm7 += a1 * b4;
9156  xmm8 += a2 * b4;
9157  }
9158 
9159  C.store( i , j , xmm1 * factor );
9160  C.store( i+SIMDSIZE, j , xmm2 * factor );
9161  C.store( i , j+1UL, xmm3 * factor );
9162  C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
9163  C.store( i , j+2UL, xmm5 * factor );
9164  C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
9165  C.store( i , j+3UL, xmm7 * factor );
9166  C.store( i+SIMDSIZE, j+3UL, xmm8 * factor );
9167  }
9168 
9169  for( ; (j+3UL) <= jend; j+=3UL )
9170  {
9171  const size_t kbegin( ( IsLower_v<MT5> )
9172  ?( ( IsUpper_v<MT4> )
9173  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9174  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9175  :( IsUpper_v<MT4> ? i : 0UL ) );
9176  const size_t kend( ( IsUpper_v<MT5> )
9177  ?( ( IsLower_v<MT4> )
9178  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
9179  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
9180  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
9181 
9182  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9183 
9184  for( size_t k=kbegin; k<kend; ++k ) {
9185  const SIMDType a1( A.load(i ,k) );
9186  const SIMDType a2( A.load(i+SIMDSIZE,k) );
9187  const SIMDType b1( set( B(k,j ) ) );
9188  const SIMDType b2( set( B(k,j+1UL) ) );
9189  const SIMDType b3( set( B(k,j+2UL) ) );
9190  xmm1 += a1 * b1;
9191  xmm2 += a2 * b1;
9192  xmm3 += a1 * b2;
9193  xmm4 += a2 * b2;
9194  xmm5 += a1 * b3;
9195  xmm6 += a2 * b3;
9196  }
9197 
9198  C.store( i , j , xmm1 * factor );
9199  C.store( i+SIMDSIZE, j , xmm2 * factor );
9200  C.store( i , j+1UL, xmm3 * factor );
9201  C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
9202  C.store( i , j+2UL, xmm5 * factor );
9203  C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
9204  }
9205 
9206  for( ; (j+2UL) <= jend; j+=2UL )
9207  {
9208  const size_t kbegin( ( IsLower_v<MT5> )
9209  ?( ( IsUpper_v<MT4> )
9210  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9211  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9212  :( IsUpper_v<MT4> ? i : 0UL ) );
9213  const size_t kend( ( IsUpper_v<MT5> )
9214  ?( ( IsLower_v<MT4> )
9215  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9216  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9217  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
9218 
9219  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9220  size_t k( kbegin );
9221 
9222  for( ; (k+2UL) <= kend; k+=2UL ) {
9223  const SIMDType a1( A.load(i ,k ) );
9224  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
9225  const SIMDType a3( A.load(i ,k+1UL) );
9226  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
9227  const SIMDType b1( set( B(k ,j ) ) );
9228  const SIMDType b2( set( B(k ,j+1UL) ) );
9229  const SIMDType b3( set( B(k+1UL,j ) ) );
9230  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
9231  xmm1 += a1 * b1;
9232  xmm2 += a2 * b1;
9233  xmm3 += a1 * b2;
9234  xmm4 += a2 * b2;
9235  xmm5 += a3 * b3;
9236  xmm6 += a4 * b3;
9237  xmm7 += a3 * b4;
9238  xmm8 += a4 * b4;
9239  }
9240 
9241  for( ; k<kend; ++k ) {
9242  const SIMDType a1( A.load(i ,k) );
9243  const SIMDType a2( A.load(i+SIMDSIZE,k) );
9244  const SIMDType b1( set( B(k,j ) ) );
9245  const SIMDType b2( set( B(k,j+1UL) ) );
9246  xmm1 += a1 * b1;
9247  xmm2 += a2 * b1;
9248  xmm3 += a1 * b2;
9249  xmm4 += a2 * b2;
9250  }
9251 
9252  C.store( i , j , (xmm1+xmm5) * factor );
9253  C.store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
9254  C.store( i , j+1UL, (xmm3+xmm7) * factor );
9255  C.store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
9256  }
9257 
9258  if( j < jend )
9259  {
9260  const size_t kbegin( ( IsLower_v<MT5> )
9261  ?( ( IsUpper_v<MT4> )
9262  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9263  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9264  :( IsUpper_v<MT4> ? i : 0UL ) );
9265  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
9266 
9267  SIMDType xmm1, xmm2, xmm3, xmm4;
9268  size_t k( kbegin );
9269 
9270  for( ; (k+2UL) <= kend; k+=2UL ) {
9271  const SIMDType b1( set( B(k ,j) ) );
9272  const SIMDType b2( set( B(k+1UL,j) ) );
9273  xmm1 += A.load(i ,k ) * b1;
9274  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
9275  xmm3 += A.load(i ,k+1UL) * b2;
9276  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
9277  }
9278 
9279  for( ; k<kend; ++k ) {
9280  const SIMDType b1( set( B(k,j) ) );
9281  xmm1 += A.load(i ,k) * b1;
9282  xmm2 += A.load(i+SIMDSIZE,k) * b1;
9283  }
9284 
9285  C.store( i , j, (xmm1+xmm3) * factor );
9286  C.store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
9287 
9288  if( LOW ) ++j;
9289  }
9290 
9291  if( LOW ) {
9292  const size_t iiend( min(i+SIMDSIZE*2UL,M) );
9293  for( ; j<N; ++j ) {
9294  for( size_t ii=i; ii<iiend; ++ii ) {
9295  reset( C(ii,j) );
9296  }
9297  }
9298  }
9299  }
9300 
9301  for( ; i<ipos; i+=SIMDSIZE )
9302  {
9303  const size_t jend( LOW ? min(i+SIMDSIZE,N) : N );
9304  size_t j( 0UL );
9305 
9306  if( SYM || HERM ) {
9307  const size_t iiend( min(i+SIMDSIZE,M) );
9308  for( ; j<i; ++j ) {
9309  for( size_t ii=i; ii<iiend; ++ii ) {
9310  C(ii,j) = HERM ? conj( C(j,ii) ) : C(j,ii);
9311  }
9312  }
9313  }
9314  else if( UPP ) {
9315  const size_t iiend( min(i+SIMDSIZE,M) );
9316  for( ; j<i; ++j ) {
9317  for( size_t ii=i; ii<iiend; ++ii ) {
9318  reset( C(ii,j) );
9319  }
9320  }
9321  }
9322 
9323  for( ; (j+4UL) <= jend; j+=4UL )
9324  {
9325  const size_t kbegin( ( IsLower_v<MT5> )
9326  ?( ( IsUpper_v<MT4> )
9327  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9328  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9329  :( IsUpper_v<MT4> ? i : 0UL ) );
9330  const size_t kend( ( IsUpper_v<MT5> )
9331  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
9332  :( K ) );
9333 
9334  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9335  size_t k( kbegin );
9336 
9337  for( ; (k+2UL) <= kend; k+=2UL ) {
9338  const SIMDType a1( A.load(i,k ) );
9339  const SIMDType a2( A.load(i,k+1UL) );
9340  xmm1 += a1 * set( B(k ,j ) );
9341  xmm2 += a1 * set( B(k ,j+1UL) );
9342  xmm3 += a1 * set( B(k ,j+2UL) );
9343  xmm4 += a1 * set( B(k ,j+3UL) );
9344  xmm5 += a2 * set( B(k+1UL,j ) );
9345  xmm6 += a2 * set( B(k+1UL,j+1UL) );
9346  xmm7 += a2 * set( B(k+1UL,j+2UL) );
9347  xmm8 += a2 * set( B(k+1UL,j+3UL) );
9348  }
9349 
9350  for( ; k<kend; ++k ) {
9351  const SIMDType a1( A.load(i,k) );
9352  xmm1 += a1 * set( B(k,j ) );
9353  xmm2 += a1 * set( B(k,j+1UL) );
9354  xmm3 += a1 * set( B(k,j+2UL) );
9355  xmm4 += a1 * set( B(k,j+3UL) );
9356  }
9357 
9358  C.store( i, j , (xmm1+xmm5) * factor );
9359  C.store( i, j+1UL, (xmm2+xmm6) * factor );
9360  C.store( i, j+2UL, (xmm3+xmm7) * factor );
9361  C.store( i, j+3UL, (xmm4+xmm8) * factor );
9362  }
9363 
9364  for( ; (j+3UL) <= jend; j+=3UL )
9365  {
9366  const size_t kbegin( ( IsLower_v<MT5> )
9367  ?( ( IsUpper_v<MT4> )
9368  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9369  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9370  :( IsUpper_v<MT4> ? i : 0UL ) );
9371  const size_t kend( ( IsUpper_v<MT5> )
9372  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
9373  :( K ) );
9374 
9375  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9376  size_t k( kbegin );
9377 
9378  for( ; (k+2UL) <= kend; k+=2UL ) {
9379  const SIMDType a1( A.load(i,k ) );
9380  const SIMDType a2( A.load(i,k+1UL) );
9381  xmm1 += a1 * set( B(k ,j ) );
9382  xmm2 += a1 * set( B(k ,j+1UL) );
9383  xmm3 += a1 * set( B(k ,j+2UL) );
9384  xmm4 += a2 * set( B(k+1UL,j ) );
9385  xmm5 += a2 * set( B(k+1UL,j+1UL) );
9386  xmm6 += a2 * set( B(k+1UL,j+2UL) );
9387  }
9388 
9389  for( ; k<kend; ++k ) {
9390  const SIMDType a1( A.load(i,k) );
9391  xmm1 += a1 * set( B(k,j ) );
9392  xmm2 += a1 * set( B(k,j+1UL) );
9393  xmm3 += a1 * set( B(k,j+2UL) );
9394  }
9395 
9396  C.store( i, j , (xmm1+xmm4) * factor );
9397  C.store( i, j+1UL, (xmm2+xmm5) * factor );
9398  C.store( i, j+2UL, (xmm3+xmm6) * factor );
9399  }
9400 
9401  for( ; (j+2UL) <= jend; j+=2UL )
9402  {
9403  const size_t kbegin( ( IsLower_v<MT5> )
9404  ?( ( IsUpper_v<MT4> )
9405  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9406  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9407  :( IsUpper_v<MT4> ? i : 0UL ) );
9408  const size_t kend( ( IsUpper_v<MT5> )
9409  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9410  :( K ) );
9411 
9412  SIMDType xmm1, xmm2, xmm3, xmm4;
9413  size_t k( kbegin );
9414 
9415  for( ; k<kend; ++k ) {
9416  const SIMDType a1( A.load(i,k) );
9417  xmm1 += a1 * set( B(k,j ) );
9418  xmm2 += a1 * set( B(k,j+1UL) );
9419  }
9420 
9421  for( ; (k+2UL) <= kend; k+=2UL ) {
9422  const SIMDType a1( A.load(i,k ) );
9423  const SIMDType a2( A.load(i,k+1UL) );
9424  xmm1 += a1 * set( B(k ,j ) );
9425  xmm2 += a1 * set( B(k ,j+1UL) );
9426  xmm3 += a2 * set( B(k+1UL,j ) );
9427  xmm4 += a2 * set( B(k+1UL,j+1UL) );
9428  }
9429 
9430  C.store( i, j , (xmm1+xmm3) * factor );
9431  C.store( i, j+1UL, (xmm2+xmm4) * factor );
9432  }
9433 
9434  if( j < jend )
9435  {
9436  const size_t kbegin( ( IsLower_v<MT5> )
9437  ?( ( IsUpper_v<MT4> )
9438  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9439  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9440  :( IsUpper_v<MT4> ? i : 0UL ) );
9441 
9442  SIMDType xmm1, xmm2;
9443  size_t k( kbegin );
9444 
9445  for( ; (k+2UL) <= K; k+=2UL ) {
9446  xmm1 += A.load(i,k ) * set( B(k ,j) );
9447  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
9448  }
9449 
9450  for( ; k<K; ++k ) {
9451  xmm1 += A.load(i,k) * set( B(k,j) );
9452  }
9453 
9454  C.store( i, j, (xmm1+xmm2) * factor );
9455 
9456  if( LOW ) ++j;
9457  }
9458 
9459  if( LOW ) {
9460  const size_t iiend( min(i+SIMDSIZE,M) );
9461  for( ; j<N; ++j ) {
9462  for( size_t ii=i; ii<iiend; ++ii ) {
9463  reset( C(ii,j) );
9464  }
9465  }
9466  }
9467  }
9468 
9469  for( ; remainder && i<M; ++i )
9470  {
9471  size_t j( 0UL );
9472 
9473  if( SYM || HERM ) {
9474  for( ; j<i; ++j ) {
9475  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
9476  }
9477  }
9478  else if( UPP ) {
9479  for( ; j<i; ++j ) {
9480  reset( C(i,j) );
9481  }
9482  }
9483 
9484  for( ; (j+2UL) <= N; j+=2UL )
9485  {
9486  const size_t kbegin( ( IsLower_v<MT5> )
9487  ?( ( IsUpper_v<MT4> )
9488  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9489  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9490  :( IsUpper_v<MT4> ? i : 0UL ) );
9491  const size_t kend( ( IsUpper_v<MT5> )
9492  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9493  :( K ) );
9494 
9495  ElementType value1{};
9496  ElementType value2{};
9497 
9498  for( size_t k=kbegin; k<kend; ++k ) {
9499  value1 += A(i,k) * B(k,j );
9500  value2 += A(i,k) * B(k,j+1UL);
9501  }
9502 
9503  C(i,j ) = value1 * scalar;
9504  C(i,j+1UL) = value2 * scalar;
9505  }
9506 
9507  if( j < N )
9508  {
9509  const size_t kbegin( ( IsLower_v<MT5> )
9510  ?( ( IsUpper_v<MT4> )
9511  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9512  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9513  :( IsUpper_v<MT4> ? i : 0UL ) );
9514 
9515  ElementType value{};
9516 
9517  for( size_t k=kbegin; k<K; ++k ) {
9518  value += A(i,k) * B(k,j);
9519  }
9520 
9521  C(i,j) = value * scalar;
9522  }
9523  }
9524  }
9525  //**********************************************************************************************
9526 
9527  //**Default assignment to dense matrices (large matrices)***************************************
9541  template< typename MT3 // Type of the left-hand side target matrix
9542  , typename MT4 // Type of the left-hand side matrix operand
9543  , typename MT5 // Type of the right-hand side matrix operand
9544  , typename ST2 > // Type of the scalar value
9545  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9546  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9547  {
9548  selectDefaultAssignKernel( C, A, B, scalar );
9549  }
9550  //**********************************************************************************************
9551 
9552  //**Vectorized default assignment to dense matrices (large matrices)****************************
9567  template< typename MT3 // Type of the left-hand side target matrix
9568  , typename MT4 // Type of the left-hand side matrix operand
9569  , typename MT5 // Type of the right-hand side matrix operand
9570  , typename ST2 > // Type of the scalar value
9571  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9572  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9573  {
9574  if( SYM )
9575  smmm( C, A, B, scalar );
9576  else if( HERM )
9577  hmmm( C, A, B, scalar );
9578  else if( LOW )
9579  lmmm( C, A, B, scalar, ST2(0) );
9580  else if( UPP )
9581  ummm( C, A, B, scalar, ST2(0) );
9582  else
9583  mmm( C, A, B, scalar, ST2(0) );
9584  }
9585  //**********************************************************************************************
9586 
9587  //**BLAS-based assignment to dense matrices (default)*******************************************
9601  template< typename MT3 // Type of the left-hand side target matrix
9602  , typename MT4 // Type of the left-hand side matrix operand
9603  , typename MT5 // Type of the right-hand side matrix operand
9604  , typename ST2 > // Type of the scalar value
9605  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9606  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9607  {
9608  selectLargeAssignKernel( C, A, B, scalar );
9609  }
9610  //**********************************************************************************************
9611 
9612  //**BLAS-based assignment to dense matrices*****************************************************
9613 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9614 
9627  template< typename MT3 // Type of the left-hand side target matrix
9628  , typename MT4 // Type of the left-hand side matrix operand
9629  , typename MT5 // Type of the right-hand side matrix operand
9630  , typename ST2 > // Type of the scalar value
9631  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9632  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9633  {
9634  using ET = ElementType_t<MT3>;
9635 
9636  if( IsTriangular_v<MT4> ) {
9637  assign( C, B );
9638  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9639  }
9640  else if( IsTriangular_v<MT5> ) {
9641  assign( C, A );
9642  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9643  }
9644  else {
9645  gemm( C, A, B, ET(scalar), ET(0) );
9646  }
9647  }
9648 #endif
9649  //**********************************************************************************************
9650 
9651  //**Assignment to sparse matrices***************************************************************
9663  template< typename MT // Type of the target sparse matrix
9664  , bool SO > // Storage order of the target sparse matrix
9665  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9666  {
9668 
9669  using TmpType = If_t< SO, ResultType, OppositeType >;
9670 
9677 
9678  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9679  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9680 
9681  const ForwardFunctor fwd;
9682 
9683  const TmpType tmp( serial( rhs ) );
9684  assign( ~lhs, fwd( tmp ) );
9685  }
9686  //**********************************************************************************************
9687 
9688  //**Addition assignment to dense matrices*******************************************************
9700  template< typename MT // Type of the target dense matrix
9701  , bool SO > // Storage order of the target dense matrix
9702  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9703  {
9705 
9706  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9707  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9708 
9709  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9710  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9711 
9712  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
9713  return;
9714  }
9715 
9716  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
9717  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
9718 
9719  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9720  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9721  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9722  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9723  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
9724  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
9725 
9726  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
9727  }
9728  //**********************************************************************************************
9729 
9730  //**Addition assignment to dense matrices (kernel selection)************************************
9741  template< typename MT3 // Type of the left-hand side target matrix
9742  , typename MT4 // Type of the left-hand side matrix operand
9743  , typename MT5 // Type of the right-hand side matrix operand
9744  , typename ST2 > // Type of the scalar value
9745  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9746  {
9747  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
9748  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
9749  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
9750  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9751  selectSmallAddAssignKernel( C, A, B, scalar );
9752  else
9753  selectBlasAddAssignKernel( C, A, B, scalar );
9754  }
9755  //**********************************************************************************************
9756 
9757  //**Default addition assignment to dense matrices (general/general)*****************************
9771  template< typename MT3 // Type of the left-hand side target matrix
9772  , typename MT4 // Type of the left-hand side matrix operand
9773  , typename MT5 // Type of the right-hand side matrix operand
9774  , typename ST2 > // Type of the scalar value
9775  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9776  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9777  {
9778  const ResultType tmp( serial( A * B * scalar ) );
9779  addAssign( C, tmp );
9780  }
9781  //**********************************************************************************************
9782 
9783  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
9797  template< typename MT3 // Type of the left-hand side target matrix
9798  , typename MT4 // Type of the left-hand side matrix operand
9799  , typename MT5 // Type of the right-hand side matrix operand
9800  , typename ST2 > // Type of the scalar value
9801  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9802  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9803  {
9804  constexpr size_t block( BLOCK_SIZE );
9805 
9806  const size_t M( A.rows() );
9807  const size_t N( B.columns() );
9808 
9809  for( size_t ii=0UL; ii<M; ii+=block ) {
9810  const size_t iend( min( M, ii+block ) );
9811  for( size_t jj=0UL; jj<N; jj+=block ) {
9812  const size_t jend( min( N, jj+block ) );
9813  for( size_t i=ii; i<iend; ++i )
9814  {
9815  const size_t jbegin( ( IsUpper_v<MT4> )
9816  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
9817  :( jj ) );
9818  const size_t jpos( ( IsLower_v<MT4> )
9819  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
9820  :( jend ) );
9821 
9822  for( size_t j=jbegin; j<jpos; ++j ) {
9823  C(i,j) += A(i,j) * B(j,j) * scalar;
9824  }
9825  }
9826  }
9827  }
9828  }
9829  //**********************************************************************************************
9830 
9831  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
9845  template< typename MT3 // Type of the left-hand side target matrix
9846  , typename MT4 // Type of the left-hand side matrix operand
9847  , typename MT5 // Type of the right-hand side matrix operand
9848  , typename ST2 > // Type of the scalar value
9849  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9850  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9851  {
9852  const size_t M( A.rows() );
9853  const size_t N( B.columns() );
9854 
9855  for( size_t j=0UL; j<N; ++j )
9856  {
9857  const size_t ibegin( ( IsLower_v<MT4> )
9858  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
9859  :( 0UL ) );
9860  const size_t iend( ( IsUpper_v<MT4> )
9861  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
9862  :( M ) );
9863  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
9864 
9865  const size_t inum( iend - ibegin );
9866  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
9867 
9868  for( size_t i=ibegin; i<ipos; i+=2UL ) {
9869  C(i ,j) += A(i ,j) * B(j,j) * scalar;
9870  C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
9871  }
9872  if( ipos < iend ) {
9873  C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
9874  }
9875  }
9876  }
9877  //**********************************************************************************************
9878 
9879  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
9893  template< typename MT3 // Type of the left-hand side target matrix
9894  , typename MT4 // Type of the left-hand side matrix operand
9895  , typename MT5 // Type of the right-hand side matrix operand
9896  , typename ST2 > // Type of the scalar value
9897  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9898  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9899  {
9900  const size_t M( A.rows() );
9901  const size_t N( B.columns() );
9902 
9903  for( size_t i=0UL; i<M; ++i )
9904  {
9905  const size_t jbegin( ( IsUpper_v<MT5> )
9906  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
9907  :( 0UL ) );
9908  const size_t jend( ( IsLower_v<MT5> )
9909  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
9910  :( N ) );
9911  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
9912 
9913  const size_t jnum( jend - jbegin );
9914  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
9915 
9916  for( size_t j=jbegin; j<jpos; j+=2UL ) {
9917  C(i,j ) += A(i,i) * B(i,j ) * scalar;
9918  C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
9919  }
9920  if( jpos < jend ) {
9921  C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
9922  }
9923  }
9924  }
9925  //**********************************************************************************************
9926 
9927  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
9941  template< typename MT3 // Type of the left-hand side target matrix
9942  , typename MT4 // Type of the left-hand side matrix operand
9943  , typename MT5 // Type of the right-hand side matrix operand
9944  , typename ST2 > // Type of the scalar value
9945  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9946  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9947  {
9948  constexpr size_t block( BLOCK_SIZE );
9949 
9950  const size_t M( A.rows() );
9951  const size_t N( B.columns() );
9952 
9953  for( size_t jj=0UL; jj<N; jj+=block ) {
9954  const size_t jend( min( N, jj+block ) );
9955  for( size_t ii=0UL; ii<M; ii+=block ) {
9956  const size_t iend( min( M, ii+block ) );
9957  for( size_t j=jj; j<jend; ++j )
9958  {
9959  const size_t ibegin( ( IsLower_v<MT5> )
9960  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
9961  :( ii ) );
9962  const size_t ipos( ( IsUpper_v<MT5> )
9963  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
9964  :( iend ) );
9965 
9966  for( size_t i=ibegin; i<ipos; ++i ) {
9967  C(i,j) += A(i,i) * B(i,j) * scalar;
9968  }
9969  }
9970  }
9971  }
9972  }
9973  //**********************************************************************************************
9974 
9975  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
9989  template< typename MT3 // Type of the left-hand side target matrix
9990  , typename MT4 // Type of the left-hand side matrix operand
9991  , typename MT5 // Type of the right-hand side matrix operand
9992  , typename ST2 > // Type of the scalar value
9993  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9994  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9995  {
9996  for( size_t i=0UL; i<A.rows(); ++i ) {
9997  C(i,i) += A(i,i) * B(i,i) * scalar;
9998  }
9999  }
10000  //**********************************************************************************************
10001 
10002  //**Default addition assignment to dense matrices (small matrices)******************************
10016  template< typename MT3 // Type of the left-hand side target matrix
10017  , typename MT4 // Type of the left-hand side matrix operand
10018  , typename MT5 // Type of the right-hand side matrix operand
10019  , typename ST2 > // Type of the scalar value
10020  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10021  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10022  {
10023  selectDefaultAddAssignKernel( C, A, B, scalar );
10024  }
10025  //**********************************************************************************************
10026 
10027  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
10042  template< typename MT3 // Type of the left-hand side target matrix
10043  , typename MT4 // Type of the left-hand side matrix operand
10044  , typename MT5 // Type of the right-hand side matrix operand
10045  , typename ST2 > // Type of the scalar value
10046  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10047  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10048  {
10049  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
10050 
10051  const size_t M( A.rows() );
10052  const size_t N( B.columns() );
10053  const size_t K( A.columns() );
10054 
10055  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
10056 
10057  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
10058  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
10059 
10060  const SIMDType factor( set( scalar ) );
10061 
10062  size_t j( 0UL );
10063 
10064  if( IsIntegral_v<ElementType> )
10065  {
10066  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
10067  for( size_t i=0UL; i<M; ++i )
10068  {
10069  const size_t kbegin( ( IsUpper_v<MT4> )
10070  ?( ( IsLower_v<MT5> )
10071  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10072  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10073  :( IsLower_v<MT5> ? j : 0UL ) );
10074  const size_t kend( ( IsLower_v<MT4> )
10075  ?( ( IsUpper_v<MT5> )
10076  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
10077  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
10078  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
10079 
10080  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10081 
10082  for( size_t k=kbegin; k<kend; ++k ) {
10083  const SIMDType a1( set( A(i,k) ) );
10084  xmm1 += a1 * B.load(k,j );
10085  xmm2 += a1 * B.load(k,j+SIMDSIZE );
10086  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
10087  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
10088  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
10089  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
10090  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
10091  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
10092  }
10093 
10094  C.store( i, j , C.load(i,j ) + xmm1 * factor );
10095  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
10096  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
10097  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
10098  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
10099  C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
10100  C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
10101  C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
10102  }
10103  }
10104  }
10105 
10106  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
10107  {
10108  size_t i( 0UL );
10109 
10110  for( ; (i+2UL) <= M; i+=2UL )
10111  {
10112  const size_t kbegin( ( IsUpper_v<MT4> )
10113  ?( ( IsLower_v<MT5> )
10114  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10115  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10116  :( IsLower_v<MT5> ? j : 0UL ) );
10117  const size_t kend( ( IsLower_v<MT4> )
10118  ?( ( IsUpper_v<MT5> )
10119  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
10120  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10121  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
10122 
10123  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10124 
10125  for( size_t k=kbegin; k<kend; ++k ) {
10126  const SIMDType a1( set( A(i ,k) ) );
10127  const SIMDType a2( set( A(i+1UL,k) ) );
10128  const SIMDType b1( B.load(k,j ) );
10129  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
10130  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
10131  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
10132  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
10133  xmm1 += a1 * b1;
10134  xmm2 += a1 * b2;
10135  xmm3 += a1 * b3;
10136  xmm4 += a1 * b4;
10137  xmm5 += a1 * b5;
10138  xmm6 += a2 * b1;
10139  xmm7 += a2 * b2;
10140  xmm8 += a2 * b3;
10141  xmm9 += a2 * b4;
10142  xmm10 += a2 * b5;
10143  }
10144 
10145  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10146  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
10147  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
10148  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
10149  C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
10150  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
10151  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
10152  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
10153  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
10154  C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
10155  }
10156 
10157  if( i < M )
10158  {
10159  const size_t kbegin( ( IsUpper_v<MT4> )
10160  ?( ( IsLower_v<MT5> )
10161  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10162  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10163  :( IsLower_v<MT5> ? j : 0UL ) );
10164  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
10165 
10166  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10167 
10168  for( size_t k=kbegin; k<kend; ++k ) {
10169  const SIMDType a1( set( A(i,k) ) );
10170  xmm1 += a1 * B.load(k,j );
10171  xmm2 += a1 * B.load(k,j+SIMDSIZE );
10172  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
10173  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
10174  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
10175  }
10176 
10177  C.store( i, j , C.load(i,j ) + xmm1 * factor );
10178  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
10179  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
10180  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
10181  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
10182  }
10183  }
10184 
10185  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
10186  {
10187  size_t i( 0UL );
10188 
10189  for( ; (i+2UL) <= M; i+=2UL )
10190  {
10191  const size_t kbegin( ( IsUpper_v<MT4> )
10192  ?( ( IsLower_v<MT5> )
10193  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10194  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10195  :( IsLower_v<MT5> ? j : 0UL ) );
10196  const size_t kend( ( IsLower_v<MT4> )
10197  ?( ( IsUpper_v<MT5> )
10198  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
10199  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10200  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
10201 
10202  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10203 
10204  for( size_t k=kbegin; k<kend; ++k ) {
10205  const SIMDType a1( set( A(i ,k) ) );
10206  const SIMDType a2( set( A(i+1UL,k) ) );
10207  const SIMDType b1( B.load(k,j ) );
10208  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
10209  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
10210  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
10211  xmm1 += a1 * b1;
10212  xmm2 += a1 * b2;
10213  xmm3 += a1 * b3;
10214  xmm4 += a1 * b4;
10215  xmm5 += a2 * b1;
10216  xmm6 += a2 * b2;
10217  xmm7 += a2 * b3;
10218  xmm8 += a2 * b4;
10219  }
10220 
10221  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10222  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
10223  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
10224  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
10225  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
10226  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
10227  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
10228  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
10229  }
10230 
10231  if( i < M )
10232  {
10233  const size_t kbegin( ( IsUpper_v<MT4> )
10234  ?( ( IsLower_v<MT5> )
10235  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10236  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10237  :( IsLower_v<MT5> ? j : 0UL ) );
10238  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
10239 
10240  SIMDType xmm1, xmm2, xmm3, xmm4;
10241 
10242  for( size_t k=kbegin; k<kend; ++k ) {
10243  const SIMDType a1( set( A(i,k) ) );
10244  xmm1 += a1 * B.load(k,j );
10245  xmm2 += a1 * B.load(k,j+SIMDSIZE );
10246  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
10247  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
10248  }
10249 
10250  C.store( i, j , C.load(i,j ) + xmm1 * factor );
10251  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
10252  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
10253  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
10254  }
10255  }
10256 
10257  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
10258  {
10259  size_t i( 0UL );
10260 
10261  for( ; (i+2UL) <= M; i+=2UL )
10262  {
10263  const size_t kbegin( ( IsUpper_v<MT4> )
10264  ?( ( IsLower_v<MT5> )
10265  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10266  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10267  :( IsLower_v<MT5> ? j : 0UL ) );
10268  const size_t kend( ( IsLower_v<MT4> )
10269  ?( ( IsUpper_v<MT5> )
10270  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
10271  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10272  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
10273 
10274  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10275 
10276  for( size_t k=kbegin; k<kend; ++k ) {
10277  const SIMDType a1( set( A(i ,k) ) );
10278  const SIMDType a2( set( A(i+1UL,k) ) );
10279  const SIMDType b1( B.load(k,j ) );
10280  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
10281  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
10282  xmm1 += a1 * b1;
10283  xmm2 += a1 * b2;
10284  xmm3 += a1 * b3;
10285  xmm4 += a2 * b1;
10286  xmm5 += a2 * b2;
10287  xmm6 += a2 * b3;
10288  }
10289 
10290  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10291  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
10292  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
10293  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
10294  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
10295  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
10296  }
10297 
10298  if( i < M )
10299  {
10300  const size_t kbegin( ( IsUpper_v<MT4> )
10301  ?( ( IsLower_v<MT5> )
10302  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10303  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10304  :( IsLower_v<MT5> ? j : 0UL ) );
10305  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
10306 
10307  SIMDType xmm1, xmm2, xmm3;
10308 
10309  for( size_t k=kbegin; k<kend; ++k ) {
10310  const SIMDType a1( set( A(i,k) ) );
10311  xmm1 += a1 * B.load(k,j );
10312  xmm2 += a1 * B.load(k,j+SIMDSIZE );
10313  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
10314  }
10315 
10316  C.store( i, j , C.load(i,j ) + xmm1 * factor );
10317  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
10318  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
10319  }
10320  }
10321 
10322  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
10323  {
10324  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
10325  size_t i( LOW ? j : 0UL );
10326 
10327  for( ; (i+4UL) <= iend; i+=4UL )
10328  {
10329  const size_t kbegin( ( IsUpper_v<MT4> )
10330  ?( ( IsLower_v<MT5> )
10331  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10332  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10333  :( IsLower_v<MT5> ? j : 0UL ) );
10334  const size_t kend( ( IsLower_v<MT4> )
10335  ?( ( IsUpper_v<MT5> )
10336  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
10337  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
10338  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
10339 
10340  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10341 
10342  for( size_t k=kbegin; k<kend; ++k ) {
10343  const SIMDType a1( set( A(i ,k) ) );
10344  const SIMDType a2( set( A(i+1UL,k) ) );
10345  const SIMDType a3( set( A(i+2UL,k) ) );
10346  const SIMDType a4( set( A(i+3UL,k) ) );
10347  const SIMDType b1( B.load(k,j ) );
10348  const SIMDType b2( B.load(k,j+SIMDSIZE) );
10349  xmm1 += a1 * b1;
10350  xmm2 += a1 * b2;
10351  xmm3 += a2 * b1;
10352  xmm4 += a2 * b2;
10353  xmm5 += a3 * b1;
10354  xmm6 += a3 * b2;
10355  xmm7 += a4 * b1;
10356  xmm8 += a4 * b2;
10357  }
10358 
10359  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10360  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
10361  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
10362  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
10363  C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
10364  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
10365  C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
10366  C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
10367  }
10368 
10369  for( ; (i+3UL) <= iend; i+=3UL )
10370  {
10371  const size_t kbegin( ( IsUpper_v<MT4> )
10372  ?( ( IsLower_v<MT5> )
10373  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10374  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10375  :( IsLower_v<MT5> ? j : 0UL ) );
10376  const size_t kend( ( IsLower_v<MT4> )
10377  ?( ( IsUpper_v<MT5> )
10378  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
10379  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
10380  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
10381 
10382  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10383 
10384  for( size_t k=kbegin; k<kend; ++k ) {
10385  const SIMDType a1( set( A(i ,k) ) );
10386  const SIMDType a2( set( A(i+1UL,k) ) );
10387  const SIMDType a3( set( A(i+2UL,k) ) );
10388  const SIMDType b1( B.load(k,j ) );
10389  const SIMDType b2( B.load(k,j+SIMDSIZE) );
10390  xmm1 += a1 * b1;
10391  xmm2 += a1 * b2;
10392  xmm3 += a2 * b1;
10393  xmm4 += a2 * b2;
10394  xmm5 += a3 * b1;
10395  xmm6 += a3 * b2;
10396  }
10397 
10398  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10399  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
10400  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
10401  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
10402  C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
10403  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
10404  }
10405 
10406  for( ; (i+2UL) <= iend; i+=2UL )
10407  {
10408  const size_t kbegin( ( IsUpper_v<MT4> )
10409  ?( ( IsLower_v<MT5> )
10410  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10411  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10412  :( IsLower_v<MT5> ? j : 0UL ) );
10413  const size_t kend( ( IsLower_v<MT4> )
10414  ?( ( IsUpper_v<MT5> )
10415  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
10416  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10417  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
10418 
10419  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10420  size_t k( kbegin );
10421 
10422  for( ; (k+2UL) <= kend; k+=2UL ) {
10423  const SIMDType a1( set( A(i ,k ) ) );
10424  const SIMDType a2( set( A(i+1UL,k ) ) );
10425  const SIMDType a3( set( A(i ,k+1UL) ) );
10426  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
10427  const SIMDType b1( B.load(k ,j ) );
10428  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
10429  const SIMDType b3( B.load(k+1UL,j ) );
10430  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
10431  xmm1 += a1 * b1;
10432  xmm2 += a1 * b2;
10433  xmm3 += a2 * b1;
10434  xmm4 += a2 * b2;
10435  xmm5 += a3 * b3;
10436  xmm6 += a3 * b4;
10437  xmm7 += a4 * b3;
10438  xmm8 += a4 * b4;
10439  }
10440 
10441  for( ; k<kend; ++k ) {
10442  const SIMDType a1( set( A(i ,k) ) );
10443  const SIMDType a2( set( A(i+1UL,k) ) );
10444  const SIMDType b1( B.load(k,j ) );
10445  const SIMDType b2( B.load(k,j+SIMDSIZE) );
10446  xmm1 += a1 * b1;
10447  xmm2 += a1 * b2;
10448  xmm3 += a2 * b1;
10449  xmm4 += a2 * b2;
10450  }
10451 
10452  C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
10453  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
10454  C.store( i+1UL, j , C.load(i+1UL,j ) + (xmm3+xmm7) * factor );
10455  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
10456  }
10457 
10458  if( i < iend )
10459  {
10460  const size_t kbegin( ( IsUpper_v<MT4> )
10461  ?( ( IsLower_v<MT5> )
10462  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10463  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10464  :( IsLower_v<MT5> ? j : 0UL ) );
10465  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
10466 
10467  SIMDType xmm1, xmm2, xmm3, xmm4;
10468  size_t k( kbegin );
10469 
10470  for( ; (k+2UL) <= kend; k+=2UL ) {
10471  const SIMDType a1( set( A(i,k ) ) );
10472  const SIMDType a2( set( A(i,k+1UL) ) );
10473  xmm1 += a1 * B.load(k ,j );
10474  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
10475  xmm3 += a2 * B.load(k+1UL,j );
10476  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
10477  }
10478 
10479  for( ; k<kend; ++k ) {
10480  const SIMDType a1( set( A(i,k) ) );
10481  xmm1 += a1 * B.load(k,j );
10482  xmm2 += a1 * B.load(k,j+SIMDSIZE);
10483  }
10484 
10485  C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
10486  C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
10487  }
10488  }
10489 
10490  for( ; j<jpos; j+=SIMDSIZE )
10491  {
10492  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
10493  size_t i( LOW ? j : 0UL );
10494 
10495  for( ; (i+4UL) <= iend; i+=4UL )
10496  {
10497  const size_t kbegin( ( IsUpper_v<MT4> )
10498  ?( ( IsLower_v<MT5> )
10499  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10500  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10501  :( IsLower_v<MT5> ? j : 0UL ) );
10502  const size_t kend( ( IsLower_v<MT4> )
10503  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
10504  :( K ) );
10505 
10506  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10507  size_t k( kbegin );
10508 
10509  for( ; (k+2UL) <= kend; k+=2UL ) {
10510  const SIMDType b1( B.load(k ,j) );
10511  const SIMDType b2( B.load(k+1UL,j) );
10512  xmm1 += set( A(i ,k ) ) * b1;
10513  xmm2 += set( A(i+1UL,k ) ) * b1;
10514  xmm3 += set( A(i+2UL,k ) ) * b1;
10515  xmm4 += set( A(i+3UL,k ) ) * b1;
10516  xmm5 += set( A(i ,k+1UL) ) * b2;
10517  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
10518  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
10519  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
10520  }
10521 
10522  for( ; k<kend; ++k ) {
10523  const SIMDType b1( B.load(k,j) );
10524  xmm1 += set( A(i ,k) ) * b1;
10525  xmm2 += set( A(i+1UL,k) ) * b1;
10526  xmm3 += set( A(i+2UL,k) ) * b1;
10527  xmm4 += set( A(i+3UL,k) ) * b1;
10528  }
10529 
10530  C.store( i , j, C.load(i ,j) + (xmm1+xmm5) * factor );
10531  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm6) * factor );
10532  C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm7) * factor );
10533  C.store( i+3UL, j, C.load(i+3UL,j) + (xmm4+xmm8) * factor );
10534  }
10535 
10536  for( ; (i+3UL) <= iend; i+=3UL )
10537  {
10538  const size_t kbegin( ( IsUpper_v<MT4> )
10539  ?( ( IsLower_v<MT5> )
10540  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10541  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10542  :( IsLower_v<MT5> ? j : 0UL ) );
10543  const size_t kend( ( IsLower_v<MT4> )
10544  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
10545  :( K ) );
10546 
10547  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10548  size_t k( kbegin );
10549 
10550  for( ; (k+2UL) <= kend; k+=2UL ) {
10551  const SIMDType b1( B.load(k ,j) );
10552  const SIMDType b2( B.load(k+1UL,j) );
10553  xmm1 += set( A(i ,k ) ) * b1;
10554  xmm2 += set( A(i+1UL,k ) ) * b1;
10555  xmm3 += set( A(i+2UL,k ) ) * b1;
10556  xmm4 += set( A(i ,k+1UL) ) * b2;
10557  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
10558  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
10559  }
10560 
10561  for( ; k<kend; ++k ) {
10562  const SIMDType b1( B.load(k,j) );
10563  xmm1 += set( A(i ,k) ) * b1;
10564  xmm2 += set( A(i+1UL,k) ) * b1;
10565  xmm3 += set( A(i+2UL,k) ) * b1;
10566  }
10567 
10568  C.store( i , j, C.load(i ,j) + (xmm1+xmm4) * factor );
10569  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm5) * factor );
10570  C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm6) * factor );
10571  }
10572 
10573  for( ; (i+2UL) <= iend; i+=2UL )
10574  {
10575  const size_t kbegin( ( IsUpper_v<MT4> )
10576  ?( ( IsLower_v<MT5> )
10577  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10578  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10579  :( IsLower_v<MT5> ? j : 0UL ) );
10580  const size_t kend( ( IsLower_v<MT4> )
10581  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
10582  :( K ) );
10583 
10584  SIMDType xmm1, xmm2, xmm3, xmm4;
10585  size_t k( kbegin );
10586 
10587  for( ; (k+2UL) <= kend; k+=2UL ) {
10588  const SIMDType b1( B.load(k ,j) );
10589  const SIMDType b2( B.load(k+1UL,j) );
10590  xmm1 += set( A(i ,k ) ) * b1;
10591  xmm2 += set( A(i+1UL,k ) ) * b1;
10592  xmm3 += set( A(i ,k+1UL) ) * b2;
10593  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
10594  }
10595 
10596  for( ; k<kend; ++k ) {
10597  const SIMDType b1( B.load(k,j) );
10598  xmm1 += set( A(i ,k) ) * b1;
10599  xmm2 += set( A(i+1UL,k) ) * b1;
10600  }
10601 
10602  C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
10603  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm4) * factor );
10604  }
10605 
10606  if( i < iend )
10607  {
10608  const size_t kbegin( ( IsUpper_v<MT4> )
10609  ?( ( IsLower_v<MT5> )
10610  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10611  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10612  :( IsLower_v<MT5> ? j : 0UL ) );
10613 
10614  SIMDType xmm1, xmm2;
10615  size_t k( kbegin );
10616 
10617  for( ; (k+2UL) <= K; k+=2UL ) {
10618  xmm1 += set( A(i,k ) ) * B.load(k ,j);
10619  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
10620  }
10621 
10622  for( ; k<K; ++k ) {
10623  xmm1 += set( A(i,k) ) * B.load(k,j);
10624  }
10625 
10626  C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
10627  }
10628  }
10629 
10630  for( ; remainder && j<N; ++j )
10631  {
10632  const size_t iend( UPP ? j+1UL : M );
10633  size_t i( LOW ? j : 0UL );
10634 
10635  for( ; (i+2UL) <= iend; i+=2UL )
10636  {
10637  const size_t kbegin( ( IsUpper_v<MT4> )
10638  ?( ( IsLower_v<MT5> )
10639  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10640  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10641  :( IsLower_v<MT5> ? j : 0UL ) );
10642  const size_t kend( ( IsLower_v<MT4> )
10643  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
10644  :( K ) );
10645 
10646  ElementType value1{};
10647  ElementType value2{};
10648 
10649  for( size_t k=kbegin; k<kend; ++k ) {
10650  value1 += A(i ,k) * B(k,j);
10651  value2 += A(i+1UL,k) * B(k,j);
10652  }
10653 
10654  C(i ,j) += value1 * scalar;
10655  C(i+1UL,j) += value2 * scalar;
10656  }
10657 
10658  if( i < iend )
10659  {
10660  const size_t kbegin( ( IsUpper_v<MT4> )
10661  ?( ( IsLower_v<MT5> )
10662  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10663  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10664  :( IsLower_v<MT5> ? j : 0UL ) );
10665 
10666  ElementType value{};
10667 
10668  for( size_t k=kbegin; k<K; ++k ) {
10669  value += A(i,k) * B(k,j);
10670  }
10671 
10672  C(i,j) += value * scalar;
10673  }
10674  }
10675  }
10676  //**********************************************************************************************
10677 
10678  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
10693  template< typename MT3 // Type of the left-hand side target matrix
10694  , typename MT4 // Type of the left-hand side matrix operand
10695  , typename MT5 // Type of the right-hand side matrix operand
10696  , typename ST2 > // Type of the scalar value
10697  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10698  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10699  {
10700  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
10701 
10702  const size_t M( A.rows() );
10703  const size_t N( B.columns() );
10704  const size_t K( A.columns() );
10705 
10706  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
10707 
10708  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
10709  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
10710 
10711  const SIMDType factor( set( scalar ) );
10712 
10713  size_t i( 0UL );
10714 
10715  if( IsIntegral_v<ElementType> )
10716  {
10717  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
10718  for( size_t j=0UL; j<N; ++j )
10719  {
10720  const size_t kbegin( ( IsLower_v<MT5> )
10721  ?( ( IsUpper_v<MT4> )
10722  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10723  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10724  :( IsUpper_v<MT4> ? i : 0UL ) );
10725  const size_t kend( ( IsUpper_v<MT5> )
10726  ?( ( IsLower_v<MT4> )
10727  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
10728  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
10729  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
10730 
10731  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10732 
10733  for( size_t k=kbegin; k<kend; ++k ) {
10734  const SIMDType b1( set( B(k,j) ) );
10735  xmm1 += A.load(i ,k) * b1;
10736  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10737  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10738  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10739  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10740  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
10741  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
10742  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
10743  }
10744 
10745  C.store( i , j, C.load(i ,j) + xmm1 * factor );
10746  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
10747  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10748  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10749  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10750  C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
10751  C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
10752  C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
10753  }
10754  }
10755  }
10756 
10757  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
10758  {
10759  size_t j( 0UL );
10760 
10761  for( ; (j+2UL) <= N; j+=2UL )
10762  {
10763  const size_t kbegin( ( IsLower_v<MT5> )
10764  ?( ( IsUpper_v<MT4> )
10765  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10766  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10767  :( IsUpper_v<MT4> ? i : 0UL ) );
10768  const size_t kend( ( IsUpper_v<MT5> )
10769  ?( ( IsLower_v<MT4> )
10770  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10771  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10772  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
10773 
10774  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10775 
10776  for( size_t k=kbegin; k<kend; ++k ) {
10777  const SIMDType a1( A.load(i ,k) );
10778  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10779  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10780  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10781  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
10782  const SIMDType b1( set( B(k,j ) ) );
10783  const SIMDType b2( set( B(k,j+1UL) ) );
10784  xmm1 += a1 * b1;
10785  xmm2 += a2 * b1;
10786  xmm3 += a3 * b1;
10787  xmm4 += a4 * b1;
10788  xmm5 += a5 * b1;
10789  xmm6 += a1 * b2;
10790  xmm7 += a2 * b2;
10791  xmm8 += a3 * b2;
10792  xmm9 += a4 * b2;
10793  xmm10 += a5 * b2;
10794  }
10795 
10796  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10797  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
10798  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10799  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10800  C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
10801  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
10802  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
10803  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
10804  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
10805  C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
10806  }
10807 
10808  if( j < N )
10809  {
10810  const size_t kbegin( ( IsLower_v<MT5> )
10811  ?( ( IsUpper_v<MT4> )
10812  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10813  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10814  :( IsUpper_v<MT4> ? i : 0UL ) );
10815  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
10816 
10817  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10818 
10819  for( size_t k=kbegin; k<kend; ++k ) {
10820  const SIMDType b1( set( B(k,j) ) );
10821  xmm1 += A.load(i ,k) * b1;
10822  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10823  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10824  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10825  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10826  }
10827 
10828  C.store( i , j, C.load(i ,j) + xmm1 * factor );
10829  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
10830  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10831  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10832  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10833  }
10834  }
10835 
10836  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10837  {
10838  size_t j( 0UL );
10839 
10840  for( ; (j+2UL) <= N; j+=2UL )
10841  {
10842  const size_t kbegin( ( IsLower_v<MT5> )
10843  ?( ( IsUpper_v<MT4> )
10844  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10845  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10846  :( IsUpper_v<MT4> ? i : 0UL ) );
10847  const size_t kend( ( IsUpper_v<MT5> )
10848  ?( ( IsLower_v<MT4> )
10849  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10850  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10851  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
10852 
10853  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10854 
10855  for( size_t k=kbegin; k<kend; ++k ) {
10856  const SIMDType a1( A.load(i ,k) );
10857  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10858  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10859  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10860  const SIMDType b1( set( B(k,j ) ) );
10861  const SIMDType b2( set( B(k,j+1UL) ) );
10862  xmm1 += a1 * b1;
10863  xmm2 += a2 * b1;
10864  xmm3 += a3 * b1;
10865  xmm4 += a4 * b1;
10866  xmm5 += a1 * b2;
10867  xmm6 += a2 * b2;
10868  xmm7 += a3 * b2;
10869  xmm8 += a4 * b2;
10870  }
10871 
10872  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10873  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
10874  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10875  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10876  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
10877  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
10878  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
10879  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
10880  }
10881 
10882  if( j < N )
10883  {
10884  const size_t kbegin( ( IsLower_v<MT5> )
10885  ?( ( IsUpper_v<MT4> )
10886  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10887  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10888  :( IsUpper_v<MT4> ? i : 0UL ) );
10889  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
10890 
10891  SIMDType xmm1, xmm2, xmm3, xmm4;
10892 
10893  for( size_t k=kbegin; k<kend; ++k ) {
10894  const SIMDType b1( set( B(k,j) ) );
10895  xmm1 += A.load(i ,k) * b1;
10896  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10897  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10898  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10899  }
10900 
10901  C.store( i , j, C.load(i ,j) + xmm1 * factor );
10902  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
10903  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10904  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10905  }
10906  }
10907 
10908  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
10909  {
10910  size_t j( 0UL );
10911 
10912  for( ; (j+2UL) <= N; j+=2UL )
10913  {
10914  const size_t kbegin( ( IsLower_v<MT5> )
10915  ?( ( IsUpper_v<MT4> )
10916  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10917  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10918  :( IsUpper_v<MT4> ? i : 0UL ) );
10919  const size_t kend( ( IsUpper_v<MT5> )
10920  ?( ( IsLower_v<MT4> )
10921  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10922  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10923  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
10924 
10925  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10926 
10927  for( size_t k=kbegin; k<kend; ++k ) {
10928  const SIMDType a1( A.load(i ,k) );
10929  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10930  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10931  const SIMDType b1( set( B(k,j ) ) );
10932  const SIMDType b2( set( B(k,j+1UL) ) );
10933  xmm1 += a1 * b1;
10934  xmm2 += a2 * b1;
10935  xmm3 += a3 * b1;
10936  xmm4 += a1 * b2;
10937  xmm5 += a2 * b2;
10938  xmm6 += a3 * b2;
10939  }
10940 
10941  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10942  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
10943  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10944  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
10945  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
10946  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
10947  }
10948 
10949  if( j < N )
10950  {
10951  const size_t kbegin( ( IsLower_v<MT5> )
10952  ?( ( IsUpper_v<MT4> )
10953  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10954  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10955  :( IsUpper_v<MT4> ? i : 0UL ) );
10956  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
10957 
10958  SIMDType xmm1, xmm2, xmm3;
10959 
10960  for( size_t k=kbegin; k<kend; ++k ) {
10961  const SIMDType b1( set( B(k,j) ) );
10962  xmm1 += A.load(i ,k) * b1;
10963  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10964  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10965  }
10966 
10967  C.store( i , j, C.load(i ,j) + xmm1 * factor );
10968  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
10969  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10970  }
10971  }
10972 
10973  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10974  {
10975  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
10976  size_t j( UPP ? i : 0UL );
10977 
10978  for( ; (j+4UL) <= jend; j+=4UL )
10979  {
10980  const size_t kbegin( ( IsLower_v<MT5> )
10981  ?( ( IsUpper_v<MT4> )
10982  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10983  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10984  :( IsUpper_v<MT4> ? i : 0UL ) );
10985  const size_t kend( ( IsUpper_v<MT5> )
10986  ?( ( IsLower_v<MT4> )
10987  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
10988  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
10989  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
10990 
10991  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10992 
10993  for( size_t k=kbegin; k<kend; ++k ) {
10994  const SIMDType a1( A.load(i ,k) );
10995  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10996  const SIMDType b1( set( B(k,j ) ) );
10997  const SIMDType b2( set( B(k,j+1UL) ) );
10998  const SIMDType b3( set( B(k,j+2UL) ) );
10999  const SIMDType b4( set( B(k,j+3UL) ) );
11000  xmm1 += a1 * b1;
11001  xmm2 += a2 * b1;
11002  xmm3 += a1 * b2;
11003  xmm4 += a2 * b2;
11004  xmm5 += a1 * b3;
11005  xmm6 += a2 * b3;
11006  xmm7 += a1 * b4;
11007  xmm8 += a2 * b4;
11008  }
11009 
11010  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11011  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
11012  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
11013  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
11014  C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
11015  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
11016  C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
11017  C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
11018  }
11019 
11020  for( ; (j+3UL) <= jend; j+=3UL )
11021  {
11022  const size_t kbegin( ( IsLower_v<MT5> )
11023  ?( ( IsUpper_v<MT4> )
11024  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11025  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11026  :( IsUpper_v<MT4> ? i : 0UL ) );
11027  const size_t kend( ( IsUpper_v<MT5> )
11028  ?( ( IsLower_v<MT4> )
11029  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
11030  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
11031  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
11032 
11033  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11034 
11035  for( size_t k=kbegin; k<kend; ++k ) {
11036  const SIMDType a1( A.load(i ,k) );
11037  const SIMDType a2( A.load(i+SIMDSIZE,k) );
11038  const SIMDType b1( set( B(k,j ) ) );
11039  const SIMDType b2( set( B(k,j+1UL) ) );
11040  const SIMDType b3( set( B(k,j+2UL) ) );
11041  xmm1 += a1 * b1;
11042  xmm2 += a2 * b1;
11043  xmm3 += a1 * b2;
11044  xmm4 += a2 * b2;
11045  xmm5 += a1 * b3;
11046  xmm6 += a2 * b3;
11047  }
11048 
11049  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11050  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
11051  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
11052  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
11053  C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
11054  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
11055  }
11056 
11057  for( ; (j+2UL) <= jend; j+=2UL )
11058  {
11059  const size_t kbegin( ( IsLower_v<MT5> )
11060  ?( ( IsUpper_v<MT4> )
11061  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11062  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11063  :( IsUpper_v<MT4> ? i : 0UL ) );
11064  const size_t kend( ( IsUpper_v<MT5> )
11065  ?( ( IsLower_v<MT4> )
11066  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
11067  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
11068  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
11069 
11070  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11071  size_t k( kbegin );
11072 
11073  for( ; (k+2UL) <= kend; k+=2UL ) {
11074  const SIMDType a1( A.load(i ,k ) );
11075  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
11076  const SIMDType a3( A.load(i ,k+1UL) );
11077  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
11078  const SIMDType b1( set( B(k ,j ) ) );
11079  const SIMDType b2( set( B(k ,j+1UL) ) );
11080  const SIMDType b3( set( B(k+1UL,j ) ) );
11081  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
11082  xmm1 += a1 * b1;
11083  xmm2 += a2 * b1;
11084  xmm3 += a1 * b2;
11085  xmm4 += a2 * b2;
11086  xmm5 += a3 * b3;
11087  xmm6 += a4 * b3;
11088  xmm7 += a3 * b4;
11089  xmm8 += a4 * b4;
11090  }
11091 
11092  for( ; k<kend; ++k ) {
11093  const SIMDType a1( A.load(i ,k) );
11094  const SIMDType a2( A.load(i+SIMDSIZE,k) );
11095  const SIMDType b1( set( B(k,j ) ) );
11096  const SIMDType b2( set( B(k,j+1UL) ) );
11097  xmm1 += a1 * b1;
11098  xmm2 += a2 * b1;
11099  xmm3 += a1 * b2;
11100  xmm4 += a2 * b2;
11101  }
11102 
11103  C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
11104  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
11105  C.store( i , j+1UL, C.load(i ,j+1UL) + (xmm3+xmm7) * factor );
11106  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
11107  }
11108 
11109  if( j < jend )
11110  {
11111  const size_t kbegin( ( IsLower_v<MT5> )
11112  ?( ( IsUpper_v<MT4> )
11113  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11114  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11115  :( IsUpper_v<MT4> ? i : 0UL ) );
11116  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
11117 
11118  SIMDType xmm1, xmm2, xmm3, xmm4;
11119  size_t k( kbegin );
11120 
11121  for( ; (k+2UL) <= kend; k+=2UL ) {
11122  const SIMDType b1( set( B(k ,j) ) );
11123  const SIMDType b2( set( B(k+1UL,j) ) );
11124  xmm1 += A.load(i ,k ) * b1;
11125  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
11126  xmm3 += A.load(i ,k+1UL) * b2;
11127  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
11128  }
11129 
11130  for( ; k<kend; ++k ) {
11131  const SIMDType b1( set( B(k,j) ) );
11132  xmm1 += A.load(i ,k) * b1;
11133  xmm2 += A.load(i+SIMDSIZE,k) * b1;
11134  }
11135 
11136  C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
11137  C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
11138  }
11139  }
11140 
11141  for( ; i<ipos; i+=SIMDSIZE )
11142  {
11143  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
11144  size_t j( UPP ? i : 0UL );
11145 
11146  for( ; (j+4UL) <= jend; j+=4UL )
11147  {
11148  const size_t kbegin( ( IsLower_v<MT5> )
11149  ?( ( IsUpper_v<MT4> )
11150  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11151  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11152  :( IsUpper_v<MT4> ? i : 0UL ) );
11153  const size_t kend( ( IsUpper_v<MT5> )
11154  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
11155  :( K ) );
11156 
11157  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11158  size_t k( kbegin );
11159 
11160  for( ; (k+2UL) <= kend; k+=2UL ) {
11161  const SIMDType a1( A.load(i,k ) );
11162  const SIMDType a2( A.load(i,k+1UL) );
11163  xmm1 += a1 * set( B(k ,j ) );
11164  xmm2 += a1 * set( B(k ,j+1UL) );
11165  xmm3 += a1 * set( B(k ,j+2UL) );
11166  xmm4 += a1 * set( B(k ,j+3UL) );
11167  xmm5 += a2 * set( B(k+1UL,j ) );
11168  xmm6 += a2 * set( B(k+1UL,j+1UL) );
11169  xmm7 += a2 * set( B(k+1UL,j+2UL) );
11170  xmm8 += a2 * set( B(k+1UL,j+3UL) );
11171  }
11172 
11173  for( ; k<kend; ++k ) {
11174  const SIMDType a1( A.load(i,k) );
11175  xmm1 += a1 * set( B(k,j ) );
11176  xmm2 += a1 * set( B(k,j+1UL) );
11177  xmm3 += a1 * set( B(k,j+2UL) );
11178  xmm4 += a1 * set( B(k,j+3UL) );
11179  }
11180 
11181  C.store( i, j , C.load(i,j ) + (xmm1+xmm5) * factor );
11182  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm6) * factor );
11183  C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm7) * factor );
11184  C.store( i, j+3UL, C.load(i,j+3UL) + (xmm4+xmm8) * factor );
11185  }
11186 
11187  for( ; (j+3UL) <= jend; j+=3UL )
11188  {
11189  const size_t kbegin( ( IsLower_v<MT5> )
11190  ?( ( IsUpper_v<MT4> )
11191  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11192  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11193  :( IsUpper_v<MT4> ? i : 0UL ) );
11194  const size_t kend( ( IsUpper_v<MT5> )
11195  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
11196  :( K ) );
11197 
11198  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11199  size_t k( kbegin );
11200 
11201  for( ; (k+2UL) <= kend; k+=2UL ) {
11202  const SIMDType a1( A.load(i,k ) );
11203  const SIMDType a2( A.load(i,k+1UL) );
11204  xmm1 += a1 * set( B(k ,j ) );
11205  xmm2 += a1 * set( B(k ,j+1UL) );
11206  xmm3 += a1 * set( B(k ,j+2UL) );
11207  xmm4 += a2 * set( B(k+1UL,j ) );
11208  xmm5 += a2 * set( B(k+1UL,j+1UL) );
11209  xmm6 += a2 * set( B(k+1UL,j+2UL) );
11210  }
11211 
11212  for( ; k<kend; ++k ) {
11213  const SIMDType a1( A.load(i,k) );
11214  xmm1 += a1 * set( B(k,j ) );
11215  xmm2 += a1 * set( B(k,j+1UL) );
11216  xmm3 += a1 * set( B(k,j+2UL) );
11217  }
11218 
11219  C.store( i, j , C.load(i,j ) + (xmm1+xmm4) * factor );
11220  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm5) * factor );
11221  C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm6) * factor );
11222  }
11223 
11224  for( ; (j+2UL) <= jend; j+=2UL )
11225  {
11226  const size_t kbegin( ( IsLower_v<MT5> )
11227  ?( ( IsUpper_v<MT4> )
11228  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11229  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11230  :( IsUpper_v<MT4> ? i : 0UL ) );
11231  const size_t kend( ( IsUpper_v<MT5> )
11232  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
11233  :( K ) );
11234 
11235  SIMDType xmm1, xmm2, xmm3, xmm4;
11236  size_t k( kbegin );
11237 
11238  for( ; (k+2UL) <= kend; k+=2UL ) {
11239  const SIMDType a1( A.load(i,k ) );
11240  const SIMDType a2( A.load(i,k+1UL) );
11241  xmm1 += a1 * set( B(k ,j ) );
11242  xmm2 += a1 * set( B(k ,j+1UL) );
11243  xmm3 += a2 * set( B(k+1UL,j ) );
11244  xmm4 += a2 * set( B(k+1UL,j+1UL) );
11245  }
11246 
11247  for( ; k<kend; ++k ) {
11248  const SIMDType a1( A.load(i,k) );
11249  xmm1 += a1 * set( B(k,j ) );
11250  xmm2 += a1 * set( B(k,j+1UL) );
11251  }
11252 
11253  C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
11254  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm4) * factor );
11255  }
11256 
11257  if( j < jend )
11258  {
11259  const size_t kbegin( ( IsLower_v<MT5> )
11260  ?( ( IsUpper_v<MT4> )
11261  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11262  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11263  :( IsUpper_v<MT4> ? i : 0UL ) );
11264 
11265  SIMDType xmm1, xmm2;
11266  size_t k( kbegin );
11267 
11268  for( ; (k+2UL) <= K; k+=2UL ) {
11269  xmm1 += A.load(i,k ) * set( B(k ,j) );
11270  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
11271  }
11272 
11273  for( ; k<K; ++k ) {
11274  xmm1 += A.load(i,k) * set( B(k,j) );
11275  }
11276 
11277  C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
11278  }
11279  }
11280 
11281  for( ; remainder && i<M; ++i )
11282  {
11283  const size_t jend( LOW ? i+1UL : N );
11284  size_t j( UPP ? i : 0UL );
11285 
11286  for( ; (j+2UL) <= jend; j+=2UL )
11287  {
11288  const size_t kbegin( ( IsLower_v<MT5> )
11289  ?( ( IsUpper_v<MT4> )
11290  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11291  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11292  :( IsUpper_v<MT4> ? i : 0UL ) );
11293  const size_t kend( ( IsUpper_v<MT5> )
11294  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
11295  :( K ) );
11296 
11297  ElementType value1{};
11298  ElementType value2{};
11299 
11300  for( size_t k=kbegin; k<kend; ++k ) {
11301  value1 += A(i,k) * B(k,j );
11302  value2 += A(i,k) * B(k,j+1UL);
11303  }
11304 
11305  C(i,j ) += value1 * scalar;
11306  C(i,j+1UL) += value2 * scalar;
11307  }
11308 
11309  if( j < jend )
11310  {
11311  const size_t kbegin( ( IsLower_v<MT5> )
11312  ?( ( IsUpper_v<MT4> )
11313  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11314  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11315  :( IsUpper_v<MT4> ? i : 0UL ) );
11316 
11317  ElementType value{};
11318 
11319  for( size_t k=kbegin; k<K; ++k ) {
11320  value += A(i,k) * B(k,j);
11321  }
11322 
11323  C(i,j) += value * scalar;
11324  }
11325  }
11326  }
11327  //**********************************************************************************************
11328 
11329  //**Default addition assignment to dense matrices (large matrices)******************************
11343  template< typename MT3 // Type of the left-hand side target matrix
11344  , typename MT4 // Type of the left-hand side matrix operand
11345  , typename MT5 // Type of the right-hand side matrix operand
11346  , typename ST2 > // Type of the scalar value
11347  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11348  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11349  {
11350  selectDefaultAddAssignKernel( C, A, B, scalar );
11351  }
11352  //**********************************************************************************************
11353 
11354  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
11369  template< typename MT3 // Type of the left-hand side target matrix
11370  , typename MT4 // Type of the left-hand side matrix operand
11371  , typename MT5 // Type of the right-hand side matrix operand
11372  , typename ST2 > // Type of the scalar value
11373  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11374  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11375  {
11376  if( LOW )
11377  lmmm( C, A, B, scalar, ST2(1) );
11378  else if( UPP )
11379  ummm( C, A, B, scalar, ST2(1) );
11380  else
11381  mmm( C, A, B, scalar, ST2(1) );
11382  }
11383  //**********************************************************************************************
11384 
11385  //**BLAS-based addition assignment to dense matrices (default)**********************************
11399  template< typename MT3 // Type of the left-hand side target matrix
11400  , typename MT4 // Type of the left-hand side matrix operand
11401  , typename MT5 // Type of the right-hand side matrix operand
11402  , typename ST2 > // Type of the scalar value
11403  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11404  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11405  {
11406  selectLargeAddAssignKernel( C, A, B, scalar );
11407  }
11408  //**********************************************************************************************
11409 
11410  //**BLAS-based addition assignment to dense matrices********************************************
11411 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
11412 
11425  template< typename MT3 // Type of the left-hand side target matrix
11426  , typename MT4 // Type of the left-hand side matrix operand
11427  , typename MT5 // Type of the right-hand side matrix operand
11428  , typename ST2 > // Type of the scalar value
11429  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11430  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11431  {
11432  using ET = ElementType_t<MT3>;
11433 
11434  if( IsTriangular_v<MT4> ) {
11435  ResultType_t<MT3> tmp( serial( B ) );
11436  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
11437  addAssign( C, tmp );
11438  }
11439  else if( IsTriangular_v<MT5> ) {
11440  ResultType_t<MT3> tmp( serial( A ) );
11441  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
11442  addAssign( C, tmp );
11443  }
11444  else {
11445  gemm( C, A, B, ET(scalar), ET(1) );
11446  }
11447  }
11448 #endif
11449  //**********************************************************************************************
11450 
11451  //**Addition assignment to sparse matrices******************************************************
11452  // No special implementation for the addition assignment to sparse matrices.
11453  //**********************************************************************************************
11454 
11455  //**Subtraction assignment to dense matrices****************************************************
11467  template< typename MT // Type of the target dense matrix
11468  , bool SO > // Storage order of the target dense matrix
11469  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11470  {
11472 
11473  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11474  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11475 
11476  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
11477  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
11478 
11479  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11480  return;
11481  }
11482 
11483  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
11484  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
11485 
11486  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11487  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11488  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11489  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11490  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11491  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11492 
11493  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
11494  }
11495  //**********************************************************************************************
11496 
11497  //**Subtraction assignment to dense matrices (kernel selection)*********************************
11508  template< typename MT3 // Type of the left-hand side target matrix
11509  , typename MT4 // Type of the left-hand side matrix operand
11510  , typename MT5 // Type of the right-hand side matrix operand
11511  , typename ST2 > // Type of the scalar value
11512  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11513  {
11514  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
11515  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
11516  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
11517  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
11518  selectSmallSubAssignKernel( C, A, B, scalar );
11519  else
11520  selectBlasSubAssignKernel( C, A, B, scalar );
11521  }
11522  //**********************************************************************************************
11523 
11524  //**Default subtraction assignment to dense matrices********************************************
11538  template< typename MT3 // Type of the left-hand side target matrix
11539  , typename MT4 // Type of the left-hand side matrix operand
11540  , typename MT5 // Type of the right-hand side matrix operand
11541  , typename ST2 > // Type of the scalar value
11542  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11543  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11544  {
11545  const ResultType tmp( serial( A * B * scalar ) );
11546  subAssign( C, tmp );
11547  }
11548  //**********************************************************************************************
11549 
11550  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
11564  template< typename MT3 // Type of the left-hand side target matrix
11565  , typename MT4 // Type of the left-hand side matrix operand
11566  , typename MT5 // Type of the right-hand side matrix operand
11567  , typename ST2 > // Type of the scalar value
11568  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11569  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11570  {
11571  constexpr size_t block( BLOCK_SIZE );
11572 
11573  const size_t M( A.rows() );
11574  const size_t N( B.columns() );
11575 
11576  for( size_t ii=0UL; ii<M; ii+=block ) {
11577  const size_t iend( min( M, ii+block ) );
11578  for( size_t jj=0UL; jj<N; jj+=block ) {
11579  const size_t jend( min( N, jj+block ) );
11580  for( size_t i=ii; i<iend; ++i )
11581  {
11582  const size_t jbegin( ( IsUpper_v<MT4> )
11583  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
11584  :( jj ) );
11585  const size_t jpos( ( IsLower_v<MT4> )
11586  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
11587  :( jend ) );
11588 
11589  for( size_t j=jbegin; j<jpos; ++j ) {
11590  C(i,j) -= A(i,j) * B(j,j) * scalar;
11591  }
11592  }
11593  }
11594  }
11595  }
11596  //**********************************************************************************************
11597 
11598  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
11612  template< typename MT3 // Type of the left-hand side target matrix
11613  , typename MT4 // Type of the left-hand side matrix operand
11614  , typename MT5 // Type of the right-hand side matrix operand
11615  , typename ST2 > // Type of the scalar value
11616  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11617  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11618  {
11619  const size_t M( A.rows() );
11620  const size_t N( B.columns() );
11621 
11622  for( size_t j=0UL; j<N; ++j )
11623  {
11624  const size_t ibegin( ( IsLower_v<MT4> )
11625  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
11626  :( 0UL ) );
11627  const size_t iend( ( IsUpper_v<MT4> )
11628  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
11629  :( M ) );
11630  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
11631 
11632  const size_t inum( iend - ibegin );
11633  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
11634 
11635  for( size_t i=ibegin; i<ipos; i+=2UL ) {
11636  C(i ,j) -= A(i ,j) * B(j,j) * scalar;
11637  C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
11638  }
11639  if( ipos < iend ) {
11640  C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
11641  }
11642  }
11643  }
11644  //**********************************************************************************************
11645 
11646  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
11660  template< typename MT3 // Type of the left-hand side target matrix
11661  , typename MT4 // Type of the left-hand side matrix operand
11662  , typename MT5 // Type of the right-hand side matrix operand
11663  , typename ST2 > // Type of the scalar value
11664  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11665  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11666  {
11667  const size_t M( A.rows() );
11668  const size_t N( B.columns() );
11669 
11670  for( size_t i=0UL; i<M; ++i )
11671  {
11672  const size_t jbegin( ( IsUpper_v<MT5> )
11673  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
11674  :( 0UL ) );
11675  const size_t jend( ( IsLower_v<MT5> )
11676  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
11677  :( N ) );
11678  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
11679 
11680  const size_t jnum( jend - jbegin );
11681  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
11682 
11683  for( size_t j=jbegin; j<jpos; j+=2UL ) {
11684  C(i,j ) -= A(i,i) * B(i,j ) * scalar;
11685  C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
11686  }
11687  if( jpos < jend ) {
11688  C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
11689  }
11690  }
11691  }
11692  //**********************************************************************************************
11693 
11694  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
11708  template< typename MT3 // Type of the left-hand side target matrix
11709  , typename MT4 // Type of the left-hand side matrix operand
11710  , typename MT5 // Type of the right-hand side matrix operand
11711  , typename ST2 > // Type of the scalar value
11712  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11713  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11714  {
11715  constexpr size_t block( BLOCK_SIZE );
11716 
11717  const size_t M( A.rows() );
11718  const size_t N( B.columns() );
11719 
11720  for( size_t jj=0UL; jj<N; jj+=block ) {
11721  const size_t jend( min( N, jj+block ) );
11722  for( size_t ii=0UL; ii<M; ii+=block ) {
11723  const size_t iend( min( M, ii+block ) );
11724  for( size_t j=jj; j<jend; ++j )
11725  {
11726  const size_t ibegin( ( IsLower_v<MT5> )
11727  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
11728  :( ii ) );
11729  const size_t ipos( ( IsUpper_v<MT5> )
11730  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
11731  :( iend ) );
11732 
11733  for( size_t i=ibegin; i<ipos; ++i ) {
11734  C(i,j) -= A(i,i) * B(i,j) * scalar;
11735  }
11736  }
11737  }
11738  }
11739  }
11740  //**********************************************************************************************
11741 
11742  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
11756  template< typename MT3 // Type of the left-hand side target matrix
11757  , typename MT4 // Type of the left-hand side matrix operand
11758  , typename MT5 // Type of the right-hand side matrix operand
11759  , typename ST2 > // Type of the scalar value
11760  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11761  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11762  {
11763  for( size_t i=0UL; i<A.rows(); ++i ) {
11764  C(i,i) -= A(i,i) * B(i,i) * scalar;
11765  }
11766  }
11767  //**********************************************************************************************
11768 
11769  //**Default subtraction assignment to dense matrices (small matrices)***************************
11783  template< typename MT3 // Type of the left-hand side target matrix
11784  , typename MT4 // Type of the left-hand side matrix operand
11785  , typename MT5 // Type of the right-hand side matrix operand
11786  , typename ST2 > // Type of the scalar value
11787  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11788  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11789  {
11790  selectDefaultSubAssignKernel( C, A, B, scalar );
11791  }
11792  //**********************************************************************************************
11793 
11794  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
11809  template< typename MT3 // Type of the left-hand side target matrix
11810  , typename MT4 // Type of the left-hand side matrix operand
11811  , typename MT5 // Type of the right-hand side matrix operand
11812  , typename ST2 > // Type of the scalar value
11813  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11814  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11815  {
11816  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
11817 
11818  const size_t M( A.rows() );
11819  const size_t N( B.columns() );
11820  const size_t K( A.columns() );
11821 
11822  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
11823 
11824  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
11825  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
11826 
11827  const SIMDType factor( set( scalar ) );
11828 
11829  size_t j( 0UL );
11830 
11831  if( IsIntegral_v<ElementType> )
11832  {
11833  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
11834  for( size_t i=0UL; i<M; ++i )
11835  {
11836  const size_t kbegin( ( IsUpper_v<MT4> )
11837  ?( ( IsLower_v<MT5> )
11838  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11839  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11840  :( IsLower_v<MT5> ? j : 0UL ) );
11841  const size_t kend( ( IsLower_v<MT4> )
11842  ?( ( IsUpper_v<MT5> )
11843  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
11844  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
11845  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
11846 
11847  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11848 
11849  for( size_t k=kbegin; k<kend; ++k ) {
11850  const SIMDType a1( set( A(i,k) ) );
11851  xmm1 += a1 * B.load(k,j );
11852  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11853  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11854  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11855  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11856  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
11857  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
11858  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
11859  }
11860 
11861  C.store( i, j , C.load(i,j ) - xmm1 * factor );
11862  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
11863  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11864  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11865  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11866  C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
11867  C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
11868  C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
11869  }
11870  }
11871  }
11872 
11873  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
11874  {
11875  size_t i( 0UL );
11876 
11877  for( ; (i+2UL) <= M; i+=2UL )
11878  {
11879  const size_t kbegin( ( IsUpper_v<MT4> )
11880  ?( ( IsLower_v<MT5> )
11881  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11882  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11883  :( IsLower_v<MT5> ? j : 0UL ) );
11884  const size_t kend( ( IsLower_v<MT4> )
11885  ?( ( IsUpper_v<MT5> )
11886  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
11887  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11888  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
11889 
11890  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
11891 
11892  for( size_t k=kbegin; k<kend; ++k ) {
11893  const SIMDType a1( set( A(i ,k) ) );
11894  const SIMDType a2( set( A(i+1UL,k) ) );
11895  const SIMDType b1( B.load(k,j ) );
11896  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11897  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11898  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11899  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
11900  xmm1 += a1 * b1;
11901  xmm2 += a1 * b2;
11902  xmm3 += a1 * b3;
11903  xmm4 += a1 * b4;
11904  xmm5 += a1 * b5;
11905  xmm6 += a2 * b1;
11906  xmm7 += a2 * b2;
11907  xmm8 += a2 * b3;
11908  xmm9 += a2 * b4;
11909  xmm10 += a2 * b5;
11910  }
11911 
11912  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11913  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
11914  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11915  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11916  C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
11917  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
11918  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
11919  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
11920  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
11921  C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
11922  }
11923 
11924  if( i < M )
11925  {
11926  const size_t kbegin( ( IsUpper_v<MT4> )
11927  ?( ( IsLower_v<MT5> )
11928  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11929  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11930  :( IsLower_v<MT5> ? j : 0UL ) );
11931  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
11932 
11933  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
11934 
11935  for( size_t k=kbegin; k<kend; ++k ) {
11936  const SIMDType a1( set( A(i,k) ) );
11937  xmm1 += a1 * B.load(k,j );
11938  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11939  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11940  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11941  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11942  }
11943 
11944  C.store( i, j , C.load(i,j ) - xmm1 * factor );
11945  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
11946  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11947  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11948  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11949  }
11950  }
11951 
11952  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
11953  {
11954  size_t i( 0UL );
11955 
11956  for( ; (i+2UL) <= M; i+=2UL )
11957  {
11958  const size_t kbegin( ( IsUpper_v<MT4> )
11959  ?( ( IsLower_v<MT5> )
11960  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11961  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11962  :( IsLower_v<MT5> ? j : 0UL ) );
11963  const size_t kend( ( IsLower_v<MT4> )
11964  ?( ( IsUpper_v<MT5> )
11965  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
11966  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11967  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
11968 
11969  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11970 
11971  for( size_t k=kbegin; k<kend; ++k ) {
11972  const SIMDType a1( set( A(i ,k) ) );
11973  const SIMDType a2( set( A(i+1UL,k) ) );
11974  const SIMDType b1( B.load(k,j ) );
11975  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11976  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11977  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11978  xmm1 += a1 * b1;
11979  xmm2 += a1 * b2;
11980  xmm3 += a1 * b3;
11981  xmm4 += a1 * b4;
11982  xmm5 += a2 * b1;
11983  xmm6 += a2 * b2;
11984  xmm7 += a2 * b3;
11985  xmm8 += a2 * b4;
11986  }
11987 
11988  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11989  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
11990  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11991  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11992  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
11993  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
11994  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
11995  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
11996  }
11997 
11998  if( i < M )
11999  {
12000  const size_t kbegin( ( IsUpper_v<MT4> )
12001  ?( ( IsLower_v<MT5> )
12002  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12003  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12004  :( IsLower_v<MT5> ? j : 0UL ) );
12005  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
12006 
12007  SIMDType xmm1, xmm2, xmm3, xmm4;
12008 
12009  for( size_t k=kbegin; k<kend; ++k ) {
12010  const SIMDType a1( set( A(i,k) ) );
12011  xmm1 += a1 * B.load(k,j );
12012  xmm2 += a1 * B.load(k,j+SIMDSIZE );
12013  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
12014  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
12015  }
12016 
12017  C.store( i, j , C.load(i,j ) - xmm1 * factor );
12018  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
12019  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
12020  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
12021  }
12022  }
12023 
12024  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
12025  {
12026  size_t i( 0UL );
12027 
12028  for( ; (i+2UL) <= M; i+=2UL )
12029  {
12030  const size_t kbegin( ( IsUpper_v<MT4> )
12031  ?( ( IsLower_v<MT5> )
12032  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12033  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12034  :( IsLower_v<MT5> ? j : 0UL ) );
12035  const size_t kend( ( IsLower_v<MT4> )
12036  ?( ( IsUpper_v<MT5> )
12037  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
12038  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
12039  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
12040 
12041  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12042 
12043  for( size_t k=kbegin; k<kend; ++k ) {
12044  const SIMDType a1( set( A(i ,k) ) );
12045  const SIMDType a2( set( A(i+1UL,k) ) );
12046  const SIMDType b1( B.load(k,j ) );
12047  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
12048  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
12049  xmm1 += a1 * b1;
12050  xmm2 += a1 * b2;
12051  xmm3 += a1 * b3;
12052  xmm4 += a2 * b1;
12053  xmm5 += a2 * b2;
12054  xmm6 += a2 * b3;
12055  }
12056 
12057  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12058  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
12059  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
12060  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
12061  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
12062  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
12063  }
12064 
12065  if( i < M )
12066  {
12067  const size_t kbegin( ( IsUpper_v<MT4> )
12068  ?( ( IsLower_v<MT5> )
12069  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12070  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12071  :( IsLower_v<MT5> ? j : 0UL ) );
12072  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
12073 
12074  SIMDType xmm1, xmm2, xmm3;
12075 
12076  for( size_t k=kbegin; k<kend; ++k ) {
12077  const SIMDType a1( set( A(i,k) ) );
12078  xmm1 += a1 * B.load(k,j );
12079  xmm2 += a1 * B.load(k,j+SIMDSIZE );
12080  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
12081  }
12082 
12083  C.store( i, j , C.load(i,j ) - xmm1 * factor );
12084  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
12085  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
12086  }
12087  }
12088 
12089  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
12090  {
12091  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
12092  size_t i( LOW ? j : 0UL );
12093 
12094  for( ; (i+4UL) <= iend; i+=4UL )
12095  {
12096  const size_t kbegin( ( IsUpper_v<MT4> )
12097  ?( ( IsLower_v<MT5> )
12098  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12099  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12100  :( IsLower_v<MT5> ? j : 0UL ) );
12101  const size_t kend( ( IsLower_v<MT4> )
12102  ?( ( IsUpper_v<MT5> )
12103  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
12104  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
12105  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
12106 
12107  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12108 
12109  for( size_t k=kbegin; k<kend; ++k ) {
12110  const SIMDType a1( set( A(i ,k) ) );
12111  const SIMDType a2( set( A(i+1UL,k) ) );
12112  const SIMDType a3( set( A(i+2UL,k) ) );
12113  const SIMDType a4( set( A(i+3UL,k) ) );
12114  const SIMDType b1( B.load(k,j ) );
12115  const SIMDType b2( B.load(k,j+SIMDSIZE) );
12116  xmm1 += a1 * b1;
12117  xmm2 += a1 * b2;
12118  xmm3 += a2 * b1;
12119  xmm4 += a2 * b2;
12120  xmm5 += a3 * b1;
12121  xmm6 += a3 * b2;
12122  xmm7 += a4 * b1;
12123  xmm8 += a4 * b2;
12124  }
12125 
12126  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12127  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
12128  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
12129  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
12130  C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
12131  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
12132  C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
12133  C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
12134  }
12135 
12136  for( ; (i+3UL) <= iend; i+=3UL )
12137  {
12138  const size_t kbegin( ( IsUpper_v<MT4> )
12139  ?( ( IsLower_v<MT5> )
12140  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12141  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12142  :( IsLower_v<MT5> ? j : 0UL ) );
12143  const size_t kend( ( IsLower_v<MT4> )
12144  ?( ( IsUpper_v<MT5> )
12145  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
12146  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
12147  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
12148 
12149  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12150 
12151  for( size_t k=kbegin; k<kend; ++k ) {
12152  const SIMDType a1( set( A(i ,k) ) );
12153  const SIMDType a2( set( A(i+1UL,k) ) );
12154  const SIMDType a3( set( A(i+2UL,k) ) );
12155  const SIMDType b1( B.load(k,j ) );
12156  const SIMDType b2( B.load(k,j+SIMDSIZE) );
12157  xmm1 += a1 * b1;
12158  xmm2 += a1 * b2;
12159  xmm3 += a2 * b1;
12160  xmm4 += a2 * b2;
12161  xmm5 += a3 * b1;
12162  xmm6 += a3 * b2;
12163  }
12164 
12165  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12166  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
12167  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
12168  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
12169  C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
12170  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
12171  }
12172 
12173  for( ; (i+2UL) <= iend; i+=2UL )
12174  {
12175  const size_t kbegin( ( IsUpper_v<MT4> )
12176  ?( ( IsLower_v<MT5> )
12177  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12178  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12179  :( IsLower_v<MT5> ? j : 0UL ) );
12180  const size_t kend( ( IsLower_v<MT4> )
12181  ?( ( IsUpper_v<MT5> )
12182  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
12183  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
12184  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
12185 
12186  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12187  size_t k( kbegin );
12188 
12189  for( ; (k+2UL) <= kend; k+=2UL ) {
12190  const SIMDType a1( set( A(i ,k ) ) );
12191  const SIMDType a2( set( A(i+1UL,k ) ) );
12192  const SIMDType a3( set( A(i ,k+1UL) ) );
12193  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
12194  const SIMDType b1( B.load(k ,j ) );
12195  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
12196  const SIMDType b3( B.load(k+1UL,j ) );
12197  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
12198  xmm1 += a1 * b1;
12199  xmm2 += a1 * b2;
12200  xmm3 += a2 * b1;
12201  xmm4 += a2 * b2;
12202  xmm5 += a3 * b3;
12203  xmm6 += a3 * b4;
12204  xmm7 += a4 * b3;
12205  xmm8 += a4 * b4;
12206  }
12207 
12208  for( ; k<kend; ++k ) {
12209  const SIMDType a1( set( A(i ,k) ) );
12210  const SIMDType a2( set( A(i+1UL,k) ) );
12211  const SIMDType b1( B.load(k,j ) );
12212  const SIMDType b2( B.load(k,j+SIMDSIZE) );
12213  xmm1 += a1 * b1;
12214  xmm2 += a1 * b2;
12215  xmm3 += a2 * b1;
12216  xmm4 += a2 * b2;
12217  }
12218 
12219  C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
12220  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
12221  C.store( i+1UL, j , C.load(i+1UL,j ) - (xmm3+xmm7) * factor );
12222  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
12223  }
12224 
12225  if( i < iend )
12226  {
12227  const size_t kbegin( ( IsUpper_v<MT4> )
12228  ?( ( IsLower_v<MT5> )
12229  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12230  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12231  :( IsLower_v<MT5> ? j : 0UL ) );
12232  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
12233 
12234  SIMDType xmm1, xmm2, xmm3, xmm4;
12235  size_t k( kbegin );
12236 
12237  for( ; (k+2UL) <= kend; k+=2UL ) {
12238  const SIMDType a1( set( A(i,k ) ) );
12239  const SIMDType a2( set( A(i,k+1UL) ) );
12240  xmm1 += a1 * B.load(k ,j );
12241  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
12242  xmm3 += a2 * B.load(k+1UL,j );
12243  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
12244  }
12245 
12246  for( ; k<kend; ++k ) {
12247  const SIMDType a1( set( A(i,k) ) );
12248  xmm1 += a1 * B.load(k,j );
12249  xmm2 += a1 * B.load(k,j+SIMDSIZE);
12250  }
12251 
12252  C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
12253  C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
12254  }
12255  }
12256 
12257  for( ; j<jpos; j+=SIMDSIZE )
12258  {
12259  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
12260  size_t i( LOW ? j : 0UL );
12261 
12262  for( ; (i+4UL) <= iend; i+=4UL )
12263  {
12264  const size_t kbegin( ( IsUpper_v<MT4> )
12265  ?( ( IsLower_v<MT5> )
12266  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12267  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12268  :( IsLower_v<MT5> ? j : 0UL ) );
12269  const size_t kend( ( IsLower_v<MT4> )
12270  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
12271  :( K ) );
12272 
12273  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12274  size_t k( kbegin );
12275 
12276  for( ; (k+2UL) <= kend; k+=2UL ) {
12277  const SIMDType b1( B.load(k ,j) );
12278  const SIMDType b2( B.load(k+1UL,j) );
12279  xmm1 += set( A(i ,k ) ) * b1;
12280  xmm2 += set( A(i+1UL,k ) ) * b1;
12281  xmm3 += set( A(i+2UL,k ) ) * b1;
12282  xmm4 += set( A(i+3UL,k ) ) * b1;
12283  xmm5 += set( A(i ,k+1UL) ) * b2;
12284  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
12285  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
12286  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
12287  }
12288 
12289  for( ; k<kend; ++k ) {
12290  const SIMDType b1( B.load(k,j) );
12291  xmm1 += set( A(i ,k) ) * b1;
12292  xmm2 += set( A(i+1UL,k) ) * b1;
12293  xmm3 += set( A(i+2UL,k) ) * b1;
12294  xmm4 += set( A(i+3UL,k) ) * b1;
12295  }
12296 
12297  C.store( i , j, C.load(i ,j) - (xmm1+xmm5) * factor );
12298  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm6) * factor );
12299  C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm7) * factor );
12300  C.store( i+3UL, j, C.load(i+3UL,j) - (xmm4+xmm8) * factor );
12301  }
12302 
12303  for( ; (i+3UL) <= iend; i+=3UL )
12304  {
12305  const size_t kbegin( ( IsUpper_v<MT4> )
12306  ?( ( IsLower_v<MT5> )
12307  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12308  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12309  :( IsLower_v<MT5> ? j : 0UL ) );
12310  const size_t kend( ( IsLower_v<MT4> )
12311  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
12312  :( K ) );
12313 
12314  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12315  size_t k( kbegin );
12316 
12317  for( ; (k+2UL) <= kend; k+=2UL ) {
12318  const SIMDType b1( B.load(k ,j) );
12319  const SIMDType b2( B.load(k+1UL,j) );
12320  xmm1 += set( A(i ,k ) ) * b1;
12321  xmm2 += set( A(i+1UL,k ) ) * b1;
12322  xmm3 += set( A(i+2UL,k ) ) * b1;
12323  xmm4 += set( A(i ,k+1UL) ) * b2;
12324  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
12325  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
12326  }
12327 
12328  for( ; k<kend; ++k ) {
12329  const SIMDType b1( B.load(k,j) );
12330  xmm1 += set( A(i ,k) ) * b1;
12331  xmm2 += set( A(i+1UL,k) ) * b1;
12332  xmm3 += set( A(i+2UL,k) ) * b1;
12333  }
12334 
12335  C.store( i , j, C.load(i ,j) - (xmm1+xmm4) * factor );
12336  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm5) * factor );
12337  C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm6) * factor );
12338  }
12339 
12340  for( ; (i+2UL) <= iend; i+=2UL )
12341  {
12342  const size_t kbegin( ( IsUpper_v<MT4> )
12343  ?( ( IsLower_v<MT5> )
12344  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12345  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12346  :( IsLower_v<MT5> ? j : 0UL ) );
12347  const size_t kend( ( IsLower_v<MT4> )
12348  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
12349  :( K ) );
12350 
12351  SIMDType xmm1, xmm2, xmm3, xmm4;
12352  size_t k( kbegin );
12353 
12354  for( ; (k+2UL) <= kend; k+=2UL ) {
12355  const SIMDType b1( B.load(k ,j) );
12356  const SIMDType b2( B.load(k+1UL,j) );
12357  xmm1 += set( A(i ,k ) ) * b1;
12358  xmm2 += set( A(i+1UL,k ) ) * b1;
12359  xmm3 += set( A(i ,k+1UL) ) * b2;
12360  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
12361  }
12362 
12363  for( ; k<kend; ++k ) {
12364  const SIMDType b1( B.load(k,j) );
12365  xmm1 += set( A(i ,k) ) * b1;
12366  xmm2 += set( A(i+1UL,k) ) * b1;
12367  }
12368 
12369  C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
12370  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm4) * factor );
12371  }
12372 
12373  if( i < iend )
12374  {
12375  const size_t kbegin( ( IsUpper_v<MT4> )
12376  ?( ( IsLower_v<MT5> )
12377  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12378  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12379  :( IsLower_v<MT5> ? j : 0UL ) );
12380 
12381  SIMDType xmm1, xmm2;
12382  size_t k( kbegin );
12383 
12384  for( ; (k+2UL) <= K; k+=2UL ) {
12385  xmm1 += set( A(i,k ) ) * B.load(k ,j);
12386  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
12387  }
12388 
12389  for( ; k<K; ++k ) {
12390  xmm1 += set( A(i,k) ) * B.load(k,j);
12391  }
12392 
12393  C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
12394  }
12395  }
12396 
12397  for( ; remainder && j<N; ++j )
12398  {
12399  const size_t iend( UPP ? j+1UL : M );
12400  size_t i( LOW ? j : 0UL );
12401 
12402  for( ; (i+2UL) <= iend; i+=2UL )
12403  {
12404  const size_t kbegin( ( IsUpper_v<MT4> )
12405  ?( ( IsLower_v<MT5> )
12406  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12407  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12408  :( IsLower_v<MT5> ? j : 0UL ) );
12409  const size_t kend( ( IsLower_v<MT4> )
12410  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
12411  :( K ) );
12412 
12413  ElementType value1{};
12414  ElementType value2{};
12415 
12416  for( size_t k=kbegin; k<kend; ++k ) {
12417  value1 += A(i ,k) * B(k,j);
12418  value2 += A(i+1UL,k) * B(k,j);
12419  }
12420 
12421  C(i ,j) -= value1 * scalar;
12422  C(i+1UL,j) -= value2 * scalar;
12423  }
12424 
12425  if( i < iend )
12426  {
12427  const size_t kbegin( ( IsUpper_v<MT4> )
12428  ?( ( IsLower_v<MT5> )
12429  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12430  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12431  :( IsLower_v<MT5> ? j : 0UL ) );
12432 
12433  ElementType value{};
12434 
12435  for( size_t k=kbegin; k<K; ++k ) {
12436  value += A(i,k) * B(k,j);
12437  }
12438 
12439  C(i,j) -= value * scalar;
12440  }
12441  }
12442  }
12443  //**********************************************************************************************
12444 
12445  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
12460  template< typename MT3 // Type of the left-hand side target matrix
12461  , typename MT4 // Type of the left-hand side matrix operand
12462  , typename MT5 // Type of the right-hand side matrix operand
12463  , typename ST2 > // Type of the scalar value
12464  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12465  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12466  {
12467  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
12468 
12469  const size_t M( A.rows() );
12470  const size_t N( B.columns() );
12471  const size_t K( A.columns() );
12472 
12473  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
12474 
12475  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
12476  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
12477 
12478  const SIMDType factor( set( scalar ) );
12479 
12480  size_t i( 0UL );
12481 
12482  if( IsIntegral_v<ElementType> )
12483  {
12484  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
12485  for( size_t j=0UL; j<N; ++j )
12486  {
12487  const size_t kbegin( ( IsLower_v<MT5> )
12488  ?( ( IsUpper_v<MT4> )
12489  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12490  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12491  :( IsUpper_v<MT4> ? i : 0UL ) );
12492  const size_t kend( ( IsUpper_v<MT5> )
12493  ?( ( IsLower_v<MT4> )
12494  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
12495  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
12496  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
12497 
12498  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12499 
12500  for( size_t k=kbegin; k<kend; ++k ) {
12501  const SIMDType b1( set( B(k,j) ) );
12502  xmm1 += A.load(i ,k) * b1;
12503  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12504  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12505  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12506  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12507  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
12508  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
12509  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
12510  }
12511 
12512  C.store( i , j, C.load(i ,j) - xmm1 * factor );
12513  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
12514  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12515  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12516  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12517  C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
12518  C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
12519  C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
12520  }
12521  }
12522  }
12523 
12524  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
12525  {
12526  size_t j( 0UL );
12527 
12528  for( ; (j+2UL) <= N; j+=2UL )
12529  {
12530  const size_t kbegin( ( IsLower_v<MT5> )
12531  ?( ( IsUpper_v<MT4> )
12532  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12533  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12534  :( IsUpper_v<MT4> ? i : 0UL ) );
12535  const size_t kend( ( IsUpper_v<MT5> )
12536  ?( ( IsLower_v<MT4> )
12537  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12538  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12539  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
12540 
12541  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
12542 
12543  for( size_t k=kbegin; k<kend; ++k ) {
12544  const SIMDType a1( A.load(i ,k) );
12545  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12546  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12547  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12548  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
12549  const SIMDType b1( set( B(k,j ) ) );
12550  const SIMDType b2( set( B(k,j+1UL) ) );
12551  xmm1 += a1 * b1;
12552  xmm2 += a2 * b1;
12553  xmm3 += a3 * b1;
12554  xmm4 += a4 * b1;
12555  xmm5 += a5 * b1;
12556  xmm6 += a1 * b2;
12557  xmm7 += a2 * b2;
12558  xmm8 += a3 * b2;
12559  xmm9 += a4 * b2;
12560  xmm10 += a5 * b2;
12561  }
12562 
12563  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12564  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
12565  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12566  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12567  C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
12568  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
12569  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
12570  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
12571  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
12572  C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
12573  }
12574 
12575  if( j < N )
12576  {
12577  const size_t kbegin( ( IsLower_v<MT5> )
12578  ?( ( IsUpper_v<MT4> )
12579  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12580  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12581  :( IsUpper_v<MT4> ? i : 0UL ) );
12582  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
12583 
12584  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
12585 
12586  for( size_t k=kbegin; k<kend; ++k ) {
12587  const SIMDType b1( set( B(k,j) ) );
12588  xmm1 += A.load(i ,k) * b1;
12589  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12590  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12591  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12592  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12593  }
12594 
12595  C.store( i , j, C.load(i ,j) - xmm1 * factor );
12596  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
12597  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12598  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12599  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12600  }
12601  }
12602 
12603  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
12604  {
12605  size_t j( 0UL );
12606 
12607  for( ; (j+2UL) <= N; j+=2UL )
12608  {
12609  const size_t kbegin( ( IsLower_v<MT5> )
12610  ?( ( IsUpper_v<MT4> )
12611  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12612  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12613  :( IsUpper_v<MT4> ? i : 0UL ) );
12614  const size_t kend( ( IsUpper_v<MT5> )
12615  ?( ( IsLower_v<MT4> )
12616  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12617  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12618  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
12619 
12620  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12621 
12622  for( size_t k=kbegin; k<kend; ++k ) {
12623  const SIMDType a1( A.load(i ,k) );
12624  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12625  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12626  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12627  const SIMDType b1( set( B(k,j ) ) );
12628  const SIMDType b2( set( B(k,j+1UL) ) );
12629  xmm1 += a1 * b1;
12630  xmm2 += a2 * b1;
12631  xmm3 += a3 * b1;
12632  xmm4 += a4 * b1;
12633  xmm5 += a1 * b2;
12634  xmm6 += a2 * b2;
12635  xmm7 += a3 * b2;
12636  xmm8 += a4 * b2;
12637  }
12638 
12639  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12640  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
12641  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12642  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12643  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
12644  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
12645  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
12646  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
12647  }
12648 
12649  if( j < N )
12650  {
12651  const size_t kbegin( ( IsLower_v<MT5> )
12652  ?( ( IsUpper_v<MT4> )
12653  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12654  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12655  :( IsUpper_v<MT4> ? i : 0UL ) );
12656  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
12657 
12658  SIMDType xmm1, xmm2, xmm3, xmm4;
12659 
12660  for( size_t k=kbegin; k<kend; ++k ) {
12661  const SIMDType b1( set( B(k,j) ) );
12662  xmm1 += A.load(i ,k) * b1;
12663  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12664  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12665  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12666  }
12667 
12668  C.store( i , j, C.load(i ,j) - xmm1 * factor );
12669  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
12670  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12671  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12672  }
12673  }
12674 
12675  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
12676  {
12677  size_t j( 0UL );
12678 
12679  for( ; (j+2UL) <= N; j+=2UL )
12680  {
12681  const size_t kbegin( ( IsLower_v<MT5> )
12682  ?( ( IsUpper_v<MT4> )
12683  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12684  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12685  :( IsUpper_v<MT4> ? i : 0UL ) );
12686  const size_t kend( ( IsUpper_v<MT5> )
12687  ?( ( IsLower_v<MT4> )
12688  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12689  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12690  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
12691 
12692  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12693 
12694  for( size_t k=kbegin; k<kend; ++k ) {
12695  const SIMDType a1( A.load(i ,k) );
12696  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12697  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12698  const SIMDType b1( set( B(k,j ) ) );
12699  const SIMDType b2( set( B(k,j+1UL) ) );
12700  xmm1 += a1 * b1;
12701  xmm2 += a2 * b1;
12702  xmm3 += a3 * b1;
12703  xmm4 += a1 * b2;
12704  xmm5 += a2 * b2;
12705  xmm6 += a3 * b2;
12706  }
12707 
12708  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12709  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
12710  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12711  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
12712  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
12713  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
12714  }
12715 
12716  if( j < N )
12717  {
12718  const size_t kbegin( ( IsLower_v<MT5> )
12719  ?( ( IsUpper_v<MT4> )
12720  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12721  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12722  :( IsUpper_v<MT4> ? i : 0UL ) );
12723  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
12724 
12725  SIMDType xmm1, xmm2, xmm3;
12726 
12727  for( size_t k=kbegin; k<kend; ++k ) {
12728  const SIMDType b1( set( B(k,j) ) );
12729  xmm1 += A.load(i ,k) * b1;
12730  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12731  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12732  }
12733 
12734  C.store( i , j, C.load(i ,j) - xmm1 * factor );
12735  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
12736  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12737  }
12738  }
12739 
12740  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
12741  {
12742  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
12743  size_t j( UPP ? i : 0UL );
12744 
12745  for( ; (j+4UL) <= jend; j+=4UL )
12746  {
12747  const size_t kbegin( ( IsLower_v<MT5> )
12748  ?( ( IsUpper_v<MT4> )
12749  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12750  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12751  :( IsUpper_v<MT4> ? i : 0UL ) );
12752  const size_t kend( ( IsUpper_v<MT5> )
12753  ?( ( IsLower_v<MT4> )
12754  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
12755  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
12756  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
12757 
12758  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12759 
12760  for( size_t k=kbegin; k<kend; ++k ) {
12761  const SIMDType a1( A.load(i ,k) );
12762  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12763  const SIMDType b1( set( B(k,j ) ) );
12764  const SIMDType b2( set( B(k,j+1UL) ) );
12765  const SIMDType b3( set( B(k,j+2UL) ) );
12766  const SIMDType b4( set( B(k,j+3UL) ) );
12767  xmm1 += a1 * b1;
12768  xmm2 += a2 * b1;
12769  xmm3 += a1 * b2;
12770  xmm4 += a2 * b2;
12771  xmm5 += a1 * b3;
12772  xmm6 += a2 * b3;
12773  xmm7 += a1 * b4;
12774  xmm8 += a2 * b4;
12775  }
12776 
12777  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12778  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
12779  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
12780  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12781  C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
12782  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12783  C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
12784  C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
12785  }
12786 
12787  for( ; (j+3UL) <= jend; j+=3UL )
12788  {
12789  const size_t kbegin( ( IsLower_v<MT5> )
12790  ?( ( IsUpper_v<MT4> )
12791  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12792  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12793  :( IsUpper_v<MT4> ? i : 0UL ) );
12794  const size_t kend( ( IsUpper_v<MT5> )
12795  ?( ( IsLower_v<MT4> )
12796  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
12797  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
12798  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
12799 
12800  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12801 
12802  for( size_t k=kbegin; k<kend; ++k ) {
12803  const SIMDType a1( A.load(i ,k) );
12804  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12805  const SIMDType b1( set( B(k,j ) ) );
12806  const SIMDType b2( set( B(k,j+1UL) ) );
12807  const SIMDType b3( set( B(k,j+2UL) ) );
12808  xmm1 += a1 * b1;
12809  xmm2 += a2 * b1;
12810  xmm3 += a1 * b2;
12811  xmm4 += a2 * b2;
12812  xmm5 += a1 * b3;
12813  xmm6 += a2 * b3;
12814  }
12815 
12816  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12817  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
12818  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
12819  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12820  C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
12821  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12822  }
12823 
12824  for( ; (j+2UL) <= jend; j+=2UL )
12825  {
12826  const size_t kbegin( ( IsLower_v<MT5> )
12827  ?( ( IsUpper_v<MT4> )
12828  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12829  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12830  :( IsUpper_v<MT4> ? i : 0UL ) );
12831  const size_t kend( ( IsUpper_v<MT5> )
12832  ?( ( IsLower_v<MT4> )
12833  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12834  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12835  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
12836 
12837  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12838  size_t k( kbegin );
12839 
12840  for( ; (k+2UL) <= kend; k+=2UL ) {
12841  const SIMDType a1( A.load(i ,k ) );
12842  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
12843  const SIMDType a3( A.load(i ,k+1UL) );
12844  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
12845  const SIMDType b1( set( B(k ,j ) ) );
12846  const SIMDType b2( set( B(k ,j+1UL) ) );
12847  const SIMDType b3( set( B(k+1UL,j ) ) );
12848  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
12849  xmm1 += a1 * b1;
12850  xmm2 += a2 * b1;
12851  xmm3 += a1 * b2;
12852  xmm4 += a2 * b2;
12853  xmm5 += a3 * b3;
12854  xmm6 += a4 * b3;
12855  xmm7 += a3 * b4;
12856  xmm8 += a4 * b4;
12857  }
12858 
12859  for( ; k<kend; ++k ) {
12860  const SIMDType a1( A.load(i ,k) );
12861  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12862  const SIMDType b1( set( B(k,j ) ) );
12863  const SIMDType b2( set( B(k,j+1UL) ) );
12864  xmm1 += a1 * b1;
12865  xmm2 += a2 * b1;
12866  xmm3 += a1 * b2;
12867  xmm4 += a2 * b2;
12868  }
12869 
12870  C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
12871  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
12872  C.store( i , j+1UL, C.load(i ,j+1UL) - (xmm3+xmm7) * factor );
12873  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
12874  }
12875 
12876  if( j < jend )
12877  {
12878  const size_t kbegin( ( IsLower_v<MT5> )
12879  ?( ( IsUpper_v<MT4> )
12880  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12881  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12882  :( IsUpper_v<MT4> ? i : 0UL ) );
12883  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
12884 
12885  SIMDType xmm1, xmm2, xmm3, xmm4;
12886  size_t k( kbegin );
12887 
12888  for( ; (k+2UL) <= kend; k+=2UL ) {
12889  const SIMDType b1( set( B(k ,j) ) );
12890  const SIMDType b2( set( B(k+1UL,j) ) );
12891  xmm1 += A.load(i ,k ) * b1;
12892  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
12893  xmm3 += A.load(i ,k+1UL) * b2;
12894  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
12895  }
12896 
12897  for( ; k<kend; ++k ) {
12898  const SIMDType b1( set( B(k,j) ) );
12899  xmm1 += A.load(i ,k) * b1;
12900  xmm2 += A.load(i+SIMDSIZE,k) * b1;
12901  }
12902 
12903  C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
12904  C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
12905  }
12906  }
12907 
12908  for( ; i<ipos; i+=SIMDSIZE )
12909  {
12910  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
12911  size_t j( UPP ? i : 0UL );
12912 
12913  for( ; (j+4UL) <= jend; j+=4UL )
12914  {
12915  const size_t kbegin( ( IsLower_v<MT5> )
12916  ?( ( IsUpper_v<MT4> )
12917  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12918  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12919  :( IsUpper_v<MT4> ? i : 0UL ) );
12920  const size_t kend( ( IsUpper_v<MT5> )
12921  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
12922  :( K ) );
12923 
12924  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12925  size_t k( kbegin );
12926 
12927  for( ; (k+2UL) <= kend; k+=2UL ) {
12928  const SIMDType a1( A.load(i,k ) );
12929  const SIMDType a2( A.load(i,k+1UL) );
12930  xmm1 += a1 * set( B(k ,j ) );
12931  xmm2 += a1 * set( B(k ,j+1UL) );
12932  xmm3 += a1 * set( B(k ,j+2UL) );
12933  xmm4 += a1 * set( B(k ,j+3UL) );
12934  xmm5 += a2 * set( B(k+1UL,j ) );
12935  xmm6 += a2 * set( B(k+1UL,j+1UL) );
12936  xmm7 += a2 * set( B(k+1UL,j+2UL) );
12937  xmm8 += a2 * set( B(k+1UL,j+3UL) );
12938  }
12939 
12940  for( ; k<kend; ++k ) {
12941  const SIMDType a1( A.load(i,k) );
12942  xmm1 += a1 * set( B(k,j ) );
12943  xmm2 += a1 * set( B(k,j+1UL) );
12944  xmm3 += a1 * set( B(k,j+2UL) );
12945  xmm4 += a1 * set( B(k,j+3UL) );
12946  }
12947 
12948  C.store( i, j , C.load(i,j ) - (xmm1+xmm5) * factor );
12949  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm6) * factor );
12950  C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm7) * factor );
12951  C.store( i, j+3UL, C.load(i,j+3UL) - (xmm4+xmm8) * factor );
12952  }
12953 
12954  for( ; (j+3UL) <= jend; j+=3UL )
12955  {
12956  const size_t kbegin( ( IsLower_v<MT5> )
12957  ?( ( IsUpper_v<MT4> )
12958  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12959  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12960  :( IsUpper_v<MT4> ? i : 0UL ) );
12961  const size_t kend( ( IsUpper_v<MT5> )
12962  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
12963  :( K ) );
12964 
12965  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12966  size_t k( kbegin );
12967 
12968  for( ; (k+2UL) <= kend; k+=2UL ) {
12969  const SIMDType a1( A.load(i,k ) );
12970  const SIMDType a2( A.load(i,k+1UL) );
12971  xmm1 += a1 * set( B(k ,j ) );
12972  xmm2 += a1 * set( B(k ,j+1UL) );
12973  xmm3 += a1 * set( B(k ,j+2UL) );
12974  xmm4 += a2 * set( B(k+1UL,j ) );
12975  xmm5 += a2 * set( B(k+1UL,j+1UL) );
12976  xmm6 += a2 * set( B(k+1UL,j+2UL) );
12977  }
12978 
12979  for( ; k<kend; ++k ) {
12980  const SIMDType a1( A.load(i,k) );
12981  xmm1 += a1 * set( B(k,j ) );
12982  xmm2 += a1 * set( B(k,j+1UL) );
12983  xmm3 += a1 * set( B(k,j+2UL) );
12984  }
12985 
12986  C.store( i, j , C.load(i,j ) - (xmm1+xmm4) * factor );
12987  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm5) * factor );
12988  C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm6) * factor );
12989  }
12990 
12991  for( ; (j+2UL) <= jend; j+=2UL )
12992  {
12993  const size_t kbegin( ( IsLower_v<MT5> )
12994  ?( ( IsUpper_v<MT4> )
12995  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12996  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12997  :( IsUpper_v<MT4> ? i : 0UL ) );
12998  const size_t kend( ( IsUpper_v<MT5> )
12999  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
13000  :( K ) );
13001 
13002  SIMDType xmm1, xmm2, xmm3, xmm4;
13003  size_t k( kbegin );
13004 
13005  for( ; (k+2UL) <= kend; k+=2UL ) {
13006  const SIMDType a1( A.load(i,k ) );
13007  const SIMDType a2( A.load(i,k+1UL) );
13008  xmm1 += a1 * set( B(k ,j ) );
13009  xmm2 += a1 * set( B(k ,j+1UL) );
13010  xmm3 += a2 * set( B(k+1UL,j ) );
13011  xmm4 += a2 * set( B(k+1UL,j+1UL) );
13012  }
13013 
13014  for( ; k<kend; ++k ) {
13015  const SIMDType a1( A.load(i,k) );
13016  xmm1 += a1 * set( B(k,j ) );
13017  xmm2 += a1 * set( B(k,j+1UL) );
13018  }
13019 
13020  C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
13021  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm4) * factor );
13022  }
13023 
13024  if( j < jend )
13025  {
13026  const size_t kbegin( ( IsLower_v<MT5> )
13027  ?( ( IsUpper_v<MT4> )
13028  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13029  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13030  :( IsUpper_v<MT4> ? i : 0UL ) );
13031 
13032  SIMDType xmm1, xmm2;
13033  size_t k( kbegin );
13034 
13035  for( ; (k+2UL) <= K; k+=2UL ) {
13036  xmm1 += A.load(i,k ) * set( B(k ,j) );
13037  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
13038  }
13039 
13040  for( ; k<K; ++k ) {
13041  xmm1 += A.load(i,k) * set( B(k,j) );
13042  }
13043 
13044  C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
13045  }
13046  }
13047 
13048  for( ; remainder && i<M; ++i )
13049  {
13050  const size_t jend( LOW ? i+1UL : N );
13051  size_t j( UPP ? i : 0UL );
13052 
13053  for( ; (j+2UL) <= jend; j+=2UL )
13054  {
13055  const size_t kbegin( ( IsLower_v<MT5> )
13056  ?( ( IsUpper_v<MT4> )
13057  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13058  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13059  :( IsUpper_v<MT4> ? i : 0UL ) );
13060  const size_t kend( ( IsUpper_v<MT5> )
13061  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
13062  :( K ) );
13063 
13064  ElementType value1{};
13065  ElementType value2{};
13066 
13067  for( size_t k=kbegin; k<kend; ++k ) {
13068  value1 += A(i,k) * B(k,j );
13069  value2 += A(i,k) * B(k,j+1UL);
13070  }
13071 
13072  C(i,j ) -= value1 * scalar;
13073  C(i,j+1UL) -= value2 * scalar;
13074  }
13075 
13076  if( j < jend )
13077  {
13078  const size_t kbegin( ( IsLower_v<MT5> )
13079  ?( ( IsUpper_v<MT4> )
13080  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13081  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13082  :( IsUpper_v<MT4> ? i : 0UL ) );
13083 
13084  ElementType value{};
13085 
13086  for( size_t k=kbegin; k<K; ++k ) {
13087  value += A(i,k) * B(k,j);
13088  }
13089 
13090  C(i,j) -= value * scalar;
13091  }
13092  }
13093  }
13094  //**********************************************************************************************
13095 
13096  //**Default subtraction assignment to dense matrices (large matrices)***************************
13110  template< typename MT3 // Type of the left-hand side target matrix
13111  , typename MT4 // Type of the left-hand side matrix operand
13112  , typename MT5 // Type of the right-hand side matrix operand
13113  , typename ST2 > // Type of the scalar value
13114  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
13115  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
13116  {
13117  selectDefaultSubAssignKernel( C, A, B, scalar );
13118  }
13119  //**********************************************************************************************
13120 
13121  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
13136  template< typename MT3 // Type of the left-hand side target matrix
13137  , typename MT4 // Type of the left-hand side matrix operand
13138  , typename MT5 // Type of the right-hand side matrix operand
13139  , typename ST2 > // Type of the scalar value
13140  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
13141  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
13142  {
13143  if( LOW )
13144  lmmm( C, A, B, -scalar, ST2(1) );
13145  else if( UPP )
13146  ummm( C, A, B, -scalar, ST2(1) );
13147  else
13148  mmm( C, A, B, -scalar, ST2(1) );
13149  }
13150  //**********************************************************************************************
13151 
13152  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
13166  template< typename MT3 // Type of the left-hand side target matrix
13167  , typename MT4 // Type of the left-hand side matrix operand
13168  , typename MT5 // Type of the right-hand side matrix operand
13169  , typename ST2 > // Type of the scalar value
13170  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
13171  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
13172  {
13173  selectLargeSubAssignKernel( C, A, B, scalar );
13174  }
13175  //**********************************************************************************************
13176 
13177  //**BLAS-based subraction assignment to dense matrices******************************************
13178 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
13179 
13192  template< typename MT3 // Type of the left-hand side target matrix
13193  , typename MT4 // Type of the left-hand side matrix operand
13194  , typename MT5 // Type of the right-hand side matrix operand
13195  , typename ST2 > // Type of the scalar value
13196  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
13197  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
13198  {
13199  using ET = ElementType_t<MT3>;
13200 
13201  if( IsTriangular_v<MT4> ) {
13202  ResultType_t<MT3> tmp( serial( B ) );
13203  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
13204  subAssign( C, tmp );
13205  }
13206  else if( IsTriangular_v<MT5> ) {
13207  ResultType_t<MT3> tmp( serial( A ) );
13208  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
13209  subAssign( C, tmp );
13210  }
13211  else {
13212  gemm( C, A, B, ET(-scalar), ET(1) );
13213  }
13214  }
13215 #endif
13216  //**********************************************************************************************
13217 
13218  //**Subtraction assignment to sparse matrices***************************************************
13219  // No special implementation for the subtraction assignment to sparse matrices.
13220  //**********************************************************************************************
13221 
13222  //**Schur product assignment to dense matrices**************************************************
13234  template< typename MT // Type of the target dense matrix
13235  , bool SO > // Storage order of the target dense matrix
13236  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13237  {
13239 
13243 
13244  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13245  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13246 
13247  const ResultType tmp( serial( rhs ) );
13248  schurAssign( ~lhs, tmp );
13249  }
13250  //**********************************************************************************************
13251 
13252  //**Schur product assignment to sparse matrices*************************************************
13253  // No special implementation for the Schur product assignment to sparse matrices.
13254  //**********************************************************************************************
13255 
13256  //**Multiplication assignment to dense matrices*************************************************
13257  // No special implementation for the multiplication assignment to dense matrices.
13258  //**********************************************************************************************
13259 
13260  //**Multiplication assignment to sparse matrices************************************************
13261  // No special implementation for the multiplication assignment to sparse matrices.
13262  //**********************************************************************************************
13263 
13264  //**SMP assignment to dense matrices************************************************************
13279  template< typename MT // Type of the target dense matrix
13280  , bool SO > // Storage order of the target dense matrix
13281  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13282  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13283  {
13285 
13286  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13287  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13288 
13289  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13290  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13291 
13292  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
13293  return;
13294  }
13295  else if( left.columns() == 0UL ) {
13296  reset( ~lhs );
13297  return;
13298  }
13299 
13300  LT A( left ); // Evaluation of the left-hand side dense matrix operand
13301  RT B( right ); // Evaluation of the right-hand side dense matrix operand
13302 
13303  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13304  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13305  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13306  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13307  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
13308  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
13309 
13310  smpAssign( ~lhs, A * B * rhs.scalar_ );
13311  }
13312  //**********************************************************************************************
13313 
13314  //**SMP assignment to sparse matrices***********************************************************
13329  template< typename MT // Type of the target sparse matrix
13330  , bool SO > // Storage order of the target sparse matrix
13331  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13332  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13333  {
13335 
13336  using TmpType = If_t< SO, ResultType, OppositeType >;
13337 
13344 
13345  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13346  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13347 
13348  const ForwardFunctor fwd;
13349 
13350  const TmpType tmp( rhs );
13351  smpAssign( ~lhs, fwd( tmp ) );
13352  }
13353  //**********************************************************************************************
13354 
13355  //**SMP addition assignment to dense matrices***************************************************
13370  template< typename MT // Type of the target dense matrix
13371  , bool SO > // Storage order of the target dense matrix
13372  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13373  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13374  {
13376 
13377  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13378  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13379 
13380  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13381  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13382 
13383  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
13384  return;
13385  }
13386 
13387  LT A( left ); // Evaluation of the left-hand side dense matrix operand
13388  RT B( right ); // Evaluation of the right-hand side dense matrix operand
13389 
13390  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13391  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13392  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13393  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13394  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
13395  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
13396 
13397  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
13398  }
13399  //**********************************************************************************************
13400 
13401  //**SMP addition assignment to sparse matrices**************************************************
13402  // No special implementation for the SMP addition assignment to sparse matrices.
13403  //**********************************************************************************************
13404 
13405  //**SMP subtraction assignment to dense matrices************************************************
13420  template< typename MT // Type of the target dense matrix
13421  , bool SO > // Storage order of the target dense matrix
13422  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13423  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13424  {
13426 
13427  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13428  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13429 
13430  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13431  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13432 
13433  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
13434  return;
13435  }
13436 
13437  LT A( left ); // Evaluation of the left-hand side dense matrix operand
13438  RT B( right ); // Evaluation of the right-hand side dense matrix operand
13439 
13440  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13441  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13442  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13443  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13444  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
13445  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
13446 
13447  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
13448  }
13449  //**********************************************************************************************
13450 
13451  //**SMP subtraction assignment to sparse matrices***********************************************
13452  // No special implementation for the SMP subtraction assignment to sparse matrices.
13453  //**********************************************************************************************
13454 
13455  //**SMP Schur product assignment to dense matrices**********************************************
13467  template< typename MT // Type of the target dense matrix
13468  , bool SO > // Storage order of the target dense matrix
13469  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13470  {
13472 
13476 
13477  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13478  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13479 
13480  const ResultType tmp( rhs );
13481  smpSchurAssign( ~lhs, tmp );
13482  }
13483  //**********************************************************************************************
13484 
13485  //**SMP Schur product assignment to sparse matrices*********************************************
13486  // No special implementation for the SMP Schur product assignment to sparse matrices.
13487  //**********************************************************************************************
13488 
13489  //**SMP multiplication assignment to dense matrices*********************************************
13490  // No special implementation for the SMP multiplication assignment to dense matrices.
13491  //**********************************************************************************************
13492 
13493  //**SMP multiplication assignment to sparse matrices********************************************
13494  // No special implementation for the SMP multiplication assignment to sparse matrices.
13495  //**********************************************************************************************
13496 
13497  //**Compile time checks*************************************************************************
13506  //**********************************************************************************************
13507 };
13509 //*************************************************************************************************
13510 
13511 
13512 
13513 
13514 //=================================================================================================
13515 //
13516 // GLOBAL BINARY ARITHMETIC OPERATORS
13517 //
13518 //=================================================================================================
13519 
13520 //*************************************************************************************************
13550 template< typename MT1 // Type of the left-hand side dense matrix
13551  , typename MT2 > // Type of the right-hand side dense matrix
13552 inline decltype(auto)
13553  operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,false>& rhs )
13554 {
13556 
13557  if( (~lhs).columns() != (~rhs).rows() ) {
13558  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
13559  }
13560 
13561  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,false,false,false,false>;
13562  return ReturnType( ~lhs, ~rhs );
13563 }
13564 //*************************************************************************************************
13565 
13566 
13567 
13568 
13569 //=================================================================================================
13570 //
13571 // GLOBAL FUNCTIONS
13572 //
13573 //=================================================================================================
13574 
13575 //*************************************************************************************************
13600 template< typename MT1 // Type of the left-hand side dense matrix
13601  , typename MT2 // Type of the right-hand side dense matrix
13602  , bool SF // Symmetry flag
13603  , bool HF // Hermitian flag
13604  , bool LF // Lower flag
13605  , bool UF > // Upper flag
13606 inline decltype(auto) declsym( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13607 {
13609 
13610  if( !isSquare( dm ) ) {
13611  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
13612  }
13613 
13614  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
13615  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13616 }
13618 //*************************************************************************************************
13619 
13620 
13621 //*************************************************************************************************
13646 template< typename MT1 // Type of the left-hand side dense matrix
13647  , typename MT2 // Type of the right-hand side dense matrix
13648  , bool SF // Symmetry flag
13649  , bool HF // Hermitian flag
13650  , bool LF // Lower flag
13651  , bool UF > // Upper flag
13652 inline decltype(auto) declherm( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13653 {
13655 
13656  if( !isSquare( dm ) ) {
13657  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
13658  }
13659 
13660  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
13661  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13662 }
13664 //*************************************************************************************************
13665 
13666 
13667 //*************************************************************************************************
13692 template< typename MT1 // Type of the left-hand side dense matrix
13693  , typename MT2 // Type of the right-hand side dense matrix
13694  , bool SF // Symmetry flag
13695  , bool HF // Hermitian flag
13696  , bool LF // Lower flag
13697  , bool UF > // Upper flag
13698 inline decltype(auto) decllow( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13699 {
13701 
13702  if( !isSquare( dm ) ) {
13703  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
13704  }
13705 
13706  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
13707  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13708 }
13710 //*************************************************************************************************
13711 
13712 
13713 //*************************************************************************************************
13738 template< typename MT1 // Type of the left-hand side dense matrix
13739  , typename MT2 // Type of the right-hand side dense matrix
13740  , bool SF // Symmetry flag
13741  , bool HF // Hermitian flag
13742  , bool LF // Lower flag
13743  , bool UF > // Upper flag
13744 inline decltype(auto) declupp( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13745 {
13747 
13748  if( !isSquare( dm ) ) {
13749  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
13750  }
13751 
13752  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
13753  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13754 }
13756 //*************************************************************************************************
13757 
13758 
13759 //*************************************************************************************************
13784 template< typename MT1 // Type of the left-hand side dense matrix
13785  , typename MT2 // Type of the right-hand side dense matrix
13786  , bool SF // Symmetry flag
13787  , bool HF // Hermitian flag
13788  , bool LF // Lower flag
13789  , bool UF > // Upper flag
13790 inline decltype(auto) decldiag( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13791 {
13793 
13794  if( !isSquare( dm ) ) {
13795  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
13796  }
13797 
13798  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
13799  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13800 }
13802 //*************************************************************************************************
13803 
13804 
13805 
13806 
13807 //=================================================================================================
13808 //
13809 // SIZE SPECIALIZATIONS
13810 //
13811 //=================================================================================================
13812 
13813 //*************************************************************************************************
13815 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13816 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
13817  : public Size<MT1,0UL>
13818 {};
13819 
13820 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13821 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
13822  : public Size<MT2,1UL>
13823 {};
13825 //*************************************************************************************************
13826 
13827 
13828 
13829 
13830 //=================================================================================================
13831 //
13832 // ISALIGNED SPECIALIZATIONS
13833 //
13834 //=================================================================================================
13835 
13836 //*************************************************************************************************
13838 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13839 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13840  : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
13841 {};
13843 //*************************************************************************************************
13844 
13845 } // namespace blaze
13846 
13847 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:151
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:427
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:166
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:272
Header file for basic type definitions.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:161
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:477
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:265
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
constexpr bool IsSIMDCombinable_v
Auxiliary variable template for the IsSIMDCombinable type trait.The IsSIMDCombinable_v variable templ...
Definition: IsSIMDCombinable.h:137
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:533
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:478
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:270
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:606
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:269
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:523
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1001
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:597
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
constexpr bool HasSIMDAdd_v
Auxiliary variable template for the HasSIMDAdd type trait.The HasSIMDAdd_v variable template provides...
Definition: HasSIMDAdd.h:187
Constraints on the storage order of matrix types.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:411
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:421
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:432
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:173
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:172
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:171
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:401
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:267
Header file for the IsComplexDouble type trait.
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:167
Constraint on the data type.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:565
Header file for the DisableIf class template.
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatDMatMultExpr.h:289
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:433
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:59
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
Header file for the decllow trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:271
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:391
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:168
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1001
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:553
Generic wrapper for the null function.
Definition: Noop.h:60
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:162
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:158
Header file for the exception macros of the math module.
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:465
Constraint on the data type.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:445
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:375
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:284
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:469
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:161
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:174
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatDMatMultExpr.h:296
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:422
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:311
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:59
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:170
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:105
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:577
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:59
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:455
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:326
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:454
constexpr bool HasSIMDMult_v
Auxiliary variable template for the HasSIMDMult type trait.The HasSIMDMult_v variable template provid...
Definition: HasSIMDMult.h:188
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:441
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:268
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:59
Header file for the IsComplex type trait.
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:278
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatDMatMultExpr.h:302
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:587
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:107
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1324
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:59
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:543
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:171
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:281
Header file for the DeclSym functor.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:275
Header file for the IsExpression type trait class.
Header file for the function trace functionality.