TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/dense/MMM.h>
53 #include <blaze/math/Exception.h>
66 #include <blaze/math/shims/Reset.h>
68 #include <blaze/math/SIMD.h>
97 #include <blaze/math/views/Check.h>
98 #include <blaze/system/BLAS.h>
99 #include <blaze/system/Blocking.h>
100 #include <blaze/system/Debugging.h>
102 #include <blaze/system/Thresholds.h>
105 #include <blaze/util/Assert.h>
106 #include <blaze/util/Complex.h>
109 #include <blaze/util/DisableIf.h>
110 #include <blaze/util/EnableIf.h>
113 #include <blaze/util/mpl/If.h>
114 #include <blaze/util/TrueType.h>
115 #include <blaze/util/Types.h>
123 
124 
125 namespace blaze {
126 
127 //=================================================================================================
128 //
129 // CLASS TDMATDMATMULTEXPR
130 //
131 //=================================================================================================
132 
133 //*************************************************************************************************
140 template< typename MT1 // Type of the left-hand side dense matrix
141  , typename MT2 // Type of the right-hand side dense matrix
142  , bool SF // Symmetry flag
143  , bool HF // Hermitian flag
144  , bool LF // Lower flag
145  , bool UF > // Upper flag
146 class TDMatDMatMultExpr
147  : public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
148  , private Computation
149 {
150  private:
151  //**Type definitions****************************************************************************
158  //**********************************************************************************************
159 
160  //**********************************************************************************************
162  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
163  //**********************************************************************************************
164 
165  //**********************************************************************************************
167  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
168  //**********************************************************************************************
169 
170  //**********************************************************************************************
171  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
172  static constexpr bool HERM = ( HF && !( LF || UF ) );
173  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
174  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
175  //**********************************************************************************************
176 
177  //**********************************************************************************************
179 
183  template< typename T1, typename T2, typename T3 >
184  static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
186  //**********************************************************************************************
187 
188  //**********************************************************************************************
190 
193  template< typename T1, typename T2, typename T3 >
194  static constexpr bool UseBlasKernel_v =
195  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
196  !SYM && !HERM && !LOW && !UPP &&
197  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
198  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
199  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
200  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
201  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
202  IsBLASCompatible_v< ElementType_t<T1> > &&
203  IsBLASCompatible_v< ElementType_t<T2> > &&
204  IsBLASCompatible_v< ElementType_t<T3> > &&
205  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
206  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
208  //**********************************************************************************************
209 
210  //**********************************************************************************************
212 
215  template< typename T1, typename T2, typename T3 >
216  static constexpr bool UseVectorizedDefaultKernel_v =
217  ( useOptimizedKernels &&
218  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
219  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
220  IsSIMDCombinable_v< ElementType_t<T1>
222  , ElementType_t<T3> > &&
223  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
224  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
226  //**********************************************************************************************
227 
228  //**********************************************************************************************
230 
233  using ForwardFunctor = If_t< HERM
234  , DeclHerm
235  , If_t< SYM
236  , DeclSym
237  , If_t< LOW
238  , If_t< UPP
239  , DeclDiag
240  , DeclLow >
241  , If_t< UPP
242  , DeclUpp
243  , Noop > > > >;
245  //**********************************************************************************************
246 
247  public:
248  //**Type definitions****************************************************************************
251 
254 
256  using ResultType = typename If_t< HERM
258  , If_t< SYM
260  , If_t< LOW
261  , If_t< UPP
264  , If_t< UPP
266  , MultTrait<RT1,RT2> > > > >::Type;
267 
272  using ReturnType = const ElementType;
273  using CompositeType = const ResultType;
274 
276  using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
277 
279  using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
280 
283 
286  //**********************************************************************************************
287 
288  //**Compilation flags***************************************************************************
290  static constexpr bool simdEnabled =
291  ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
292  MT1::simdEnabled && MT2::simdEnabled &&
293  HasSIMDAdd_v<ET1,ET2> &&
294  HasSIMDMult_v<ET1,ET2> );
295 
297  static constexpr bool smpAssignable =
299  //**********************************************************************************************
300 
301  //**SIMD properties*****************************************************************************
303  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
304  //**********************************************************************************************
305 
306  //**Constructor*********************************************************************************
312  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
313  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
314  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
315  {
316  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
317  }
318  //**********************************************************************************************
319 
320  //**Access operator*****************************************************************************
327  inline ReturnType operator()( size_t i, size_t j ) const {
328  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
329  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
330 
331  if( IsDiagonal_v<MT1> ) {
332  return lhs_(i,i) * rhs_(i,j);
333  }
334  else if( IsDiagonal_v<MT2> ) {
335  return lhs_(i,j) * rhs_(j,j);
336  }
337  else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
338  const size_t begin( ( IsUpper_v<MT1> )
339  ?( ( IsLower_v<MT2> )
340  ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
341  , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
342  :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
343  :( ( IsLower_v<MT2> )
344  ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
345  :( 0UL ) ) );
346  const size_t end( ( IsLower_v<MT1> )
347  ?( ( IsUpper_v<MT2> )
348  ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
349  , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
350  :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
351  :( ( IsUpper_v<MT2> )
352  ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
353  :( lhs_.columns() ) ) );
354 
355  if( begin >= end ) return ElementType();
356 
357  const size_t n( end - begin );
358 
359  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
360  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
361  }
362  else {
363  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
364  }
365  }
366  //**********************************************************************************************
367 
368  //**At function*********************************************************************************
376  inline ReturnType at( size_t i, size_t j ) const {
377  if( i >= lhs_.rows() ) {
378  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
379  }
380  if( j >= rhs_.columns() ) {
381  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
382  }
383  return (*this)(i,j);
384  }
385  //**********************************************************************************************
386 
387  //**Rows function*******************************************************************************
392  inline size_t rows() const noexcept {
393  return lhs_.rows();
394  }
395  //**********************************************************************************************
396 
397  //**Columns function****************************************************************************
402  inline size_t columns() const noexcept {
403  return rhs_.columns();
404  }
405  //**********************************************************************************************
406 
407  //**Left operand access*************************************************************************
412  inline LeftOperand leftOperand() const noexcept {
413  return lhs_;
414  }
415  //**********************************************************************************************
416 
417  //**Right operand access************************************************************************
422  inline RightOperand rightOperand() const noexcept {
423  return rhs_;
424  }
425  //**********************************************************************************************
426 
427  //**********************************************************************************************
433  template< typename T >
434  inline bool canAlias( const T* alias ) const noexcept {
435  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
436  }
437  //**********************************************************************************************
438 
439  //**********************************************************************************************
445  template< typename T >
446  inline bool isAliased( const T* alias ) const noexcept {
447  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
448  }
449  //**********************************************************************************************
450 
451  //**********************************************************************************************
456  inline bool isAligned() const noexcept {
457  return lhs_.isAligned() && rhs_.isAligned();
458  }
459  //**********************************************************************************************
460 
461  //**********************************************************************************************
466  inline bool canSMPAssign() const noexcept {
467  return ( !BLAZE_BLAS_MODE ||
468  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
470  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
471  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
472  !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
473  }
474  //**********************************************************************************************
475 
476  private:
477  //**Member variables****************************************************************************
480  //**********************************************************************************************
481 
482  //**Assignment to dense matrices****************************************************************
495  template< typename MT // Type of the target dense matrix
496  , bool SO > // Storage order of the target dense matrix
497  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
498  {
500 
501  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
502  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
503 
504  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
505  return;
506  }
507  else if( rhs.lhs_.columns() == 0UL ) {
508  reset( ~lhs );
509  return;
510  }
511 
512  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
513  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
514 
515  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
516  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
517  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
518  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
519  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
520  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
521 
522  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
523  }
525  //**********************************************************************************************
526 
527  //**Assignment to dense matrices (kernel selection)*********************************************
538  template< typename MT3 // Type of the left-hand side target matrix
539  , typename MT4 // Type of the left-hand side matrix operand
540  , typename MT5 > // Type of the right-hand side matrix operand
541  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
542  {
543  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
544  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
545  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
546  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
547  selectSmallAssignKernel( C, A, B );
548  else
549  selectBlasAssignKernel( C, A, B );
550  }
552  //**********************************************************************************************
553 
554  //**Default assignment to row-major dense matrices (general/general)****************************
568  template< typename MT3 // Type of the left-hand side target matrix
569  , typename MT4 // Type of the left-hand side matrix operand
570  , typename MT5 > // Type of the right-hand side matrix operand
571  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
572  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
573  {
574  const size_t M( A.rows() );
575  const size_t N( B.columns() );
576  const size_t K( A.columns() );
577 
578  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
579 
580  for( size_t i=0UL; i<M; ++i )
581  {
582  const size_t kbegin( ( IsUpper_v<MT4> )
583  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
584  :( 0UL ) );
585  const size_t kend( ( IsLower_v<MT4> )
586  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
587  :( K ) );
588  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
589 
590  if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
591  for( size_t j=0UL; j<N; ++j ) {
592  reset( C(i,j) );
593  }
594  continue;
595  }
596 
597  {
598  const size_t jbegin( ( IsUpper_v<MT5> )
599  ?( ( IsStrictlyUpper_v<MT5> )
600  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
601  :( UPP ? max(i,kbegin) : kbegin ) )
602  :( UPP ? i : 0UL ) );
603  const size_t jend( ( IsLower_v<MT5> )
604  ?( ( IsStrictlyLower_v<MT5> )
605  ?( LOW ? min(i+1UL,kbegin) : kbegin )
606  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
607  :( LOW ? i+1UL : N ) );
608 
609  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
610  for( size_t j=0UL; j<jbegin; ++j ) {
611  reset( C(i,j) );
612  }
613  }
614  else if( IsStrictlyUpper_v<MT5> ) {
615  reset( C(i,0UL) );
616  }
617  for( size_t j=jbegin; j<jend; ++j ) {
618  C(i,j) = A(i,kbegin) * B(kbegin,j);
619  }
620  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
621  for( size_t j=jend; j<N; ++j ) {
622  reset( C(i,j) );
623  }
624  }
625  else if( IsStrictlyLower_v<MT5> ) {
626  reset( C(i,N-1UL) );
627  }
628  }
629 
630  for( size_t k=kbegin+1UL; k<kend; ++k )
631  {
632  const size_t jbegin( ( IsUpper_v<MT5> )
633  ?( ( IsStrictlyUpper_v<MT5> )
634  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
635  :( SYM || HERM || UPP ? max( i, k ) : k ) )
636  :( SYM || HERM || UPP ? i : 0UL ) );
637  const size_t jend( ( IsLower_v<MT5> )
638  ?( ( IsStrictlyLower_v<MT5> )
639  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
640  :( LOW ? min(i+1UL,k) : k ) )
641  :( LOW ? i+1UL : N ) );
642 
643  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
644  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
645 
646  for( size_t j=jbegin; j<jend; ++j ) {
647  C(i,j) += A(i,k) * B(k,j);
648  }
649  if( IsLower_v<MT5> ) {
650  C(i,jend) = A(i,k) * B(k,jend);
651  }
652  }
653  }
654 
655  if( SYM || HERM ) {
656  for( size_t i=1UL; i<M; ++i ) {
657  for( size_t j=0UL; j<i; ++j ) {
658  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
659  }
660  }
661  }
662  }
664  //**********************************************************************************************
665 
666  //**Default assignment to column-major dense matrices (general/general)*************************
680  template< typename MT3 // Type of the left-hand side target matrix
681  , typename MT4 // Type of the left-hand side matrix operand
682  , typename MT5 > // Type of the right-hand side matrix operand
683  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
684  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
685  {
686  const size_t M( A.rows() );
687  const size_t N( B.columns() );
688  const size_t K( A.columns() );
689 
690  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
691 
692  for( size_t j=0UL; j<N; ++j )
693  {
694  const size_t kbegin( ( IsLower_v<MT5> )
695  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
696  :( 0UL ) );
697  const size_t kend( ( IsUpper_v<MT5> )
698  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
699  :( K ) );
700  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
701 
702  if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
703  for( size_t i=0UL; i<M; ++i ) {
704  reset( C(i,j) );
705  }
706  continue;
707  }
708 
709  {
710  const size_t ibegin( ( IsLower_v<MT4> )
711  ?( ( IsStrictlyLower_v<MT4> )
712  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
713  :( LOW ? max(j,kbegin) : kbegin ) )
714  :( LOW ? j : 0UL ) );
715  const size_t iend( ( IsUpper_v<MT4> )
716  ?( ( IsStrictlyUpper_v<MT4> )
717  ?( UPP ? min(j+1UL,kbegin) : kbegin )
718  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
719  :( UPP ? j+1UL : M ) );
720 
721  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
722  for( size_t i=0UL; i<ibegin; ++i ) {
723  reset( C(i,j) );
724  }
725  }
726  else if( IsStrictlyLower_v<MT4> ) {
727  reset( C(0UL,j) );
728  }
729  for( size_t i=ibegin; i<iend; ++i ) {
730  C(i,j) = A(i,kbegin) * B(kbegin,j);
731  }
732  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
733  for( size_t i=iend; i<M; ++i ) {
734  reset( C(i,j) );
735  }
736  }
737  else if( IsStrictlyUpper_v<MT4> ) {
738  reset( C(M-1UL,j) );
739  }
740  }
741 
742  for( size_t k=kbegin+1UL; k<kend; ++k )
743  {
744  const size_t ibegin( ( IsLower_v<MT4> )
745  ?( ( IsStrictlyLower_v<MT4> )
746  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
747  :( SYM || HERM || LOW ? max( j, k ) : k ) )
748  :( SYM || HERM || LOW ? j : 0UL ) );
749  const size_t iend( ( IsUpper_v<MT4> )
750  ?( ( IsStrictlyUpper_v<MT4> )
751  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
752  :( UPP ? min(j+1UL,k) : k ) )
753  :( UPP ? j+1UL : M ) );
754 
755  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
756  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
757 
758  for( size_t i=ibegin; i<iend; ++i ) {
759  C(i,j) += A(i,k) * B(k,j);
760  }
761  if( IsUpper_v<MT4> ) {
762  C(iend,j) = A(iend,k) * B(k,j);
763  }
764  }
765  }
766 
767  if( SYM || HERM ) {
768  for( size_t j=1UL; j<N; ++j ) {
769  for( size_t i=0UL; i<j; ++i ) {
770  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
771  }
772  }
773  }
774  }
776  //**********************************************************************************************
777 
778  //**Default assignment to row-major dense matrices (general/diagonal)***************************
792  template< typename MT3 // Type of the left-hand side target matrix
793  , typename MT4 // Type of the left-hand side matrix operand
794  , typename MT5 > // Type of the right-hand side matrix operand
795  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
796  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
797  {
798  constexpr size_t block( BLOCK_SIZE );
799 
800  const size_t M( A.rows() );
801  const size_t N( B.columns() );
802 
803  for( size_t ii=0UL; ii<M; ii+=block ) {
804  const size_t iend( min( M, ii+block ) );
805  for( size_t jj=0UL; jj<N; jj+=block ) {
806  const size_t jend( min( N, jj+block ) );
807  for( size_t i=ii; i<iend; ++i )
808  {
809  const size_t jbegin( ( IsUpper_v<MT4> )
810  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
811  :( jj ) );
812  const size_t jpos( ( IsLower_v<MT4> )
813  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
814  :( jend ) );
815 
816  if( IsUpper_v<MT4> ) {
817  for( size_t j=jj; j<jbegin; ++j ) {
818  reset( C(i,j) );
819  }
820  }
821  for( size_t j=jbegin; j<jpos; ++j ) {
822  C(i,j) = A(i,j) * B(j,j);
823  }
824  if( IsLower_v<MT4> ) {
825  for( size_t j=jpos; j<jend; ++j ) {
826  reset( C(i,j) );
827  }
828  }
829  }
830  }
831  }
832  }
834  //**********************************************************************************************
835 
836  //**Default assignment to column-major dense matrices (general/diagonal)************************
850  template< typename MT3 // Type of the left-hand side target matrix
851  , typename MT4 // Type of the left-hand side matrix operand
852  , typename MT5 > // Type of the right-hand side matrix operand
853  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
854  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
855  {
856  const size_t M( A.rows() );
857  const size_t N( B.columns() );
858 
859  for( size_t j=0UL; j<N; ++j )
860  {
861  const size_t ibegin( ( IsLower_v<MT4> )
862  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
863  :( 0UL ) );
864  const size_t iend( ( IsUpper_v<MT4> )
865  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
866  :( M ) );
867  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
868 
869  if( IsLower_v<MT4> ) {
870  for( size_t i=0UL; i<ibegin; ++i ) {
871  reset( C(i,j) );
872  }
873  }
874  for( size_t i=ibegin; i<iend; ++i ) {
875  C(i,j) = A(i,j) * B(j,j);
876  }
877  if( IsUpper_v<MT4> ) {
878  for( size_t i=iend; i<M; ++i ) {
879  reset( C(i,j) );
880  }
881  }
882  }
883  }
885  //**********************************************************************************************
886 
887  //**Default assignment to row-major dense matrices (diagonal/general)***************************
901  template< typename MT3 // Type of the left-hand side target matrix
902  , typename MT4 // Type of the left-hand side matrix operand
903  , typename MT5 > // Type of the right-hand side matrix operand
904  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
905  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
906  {
907  const size_t M( A.rows() );
908  const size_t N( B.columns() );
909 
910  for( size_t i=0UL; i<M; ++i )
911  {
912  const size_t jbegin( ( IsUpper_v<MT5> )
913  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
914  :( 0UL ) );
915  const size_t jend( ( IsLower_v<MT5> )
916  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
917  :( N ) );
918  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
919 
920  if( IsUpper_v<MT5> ) {
921  for( size_t j=0UL; j<jbegin; ++j ) {
922  reset( C(i,j) );
923  }
924  }
925  for( size_t j=jbegin; j<jend; ++j ) {
926  C(i,j) = A(i,i) * B(i,j);
927  }
928  if( IsLower_v<MT5> ) {
929  for( size_t j=jend; j<N; ++j ) {
930  reset( C(i,j) );
931  }
932  }
933  }
934  }
936  //**********************************************************************************************
937 
938  //**Default assignment to column-major dense matrices (diagonal/general)************************
952  template< typename MT3 // Type of the left-hand side target matrix
953  , typename MT4 // Type of the left-hand side matrix operand
954  , typename MT5 > // Type of the right-hand side matrix operand
955  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
956  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
957  {
958  constexpr size_t block( BLOCK_SIZE );
959 
960  const size_t M( A.rows() );
961  const size_t N( B.columns() );
962 
963  for( size_t jj=0UL; jj<N; jj+=block ) {
964  const size_t jend( min( N, jj+block ) );
965  for( size_t ii=0UL; ii<M; ii+=block ) {
966  const size_t iend( min( M, ii+block ) );
967  for( size_t j=jj; j<jend; ++j )
968  {
969  const size_t ibegin( ( IsLower_v<MT5> )
970  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
971  :( ii ) );
972  const size_t ipos( ( IsUpper_v<MT5> )
973  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
974  :( iend ) );
975 
976  if( IsLower_v<MT5> ) {
977  for( size_t i=ii; i<ibegin; ++i ) {
978  reset( C(i,j) );
979  }
980  }
981  for( size_t i=ibegin; i<ipos; ++i ) {
982  C(i,j) = A(i,i) * B(i,j);
983  }
984  if( IsUpper_v<MT5> ) {
985  for( size_t i=ipos; i<iend; ++i ) {
986  reset( C(i,j) );
987  }
988  }
989  }
990  }
991  }
992  }
994  //**********************************************************************************************
995 
996  //**Default assignment to dense matrices (diagonal/diagonal)************************************
1010  template< typename MT3 // Type of the left-hand side target matrix
1011  , typename MT4 // Type of the left-hand side matrix operand
1012  , typename MT5 > // Type of the right-hand side matrix operand
1013  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
1014  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1015  {
1016  reset( C );
1017 
1018  for( size_t i=0UL; i<A.rows(); ++i ) {
1019  C(i,i) = A(i,i) * B(i,i);
1020  }
1021  }
1023  //**********************************************************************************************
1024 
1025  //**Default assignment to dense matrices (small matrices)***************************************
1039  template< typename MT3 // Type of the left-hand side target matrix
1040  , typename MT4 // Type of the left-hand side matrix operand
1041  , typename MT5 > // Type of the right-hand side matrix operand
1042  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1043  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1044  {
1045  selectDefaultAssignKernel( C, A, B );
1046  }
1048  //**********************************************************************************************
1049 
1050  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1065  template< typename MT3 // Type of the left-hand side target matrix
1066  , typename MT4 // Type of the left-hand side matrix operand
1067  , typename MT5 > // Type of the right-hand side matrix operand
1068  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1069  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1070  {
1071  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
1072 
1073  const size_t M( A.rows() );
1074  const size_t N( B.columns() );
1075  const size_t K( A.columns() );
1076 
1077  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1078 
1079  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1080  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1081 
1082  if( LOW && UPP && N > SIMDSIZE*3UL ) {
1083  reset( C );
1084  }
1085 
1086  {
1087  size_t j( 0UL );
1088 
1089  if( IsIntegral_v<ElementType> )
1090  {
1091  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1092  for( size_t i=0UL; i<M; ++i )
1093  {
1094  const size_t kbegin( ( IsUpper_v<MT4> )
1095  ?( ( IsLower_v<MT5> )
1096  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1097  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1098  :( IsLower_v<MT5> ? j : 0UL ) );
1099  const size_t kend( ( IsLower_v<MT4> )
1100  ?( ( IsUpper_v<MT5> )
1101  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1102  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1103  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
1104 
1105  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1106 
1107  for( size_t k=kbegin; k<kend; ++k ) {
1108  const SIMDType a1( set( A(i,k) ) );
1109  xmm1 += a1 * B.load(k,j );
1110  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1111  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1112  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1113  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1114  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1115  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1116  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1117  }
1118 
1119  C.store( i, j , xmm1 );
1120  C.store( i, j+SIMDSIZE , xmm2 );
1121  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1122  C.store( i, j+SIMDSIZE*3UL, xmm4 );
1123  C.store( i, j+SIMDSIZE*4UL, xmm5 );
1124  C.store( i, j+SIMDSIZE*5UL, xmm6 );
1125  C.store( i, j+SIMDSIZE*6UL, xmm7 );
1126  C.store( i, j+SIMDSIZE*7UL, xmm8 );
1127  }
1128  }
1129  }
1130 
1131  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1132  {
1133  size_t i( 0UL );
1134 
1135  for( ; (i+2UL) <= M; i+=2UL )
1136  {
1137  const size_t kbegin( ( IsUpper_v<MT4> )
1138  ?( ( IsLower_v<MT5> )
1139  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1140  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1141  :( IsLower_v<MT5> ? j : 0UL ) );
1142  const size_t kend( ( IsLower_v<MT4> )
1143  ?( ( IsUpper_v<MT5> )
1144  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
1145  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1146  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
1147 
1148  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1149 
1150  for( size_t k=kbegin; k<kend; ++k ) {
1151  const SIMDType a1( set( A(i ,k) ) );
1152  const SIMDType a2( set( A(i+1UL,k) ) );
1153  const SIMDType b1( B.load(k,j ) );
1154  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1155  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1156  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1157  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
1158  xmm1 += a1 * b1;
1159  xmm2 += a1 * b2;
1160  xmm3 += a1 * b3;
1161  xmm4 += a1 * b4;
1162  xmm5 += a1 * b5;
1163  xmm6 += a2 * b1;
1164  xmm7 += a2 * b2;
1165  xmm8 += a2 * b3;
1166  xmm9 += a2 * b4;
1167  xmm10 += a2 * b5;
1168  }
1169 
1170  C.store( i , j , xmm1 );
1171  C.store( i , j+SIMDSIZE , xmm2 );
1172  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1173  C.store( i , j+SIMDSIZE*3UL, xmm4 );
1174  C.store( i , j+SIMDSIZE*4UL, xmm5 );
1175  C.store( i+1UL, j , xmm6 );
1176  C.store( i+1UL, j+SIMDSIZE , xmm7 );
1177  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1178  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1179  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1180  }
1181 
1182  if( i < M )
1183  {
1184  const size_t kbegin( ( IsUpper_v<MT4> )
1185  ?( ( IsLower_v<MT5> )
1186  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1187  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1188  :( IsLower_v<MT5> ? j : 0UL ) );
1189  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
1190 
1191  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1192 
1193  for( size_t k=kbegin; k<kend; ++k ) {
1194  const SIMDType a1( set( A(i,k) ) );
1195  xmm1 += a1 * B.load(k,j );
1196  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1197  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1198  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1199  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1200  }
1201 
1202  C.store( i, j , xmm1 );
1203  C.store( i, j+SIMDSIZE , xmm2 );
1204  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1205  C.store( i, j+SIMDSIZE*3UL, xmm4 );
1206  C.store( i, j+SIMDSIZE*4UL, xmm5 );
1207  }
1208  }
1209 
1210  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1211  {
1212  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
1213  size_t i( LOW ? j : 0UL );
1214 
1215  for( ; (i+2UL) <= iend; i+=2UL )
1216  {
1217  const size_t kbegin( ( IsUpper_v<MT4> )
1218  ?( ( IsLower_v<MT5> )
1219  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1220  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1221  :( IsLower_v<MT5> ? j : 0UL ) );
1222  const size_t kend( ( IsLower_v<MT4> )
1223  ?( ( IsUpper_v<MT5> )
1224  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1225  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1226  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
1227 
1228  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1229 
1230  for( size_t k=kbegin; k<kend; ++k ) {
1231  const SIMDType a1( set( A(i ,k) ) );
1232  const SIMDType a2( set( A(i+1UL,k) ) );
1233  const SIMDType b1( B.load(k,j ) );
1234  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1235  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1236  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1237  xmm1 += a1 * b1;
1238  xmm2 += a1 * b2;
1239  xmm3 += a1 * b3;
1240  xmm4 += a1 * b4;
1241  xmm5 += a2 * b1;
1242  xmm6 += a2 * b2;
1243  xmm7 += a2 * b3;
1244  xmm8 += a2 * b4;
1245  }
1246 
1247  C.store( i , j , xmm1 );
1248  C.store( i , j+SIMDSIZE , xmm2 );
1249  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1250  C.store( i , j+SIMDSIZE*3UL, xmm4 );
1251  C.store( i+1UL, j , xmm5 );
1252  C.store( i+1UL, j+SIMDSIZE , xmm6 );
1253  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1254  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1255  }
1256 
1257  if( i < iend )
1258  {
1259  const size_t kbegin( ( IsUpper_v<MT4> )
1260  ?( ( IsLower_v<MT5> )
1261  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1262  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1263  :( IsLower_v<MT5> ? j : 0UL ) );
1264  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1265 
1266  SIMDType xmm1, xmm2, xmm3, xmm4;
1267 
1268  for( size_t k=kbegin; k<kend; ++k ) {
1269  const SIMDType a1( set( A(i,k) ) );
1270  xmm1 += a1 * B.load(k,j );
1271  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1272  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1273  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1274  }
1275 
1276  C.store( i, j , xmm1 );
1277  C.store( i, j+SIMDSIZE , xmm2 );
1278  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1279  C.store( i, j+SIMDSIZE*3UL, xmm4 );
1280  }
1281  }
1282 
1283  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1284  {
1285  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
1286  size_t i( LOW ? j : 0UL );
1287 
1288  for( ; (i+2UL) <= iend; i+=2UL )
1289  {
1290  const size_t kbegin( ( IsUpper_v<MT4> )
1291  ?( ( IsLower_v<MT5> )
1292  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1293  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1294  :( IsLower_v<MT5> ? j : 0UL ) );
1295  const size_t kend( ( IsLower_v<MT4> )
1296  ?( ( IsUpper_v<MT5> )
1297  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1298  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1299  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
1300 
1301  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1302 
1303  for( size_t k=kbegin; k<kend; ++k ) {
1304  const SIMDType a1( set( A(i ,k) ) );
1305  const SIMDType a2( set( A(i+1UL,k) ) );
1306  const SIMDType b1( B.load(k,j ) );
1307  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1308  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1309  xmm1 += a1 * b1;
1310  xmm2 += a1 * b2;
1311  xmm3 += a1 * b3;
1312  xmm4 += a2 * b1;
1313  xmm5 += a2 * b2;
1314  xmm6 += a2 * b3;
1315  }
1316 
1317  C.store( i , j , xmm1 );
1318  C.store( i , j+SIMDSIZE , xmm2 );
1319  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1320  C.store( i+1UL, j , xmm4 );
1321  C.store( i+1UL, j+SIMDSIZE , xmm5 );
1322  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1323  }
1324 
1325  if( i < iend )
1326  {
1327  const size_t kbegin( ( IsUpper_v<MT4> )
1328  ?( ( IsLower_v<MT5> )
1329  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1330  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1331  :( IsLower_v<MT5> ? j : 0UL ) );
1332  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1333 
1334  SIMDType xmm1, xmm2, xmm3;
1335 
1336  for( size_t k=kbegin; k<kend; ++k ) {
1337  const SIMDType a1( set( A(i,k) ) );
1338  xmm1 += a1 * B.load(k,j );
1339  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1340  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1341  }
1342 
1343  C.store( i, j , xmm1 );
1344  C.store( i, j+SIMDSIZE , xmm2 );
1345  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1346  }
1347  }
1348 
1349  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1350  {
1351  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
1352  size_t i( LOW ? j : 0UL );
1353 
1354  for( ; (i+4UL) <= iend; i+=4UL )
1355  {
1356  const size_t kbegin( ( IsUpper_v<MT4> )
1357  ?( ( IsLower_v<MT5> )
1358  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1359  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1360  :( IsLower_v<MT5> ? j : 0UL ) );
1361  const size_t kend( ( IsLower_v<MT4> )
1362  ?( ( IsUpper_v<MT5> )
1363  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
1364  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1365  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1366 
1367  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1368 
1369  for( size_t k=kbegin; k<kend; ++k ) {
1370  const SIMDType a1( set( A(i ,k) ) );
1371  const SIMDType a2( set( A(i+1UL,k) ) );
1372  const SIMDType a3( set( A(i+2UL,k) ) );
1373  const SIMDType a4( set( A(i+3UL,k) ) );
1374  const SIMDType b1( B.load(k,j ) );
1375  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1376  xmm1 += a1 * b1;
1377  xmm2 += a1 * b2;
1378  xmm3 += a2 * b1;
1379  xmm4 += a2 * b2;
1380  xmm5 += a3 * b1;
1381  xmm6 += a3 * b2;
1382  xmm7 += a4 * b1;
1383  xmm8 += a4 * b2;
1384  }
1385 
1386  C.store( i , j , xmm1 );
1387  C.store( i , j+SIMDSIZE, xmm2 );
1388  C.store( i+1UL, j , xmm3 );
1389  C.store( i+1UL, j+SIMDSIZE, xmm4 );
1390  C.store( i+2UL, j , xmm5 );
1391  C.store( i+2UL, j+SIMDSIZE, xmm6 );
1392  C.store( i+3UL, j , xmm7 );
1393  C.store( i+3UL, j+SIMDSIZE, xmm8 );
1394  }
1395 
1396  for( ; (i+3UL) <= iend; i+=3UL )
1397  {
1398  const size_t kbegin( ( IsUpper_v<MT4> )
1399  ?( ( IsLower_v<MT5> )
1400  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1401  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1402  :( IsLower_v<MT5> ? j : 0UL ) );
1403  const size_t kend( ( IsLower_v<MT4> )
1404  ?( ( IsUpper_v<MT5> )
1405  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
1406  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1407  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1408 
1409  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1410 
1411  for( size_t k=kbegin; k<kend; ++k ) {
1412  const SIMDType a1( set( A(i ,k) ) );
1413  const SIMDType a2( set( A(i+1UL,k) ) );
1414  const SIMDType a3( set( A(i+2UL,k) ) );
1415  const SIMDType b1( B.load(k,j ) );
1416  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1417  xmm1 += a1 * b1;
1418  xmm2 += a1 * b2;
1419  xmm3 += a2 * b1;
1420  xmm4 += a2 * b2;
1421  xmm5 += a3 * b1;
1422  xmm6 += a3 * b2;
1423  }
1424 
1425  C.store( i , j , xmm1 );
1426  C.store( i , j+SIMDSIZE, xmm2 );
1427  C.store( i+1UL, j , xmm3 );
1428  C.store( i+1UL, j+SIMDSIZE, xmm4 );
1429  C.store( i+2UL, j , xmm5 );
1430  C.store( i+2UL, j+SIMDSIZE, xmm6 );
1431  }
1432 
1433  for( ; (i+2UL) <= iend; i+=2UL )
1434  {
1435  const size_t kbegin( ( IsUpper_v<MT4> )
1436  ?( ( IsLower_v<MT5> )
1437  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1438  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1439  :( IsLower_v<MT5> ? j : 0UL ) );
1440  const size_t kend( ( IsLower_v<MT4> )
1441  ?( ( IsUpper_v<MT5> )
1442  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1443  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1444  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1445 
1446  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1447  size_t k( kbegin );
1448 
1449  for( ; (k+2UL) <= kend; k+=2UL ) {
1450  const SIMDType a1( set( A(i ,k ) ) );
1451  const SIMDType a2( set( A(i+1UL,k ) ) );
1452  const SIMDType a3( set( A(i ,k+1UL) ) );
1453  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
1454  const SIMDType b1( B.load(k ,j ) );
1455  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1456  const SIMDType b3( B.load(k+1UL,j ) );
1457  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1458  xmm1 += a1 * b1;
1459  xmm2 += a1 * b2;
1460  xmm3 += a2 * b1;
1461  xmm4 += a2 * b2;
1462  xmm5 += a3 * b3;
1463  xmm6 += a3 * b4;
1464  xmm7 += a4 * b3;
1465  xmm8 += a4 * b4;
1466  }
1467 
1468  for( ; k<kend; ++k ) {
1469  const SIMDType a1( set( A(i ,k) ) );
1470  const SIMDType a2( set( A(i+1UL,k) ) );
1471  const SIMDType b1( B.load(k,j ) );
1472  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1473  xmm1 += a1 * b1;
1474  xmm2 += a1 * b2;
1475  xmm3 += a2 * b1;
1476  xmm4 += a2 * b2;
1477  }
1478 
1479  C.store( i , j , xmm1+xmm5 );
1480  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
1481  C.store( i+1UL, j , xmm3+xmm7 );
1482  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1483  }
1484 
1485  if( i < iend )
1486  {
1487  const size_t kbegin( ( IsUpper_v<MT4> )
1488  ?( ( IsLower_v<MT5> )
1489  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1490  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1491  :( IsLower_v<MT5> ? j : 0UL ) );
1492  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1493 
1494  SIMDType xmm1, xmm2, xmm3, xmm4;
1495  size_t k( kbegin );
1496 
1497  for( ; (k+2UL) <= kend; k+=2UL ) {
1498  const SIMDType a1( set( A(i,k ) ) );
1499  const SIMDType a2( set( A(i,k+1UL) ) );
1500  xmm1 += a1 * B.load(k ,j );
1501  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1502  xmm3 += a2 * B.load(k+1UL,j );
1503  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1504  }
1505 
1506  for( ; k<kend; ++k ) {
1507  const SIMDType a1( set( A(i,k) ) );
1508  xmm1 += a1 * B.load(k,j );
1509  xmm2 += a1 * B.load(k,j+SIMDSIZE);
1510  }
1511 
1512  C.store( i, j , xmm1+xmm3 );
1513  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
1514  }
1515  }
1516 
1517  for( ; j<jpos; j+=SIMDSIZE )
1518  {
1519  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
1520  size_t i( LOW ? j : 0UL );
1521 
1522  for( ; (i+4UL) <= iend; i+=4UL )
1523  {
1524  const size_t kbegin( ( IsUpper_v<MT4> )
1525  ?( ( IsLower_v<MT5> )
1526  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1527  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1528  :( IsLower_v<MT5> ? j : 0UL ) );
1529  const size_t kend( ( IsLower_v<MT4> )
1530  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1531  :( K ) );
1532 
1533  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1534  size_t k( kbegin );
1535 
1536  for( ; (k+2UL) <= kend; k+=2UL ) {
1537  const SIMDType b1( B.load(k ,j) );
1538  const SIMDType b2( B.load(k+1UL,j) );
1539  xmm1 += set( A(i ,k ) ) * b1;
1540  xmm2 += set( A(i+1UL,k ) ) * b1;
1541  xmm3 += set( A(i+2UL,k ) ) * b1;
1542  xmm4 += set( A(i+3UL,k ) ) * b1;
1543  xmm5 += set( A(i ,k+1UL) ) * b2;
1544  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
1545  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
1546  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
1547  }
1548 
1549  for( ; k<kend; ++k ) {
1550  const SIMDType b1( B.load(k,j) );
1551  xmm1 += set( A(i ,k) ) * b1;
1552  xmm2 += set( A(i+1UL,k) ) * b1;
1553  xmm3 += set( A(i+2UL,k) ) * b1;
1554  xmm4 += set( A(i+3UL,k) ) * b1;
1555  }
1556 
1557  C.store( i , j, xmm1+xmm5 );
1558  C.store( i+1UL, j, xmm2+xmm6 );
1559  C.store( i+2UL, j, xmm3+xmm7 );
1560  C.store( i+3UL, j, xmm4+xmm8 );
1561  }
1562 
1563  for( ; (i+3UL) <= iend; i+=3UL )
1564  {
1565  const size_t kbegin( ( IsUpper_v<MT4> )
1566  ?( ( IsLower_v<MT5> )
1567  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1568  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1569  :( IsLower_v<MT5> ? j : 0UL ) );
1570  const size_t kend( ( IsLower_v<MT4> )
1571  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1572  :( K ) );
1573 
1574  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1575  size_t k( kbegin );
1576 
1577  for( ; (k+2UL) <= kend; k+=2UL ) {
1578  const SIMDType b1( B.load(k ,j) );
1579  const SIMDType b2( B.load(k+1UL,j) );
1580  xmm1 += set( A(i ,k ) ) * b1;
1581  xmm2 += set( A(i+1UL,k ) ) * b1;
1582  xmm3 += set( A(i+2UL,k ) ) * b1;
1583  xmm4 += set( A(i ,k+1UL) ) * b2;
1584  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
1585  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
1586  }
1587 
1588  for( ; k<kend; ++k ) {
1589  const SIMDType b1( B.load(k,j) );
1590  xmm1 += set( A(i ,k) ) * b1;
1591  xmm2 += set( A(i+1UL,k) ) * b1;
1592  xmm3 += set( A(i+2UL,k) ) * b1;
1593  }
1594 
1595  C.store( i , j, xmm1+xmm4 );
1596  C.store( i+1UL, j, xmm2+xmm5 );
1597  C.store( i+2UL, j, xmm3+xmm6 );
1598  }
1599 
1600  for( ; (i+2UL) <= iend; i+=2UL )
1601  {
1602  const size_t kbegin( ( IsUpper_v<MT4> )
1603  ?( ( IsLower_v<MT5> )
1604  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1605  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1606  :( IsLower_v<MT5> ? j : 0UL ) );
1607  const size_t kend( ( IsLower_v<MT4> )
1608  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1609  :( K ) );
1610 
1611  SIMDType xmm1, xmm2, xmm3, xmm4;
1612  size_t k( kbegin );
1613 
1614  for( ; (k+2UL) <= kend; k+=2UL ) {
1615  const SIMDType b1( B.load(k ,j) );
1616  const SIMDType b2( B.load(k+1UL,j) );
1617  xmm1 += set( A(i ,k ) ) * b1;
1618  xmm2 += set( A(i+1UL,k ) ) * b1;
1619  xmm3 += set( A(i ,k+1UL) ) * b2;
1620  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
1621  }
1622 
1623  for( ; k<kend; ++k ) {
1624  const SIMDType b1( B.load(k,j) );
1625  xmm1 += set( A(i ,k) ) * b1;
1626  xmm2 += set( A(i+1UL,k) ) * b1;
1627  }
1628 
1629  C.store( i , j, xmm1+xmm3 );
1630  C.store( i+1UL, j, xmm2+xmm4 );
1631  }
1632 
1633  if( i < iend )
1634  {
1635  const size_t kbegin( ( IsUpper_v<MT4> )
1636  ?( ( IsLower_v<MT5> )
1637  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1638  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1639  :( IsLower_v<MT5> ? j : 0UL ) );
1640 
1641  SIMDType xmm1, xmm2;
1642  size_t k( kbegin );
1643 
1644  for( ; (k+2UL) <= K; k+=2UL ) {
1645  xmm1 += set( A(i,k ) ) * B.load(k ,j);
1646  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
1647  }
1648 
1649  for( ; k<K; ++k ) {
1650  xmm1 += set( A(i,k) ) * B.load(k,j);
1651  }
1652 
1653  C.store( i, j, xmm1+xmm2 );
1654  }
1655  }
1656 
1657  for( ; remainder && j<N; ++j )
1658  {
1659  size_t i( LOW && UPP ? j : 0UL );
1660 
1661  for( ; (i+2UL) <= M; i+=2UL )
1662  {
1663  const size_t kbegin( ( IsUpper_v<MT4> )
1664  ?( ( IsLower_v<MT5> )
1665  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1666  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1667  :( IsLower_v<MT5> ? j : 0UL ) );
1668  const size_t kend( ( IsLower_v<MT4> )
1669  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1670  :( K ) );
1671 
1672  ElementType value1{};
1673  ElementType value2{};
1674 
1675  for( size_t k=kbegin; k<kend; ++k ) {
1676  value1 += A(i ,k) * B(k,j);
1677  value2 += A(i+1UL,k) * B(k,j);
1678  }
1679 
1680  C(i ,j) = value1;
1681  C(i+1UL,j) = value2;
1682  }
1683 
1684  if( i < M )
1685  {
1686  const size_t kbegin( ( IsUpper_v<MT4> )
1687  ?( ( IsLower_v<MT5> )
1688  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1689  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1690  :( IsLower_v<MT5> ? j : 0UL ) );
1691 
1692  ElementType value{};
1693 
1694  for( size_t k=kbegin; k<K; ++k ) {
1695  value += A(i,k) * B(k,j);
1696  }
1697 
1698  C(i,j) = value;
1699  }
1700  }
1701  }
1702 
1703  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1704  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1705  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1706  for( size_t j=0UL; j<jend; ++j ) {
1707  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1708  }
1709  }
1710  }
1711  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1712  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1713  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1714  for( size_t i=0UL; i<iend; ++i ) {
1715  reset( C(i,j) );
1716  }
1717  }
1718  }
1719  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1720  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1721  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1722  for( size_t j=0UL; j<jend; ++j ) {
1723  reset( C(i,j) );
1724  }
1725  }
1726  }
1727  }
1729  //**********************************************************************************************
1730 
1731  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1746  template< typename MT3 // Type of the left-hand side target matrix
1747  , typename MT4 // Type of the left-hand side matrix operand
1748  , typename MT5 > // Type of the right-hand side matrix operand
1749  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1750  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1751  {
1752  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
1753 
1754  const size_t M( A.rows() );
1755  const size_t N( B.columns() );
1756  const size_t K( A.columns() );
1757 
1758  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1759 
1760  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
1761  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1762 
1763  if( LOW && UPP && M > SIMDSIZE*3UL ) {
1764  reset( C );
1765  }
1766 
1767  {
1768  size_t i( 0UL );
1769 
1770  if( IsIntegral_v<ElementType> )
1771  {
1772  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1773  for( size_t j=0UL; j<N; ++j )
1774  {
1775  const size_t kbegin( ( IsLower_v<MT5> )
1776  ?( ( IsUpper_v<MT4> )
1777  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1778  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1779  :( IsUpper_v<MT4> ? i : 0UL ) );
1780  const size_t kend( ( IsUpper_v<MT5> )
1781  ?( ( IsLower_v<MT4> )
1782  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
1783  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
1784  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
1785 
1786  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1787 
1788  for( size_t k=kbegin; k<kend; ++k ) {
1789  const SIMDType b1( set( B(k,j) ) );
1790  xmm1 += A.load(i ,k) * b1;
1791  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1792  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1793  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1794  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1795  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
1796  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
1797  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
1798  }
1799 
1800  C.store( i , j, xmm1 );
1801  C.store( i+SIMDSIZE , j, xmm2 );
1802  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1803  C.store( i+SIMDSIZE*3UL, j, xmm4 );
1804  C.store( i+SIMDSIZE*4UL, j, xmm5 );
1805  C.store( i+SIMDSIZE*5UL, j, xmm6 );
1806  C.store( i+SIMDSIZE*6UL, j, xmm7 );
1807  C.store( i+SIMDSIZE*7UL, j, xmm8 );
1808  }
1809  }
1810  }
1811 
1812  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
1813  {
1814  size_t j( 0UL );
1815 
1816  for( ; (j+2UL) <= N; j+=2UL )
1817  {
1818  const size_t kbegin( ( IsLower_v<MT5> )
1819  ?( ( IsUpper_v<MT4> )
1820  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1821  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1822  :( IsUpper_v<MT4> ? i : 0UL ) );
1823  const size_t kend( ( IsUpper_v<MT5> )
1824  ?( ( IsLower_v<MT4> )
1825  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1826  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1827  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
1828 
1829  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1830 
1831  for( size_t k=kbegin; k<kend; ++k ) {
1832  const SIMDType a1( A.load(i ,k) );
1833  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1834  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1835  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1836  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1837  const SIMDType b1( set( B(k,j ) ) );
1838  const SIMDType b2( set( B(k,j+1UL) ) );
1839  xmm1 += a1 * b1;
1840  xmm2 += a2 * b1;
1841  xmm3 += a3 * b1;
1842  xmm4 += a4 * b1;
1843  xmm5 += a5 * b1;
1844  xmm6 += a1 * b2;
1845  xmm7 += a2 * b2;
1846  xmm8 += a3 * b2;
1847  xmm9 += a4 * b2;
1848  xmm10 += a5 * b2;
1849  }
1850 
1851  C.store( i , j , xmm1 );
1852  C.store( i+SIMDSIZE , j , xmm2 );
1853  C.store( i+SIMDSIZE*2UL, j , xmm3 );
1854  C.store( i+SIMDSIZE*3UL, j , xmm4 );
1855  C.store( i+SIMDSIZE*4UL, j , xmm5 );
1856  C.store( i , j+1UL, xmm6 );
1857  C.store( i+SIMDSIZE , j+1UL, xmm7 );
1858  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1859  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1860  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1861  }
1862 
1863  if( j < N )
1864  {
1865  const size_t kbegin( ( IsLower_v<MT5> )
1866  ?( ( IsUpper_v<MT4> )
1867  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1868  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1869  :( IsUpper_v<MT4> ? i : 0UL ) );
1870  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1871 
1872  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1873 
1874  for( size_t k=kbegin; k<kend; ++k ) {
1875  const SIMDType b1( set( B(k,j) ) );
1876  xmm1 += A.load(i ,k) * b1;
1877  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1878  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1879  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1880  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1881  }
1882 
1883  C.store( i , j, xmm1 );
1884  C.store( i+SIMDSIZE , j, xmm2 );
1885  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1886  C.store( i+SIMDSIZE*3UL, j, xmm4 );
1887  C.store( i+SIMDSIZE*4UL, j, xmm5 );
1888  }
1889  }
1890 
1891  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1892  {
1893  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
1894  size_t j( UPP ? i : 0UL );
1895 
1896  for( ; (j+2UL) <= jend; j+=2UL )
1897  {
1898  const size_t kbegin( ( IsLower_v<MT5> )
1899  ?( ( IsUpper_v<MT4> )
1900  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1901  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1902  :( IsUpper_v<MT4> ? i : 0UL ) );
1903  const size_t kend( ( IsUpper_v<MT5> )
1904  ?( ( IsLower_v<MT4> )
1905  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1906  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1907  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
1908 
1909  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1910 
1911  for( size_t k=kbegin; k<kend; ++k ) {
1912  const SIMDType a1( A.load(i ,k) );
1913  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1914  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1915  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1916  const SIMDType b1( set( B(k,j ) ) );
1917  const SIMDType b2( set( B(k,j+1UL) ) );
1918  xmm1 += a1 * b1;
1919  xmm2 += a2 * b1;
1920  xmm3 += a3 * b1;
1921  xmm4 += a4 * b1;
1922  xmm5 += a1 * b2;
1923  xmm6 += a2 * b2;
1924  xmm7 += a3 * b2;
1925  xmm8 += a4 * b2;
1926  }
1927 
1928  C.store( i , j , xmm1 );
1929  C.store( i+SIMDSIZE , j , xmm2 );
1930  C.store( i+SIMDSIZE*2UL, j , xmm3 );
1931  C.store( i+SIMDSIZE*3UL, j , xmm4 );
1932  C.store( i , j+1UL, xmm5 );
1933  C.store( i+SIMDSIZE , j+1UL, xmm6 );
1934  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1935  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1936  }
1937 
1938  if( j < jend )
1939  {
1940  const size_t kbegin( ( IsLower_v<MT5> )
1941  ?( ( IsUpper_v<MT4> )
1942  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1943  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1944  :( IsUpper_v<MT4> ? i : 0UL ) );
1945  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1946 
1947  SIMDType xmm1, xmm2, xmm3, xmm4;
1948 
1949  for( size_t k=kbegin; k<kend; ++k ) {
1950  const SIMDType b1( set( B(k,j) ) );
1951  xmm1 += A.load(i ,k) * b1;
1952  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1953  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1954  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1955  }
1956 
1957  C.store( i , j, xmm1 );
1958  C.store( i+SIMDSIZE , j, xmm2 );
1959  C.store( i+SIMDSIZE*2UL, j, xmm3 );
1960  C.store( i+SIMDSIZE*3UL, j, xmm4 );
1961  }
1962  }
1963 
1964  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1965  {
1966  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
1967  size_t j( UPP ? i : 0UL );
1968 
1969  for( ; (j+2UL) <= jend; j+=2UL )
1970  {
1971  const size_t kbegin( ( IsLower_v<MT5> )
1972  ?( ( IsUpper_v<MT4> )
1973  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1974  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1975  :( IsUpper_v<MT4> ? i : 0UL ) );
1976  const size_t kend( ( IsUpper_v<MT5> )
1977  ?( ( IsLower_v<MT4> )
1978  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1979  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1980  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
1981 
1982  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1983 
1984  for( size_t k=kbegin; k<kend; ++k ) {
1985  const SIMDType a1( A.load(i ,k) );
1986  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1987  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1988  const SIMDType b1( set( B(k,j ) ) );
1989  const SIMDType b2( set( B(k,j+1UL) ) );
1990  xmm1 += a1 * b1;
1991  xmm2 += a2 * b1;
1992  xmm3 += a3 * b1;
1993  xmm4 += a1 * b2;
1994  xmm5 += a2 * b2;
1995  xmm6 += a3 * b2;
1996  }
1997 
1998  C.store( i , j , xmm1 );
1999  C.store( i+SIMDSIZE , j , xmm2 );
2000  C.store( i+SIMDSIZE*2UL, j , xmm3 );
2001  C.store( i , j+1UL, xmm4 );
2002  C.store( i+SIMDSIZE , j+1UL, xmm5 );
2003  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2004  }
2005 
2006  if( j < jend )
2007  {
2008  const size_t kbegin( ( IsLower_v<MT5> )
2009  ?( ( IsUpper_v<MT4> )
2010  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2011  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2012  :( IsUpper_v<MT4> ? i : 0UL ) );
2013  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2014 
2015  SIMDType xmm1, xmm2, xmm3;
2016 
2017  for( size_t k=kbegin; k<kend; ++k ) {
2018  const SIMDType b1( set( B(k,j) ) );
2019  xmm1 += A.load(i ,k) * b1;
2020  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2021  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2022  }
2023 
2024  C.store( i , j, xmm1 );
2025  C.store( i+SIMDSIZE , j, xmm2 );
2026  C.store( i+SIMDSIZE*2UL, j, xmm3 );
2027  }
2028  }
2029 
2030  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2031  {
2032  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
2033  size_t j( UPP ? i : 0UL );
2034 
2035  for( ; (j+4UL) <= jend; j+=4UL )
2036  {
2037  const size_t kbegin( ( IsLower_v<MT5> )
2038  ?( ( IsUpper_v<MT4> )
2039  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2040  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2041  :( IsUpper_v<MT4> ? i : 0UL ) );
2042  const size_t kend( ( IsUpper_v<MT5> )
2043  ?( ( IsLower_v<MT4> )
2044  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2045  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2046  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2047 
2048  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2049 
2050  for( size_t k=kbegin; k<kend; ++k ) {
2051  const SIMDType a1( A.load(i ,k) );
2052  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2053  const SIMDType b1( set( B(k,j ) ) );
2054  const SIMDType b2( set( B(k,j+1UL) ) );
2055  const SIMDType b3( set( B(k,j+2UL) ) );
2056  const SIMDType b4( set( B(k,j+3UL) ) );
2057  xmm1 += a1 * b1;
2058  xmm2 += a2 * b1;
2059  xmm3 += a1 * b2;
2060  xmm4 += a2 * b2;
2061  xmm5 += a1 * b3;
2062  xmm6 += a2 * b3;
2063  xmm7 += a1 * b4;
2064  xmm8 += a2 * b4;
2065  }
2066 
2067  C.store( i , j , xmm1 );
2068  C.store( i+SIMDSIZE, j , xmm2 );
2069  C.store( i , j+1UL, xmm3 );
2070  C.store( i+SIMDSIZE, j+1UL, xmm4 );
2071  C.store( i , j+2UL, xmm5 );
2072  C.store( i+SIMDSIZE, j+2UL, xmm6 );
2073  C.store( i , j+3UL, xmm7 );
2074  C.store( i+SIMDSIZE, j+3UL, xmm8 );
2075  }
2076 
2077  for( ; (j+3UL) <= jend; j+=3UL )
2078  {
2079  const size_t kbegin( ( IsLower_v<MT5> )
2080  ?( ( IsUpper_v<MT4> )
2081  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2082  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2083  :( IsUpper_v<MT4> ? i : 0UL ) );
2084  const size_t kend( ( IsUpper_v<MT5> )
2085  ?( ( IsLower_v<MT4> )
2086  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2087  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2088  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2089 
2090  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2091 
2092  for( size_t k=kbegin; k<kend; ++k ) {
2093  const SIMDType a1( A.load(i ,k) );
2094  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2095  const SIMDType b1( set( B(k,j ) ) );
2096  const SIMDType b2( set( B(k,j+1UL) ) );
2097  const SIMDType b3( set( B(k,j+2UL) ) );
2098  xmm1 += a1 * b1;
2099  xmm2 += a2 * b1;
2100  xmm3 += a1 * b2;
2101  xmm4 += a2 * b2;
2102  xmm5 += a1 * b3;
2103  xmm6 += a2 * b3;
2104  }
2105 
2106  C.store( i , j , xmm1 );
2107  C.store( i+SIMDSIZE, j , xmm2 );
2108  C.store( i , j+1UL, xmm3 );
2109  C.store( i+SIMDSIZE, j+1UL, xmm4 );
2110  C.store( i , j+2UL, xmm5 );
2111  C.store( i+SIMDSIZE, j+2UL, xmm6 );
2112  }
2113 
2114  for( ; (j+2UL) <= jend; j+=2UL )
2115  {
2116  const size_t kbegin( ( IsLower_v<MT5> )
2117  ?( ( IsUpper_v<MT4> )
2118  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2119  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2120  :( IsUpper_v<MT4> ? i : 0UL ) );
2121  const size_t kend( ( IsUpper_v<MT5> )
2122  ?( ( IsLower_v<MT4> )
2123  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2124  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2125  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
2126 
2127  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2128  size_t k( kbegin );
2129 
2130  for( ; (k+2UL) <= kend; k+=2UL ) {
2131  const SIMDType a1( A.load(i ,k ) );
2132  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2133  const SIMDType a3( A.load(i ,k+1UL) );
2134  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2135  const SIMDType b1( set( B(k ,j ) ) );
2136  const SIMDType b2( set( B(k ,j+1UL) ) );
2137  const SIMDType b3( set( B(k+1UL,j ) ) );
2138  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
2139  xmm1 += a1 * b1;
2140  xmm2 += a2 * b1;
2141  xmm3 += a1 * b2;
2142  xmm4 += a2 * b2;
2143  xmm5 += a3 * b3;
2144  xmm6 += a4 * b3;
2145  xmm7 += a3 * b4;
2146  xmm8 += a4 * b4;
2147  }
2148 
2149  for( ; k<kend; ++k ) {
2150  const SIMDType a1( A.load(i ,k) );
2151  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2152  const SIMDType b1( set( B(k,j ) ) );
2153  const SIMDType b2( set( B(k,j+1UL) ) );
2154  xmm1 += a1 * b1;
2155  xmm2 += a2 * b1;
2156  xmm3 += a1 * b2;
2157  xmm4 += a2 * b2;
2158  }
2159 
2160  C.store( i , j , xmm1+xmm5 );
2161  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
2162  C.store( i , j+1UL, xmm3+xmm7 );
2163  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2164  }
2165 
2166  if( j < jend )
2167  {
2168  const size_t kbegin( ( IsLower_v<MT5> )
2169  ?( ( IsUpper_v<MT4> )
2170  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2171  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2172  :( IsUpper_v<MT4> ? i : 0UL ) );
2173  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2174 
2175  SIMDType xmm1, xmm2, xmm3, xmm4;
2176  size_t k( kbegin );
2177 
2178  for( ; (k+2UL) <= kend; k+=2UL ) {
2179  const SIMDType b1( set( B(k ,j) ) );
2180  const SIMDType b2( set( B(k+1UL,j) ) );
2181  xmm1 += A.load(i ,k ) * b1;
2182  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2183  xmm3 += A.load(i ,k+1UL) * b2;
2184  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2185  }
2186 
2187  for( ; k<kend; ++k ) {
2188  const SIMDType b1( set( B(k,j) ) );
2189  xmm1 += A.load(i ,k) * b1;
2190  xmm2 += A.load(i+SIMDSIZE,k) * b1;
2191  }
2192 
2193  C.store( i , j, xmm1+xmm3 );
2194  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
2195  }
2196  }
2197 
2198  for( ; i<ipos; i+=SIMDSIZE )
2199  {
2200  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
2201  size_t j( UPP ? i : 0UL );
2202 
2203  for( ; (j+4UL) <= jend; j+=4UL )
2204  {
2205  const size_t kbegin( ( IsLower_v<MT5> )
2206  ?( ( IsUpper_v<MT4> )
2207  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2208  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2209  :( IsUpper_v<MT4> ? i : 0UL ) );
2210  const size_t kend( ( IsUpper_v<MT5> )
2211  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2212  :( K ) );
2213 
2214  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2215  size_t k( kbegin );
2216 
2217  for( ; (k+2UL) <= kend; k+=2UL ) {
2218  const SIMDType a1( A.load(i,k ) );
2219  const SIMDType a2( A.load(i,k+1UL) );
2220  xmm1 += a1 * set( B(k ,j ) );
2221  xmm2 += a1 * set( B(k ,j+1UL) );
2222  xmm3 += a1 * set( B(k ,j+2UL) );
2223  xmm4 += a1 * set( B(k ,j+3UL) );
2224  xmm5 += a2 * set( B(k+1UL,j ) );
2225  xmm6 += a2 * set( B(k+1UL,j+1UL) );
2226  xmm7 += a2 * set( B(k+1UL,j+2UL) );
2227  xmm8 += a2 * set( B(k+1UL,j+3UL) );
2228  }
2229 
2230  for( ; k<kend; ++k ) {
2231  const SIMDType a1( A.load(i,k) );
2232  xmm1 += a1 * set( B(k,j ) );
2233  xmm2 += a1 * set( B(k,j+1UL) );
2234  xmm3 += a1 * set( B(k,j+2UL) );
2235  xmm4 += a1 * set( B(k,j+3UL) );
2236  }
2237 
2238  C.store( i, j , xmm1+xmm5 );
2239  C.store( i, j+1UL, xmm2+xmm6 );
2240  C.store( i, j+2UL, xmm3+xmm7 );
2241  C.store( i, j+3UL, xmm4+xmm8 );
2242  }
2243 
2244  for( ; (j+3UL) <= jend; j+=3UL )
2245  {
2246  const size_t kbegin( ( IsLower_v<MT5> )
2247  ?( ( IsUpper_v<MT4> )
2248  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2249  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2250  :( IsUpper_v<MT4> ? i : 0UL ) );
2251  const size_t kend( ( IsUpper_v<MT5> )
2252  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2253  :( K ) );
2254 
2255  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2256  size_t k( kbegin );
2257 
2258  for( ; (k+2UL) <= kend; k+=2UL ) {
2259  const SIMDType a1( A.load(i,k ) );
2260  const SIMDType a2( A.load(i,k+1UL) );
2261  xmm1 += a1 * set( B(k ,j ) );
2262  xmm2 += a1 * set( B(k ,j+1UL) );
2263  xmm3 += a1 * set( B(k ,j+2UL) );
2264  xmm4 += a2 * set( B(k+1UL,j ) );
2265  xmm5 += a2 * set( B(k+1UL,j+1UL) );
2266  xmm6 += a2 * set( B(k+1UL,j+2UL) );
2267  }
2268 
2269  for( ; k<kend; ++k ) {
2270  const SIMDType a1( A.load(i,k) );
2271  xmm1 += a1 * set( B(k,j ) );
2272  xmm2 += a1 * set( B(k,j+1UL) );
2273  xmm3 += a1 * set( B(k,j+2UL) );
2274  }
2275 
2276  C.store( i, j , xmm1+xmm4 );
2277  C.store( i, j+1UL, xmm2+xmm5 );
2278  C.store( i, j+2UL, xmm3+xmm6 );
2279  }
2280 
2281  for( ; (j+2UL) <= jend; j+=2UL )
2282  {
2283  const size_t kbegin( ( IsLower_v<MT5> )
2284  ?( ( IsUpper_v<MT4> )
2285  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2286  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2287  :( IsUpper_v<MT4> ? i : 0UL ) );
2288  const size_t kend( ( IsUpper_v<MT5> )
2289  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2290  :( K ) );
2291 
2292  SIMDType xmm1, xmm2, xmm3, xmm4;
2293  size_t k( kbegin );
2294 
2295  for( ; (k+2UL) <= kend; k+=2UL ) {
2296  const SIMDType a1( A.load(i,k ) );
2297  const SIMDType a2( A.load(i,k+1UL) );
2298  xmm1 += a1 * set( B(k ,j ) );
2299  xmm2 += a1 * set( B(k ,j+1UL) );
2300  xmm3 += a2 * set( B(k+1UL,j ) );
2301  xmm4 += a2 * set( B(k+1UL,j+1UL) );
2302  }
2303 
2304  for( ; k<kend; ++k ) {
2305  const SIMDType a1( A.load(i,k) );
2306  xmm1 += a1 * set( B(k,j ) );
2307  xmm2 += a1 * set( B(k,j+1UL) );
2308  }
2309 
2310  C.store( i, j , xmm1+xmm3 );
2311  C.store( i, j+1UL, xmm2+xmm4 );
2312  }
2313 
2314  if( j < jend )
2315  {
2316  const size_t kbegin( ( IsLower_v<MT5> )
2317  ?( ( IsUpper_v<MT4> )
2318  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2319  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2320  :( IsUpper_v<MT4> ? i : 0UL ) );
2321 
2322  SIMDType xmm1, xmm2;
2323  size_t k( kbegin );
2324 
2325  for( ; (k+2UL) <= K; k+=2UL ) {
2326  xmm1 += A.load(i,k ) * set( B(k ,j) );
2327  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
2328  }
2329 
2330  for( ; k<K; ++k ) {
2331  xmm1 += A.load(i,k) * set( B(k,j) );
2332  }
2333 
2334  C.store( i, j, xmm1+xmm2 );
2335  }
2336  }
2337 
2338  for( ; remainder && i<M; ++i )
2339  {
2340  size_t j( LOW && UPP ? i : 0UL );
2341 
2342  for( ; (j+2UL) <= N; j+=2UL )
2343  {
2344  const size_t kbegin( ( IsLower_v<MT5> )
2345  ?( ( IsUpper_v<MT4> )
2346  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2347  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2348  :( IsUpper_v<MT4> ? i : 0UL ) );
2349  const size_t kend( ( IsUpper_v<MT5> )
2350  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2351  :( K ) );
2352 
2353  ElementType value1{};
2354  ElementType value2{};
2355 
2356  for( size_t k=kbegin; k<kend; ++k ) {
2357  value1 += A(i,k) * B(k,j );
2358  value2 += A(i,k) * B(k,j+1UL);
2359  }
2360 
2361  C(i,j ) = value1;
2362  C(i,j+1UL) = value2;
2363  }
2364 
2365  if( j < N )
2366  {
2367  const size_t kbegin( ( IsLower_v<MT5> )
2368  ?( ( IsUpper_v<MT4> )
2369  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2370  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2371  :( IsUpper_v<MT4> ? i : 0UL ) );
2372 
2373  ElementType value{};
2374 
2375  for( size_t k=kbegin; k<K; ++k ) {
2376  value += A(i,k) * B(k,j);
2377  }
2378 
2379  C(i,j) = value;
2380  }
2381  }
2382  }
2383 
2384  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
2385  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2386  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2387  for( size_t i=0UL; i<iend; ++i ) {
2388  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
2389  }
2390  }
2391  }
2392  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
2393  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2394  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2395  for( size_t i=0UL; i<iend; ++i ) {
2396  reset( C(i,j) );
2397  }
2398  }
2399  }
2400  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
2401  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
2402  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
2403  for( size_t j=0UL; j<jend; ++j ) {
2404  reset( C(i,j) );
2405  }
2406  }
2407  }
2408  }
2410  //**********************************************************************************************
2411 
2412  //**Default assignment to dense matrices (large matrices)***************************************
2426  template< typename MT3 // Type of the left-hand side target matrix
2427  , typename MT4 // Type of the left-hand side matrix operand
2428  , typename MT5 > // Type of the right-hand side matrix operand
2429  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2430  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2431  {
2432  selectDefaultAssignKernel( C, A, B );
2433  }
2435  //**********************************************************************************************
2436 
2437  //**Vectorized default assignment to dense matrices (large matrices)****************************
2452  template< typename MT3 // Type of the left-hand side target matrix
2453  , typename MT4 // Type of the left-hand side matrix operand
2454  , typename MT5 > // Type of the right-hand side matrix operand
2455  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2456  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2457  {
2458  if( SYM )
2459  smmm( C, A, B, ElementType(1) );
2460  else if( HERM )
2461  hmmm( C, A, B, ElementType(1) );
2462  else if( LOW )
2463  lmmm( C, A, B, ElementType(1), ElementType(0) );
2464  else if( UPP )
2465  ummm( C, A, B, ElementType(1), ElementType(0) );
2466  else
2467  mmm( C, A, B, ElementType(1), ElementType(0) );
2468  }
2470  //**********************************************************************************************
2471 
2472  //**BLAS-based assignment to dense matrices (default)*******************************************
2486  template< typename MT3 // Type of the left-hand side target matrix
2487  , typename MT4 // Type of the left-hand side matrix operand
2488  , typename MT5 > // Type of the right-hand side matrix operand
2489  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2490  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2491  {
2492  selectLargeAssignKernel( C, A, B );
2493  }
2495  //**********************************************************************************************
2496 
2497  //**BLAS-based assignment to dense matrices*****************************************************
2498 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2499 
2512  template< typename MT3 // Type of the left-hand side target matrix
2513  , typename MT4 // Type of the left-hand side matrix operand
2514  , typename MT5 > // Type of the right-hand side matrix operand
2515  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2516  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2517  {
2518  using ET = ElementType_t<MT3>;
2519 
2520  if( IsTriangular_v<MT4> ) {
2521  assign( C, B );
2522  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2523  }
2524  else if( IsTriangular_v<MT5> ) {
2525  assign( C, A );
2526  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2527  }
2528  else {
2529  gemm( C, A, B, ET(1), ET(0) );
2530  }
2531  }
2533 #endif
2534  //**********************************************************************************************
2535 
2536  //**Assignment to sparse matrices***************************************************************
2549  template< typename MT // Type of the target sparse matrix
2550  , bool SO > // Storage order of the target sparse matrix
2551  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2552  {
2554 
2555  using TmpType = If_t< SO, ResultType, OppositeType >;
2556 
2563 
2564  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2565  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2566 
2567  const ForwardFunctor fwd;
2568 
2569  const TmpType tmp( serial( rhs ) );
2570  assign( ~lhs, fwd( tmp ) );
2571  }
2573  //**********************************************************************************************
2574 
2575  //**Addition assignment to dense matrices*******************************************************
2588  template< typename MT // Type of the target dense matrix
2589  , bool SO > // Storage order of the target dense matrix
2590  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2591  {
2593 
2594  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2595  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2596 
2597  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2598  return;
2599  }
2600 
2601  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2602  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2603 
2604  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2605  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2606  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2607  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2608  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2609  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2610 
2611  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2612  }
2614  //**********************************************************************************************
2615 
2616  //**Addition assignment to dense matrices (kernel selection)************************************
2627  template< typename MT3 // Type of the left-hand side target matrix
2628  , typename MT4 // Type of the left-hand side matrix operand
2629  , typename MT5 > // Type of the right-hand side matrix operand
2630  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2631  {
2632  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
2633  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
2634  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
2635  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2636  selectSmallAddAssignKernel( C, A, B );
2637  else
2638  selectBlasAddAssignKernel( C, A, B );
2639  }
2641  //**********************************************************************************************
2642 
2643  //**Default addition assignment to row-major dense matrices (general/general)*******************
2657  template< typename MT3 // Type of the left-hand side target matrix
2658  , typename MT4 // Type of the left-hand side matrix operand
2659  , typename MT5 > // Type of the right-hand side matrix operand
2660  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2661  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2662  {
2663  const size_t M( A.rows() );
2664  const size_t N( B.columns() );
2665  const size_t K( A.columns() );
2666 
2667  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2668 
2669  for( size_t i=0UL; i<M; ++i )
2670  {
2671  const size_t kbegin( ( IsUpper_v<MT4> )
2672  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2673  :( 0UL ) );
2674  const size_t kend( ( IsLower_v<MT4> )
2675  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2676  :( K ) );
2677  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2678 
2679  for( size_t k=kbegin; k<kend; ++k )
2680  {
2681  const size_t jbegin( ( IsUpper_v<MT5> )
2682  ?( ( IsStrictlyUpper_v<MT5> )
2683  ?( UPP ? max(i,k+1UL) : k+1UL )
2684  :( UPP ? max(i,k) : k ) )
2685  :( UPP ? i : 0UL ) );
2686  const size_t jend( ( IsLower_v<MT5> )
2687  ?( ( IsStrictlyLower_v<MT5> )
2688  ?( LOW ? min(i+1UL,k) : k )
2689  :( LOW ? min(i,k)+1UL : k+1UL ) )
2690  :( LOW ? i+1UL : N ) );
2691 
2692  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
2693  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2694 
2695  const size_t jnum( jend - jbegin );
2696  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2697 
2698  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2699  C(i,j ) += A(i,k) * B(k,j );
2700  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2701  }
2702  if( jpos < jend ) {
2703  C(i,jpos) += A(i,k) * B(k,jpos);
2704  }
2705  }
2706  }
2707  }
2709  //**********************************************************************************************
2710 
2711  //**Default addition assignment to column-major dense matrices (general/general)****************
2725  template< typename MT3 // Type of the left-hand side target matrix
2726  , typename MT4 // Type of the left-hand side matrix operand
2727  , typename MT5 > // Type of the right-hand side matrix operand
2728  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2729  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2730  {
2731  const size_t M( A.rows() );
2732  const size_t N( B.columns() );
2733  const size_t K( A.columns() );
2734 
2735  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2736 
2737  for( size_t j=0UL; j<N; ++j )
2738  {
2739  const size_t kbegin( ( IsLower_v<MT5> )
2740  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2741  :( 0UL ) );
2742  const size_t kend( ( IsUpper_v<MT5> )
2743  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2744  :( K ) );
2745  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2746 
2747  for( size_t k=kbegin; k<kend; ++k )
2748  {
2749  const size_t ibegin( ( IsLower_v<MT4> )
2750  ?( ( IsStrictlyLower_v<MT4> )
2751  ?( LOW ? max(j,k+1UL) : k+1UL )
2752  :( LOW ? max(j,k) : k ) )
2753  :( LOW ? j : 0UL ) );
2754  const size_t iend( ( IsUpper_v<MT4> )
2755  ?( ( IsStrictlyUpper_v<MT4> )
2756  ?( UPP ? min(j+1UL,k) : k )
2757  :( UPP ? min(j,k)+1UL : k+1UL ) )
2758  :( UPP ? j+1UL : M ) );
2759 
2760  if( ( LOW || UPP ) && ibegin >= iend ) continue;
2761  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2762 
2763  const size_t inum( iend - ibegin );
2764  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2765 
2766  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2767  C(i ,j) += A(i ,k) * B(k,j);
2768  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2769  }
2770  if( ipos < iend ) {
2771  C(ipos,j) += A(ipos,k) * B(k,j);
2772  }
2773  }
2774  }
2775  }
2777  //**********************************************************************************************
2778 
2779  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2793  template< typename MT3 // Type of the left-hand side target matrix
2794  , typename MT4 // Type of the left-hand side matrix operand
2795  , typename MT5 > // Type of the right-hand side matrix operand
2796  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2797  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2798  {
2799  constexpr size_t block( BLOCK_SIZE );
2800 
2801  const size_t M( A.rows() );
2802  const size_t N( B.columns() );
2803 
2804  for( size_t ii=0UL; ii<M; ii+=block ) {
2805  const size_t iend( min( M, ii+block ) );
2806  for( size_t jj=0UL; jj<N; jj+=block ) {
2807  const size_t jend( min( N, jj+block ) );
2808  for( size_t i=ii; i<iend; ++i )
2809  {
2810  const size_t jbegin( ( IsUpper_v<MT4> )
2811  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
2812  :( jj ) );
2813  const size_t jpos( ( IsLower_v<MT4> )
2814  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
2815  :( jend ) );
2816 
2817  for( size_t j=jbegin; j<jpos; ++j ) {
2818  C(i,j) += A(i,j) * B(j,j);
2819  }
2820  }
2821  }
2822  }
2823  }
2825  //**********************************************************************************************
2826 
2827  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
2841  template< typename MT3 // Type of the left-hand side target matrix
2842  , typename MT4 // Type of the left-hand side matrix operand
2843  , typename MT5 > // Type of the right-hand side matrix operand
2844  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2845  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2846  {
2847  const size_t M( A.rows() );
2848  const size_t N( B.columns() );
2849 
2850  for( size_t j=0UL; j<N; ++j )
2851  {
2852  const size_t ibegin( ( IsLower_v<MT4> )
2853  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
2854  :( 0UL ) );
2855  const size_t iend( ( IsUpper_v<MT4> )
2856  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
2857  :( M ) );
2858  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2859 
2860  const size_t inum( iend - ibegin );
2861  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2862 
2863  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2864  C(i ,j) += A(i ,j) * B(j,j);
2865  C(i+1UL,j) += A(i+1UL,j) * B(j,j);
2866  }
2867  if( ipos < iend ) {
2868  C(ipos,j) += A(ipos,j) * B(j,j);
2869  }
2870  }
2871  }
2873  //**********************************************************************************************
2874 
2875  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
2889  template< typename MT3 // Type of the left-hand side target matrix
2890  , typename MT4 // Type of the left-hand side matrix operand
2891  , typename MT5 > // Type of the right-hand side matrix operand
2892  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2893  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2894  {
2895  const size_t M( A.rows() );
2896  const size_t N( B.columns() );
2897 
2898  for( size_t i=0UL; i<M; ++i )
2899  {
2900  const size_t jbegin( ( IsUpper_v<MT5> )
2901  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
2902  :( 0UL ) );
2903  const size_t jend( ( IsLower_v<MT5> )
2904  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
2905  :( N ) );
2906  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2907 
2908  const size_t jnum( jend - jbegin );
2909  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2910 
2911  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2912  C(i,j ) += A(i,i) * B(i,j );
2913  C(i,j+1UL) += A(i,i) * B(i,j+1UL);
2914  }
2915  if( jpos < jend ) {
2916  C(i,jpos) += A(i,i) * B(i,jpos);
2917  }
2918  }
2919  }
2921  //**********************************************************************************************
2922 
2923  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2937  template< typename MT3 // Type of the left-hand side target matrix
2938  , typename MT4 // Type of the left-hand side matrix operand
2939  , typename MT5 > // Type of the right-hand side matrix operand
2940  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2941  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2942  {
2943  constexpr size_t block( BLOCK_SIZE );
2944 
2945  const size_t M( A.rows() );
2946  const size_t N( B.columns() );
2947 
2948  for( size_t jj=0UL; jj<N; jj+=block ) {
2949  const size_t jend( min( N, jj+block ) );
2950  for( size_t ii=0UL; ii<M; ii+=block ) {
2951  const size_t iend( min( M, ii+block ) );
2952  for( size_t j=jj; j<jend; ++j )
2953  {
2954  const size_t ibegin( ( IsLower_v<MT5> )
2955  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
2956  :( ii ) );
2957  const size_t ipos( ( IsUpper_v<MT5> )
2958  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
2959  :( iend ) );
2960 
2961  for( size_t i=ibegin; i<ipos; ++i ) {
2962  C(i,j) += A(i,i) * B(i,j);
2963  }
2964  }
2965  }
2966  }
2967  }
2969  //**********************************************************************************************
2970 
2971  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2985  template< typename MT3 // Type of the left-hand side target matrix
2986  , typename MT4 // Type of the left-hand side matrix operand
2987  , typename MT5 > // Type of the right-hand side matrix operand
2988  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2989  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2990  {
2991  for( size_t i=0UL; i<A.rows(); ++i ) {
2992  C(i,i) += A(i,i) * B(i,i);
2993  }
2994  }
2996  //**********************************************************************************************
2997 
2998  //**Default addition assignment to dense matrices (small matrices)******************************
3012  template< typename MT3 // Type of the left-hand side target matrix
3013  , typename MT4 // Type of the left-hand side matrix operand
3014  , typename MT5 > // Type of the right-hand side matrix operand
3015  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3016  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3017  {
3018  selectDefaultAddAssignKernel( C, A, B );
3019  }
3021  //**********************************************************************************************
3022 
3023  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
3038  template< typename MT3 // Type of the left-hand side target matrix
3039  , typename MT4 // Type of the left-hand side matrix operand
3040  , typename MT5 > // Type of the right-hand side matrix operand
3041  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3042  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3043  {
3044  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3045 
3046  const size_t M( A.rows() );
3047  const size_t N( B.columns() );
3048  const size_t K( A.columns() );
3049 
3050  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3051 
3052  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3053  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3054 
3055  size_t j( 0UL );
3056 
3057  if( IsIntegral_v<ElementType> )
3058  {
3059  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3060  for( size_t i=0UL; i<M; ++i )
3061  {
3062  const size_t kbegin( ( IsUpper_v<MT4> )
3063  ?( ( IsLower_v<MT5> )
3064  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3065  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3066  :( IsLower_v<MT5> ? j : 0UL ) );
3067  const size_t kend( ( IsLower_v<MT4> )
3068  ?( ( IsUpper_v<MT5> )
3069  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
3070  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3071  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
3072 
3073  SIMDType xmm1( C.load(i,j ) );
3074  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3075  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3076  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3077  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3078  SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
3079  SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
3080  SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
3081 
3082  for( size_t k=kbegin; k<kend; ++k ) {
3083  const SIMDType a1( set( A(i,k) ) );
3084  xmm1 += a1 * B.load(k,j );
3085  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3086  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3087  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3088  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3089  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
3090  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
3091  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
3092  }
3093 
3094  C.store( i, j , xmm1 );
3095  C.store( i, j+SIMDSIZE , xmm2 );
3096  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3097  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3098  C.store( i, j+SIMDSIZE*4UL, xmm5 );
3099  C.store( i, j+SIMDSIZE*5UL, xmm6 );
3100  C.store( i, j+SIMDSIZE*6UL, xmm7 );
3101  C.store( i, j+SIMDSIZE*7UL, xmm8 );
3102  }
3103  }
3104  }
3105 
3106  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3107  {
3108  size_t i( 0UL );
3109 
3110  for( ; (i+2UL) <= M; i+=2UL )
3111  {
3112  const size_t kbegin( ( IsUpper_v<MT4> )
3113  ?( ( IsLower_v<MT5> )
3114  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3115  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3116  :( IsLower_v<MT5> ? j : 0UL ) );
3117  const size_t kend( ( IsLower_v<MT4> )
3118  ?( ( IsUpper_v<MT5> )
3119  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
3120  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3121  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
3122 
3123  SIMDType xmm1 ( C.load(i ,j ) );
3124  SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
3125  SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
3126  SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
3127  SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
3128  SIMDType xmm6 ( C.load(i+1UL,j ) );
3129  SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
3130  SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
3131  SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
3132  SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
3133 
3134  for( size_t k=kbegin; k<kend; ++k ) {
3135  const SIMDType a1( set( A(i ,k) ) );
3136  const SIMDType a2( set( A(i+1UL,k) ) );
3137  const SIMDType b1( B.load(k,j ) );
3138  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3139  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3140  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3141  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3142  xmm1 += a1 * b1;
3143  xmm2 += a1 * b2;
3144  xmm3 += a1 * b3;
3145  xmm4 += a1 * b4;
3146  xmm5 += a1 * b5;
3147  xmm6 += a2 * b1;
3148  xmm7 += a2 * b2;
3149  xmm8 += a2 * b3;
3150  xmm9 += a2 * b4;
3151  xmm10 += a2 * b5;
3152  }
3153 
3154  C.store( i , j , xmm1 );
3155  C.store( i , j+SIMDSIZE , xmm2 );
3156  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3157  C.store( i , j+SIMDSIZE*3UL, xmm4 );
3158  C.store( i , j+SIMDSIZE*4UL, xmm5 );
3159  C.store( i+1UL, j , xmm6 );
3160  C.store( i+1UL, j+SIMDSIZE , xmm7 );
3161  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3162  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3163  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3164  }
3165 
3166  if( i < M )
3167  {
3168  const size_t kbegin( ( IsUpper_v<MT4> )
3169  ?( ( IsLower_v<MT5> )
3170  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3171  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3172  :( IsLower_v<MT5> ? j : 0UL ) );
3173  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
3174 
3175  SIMDType xmm1( C.load(i,j ) );
3176  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3177  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3178  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3179  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3180 
3181  for( size_t k=kbegin; k<kend; ++k ) {
3182  const SIMDType a1( set( A(i,k) ) );
3183  xmm1 += a1 * B.load(k,j );
3184  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3185  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3186  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3187  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3188  }
3189 
3190  C.store( i, j , xmm1 );
3191  C.store( i, j+SIMDSIZE , xmm2 );
3192  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3193  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3194  C.store( i, j+SIMDSIZE*4UL, xmm5 );
3195  }
3196  }
3197 
3198  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3199  {
3200  size_t i( 0UL );
3201 
3202  for( ; (i+2UL) <= M; i+=2UL )
3203  {
3204  const size_t kbegin( ( IsUpper_v<MT4> )
3205  ?( ( IsLower_v<MT5> )
3206  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3207  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3208  :( IsLower_v<MT5> ? j : 0UL ) );
3209  const size_t kend( ( IsLower_v<MT4> )
3210  ?( ( IsUpper_v<MT5> )
3211  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3212  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3213  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
3214 
3215  SIMDType xmm1( C.load(i ,j ) );
3216  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3217  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3218  SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
3219  SIMDType xmm5( C.load(i+1UL,j ) );
3220  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
3221  SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
3222  SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
3223 
3224  for( size_t k=kbegin; k<kend; ++k ) {
3225  const SIMDType a1( set( A(i ,k) ) );
3226  const SIMDType a2( set( A(i+1UL,k) ) );
3227  const SIMDType b1( B.load(k,j ) );
3228  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3229  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3230  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3231  xmm1 += a1 * b1;
3232  xmm2 += a1 * b2;
3233  xmm3 += a1 * b3;
3234  xmm4 += a1 * b4;
3235  xmm5 += a2 * b1;
3236  xmm6 += a2 * b2;
3237  xmm7 += a2 * b3;
3238  xmm8 += a2 * b4;
3239  }
3240 
3241  C.store( i , j , xmm1 );
3242  C.store( i , j+SIMDSIZE , xmm2 );
3243  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3244  C.store( i , j+SIMDSIZE*3UL, xmm4 );
3245  C.store( i+1UL, j , xmm5 );
3246  C.store( i+1UL, j+SIMDSIZE , xmm6 );
3247  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3248  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3249  }
3250 
3251  if( i < M )
3252  {
3253  const size_t kbegin( ( IsUpper_v<MT4> )
3254  ?( ( IsLower_v<MT5> )
3255  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3256  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3257  :( IsLower_v<MT5> ? j : 0UL ) );
3258  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3259 
3260  SIMDType xmm1( C.load(i,j ) );
3261  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3262  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3263  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3264 
3265  for( size_t k=kbegin; k<kend; ++k ) {
3266  const SIMDType a1( set( A(i,k) ) );
3267  xmm1 += a1 * B.load(k,j );
3268  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3269  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3270  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3271  }
3272 
3273  C.store( i, j , xmm1 );
3274  C.store( i, j+SIMDSIZE , xmm2 );
3275  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3276  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3277  }
3278  }
3279 
3280  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3281  {
3282  size_t i( 0UL );
3283 
3284  for( ; (i+2UL) <= M; i+=2UL )
3285  {
3286  const size_t kbegin( ( IsUpper_v<MT4> )
3287  ?( ( IsLower_v<MT5> )
3288  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3289  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3290  :( IsLower_v<MT5> ? j : 0UL ) );
3291  const size_t kend( ( IsLower_v<MT4> )
3292  ?( ( IsUpper_v<MT5> )
3293  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
3294  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3295  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
3296 
3297  SIMDType xmm1( C.load(i ,j ) );
3298  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3299  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3300  SIMDType xmm4( C.load(i+1UL,j ) );
3301  SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
3302  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
3303 
3304  for( size_t k=kbegin; k<kend; ++k ) {
3305  const SIMDType a1( set( A(i ,k) ) );
3306  const SIMDType a2( set( A(i+1UL,k) ) );
3307  const SIMDType b1( B.load(k,j ) );
3308  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3309  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3310  xmm1 += a1 * b1;
3311  xmm2 += a1 * b2;
3312  xmm3 += a1 * b3;
3313  xmm4 += a2 * b1;
3314  xmm5 += a2 * b2;
3315  xmm6 += a2 * b3;
3316  }
3317 
3318  C.store( i , j , xmm1 );
3319  C.store( i , j+SIMDSIZE , xmm2 );
3320  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3321  C.store( i+1UL, j , xmm4 );
3322  C.store( i+1UL, j+SIMDSIZE , xmm5 );
3323  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3324  }
3325 
3326  if( i < M )
3327  {
3328  const size_t kbegin( ( IsUpper_v<MT4> )
3329  ?( ( IsLower_v<MT5> )
3330  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3331  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3332  :( IsLower_v<MT5> ? j : 0UL ) );
3333  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
3334 
3335  SIMDType xmm1( C.load(i,j ) );
3336  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3337  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3338 
3339  for( size_t k=kbegin; k<kend; ++k ) {
3340  const SIMDType a1( set( A(i,k) ) );
3341  xmm1 += a1 * B.load(k,j );
3342  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3343  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3344  }
3345 
3346  C.store( i, j , xmm1 );
3347  C.store( i, j+SIMDSIZE , xmm2 );
3348  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3349  }
3350  }
3351 
3352  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3353  {
3354  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
3355  size_t i( LOW ? j : 0UL );
3356 
3357  for( ; (i+4UL) <= iend; i+=4UL )
3358  {
3359  const size_t kbegin( ( IsUpper_v<MT4> )
3360  ?( ( IsLower_v<MT5> )
3361  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3362  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3363  :( IsLower_v<MT5> ? j : 0UL ) );
3364  const size_t kend( ( IsLower_v<MT4> )
3365  ?( ( IsUpper_v<MT5> )
3366  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
3367  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3368  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3369 
3370  SIMDType xmm1( C.load(i ,j ) );
3371  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3372  SIMDType xmm3( C.load(i+1UL,j ) );
3373  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3374  SIMDType xmm5( C.load(i+2UL,j ) );
3375  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3376  SIMDType xmm7( C.load(i+3UL,j ) );
3377  SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
3378 
3379  for( size_t k=kbegin; k<kend; ++k ) {
3380  const SIMDType a1( set( A(i ,k) ) );
3381  const SIMDType a2( set( A(i+1UL,k) ) );
3382  const SIMDType a3( set( A(i+2UL,k) ) );
3383  const SIMDType a4( set( A(i+3UL,k) ) );
3384  const SIMDType b1( B.load(k,j ) );
3385  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3386  xmm1 += a1 * b1;
3387  xmm2 += a1 * b2;
3388  xmm3 += a2 * b1;
3389  xmm4 += a2 * b2;
3390  xmm5 += a3 * b1;
3391  xmm6 += a3 * b2;
3392  xmm7 += a4 * b1;
3393  xmm8 += a4 * b2;
3394  }
3395 
3396  C.store( i , j , xmm1 );
3397  C.store( i , j+SIMDSIZE, xmm2 );
3398  C.store( i+1UL, j , xmm3 );
3399  C.store( i+1UL, j+SIMDSIZE, xmm4 );
3400  C.store( i+2UL, j , xmm5 );
3401  C.store( i+2UL, j+SIMDSIZE, xmm6 );
3402  C.store( i+3UL, j , xmm7 );
3403  C.store( i+3UL, j+SIMDSIZE, xmm8 );
3404  }
3405 
3406  for( ; (i+3UL) <= iend; i+=3UL )
3407  {
3408  const size_t kbegin( ( IsUpper_v<MT4> )
3409  ?( ( IsLower_v<MT5> )
3410  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3411  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3412  :( IsLower_v<MT5> ? j : 0UL ) );
3413  const size_t kend( ( IsLower_v<MT4> )
3414  ?( ( IsUpper_v<MT5> )
3415  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
3416  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3417  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3418 
3419  SIMDType xmm1( C.load(i ,j ) );
3420  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3421  SIMDType xmm3( C.load(i+1UL,j ) );
3422  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3423  SIMDType xmm5( C.load(i+2UL,j ) );
3424  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3425 
3426  for( size_t k=kbegin; k<kend; ++k ) {
3427  const SIMDType a1( set( A(i ,k) ) );
3428  const SIMDType a2( set( A(i+1UL,k) ) );
3429  const SIMDType a3( set( A(i+2UL,k) ) );
3430  const SIMDType b1( B.load(k,j ) );
3431  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3432  xmm1 += a1 * b1;
3433  xmm2 += a1 * b2;
3434  xmm3 += a2 * b1;
3435  xmm4 += a2 * b2;
3436  xmm5 += a3 * b1;
3437  xmm6 += a3 * b2;
3438  }
3439 
3440  C.store( i , j , xmm1 );
3441  C.store( i , j+SIMDSIZE, xmm2 );
3442  C.store( i+1UL, j , xmm3 );
3443  C.store( i+1UL, j+SIMDSIZE, xmm4 );
3444  C.store( i+2UL, j , xmm5 );
3445  C.store( i+2UL, j+SIMDSIZE, xmm6 );
3446  }
3447 
3448  for( ; (i+2UL) <= iend; i+=2UL )
3449  {
3450  const size_t kbegin( ( IsUpper_v<MT4> )
3451  ?( ( IsLower_v<MT5> )
3452  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3453  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3454  :( IsLower_v<MT5> ? j : 0UL ) );
3455  const size_t kend( ( IsLower_v<MT4> )
3456  ?( ( IsUpper_v<MT5> )
3457  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3458  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3459  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3460 
3461  SIMDType xmm1( C.load(i ,j ) );
3462  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3463  SIMDType xmm3( C.load(i+1UL,j ) );
3464  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3465  SIMDType xmm5, xmm6, xmm7, xmm8;
3466  size_t k( kbegin );
3467 
3468  for( ; (k+2UL) <= kend; k+=2UL ) {
3469  const SIMDType a1( set( A(i ,k ) ) );
3470  const SIMDType a2( set( A(i+1UL,k ) ) );
3471  const SIMDType a3( set( A(i ,k+1UL) ) );
3472  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
3473  const SIMDType b1( B.load(k ,j ) );
3474  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3475  const SIMDType b3( B.load(k+1UL,j ) );
3476  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3477  xmm1 += a1 * b1;
3478  xmm2 += a1 * b2;
3479  xmm3 += a2 * b1;
3480  xmm4 += a2 * b2;
3481  xmm5 += a3 * b3;
3482  xmm6 += a3 * b4;
3483  xmm7 += a4 * b3;
3484  xmm8 += a4 * b4;
3485  }
3486 
3487  for( ; k<kend; ++k ) {
3488  const SIMDType a1( set( A(i ,k) ) );
3489  const SIMDType a2( set( A(i+1UL,k) ) );
3490  const SIMDType b1( B.load(k,j ) );
3491  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3492  xmm1 += a1 * b1;
3493  xmm2 += a1 * b2;
3494  xmm3 += a2 * b1;
3495  xmm4 += a2 * b2;
3496  }
3497 
3498  C.store( i , j , xmm1+xmm5 );
3499  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
3500  C.store( i+1UL, j , xmm3+xmm7 );
3501  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3502  }
3503 
3504  if( i < iend )
3505  {
3506  const size_t kbegin( ( IsUpper_v<MT4> )
3507  ?( ( IsLower_v<MT5> )
3508  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3509  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3510  :( IsLower_v<MT5> ? j : 0UL ) );
3511  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
3512 
3513  SIMDType xmm1( C.load(i,j ) );
3514  SIMDType xmm2( C.load(i,j+SIMDSIZE) );
3515  SIMDType xmm3, xmm4;
3516  size_t k( kbegin );
3517 
3518  for( ; (k+2UL) <= kend; k+=2UL ) {
3519  const SIMDType a1( set( A(i,k ) ) );
3520  const SIMDType a2( set( A(i,k+1UL) ) );
3521  xmm1 += a1 * B.load(k ,j );
3522  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
3523  xmm3 += a2 * B.load(k+1UL,j );
3524  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
3525  }
3526 
3527  for( ; k<kend; ++k ) {
3528  const SIMDType a1( set( A(i,k) ) );
3529  xmm1 += a1 * B.load(k,j );
3530  xmm2 += a1 * B.load(k,j+SIMDSIZE);
3531  }
3532 
3533  C.store( i, j , xmm1+xmm3 );
3534  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
3535  }
3536  }
3537 
3538  for( ; j<jpos; j+=SIMDSIZE )
3539  {
3540  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
3541  size_t i( LOW ? j : 0UL );
3542 
3543  for( ; (i+4UL) <= iend; i+=4UL )
3544  {
3545  const size_t kbegin( ( IsUpper_v<MT4> )
3546  ?( ( IsLower_v<MT5> )
3547  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3548  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3549  :( IsLower_v<MT5> ? j : 0UL ) );
3550  const size_t kend( ( IsLower_v<MT4> )
3551  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
3552  :( K ) );
3553 
3554  SIMDType xmm1( C.load(i ,j) );
3555  SIMDType xmm2( C.load(i+1UL,j) );
3556  SIMDType xmm3( C.load(i+2UL,j) );
3557  SIMDType xmm4( C.load(i+3UL,j) );
3558  SIMDType xmm5, xmm6, xmm7, xmm8;
3559  size_t k( kbegin );
3560 
3561  for( ; (k+2UL) <= kend; k+=2UL ) {
3562  const SIMDType b1( B.load(k ,j) );
3563  const SIMDType b2( B.load(k+1UL,j) );
3564  xmm1 += set( A(i ,k ) ) * b1;
3565  xmm2 += set( A(i+1UL,k ) ) * b1;
3566  xmm3 += set( A(i+2UL,k ) ) * b1;
3567  xmm4 += set( A(i+3UL,k ) ) * b1;
3568  xmm5 += set( A(i ,k+1UL) ) * b2;
3569  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
3570  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
3571  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
3572  }
3573 
3574  for( ; k<kend; ++k ) {
3575  const SIMDType b1( B.load(k,j) );
3576  xmm1 += set( A(i ,k) ) * b1;
3577  xmm2 += set( A(i+1UL,k) ) * b1;
3578  xmm3 += set( A(i+2UL,k) ) * b1;
3579  xmm4 += set( A(i+3UL,k) ) * b1;
3580  }
3581 
3582  C.store( i , j, xmm1+xmm5 );
3583  C.store( i+1UL, j, xmm2+xmm6 );
3584  C.store( i+2UL, j, xmm3+xmm7 );
3585  C.store( i+3UL, j, xmm4+xmm8 );
3586  }
3587 
3588  for( ; (i+3UL) <= iend; i+=3UL )
3589  {
3590  const size_t kbegin( ( IsUpper_v<MT4> )
3591  ?( ( IsLower_v<MT5> )
3592  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3593  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3594  :( IsLower_v<MT5> ? j : 0UL ) );
3595  const size_t kend( ( IsLower_v<MT4> )
3596  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
3597  :( K ) );
3598 
3599  SIMDType xmm1( C.load(i ,j) );
3600  SIMDType xmm2( C.load(i+1UL,j) );
3601  SIMDType xmm3( C.load(i+2UL,j) );
3602  SIMDType xmm4, xmm5, xmm6;
3603  size_t k( kbegin );
3604 
3605  for( ; (k+2UL) <= kend; k+=2UL ) {
3606  const SIMDType b1( B.load(k ,j) );
3607  const SIMDType b2( B.load(k+1UL,j) );
3608  xmm1 += set( A(i ,k ) ) * b1;
3609  xmm2 += set( A(i+1UL,k ) ) * b1;
3610  xmm3 += set( A(i+2UL,k ) ) * b1;
3611  xmm4 += set( A(i ,k+1UL) ) * b2;
3612  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
3613  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
3614  }
3615 
3616  for( ; k<kend; ++k ) {
3617  const SIMDType b1( B.load(k,j) );
3618  xmm1 += set( A(i ,k) ) * b1;
3619  xmm2 += set( A(i+1UL,k) ) * b1;
3620  xmm3 += set( A(i+2UL,k) ) * b1;
3621  }
3622 
3623  C.store( i , j, xmm1+xmm4 );
3624  C.store( i+1UL, j, xmm2+xmm5 );
3625  C.store( i+2UL, j, xmm3+xmm6 );
3626  }
3627 
3628  for( ; (i+2UL) <= iend; i+=2UL )
3629  {
3630  const size_t kbegin( ( IsUpper_v<MT4> )
3631  ?( ( IsLower_v<MT5> )
3632  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3633  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3634  :( IsLower_v<MT5> ? j : 0UL ) );
3635  const size_t kend( ( IsLower_v<MT4> )
3636  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3637  :( K ) );
3638 
3639  SIMDType xmm1( C.load(i ,j) );
3640  SIMDType xmm2( C.load(i+1UL,j) );
3641  SIMDType xmm3, xmm4;
3642  size_t k( kbegin );
3643 
3644  for( ; (k+2UL) <= kend; k+=2UL ) {
3645  const SIMDType b1( B.load(k ,j) );
3646  const SIMDType b2( B.load(k+1UL,j) );
3647  xmm1 += set( A(i ,k ) ) * b1;
3648  xmm2 += set( A(i+1UL,k ) ) * b1;
3649  xmm3 += set( A(i ,k+1UL) ) * b2;
3650  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
3651  }
3652 
3653  for( ; k<kend; ++k ) {
3654  const SIMDType b1( B.load(k,j) );
3655  xmm1 += set( A(i ,k) ) * b1;
3656  xmm2 += set( A(i+1UL,k) ) * b1;
3657  }
3658 
3659  C.store( i , j, xmm1+xmm3 );
3660  C.store( i+1UL, j, xmm2+xmm4 );
3661  }
3662 
3663  if( i < iend )
3664  {
3665  const size_t kbegin( ( IsUpper_v<MT4> )
3666  ?( ( IsLower_v<MT5> )
3667  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3668  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3669  :( IsLower_v<MT5> ? j : 0UL ) );
3670 
3671  SIMDType xmm1( C.load(i,j) );
3672  SIMDType xmm2;
3673  size_t k( kbegin );
3674 
3675  for( ; (k+2UL) <= K; k+=2UL ) {
3676  xmm1 += set( A(i,k ) ) * B.load(k ,j);
3677  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
3678  }
3679 
3680  for( ; k<K; ++k ) {
3681  xmm1 += set( A(i,k) ) * B.load(k,j);
3682  }
3683 
3684  C.store( i, j, xmm1+xmm2 );
3685  }
3686  }
3687 
3688  for( ; remainder && j<N; ++j )
3689  {
3690  const size_t iend( UPP ? j+1UL : M );
3691  size_t i( LOW ? j : 0UL );
3692 
3693  for( ; (i+2UL) <= iend; i+=2UL )
3694  {
3695  const size_t kbegin( ( IsUpper_v<MT4> )
3696  ?( ( IsLower_v<MT5> )
3697  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3698  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3699  :( IsLower_v<MT5> ? j : 0UL ) );
3700  const size_t kend( ( IsLower_v<MT4> )
3701  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3702  :( K ) );
3703 
3704  ElementType value1( C(i ,j) );
3705  ElementType value2( C(i+1UL,j) );;
3706 
3707  for( size_t k=kbegin; k<kend; ++k ) {
3708  value1 += A(i ,k) * B(k,j);
3709  value2 += A(i+1UL,k) * B(k,j);
3710  }
3711 
3712  C(i ,j) = value1;
3713  C(i+1UL,j) = value2;
3714  }
3715 
3716  if( i < iend )
3717  {
3718  const size_t kbegin( ( IsUpper_v<MT4> )
3719  ?( ( IsLower_v<MT5> )
3720  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3721  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3722  :( IsLower_v<MT5> ? j : 0UL ) );
3723 
3724  ElementType value( C(i,j) );
3725 
3726  for( size_t k=kbegin; k<K; ++k ) {
3727  value += A(i,k) * B(k,j);
3728  }
3729 
3730  C(i,j) = value;
3731  }
3732  }
3733  }
3735  //**********************************************************************************************
3736 
3737  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
3752  template< typename MT3 // Type of the left-hand side target matrix
3753  , typename MT4 // Type of the left-hand side matrix operand
3754  , typename MT5 > // Type of the right-hand side matrix operand
3755  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3756  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3757  {
3758  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3759 
3760  const size_t M( A.rows() );
3761  const size_t N( B.columns() );
3762  const size_t K( A.columns() );
3763 
3764  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3765 
3766  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3767  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3768 
3769  size_t i( 0UL );
3770 
3771  if( IsIntegral_v<ElementType> )
3772  {
3773  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3774  for( size_t j=0UL; j<N; ++j )
3775  {
3776  const size_t kbegin( ( IsLower_v<MT5> )
3777  ?( ( IsUpper_v<MT4> )
3778  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3779  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3780  :( IsUpper_v<MT4> ? i : 0UL ) );
3781  const size_t kend( ( IsUpper_v<MT5> )
3782  ?( ( IsLower_v<MT4> )
3783  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3784  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3785  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
3786 
3787  SIMDType xmm1( C.load(i ,j) );
3788  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3789  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3790  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3791  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
3792  SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
3793  SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
3794  SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
3795 
3796  for( size_t k=kbegin; k<kend; ++k ) {
3797  const SIMDType b1( set( B(k,j) ) );
3798  xmm1 += A.load(i ,k) * b1;
3799  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3800  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3801  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3802  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3803  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
3804  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
3805  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
3806  }
3807 
3808  C.store( i , j, xmm1 );
3809  C.store( i+SIMDSIZE , j, xmm2 );
3810  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3811  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3812  C.store( i+SIMDSIZE*4UL, j, xmm5 );
3813  C.store( i+SIMDSIZE*5UL, j, xmm6 );
3814  C.store( i+SIMDSIZE*6UL, j, xmm7 );
3815  C.store( i+SIMDSIZE*7UL, j, xmm8 );
3816  }
3817  }
3818  }
3819 
3820  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3821  {
3822  size_t j( 0UL );
3823 
3824  for( ; (j+2UL) <= N; j+=2UL )
3825  {
3826  const size_t kbegin( ( IsLower_v<MT5> )
3827  ?( ( IsUpper_v<MT4> )
3828  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3829  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3830  :( IsUpper_v<MT4> ? i : 0UL ) );
3831  const size_t kend( ( IsUpper_v<MT5> )
3832  ?( ( IsLower_v<MT4> )
3833  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3834  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3835  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
3836 
3837  SIMDType xmm1 ( C.load(i ,j ) );
3838  SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
3839  SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
3840  SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
3841  SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
3842  SIMDType xmm6 ( C.load(i ,j+1UL) );
3843  SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
3844  SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
3845  SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
3846  SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
3847 
3848  for( size_t k=kbegin; k<kend; ++k ) {
3849  const SIMDType a1( A.load(i ,k) );
3850  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3851  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3852  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3853  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3854  const SIMDType b1( set( B(k,j ) ) );
3855  const SIMDType b2( set( B(k,j+1UL) ) );
3856  xmm1 += a1 * b1;
3857  xmm2 += a2 * b1;
3858  xmm3 += a3 * b1;
3859  xmm4 += a4 * b1;
3860  xmm5 += a5 * b1;
3861  xmm6 += a1 * b2;
3862  xmm7 += a2 * b2;
3863  xmm8 += a3 * b2;
3864  xmm9 += a4 * b2;
3865  xmm10 += a5 * b2;
3866  }
3867 
3868  C.store( i , j , xmm1 );
3869  C.store( i+SIMDSIZE , j , xmm2 );
3870  C.store( i+SIMDSIZE*2UL, j , xmm3 );
3871  C.store( i+SIMDSIZE*3UL, j , xmm4 );
3872  C.store( i+SIMDSIZE*4UL, j , xmm5 );
3873  C.store( i , j+1UL, xmm6 );
3874  C.store( i+SIMDSIZE , j+1UL, xmm7 );
3875  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3876  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3877  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3878  }
3879 
3880  if( j < N )
3881  {
3882  const size_t kbegin( ( IsLower_v<MT5> )
3883  ?( ( IsUpper_v<MT4> )
3884  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3885  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3886  :( IsUpper_v<MT4> ? i : 0UL ) );
3887  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
3888 
3889  SIMDType xmm1( C.load(i ,j) );
3890  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3891  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3892  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3893  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
3894 
3895  for( size_t k=kbegin; k<kend; ++k ) {
3896  const SIMDType b1( set( B(k,j) ) );
3897  xmm1 += A.load(i ,k) * b1;
3898  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3899  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3900  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3901  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3902  }
3903 
3904  C.store( i , j, xmm1 );
3905  C.store( i+SIMDSIZE , j, xmm2 );
3906  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3907  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3908  C.store( i+SIMDSIZE*4UL, j, xmm5 );
3909  }
3910  }
3911 
3912  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3913  {
3914  size_t j( 0UL );
3915 
3916  for( ; (j+2UL) <= N; j+=2UL )
3917  {
3918  const size_t kbegin( ( IsLower_v<MT5> )
3919  ?( ( IsUpper_v<MT4> )
3920  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3921  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3922  :( IsUpper_v<MT4> ? i : 0UL ) );
3923  const size_t kend( ( IsUpper_v<MT5> )
3924  ?( ( IsLower_v<MT4> )
3925  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3926  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3927  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
3928 
3929  SIMDType xmm1( C.load(i ,j ) );
3930  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
3931  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
3932  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
3933  SIMDType xmm5( C.load(i ,j+1UL) );
3934  SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
3935  SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
3936  SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
3937 
3938  for( size_t k=kbegin; k<kend; ++k ) {
3939  const SIMDType a1( A.load(i ,k) );
3940  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3941  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3942  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3943  const SIMDType b1( set( B(k,j ) ) );
3944  const SIMDType b2( set( B(k,j+1UL) ) );
3945  xmm1 += a1 * b1;
3946  xmm2 += a2 * b1;
3947  xmm3 += a3 * b1;
3948  xmm4 += a4 * b1;
3949  xmm5 += a1 * b2;
3950  xmm6 += a2 * b2;
3951  xmm7 += a3 * b2;
3952  xmm8 += a4 * b2;
3953  }
3954 
3955  C.store( i , j , xmm1 );
3956  C.store( i+SIMDSIZE , j , xmm2 );
3957  C.store( i+SIMDSIZE*2UL, j , xmm3 );
3958  C.store( i+SIMDSIZE*3UL, j , xmm4 );
3959  C.store( i , j+1UL, xmm5 );
3960  C.store( i+SIMDSIZE , j+1UL, xmm6 );
3961  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3962  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3963  }
3964 
3965  if( j < N )
3966  {
3967  const size_t kbegin( ( IsLower_v<MT5> )
3968  ?( ( IsUpper_v<MT4> )
3969  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3970  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3971  :( IsUpper_v<MT4> ? i : 0UL ) );
3972  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3973 
3974  SIMDType xmm1( C.load(i ,j) );
3975  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
3976  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
3977  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
3978 
3979  for( size_t k=kbegin; k<kend; ++k ) {
3980  const SIMDType b1( set( B(k,j) ) );
3981  xmm1 += A.load(i ,k) * b1;
3982  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3983  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3984  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3985  }
3986 
3987  C.store( i , j, xmm1 );
3988  C.store( i+SIMDSIZE , j, xmm2 );
3989  C.store( i+SIMDSIZE*2UL, j, xmm3 );
3990  C.store( i+SIMDSIZE*3UL, j, xmm4 );
3991  }
3992  }
3993 
3994  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3995  {
3996  size_t j( 0UL );
3997 
3998  for( ; (j+2UL) <= N; j+=2UL )
3999  {
4000  const size_t kbegin( ( IsLower_v<MT5> )
4001  ?( ( IsUpper_v<MT4> )
4002  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4003  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4004  :( IsUpper_v<MT4> ? i : 0UL ) );
4005  const size_t kend( ( IsUpper_v<MT5> )
4006  ?( ( IsLower_v<MT4> )
4007  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4008  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4009  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
4010 
4011  SIMDType xmm1( C.load(i ,j ) );
4012  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
4013  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
4014  SIMDType xmm4( C.load(i ,j+1UL) );
4015  SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
4016  SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
4017 
4018  for( size_t k=kbegin; k<kend; ++k ) {
4019  const SIMDType a1( A.load(i ,k) );
4020  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4021  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4022  const SIMDType b1( set( B(k,j ) ) );
4023  const SIMDType b2( set( B(k,j+1UL) ) );
4024  xmm1 += a1 * b1;
4025  xmm2 += a2 * b1;
4026  xmm3 += a3 * b1;
4027  xmm4 += a1 * b2;
4028  xmm5 += a2 * b2;
4029  xmm6 += a3 * b2;
4030  }
4031 
4032  C.store( i , j , xmm1 );
4033  C.store( i+SIMDSIZE , j , xmm2 );
4034  C.store( i+SIMDSIZE*2UL, j , xmm3 );
4035  C.store( i , j+1UL, xmm4 );
4036  C.store( i+SIMDSIZE , j+1UL, xmm5 );
4037  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
4038  }
4039 
4040  if( j < N )
4041  {
4042  const size_t kbegin( ( IsLower_v<MT5> )
4043  ?( ( IsUpper_v<MT4> )
4044  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4045  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4046  :( IsUpper_v<MT4> ? i : 0UL ) );
4047  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
4048 
4049  SIMDType xmm1( C.load(i ,j) );
4050  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
4051  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
4052 
4053  for( size_t k=kbegin; k<kend; ++k ) {
4054  const SIMDType b1( set( B(k,j) ) );
4055  xmm1 += A.load(i ,k) * b1;
4056  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4057  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4058  }
4059 
4060  C.store( i , j, xmm1 );
4061  C.store( i+SIMDSIZE , j, xmm2 );
4062  C.store( i+SIMDSIZE*2UL, j, xmm3 );
4063  }
4064  }
4065 
4066  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4067  {
4068  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
4069  size_t j( UPP ? i : 0UL );
4070 
4071  for( ; (j+4UL) <= jend; j+=4UL )
4072  {
4073  const size_t kbegin( ( IsLower_v<MT5> )
4074  ?( ( IsUpper_v<MT4> )
4075  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4076  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4077  :( IsUpper_v<MT4> ? i : 0UL ) );
4078  const size_t kend( ( IsUpper_v<MT5> )
4079  ?( ( IsLower_v<MT4> )
4080  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
4081  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
4082  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4083 
4084  SIMDType xmm1( C.load(i ,j ) );
4085  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4086  SIMDType xmm3( C.load(i ,j+1UL) );
4087  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4088  SIMDType xmm5( C.load(i ,j+2UL) );
4089  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
4090  SIMDType xmm7( C.load(i ,j+3UL) );
4091  SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
4092 
4093  for( size_t k=kbegin; k<kend; ++k ) {
4094  const SIMDType a1( A.load(i ,k) );
4095  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4096  const SIMDType b1( set( B(k,j ) ) );
4097  const SIMDType b2( set( B(k,j+1UL) ) );
4098  const SIMDType b3( set( B(k,j+2UL) ) );
4099  const SIMDType b4( set( B(k,j+3UL) ) );
4100  xmm1 += a1 * b1;
4101  xmm2 += a2 * b1;
4102  xmm3 += a1 * b2;
4103  xmm4 += a2 * b2;
4104  xmm5 += a1 * b3;
4105  xmm6 += a2 * b3;
4106  xmm7 += a1 * b4;
4107  xmm8 += a2 * b4;
4108  }
4109 
4110  C.store( i , j , xmm1 );
4111  C.store( i+SIMDSIZE, j , xmm2 );
4112  C.store( i , j+1UL, xmm3 );
4113  C.store( i+SIMDSIZE, j+1UL, xmm4 );
4114  C.store( i , j+2UL, xmm5 );
4115  C.store( i+SIMDSIZE, j+2UL, xmm6 );
4116  C.store( i , j+3UL, xmm7 );
4117  C.store( i+SIMDSIZE, j+3UL, xmm8 );
4118  }
4119 
4120  for( ; (j+3UL) <= jend; j+=3UL )
4121  {
4122  const size_t kbegin( ( IsLower_v<MT5> )
4123  ?( ( IsUpper_v<MT4> )
4124  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4125  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4126  :( IsUpper_v<MT4> ? i : 0UL ) );
4127  const size_t kend( ( IsUpper_v<MT5> )
4128  ?( ( IsLower_v<MT4> )
4129  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
4130  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
4131  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4132 
4133  SIMDType xmm1( C.load(i ,j ) );
4134  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4135  SIMDType xmm3( C.load(i ,j+1UL) );
4136  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4137  SIMDType xmm5( C.load(i ,j+2UL) );
4138  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
4139 
4140  for( size_t k=kbegin; k<kend; ++k ) {
4141  const SIMDType a1( A.load(i ,k) );
4142  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4143  const SIMDType b1( set( B(k,j ) ) );
4144  const SIMDType b2( set( B(k,j+1UL) ) );
4145  const SIMDType b3( set( B(k,j+2UL) ) );
4146  xmm1 += a1 * b1;
4147  xmm2 += a2 * b1;
4148  xmm3 += a1 * b2;
4149  xmm4 += a2 * b2;
4150  xmm5 += a1 * b3;
4151  xmm6 += a2 * b3;
4152  }
4153 
4154  C.store( i , j , xmm1 );
4155  C.store( i+SIMDSIZE, j , xmm2 );
4156  C.store( i , j+1UL, xmm3 );
4157  C.store( i+SIMDSIZE, j+1UL, xmm4 );
4158  C.store( i , j+2UL, xmm5 );
4159  C.store( i+SIMDSIZE, j+2UL, xmm6 );
4160  }
4161 
4162  for( ; (j+2UL) <= jend; j+=2UL )
4163  {
4164  const size_t kbegin( ( IsLower_v<MT5> )
4165  ?( ( IsUpper_v<MT4> )
4166  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4167  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4168  :( IsUpper_v<MT4> ? i : 0UL ) );
4169  const size_t kend( ( IsUpper_v<MT5> )
4170  ?( ( IsLower_v<MT4> )
4171  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4172  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4173  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
4174 
4175  SIMDType xmm1( C.load(i ,j ) );
4176  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
4177  SIMDType xmm3( C.load(i ,j+1UL) );
4178  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
4179  SIMDType xmm5, xmm6, xmm7, xmm8;
4180  size_t k( kbegin );
4181 
4182  for( ; (k+2UL) < kend; k+=2UL ) {
4183  const SIMDType a1( A.load(i ,k ) );
4184  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
4185  const SIMDType a3( A.load(i ,k+1UL) );
4186  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
4187  const SIMDType b1( set( B(k ,j ) ) );
4188  const SIMDType b2( set( B(k ,j+1UL) ) );
4189  const SIMDType b3( set( B(k+1UL,j ) ) );
4190  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
4191  xmm1 += a1 * b1;
4192  xmm2 += a2 * b1;
4193  xmm3 += a1 * b2;
4194  xmm4 += a2 * b2;
4195  xmm5 += a3 * b3;
4196  xmm6 += a4 * b3;
4197  xmm7 += a3 * b4;
4198  xmm8 += a4 * b4;
4199  }
4200 
4201  for( ; k<kend; ++k ) {
4202  const SIMDType a1( A.load(i ,k) );
4203  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4204  const SIMDType b1( set( B(k,j ) ) );
4205  const SIMDType b2( set( B(k,j+1UL) ) );
4206  xmm1 += a1 * b1;
4207  xmm2 += a2 * b1;
4208  xmm3 += a1 * b2;
4209  xmm4 += a2 * b2;
4210  }
4211 
4212  C.store( i , j , xmm1+xmm5 );
4213  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
4214  C.store( i , j+1UL, xmm3+xmm7 );
4215  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
4216  }
4217 
4218  if( j < jend )
4219  {
4220  const size_t kbegin( ( IsLower_v<MT5> )
4221  ?( ( IsUpper_v<MT4> )
4222  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4223  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4224  :( IsUpper_v<MT4> ? i : 0UL ) );
4225  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
4226 
4227  SIMDType xmm1( C.load(i ,j) );
4228  SIMDType xmm2( C.load(i+SIMDSIZE,j) );
4229  SIMDType xmm3, xmm4;
4230  size_t k( kbegin );
4231 
4232  for( ; (k+2UL) <= kend; k+=2UL ) {
4233  const SIMDType b1( set( B(k ,j) ) );
4234  const SIMDType b2( set( B(k+1UL,j) ) );
4235  xmm1 += A.load(i ,k ) * b1;
4236  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
4237  xmm3 += A.load(i ,k+1UL) * b2;
4238  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
4239  }
4240 
4241  for( ; k<kend; ++k ) {
4242  const SIMDType b1( set( B(k,j) ) );
4243  xmm1 += A.load(i ,k) * b1;
4244  xmm2 += A.load(i+SIMDSIZE,k) * b1;
4245  }
4246 
4247  C.store( i , j, xmm1+xmm3 );
4248  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
4249  }
4250  }
4251 
4252  for( ; i<ipos; i+=SIMDSIZE )
4253  {
4254  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
4255  size_t j( UPP ? i : 0UL );
4256 
4257  for( ; (j+4UL) <= jend; j+=4UL )
4258  {
4259  const size_t kbegin( ( IsLower_v<MT5> )
4260  ?( ( IsUpper_v<MT4> )
4261  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4262  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4263  :( IsUpper_v<MT4> ? i : 0UL ) );
4264  const size_t kend( ( IsUpper_v<MT5> )
4265  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
4266  :( K ) );
4267 
4268  SIMDType xmm1( C.load(i,j ) );
4269  SIMDType xmm2( C.load(i,j+1UL) );
4270  SIMDType xmm3( C.load(i,j+2UL) );
4271  SIMDType xmm4( C.load(i,j+3UL) );
4272  SIMDType xmm5, xmm6, xmm7, xmm8;
4273  size_t k( kbegin );
4274 
4275  for( ; (k+2UL) <= kend; k+=2UL ) {
4276  const SIMDType a1( A.load(i,k ) );
4277  const SIMDType a2( A.load(i,k+1UL) );
4278  xmm1 += a1 * set( B(k ,j ) );
4279  xmm2 += a1 * set( B(k ,j+1UL) );
4280  xmm3 += a1 * set( B(k ,j+2UL) );
4281  xmm4 += a1 * set( B(k ,j+3UL) );
4282  xmm5 += a2 * set( B(k+1UL,j ) );
4283  xmm6 += a2 * set( B(k+1UL,j+1UL) );
4284  xmm7 += a2 * set( B(k+1UL,j+2UL) );
4285  xmm8 += a2 * set( B(k+1UL,j+3UL) );
4286  }
4287 
4288  for( ; k<kend; ++k ) {
4289  const SIMDType a1( A.load(i,k) );
4290  xmm1 += a1 * set( B(k,j ) );
4291  xmm2 += a1 * set( B(k,j+1UL) );
4292  xmm3 += a1 * set( B(k,j+2UL) );
4293  xmm4 += a1 * set( B(k,j+3UL) );
4294  }
4295 
4296  C.store( i, j , xmm1+xmm5 );
4297  C.store( i, j+1UL, xmm2+xmm6 );
4298  C.store( i, j+2UL, xmm3+xmm7 );
4299  C.store( i, j+3UL, xmm4+xmm8 );
4300  }
4301 
4302  for( ; (j+3UL) <= jend; j+=3UL )
4303  {
4304  const size_t kbegin( ( IsLower_v<MT5> )
4305  ?( ( IsUpper_v<MT4> )
4306  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4307  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4308  :( IsUpper_v<MT4> ? i : 0UL ) );
4309  const size_t kend( ( IsUpper_v<MT5> )
4310  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
4311  :( K ) );
4312 
4313  SIMDType xmm1( C.load(i,j ) );
4314  SIMDType xmm2( C.load(i,j+1UL) );
4315  SIMDType xmm3( C.load(i,j+2UL) );
4316  SIMDType xmm4, xmm5, xmm6;
4317  size_t k( kbegin );
4318 
4319  for( ; (k+2UL) <= kend; k+=2UL ) {
4320  const SIMDType a1( A.load(i,k ) );
4321  const SIMDType a2( A.load(i,k+1UL) );
4322  xmm1 += a1 * set( B(k ,j ) );
4323  xmm2 += a1 * set( B(k ,j+1UL) );
4324  xmm3 += a1 * set( B(k ,j+2UL) );
4325  xmm4 += a2 * set( B(k+1UL,j ) );
4326  xmm5 += a2 * set( B(k+1UL,j+1UL) );
4327  xmm6 += a2 * set( B(k+1UL,j+2UL) );
4328  }
4329 
4330  for( ; k<kend; ++k ) {
4331  const SIMDType a1( A.load(i,k) );
4332  xmm1 += a1 * set( B(k,j ) );
4333  xmm2 += a1 * set( B(k,j+1UL) );
4334  xmm3 += a1 * set( B(k,j+2UL) );
4335  }
4336 
4337  C.store( i, j , xmm1+xmm4 );
4338  C.store( i, j+1UL, xmm2+xmm5 );
4339  C.store( i, j+2UL, xmm3+xmm6 );
4340  }
4341 
4342  for( ; (j+2UL) <= jend; j+=2UL )
4343  {
4344  const size_t kbegin( ( IsLower_v<MT5> )
4345  ?( ( IsUpper_v<MT4> )
4346  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4347  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4348  :( IsUpper_v<MT4> ? i : 0UL ) );
4349  const size_t kend( ( IsUpper_v<MT5> )
4350  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4351  :( K ) );
4352 
4353  SIMDType xmm1( C.load(i,j ) );
4354  SIMDType xmm2( C.load(i,j+1UL) );
4355  SIMDType xmm3, xmm4;
4356  size_t k( kbegin );
4357 
4358  for( ; (k+2UL) <= kend; k+=2UL ) {
4359  const SIMDType a1( A.load(i,k ) );
4360  const SIMDType a2( A.load(i,k+1UL) );
4361  xmm1 += a1 * set( B(k ,j ) );
4362  xmm2 += a1 * set( B(k ,j+1UL) );
4363  xmm3 += a2 * set( B(k+1UL,j ) );
4364  xmm4 += a2 * set( B(k+1UL,j+1UL) );
4365  }
4366 
4367  for( ; k<kend; ++k ) {
4368  const SIMDType a1( A.load(i,k) );
4369  xmm1 += a1 * set( B(k,j ) );
4370  xmm2 += a1 * set( B(k,j+1UL) );
4371  }
4372 
4373  C.store( i, j , xmm1+xmm3 );
4374  C.store( i, j+1UL, xmm2+xmm4 );
4375  }
4376 
4377  if( j < jend )
4378  {
4379  const size_t kbegin( ( IsLower_v<MT5> )
4380  ?( ( IsUpper_v<MT4> )
4381  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4382  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4383  :( IsUpper_v<MT4> ? i : 0UL ) );
4384 
4385  SIMDType xmm1( C.load(i,j) );
4386  SIMDType xmm2;
4387  size_t k( kbegin );
4388 
4389  for( ; (k+2UL) <= K; k+=2UL ) {
4390  xmm1 += A.load(i,k ) * set( B(k ,j) );
4391  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
4392  }
4393 
4394  for( ; k<K; ++k ) {
4395  xmm1 += A.load(i,k) * set( B(k,j) );
4396  }
4397 
4398  C.store( i, j, xmm1+xmm2 );
4399  }
4400  }
4401 
4402  for( ; remainder && i<M; ++i )
4403  {
4404  const size_t jend( LOW ? i+1UL : N );
4405  size_t j( UPP ? i : 0UL );
4406 
4407  for( ; (j+2UL) <= jend; j+=2UL )
4408  {
4409  const size_t kbegin( ( IsLower_v<MT5> )
4410  ?( ( IsUpper_v<MT4> )
4411  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4412  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4413  :( IsUpper_v<MT4> ? i : 0UL ) );
4414  const size_t kend( ( IsUpper_v<MT5> )
4415  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4416  :( K ) );
4417 
4418  ElementType value1( C(i,j ) );
4419  ElementType value2( C(i,j+1UL) );
4420 
4421  for( size_t k=kbegin; k<kend; ++k ) {
4422  value1 += A(i,k) * B(k,j );
4423  value2 += A(i,k) * B(k,j+1UL);
4424  }
4425 
4426  C(i,j ) = value1;
4427  C(i,j+1UL) = value2;
4428  }
4429 
4430  if( j < jend )
4431  {
4432  const size_t kbegin( ( IsLower_v<MT5> )
4433  ?( ( IsUpper_v<MT4> )
4434  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4435  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4436  :( IsUpper_v<MT4> ? i : 0UL ) );
4437 
4438  ElementType value( C(i,j) );
4439 
4440  for( size_t k=kbegin; k<K; ++k ) {
4441  value += A(i,k) * B(k,j);
4442  }
4443 
4444  C(i,j) = value;
4445  }
4446  }
4447  }
4449  //**********************************************************************************************
4450 
4451  //**Default addition assignment to dense matrices (large matrices)******************************
4465  template< typename MT3 // Type of the left-hand side target matrix
4466  , typename MT4 // Type of the left-hand side matrix operand
4467  , typename MT5 > // Type of the right-hand side matrix operand
4468  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4469  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4470  {
4471  selectDefaultAddAssignKernel( C, A, B );
4472  }
4474  //**********************************************************************************************
4475 
4476  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
4491  template< typename MT3 // Type of the left-hand side target matrix
4492  , typename MT4 // Type of the left-hand side matrix operand
4493  , typename MT5 > // Type of the right-hand side matrix operand
4494  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4495  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4496  {
4497  if( LOW )
4498  lmmm( C, A, B, ElementType(1), ElementType(1) );
4499  else if( UPP )
4500  ummm( C, A, B, ElementType(1), ElementType(1) );
4501  else
4502  mmm( C, A, B, ElementType(1), ElementType(1) );
4503  }
4505  //**********************************************************************************************
4506 
4507  //**BLAS-based addition assignment to dense matrices (default)**********************************
4521  template< typename MT3 // Type of the left-hand side target matrix
4522  , typename MT4 // Type of the left-hand side matrix operand
4523  , typename MT5 > // Type of the right-hand side matrix operand
4524  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4525  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4526  {
4527  selectLargeAddAssignKernel( C, A, B );
4528  }
4530  //**********************************************************************************************
4531 
4532  //**BLAS-based addition assignment to dense matrices********************************************
4533 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4534 
4547  template< typename MT3 // Type of the left-hand side target matrix
4548  , typename MT4 // Type of the left-hand side matrix operand
4549  , typename MT5 > // Type of the right-hand side matrix operand
4550  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4551  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4552  {
4553  using ET = ElementType_t<MT3>;
4554 
4555  if( IsTriangular_v<MT4> ) {
4556  ResultType_t<MT3> tmp( serial( B ) );
4557  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4558  addAssign( C, tmp );
4559  }
4560  else if( IsTriangular_v<MT5> ) {
4561  ResultType_t<MT3> tmp( serial( A ) );
4562  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4563  addAssign( C, tmp );
4564  }
4565  else {
4566  gemm( C, A, B, ET(1), ET(1) );
4567  }
4568  }
4570 #endif
4571  //**********************************************************************************************
4572 
4573  //**Addition assignment to sparse matrices******************************************************
4574  // No special implementation for the addition assignment to sparse matrices.
4575  //**********************************************************************************************
4576 
4577  //**Subtraction assignment to dense matrices****************************************************
4590  template< typename MT // Type of the target dense matrix
4591  , bool SO > // Storage order of the target dense matrix
4592  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
4593  {
4595 
4596  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4597  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4598 
4599  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4600  return;
4601  }
4602 
4603  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
4604  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
4605 
4606  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4607  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4608  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4609  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4610  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4611  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4612 
4613  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
4614  }
4616  //**********************************************************************************************
4617 
4618  //**Subtraction assignment to dense matrices (kernel selection)*********************************
4629  template< typename MT3 // Type of the left-hand side target matrix
4630  , typename MT4 // Type of the left-hand side matrix operand
4631  , typename MT5 > // Type of the right-hand side matrix operand
4632  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4633  {
4634  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
4635  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
4636  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
4637  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
4638  selectSmallSubAssignKernel( C, A, B );
4639  else
4640  selectBlasSubAssignKernel( C, A, B );
4641  }
4643  //**********************************************************************************************
4644 
4645  //**Default subtraction assignment to row-major dense matrices (general/general)****************
4659  template< typename MT3 // Type of the left-hand side target matrix
4660  , typename MT4 // Type of the left-hand side matrix operand
4661  , typename MT5 > // Type of the right-hand side matrix operand
4662  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4663  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4664  {
4665  const size_t M( A.rows() );
4666  const size_t N( B.columns() );
4667  const size_t K( A.columns() );
4668 
4669  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4670 
4671  for( size_t i=0UL; i<M; ++i )
4672  {
4673  const size_t kbegin( ( IsUpper_v<MT4> )
4674  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4675  :( 0UL ) );
4676  const size_t kend( ( IsLower_v<MT4> )
4677  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
4678  :( K ) );
4679  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4680 
4681  for( size_t k=kbegin; k<kend; ++k )
4682  {
4683  const size_t jbegin( ( IsUpper_v<MT5> )
4684  ?( ( IsStrictlyUpper_v<MT5> )
4685  ?( UPP ? max(i,k+1UL) : k+1UL )
4686  :( UPP ? max(i,k) : k ) )
4687  :( UPP ? i : 0UL ) );
4688  const size_t jend( ( IsLower_v<MT5> )
4689  ?( ( IsStrictlyLower_v<MT5> )
4690  ?( LOW ? min(i+1UL,k) : k )
4691  :( LOW ? min(i,k)+1UL : k+1UL ) )
4692  :( LOW ? i+1UL : N ) );
4693 
4694  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
4695  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4696 
4697  const size_t jnum( jend - jbegin );
4698  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4699 
4700  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4701  C(i,j ) -= A(i,k) * B(k,j );
4702  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4703  }
4704  if( jpos < jend ) {
4705  C(i,jpos) -= A(i,k) * B(k,jpos);
4706  }
4707  }
4708  }
4709  }
4711  //**********************************************************************************************
4712 
4713  //**Default subtraction assignment to column-major dense matrices (general/general)*************
4727  template< typename MT3 // Type of the left-hand side target matrix
4728  , typename MT4 // Type of the left-hand side matrix operand
4729  , typename MT5 > // Type of the right-hand side matrix operand
4730  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4731  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4732  {
4733  const size_t M( A.rows() );
4734  const size_t N( B.columns() );
4735  const size_t K( A.columns() );
4736 
4737  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4738 
4739  for( size_t j=0UL; j<N; ++j )
4740  {
4741  const size_t kbegin( ( IsLower_v<MT5> )
4742  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4743  :( 0UL ) );
4744  const size_t kend( ( IsUpper_v<MT5> )
4745  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4746  :( K ) );
4747  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4748 
4749  for( size_t k=kbegin; k<kend; ++k )
4750  {
4751  const size_t ibegin( ( IsLower_v<MT4> )
4752  ?( ( IsStrictlyLower_v<MT4> )
4753  ?( LOW ? max(j,k+1UL) : k+1UL )
4754  :( LOW ? max(j,k) : k ) )
4755  :( LOW ? j : 0UL ) );
4756  const size_t iend( ( IsUpper_v<MT4> )
4757  ?( ( IsStrictlyUpper_v<MT4> )
4758  ?( UPP ? min(j+1UL,k) : k )
4759  :( UPP ? min(j,k)+1UL : k+1UL ) )
4760  :( UPP ? j+1UL : M ) );
4761 
4762  if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
4763  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4764 
4765  const size_t inum( iend - ibegin );
4766  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4767 
4768  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4769  C(i ,j) -= A(i ,k) * B(k,j);
4770  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4771  }
4772  if( ipos < iend ) {
4773  C(ipos,j) -= A(ipos,k) * B(k,j);
4774  }
4775  }
4776  }
4777  }
4779  //**********************************************************************************************
4780 
4781  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
4795  template< typename MT3 // Type of the left-hand side target matrix
4796  , typename MT4 // Type of the left-hand side matrix operand
4797  , typename MT5 > // Type of the right-hand side matrix operand
4798  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4799  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4800  {
4801  constexpr size_t block( BLOCK_SIZE );
4802 
4803  const size_t M( A.rows() );
4804  const size_t N( B.columns() );
4805 
4806  for( size_t ii=0UL; ii<M; ii+=block ) {
4807  const size_t iend( min( M, ii+block ) );
4808  for( size_t jj=0UL; jj<N; jj+=block ) {
4809  const size_t jend( min( N, jj+block ) );
4810  for( size_t i=ii; i<iend; ++i )
4811  {
4812  const size_t jbegin( ( IsUpper_v<MT4> )
4813  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
4814  :( jj ) );
4815  const size_t jpos( ( IsLower_v<MT4> )
4816  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
4817  :( jend ) );
4818 
4819  for( size_t j=jbegin; j<jpos; ++j ) {
4820  C(i,j) -= A(i,j) * B(j,j);
4821  }
4822  }
4823  }
4824  }
4825  }
4827  //**********************************************************************************************
4828 
4829  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
4843  template< typename MT3 // Type of the left-hand side target matrix
4844  , typename MT4 // Type of the left-hand side matrix operand
4845  , typename MT5 > // Type of the right-hand side matrix operand
4846  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4847  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4848  {
4849  const size_t M( A.rows() );
4850  const size_t N( B.columns() );
4851 
4852  for( size_t j=0UL; j<N; ++j )
4853  {
4854  const size_t ibegin( ( IsLower_v<MT4> )
4855  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
4856  :( 0UL ) );
4857  const size_t iend( ( IsUpper_v<MT4> )
4858  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
4859  :( M ) );
4860  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4861 
4862  const size_t inum( iend - ibegin );
4863  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4864 
4865  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4866  C(i ,j) -= A(i ,j) * B(j,j);
4867  C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4868  }
4869  if( ipos < iend ) {
4870  C(ipos,j) -= A(ipos,j) * B(j,j);
4871  }
4872  }
4873  }
4875  //**********************************************************************************************
4876 
4877  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
4891  template< typename MT3 // Type of the left-hand side target matrix
4892  , typename MT4 // Type of the left-hand side matrix operand
4893  , typename MT5 > // Type of the right-hand side matrix operand
4894  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4895  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4896  {
4897  const size_t M( A.rows() );
4898  const size_t N( B.columns() );
4899 
4900  for( size_t i=0UL; i<M; ++i )
4901  {
4902  const size_t jbegin( ( IsUpper_v<MT5> )
4903  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
4904  :( 0UL ) );
4905  const size_t jend( ( IsLower_v<MT5> )
4906  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
4907  :( N ) );
4908  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4909 
4910  const size_t jnum( jend - jbegin );
4911  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4912 
4913  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4914  C(i,j ) -= A(i,i) * B(i,j );
4915  C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4916  }
4917  if( jpos < jend ) {
4918  C(i,jpos) -= A(i,i) * B(i,jpos);
4919  }
4920  }
4921  }
4923  //**********************************************************************************************
4924 
4925  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
4939  template< typename MT3 // Type of the left-hand side target matrix
4940  , typename MT4 // Type of the left-hand side matrix operand
4941  , typename MT5 > // Type of the right-hand side matrix operand
4942  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4943  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4944  {
4945  constexpr size_t block( BLOCK_SIZE );
4946 
4947  const size_t M( A.rows() );
4948  const size_t N( B.columns() );
4949 
4950  for( size_t jj=0UL; jj<N; jj+=block ) {
4951  const size_t jend( min( N, jj+block ) );
4952  for( size_t ii=0UL; ii<M; ii+=block ) {
4953  const size_t iend( min( M, ii+block ) );
4954  for( size_t j=jj; j<jend; ++j )
4955  {
4956  const size_t ibegin( ( IsLower_v<MT5> )
4957  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
4958  :( ii ) );
4959  const size_t ipos( ( IsUpper_v<MT5> )
4960  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
4961  :( iend ) );
4962 
4963  for( size_t i=ibegin; i<ipos; ++i ) {
4964  C(i,j) -= A(i,i) * B(i,j);
4965  }
4966  }
4967  }
4968  }
4969  }
4971  //**********************************************************************************************
4972 
4973  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
4987  template< typename MT3 // Type of the left-hand side target matrix
4988  , typename MT4 // Type of the left-hand side matrix operand
4989  , typename MT5 > // Type of the right-hand side matrix operand
4990  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4991  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4992  {
4993  for( size_t i=0UL; i<A.rows(); ++i ) {
4994  C(i,i) -= A(i,i) * B(i,i);
4995  }
4996  }
4998  //**********************************************************************************************
4999 
5000  //**Default subtraction assignment to dense matrices (small matrices)***************************
5014  template< typename MT3 // Type of the left-hand side target matrix
5015  , typename MT4 // Type of the left-hand side matrix operand
5016  , typename MT5 > // Type of the right-hand side matrix operand
5017  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5018  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5019  {
5020  selectDefaultSubAssignKernel( C, A, B );
5021  }
5023  //**********************************************************************************************
5024 
5025  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
5040  template< typename MT3 // Type of the left-hand side target matrix
5041  , typename MT4 // Type of the left-hand side matrix operand
5042  , typename MT5 > // Type of the right-hand side matrix operand
5043  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5044  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5045  {
5046  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5047 
5048  const size_t M( A.rows() );
5049  const size_t N( B.columns() );
5050  const size_t K( A.columns() );
5051 
5052  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5053 
5054  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
5055  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5056 
5057  size_t j( 0UL );
5058 
5059  if( IsIntegral_v<ElementType> )
5060  {
5061  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5062  for( size_t i=0UL; i<M; ++i )
5063  {
5064  const size_t kbegin( ( IsUpper_v<MT4> )
5065  ?( ( IsLower_v<MT5> )
5066  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5067  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5068  :( IsLower_v<MT5> ? j : 0UL ) );
5069  const size_t kend( ( IsLower_v<MT4> )
5070  ?( ( IsUpper_v<MT5> )
5071  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5072  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5073  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
5074 
5075  SIMDType xmm1( C.load(i,j ) );
5076  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5077  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5078  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
5079  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
5080  SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
5081  SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
5082  SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
5083 
5084  for( size_t k=kbegin; k<kend; ++k ) {
5085  const SIMDType a1( set( A(i,k) ) );
5086  xmm1 -= a1 * B.load(k,j );
5087  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5088  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5089  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5090  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5091  xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
5092  xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
5093  xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
5094  }
5095 
5096  C.store( i, j , xmm1 );
5097  C.store( i, j+SIMDSIZE , xmm2 );
5098  C.store( i, j+SIMDSIZE*2UL, xmm3 );
5099  C.store( i, j+SIMDSIZE*3UL, xmm4 );
5100  C.store( i, j+SIMDSIZE*4UL, xmm5 );
5101  C.store( i, j+SIMDSIZE*5UL, xmm6 );
5102  C.store( i, j+SIMDSIZE*6UL, xmm7 );
5103  C.store( i, j+SIMDSIZE*7UL, xmm8 );
5104  }
5105  }
5106  }
5107 
5108  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5109  {
5110  size_t i( 0UL );
5111 
5112  for( ; (i+2UL) <= M; i+=2UL )
5113  {
5114  const size_t kbegin( ( IsUpper_v<MT4> )
5115  ?( ( IsLower_v<MT5> )
5116  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5117  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5118  :( IsLower_v<MT5> ? j : 0UL ) );
5119  const size_t kend( ( IsLower_v<MT4> )
5120  ?( ( IsUpper_v<MT5> )
5121  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
5122  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5123  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
5124 
5125  SIMDType xmm1 ( C.load(i ,j ) );
5126  SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
5127  SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
5128  SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
5129  SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
5130  SIMDType xmm6 ( C.load(i+1UL,j ) );
5131  SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
5132  SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
5133  SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
5134  SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
5135 
5136  for( size_t k=kbegin; k<kend; ++k ) {
5137  const SIMDType a1( set( A(i ,k) ) );
5138  const SIMDType a2( set( A(i+1UL,k) ) );
5139  const SIMDType b1( B.load(k,j ) );
5140  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5141  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5142  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5143  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5144  xmm1 -= a1 * b1;
5145  xmm2 -= a1 * b2;
5146  xmm3 -= a1 * b3;
5147  xmm4 -= a1 * b4;
5148  xmm5 -= a1 * b5;
5149  xmm6 -= a2 * b1;
5150  xmm7 -= a2 * b2;
5151  xmm8 -= a2 * b3;
5152  xmm9 -= a2 * b4;
5153  xmm10 -= a2 * b5;
5154  }
5155 
5156  C.store( i , j , xmm1 );
5157  C.store( i , j+SIMDSIZE , xmm2 );
5158  C.store( i , j+SIMDSIZE*2UL, xmm3 );
5159  C.store( i , j+SIMDSIZE*3UL, xmm4 );
5160  C.store( i , j+SIMDSIZE*4UL, xmm5 );
5161  C.store( i+1UL, j , xmm6 );
5162  C.store( i+1UL, j+SIMDSIZE , xmm7 );
5163  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
5164  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
5165  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
5166  }
5167 
5168  if( i < M )
5169  {
5170  const size_t kbegin( ( IsUpper_v<MT4> )
5171  ?( ( IsLower_v<MT5> )
5172  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5173  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5174  :( IsLower_v<MT5> ? j : 0UL ) );
5175  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
5176 
5177  SIMDType xmm1( C.load(i,j ) );
5178  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5179  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5180  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
5181  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
5182 
5183  for( size_t k=kbegin; k<kend; ++k ) {
5184  const SIMDType a1( set( A(i,k) ) );
5185  xmm1 -= a1 * B.load(k,j );
5186  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5187  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5188  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5189  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5190  }
5191 
5192  C.store( i, j , xmm1 );
5193  C.store( i, j+SIMDSIZE , xmm2 );
5194  C.store( i, j+SIMDSIZE*2UL, xmm3 );
5195  C.store( i, j+SIMDSIZE*3UL, xmm4 );
5196  C.store( i, j+SIMDSIZE*4UL, xmm5 );
5197  }
5198  }
5199 
5200  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5201  {
5202  size_t i( 0UL );
5203 
5204  for( ; (i+2UL) <= M; i+=2UL )
5205  {
5206  const size_t kbegin( ( IsUpper_v<MT4> )
5207  ?( ( IsLower_v<MT5> )
5208  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5209  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5210  :( IsLower_v<MT5> ? j : 0UL ) );
5211  const size_t kend( ( IsLower_v<MT4> )
5212  ?( ( IsUpper_v<MT5> )
5213  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5214  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5215  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
5216 
5217  SIMDType xmm1( C.load(i ,j ) );
5218  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
5219  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
5220  SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
5221  SIMDType xmm5( C.load(i+1UL,j ) );
5222  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
5223  SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
5224  SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
5225 
5226  for( size_t k=kbegin; k<kend; ++k ) {
5227  const SIMDType a1( set( A(i ,k) ) );
5228  const SIMDType a2( set( A(i+1UL,k) ) );
5229  const SIMDType b1( B.load(k,j ) );
5230  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5231  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5232  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5233  xmm1 -= a1 * b1;
5234  xmm2 -= a1 * b2;
5235  xmm3 -= a1 * b3;
5236  xmm4 -= a1 * b4;
5237  xmm5 -= a2 * b1;
5238  xmm6 -= a2 * b2;
5239  xmm7 -= a2 * b3;
5240  xmm8 -= a2 * b4;
5241  }
5242 
5243  C.store( i , j , xmm1 );
5244  C.store( i , j+SIMDSIZE , xmm2 );
5245  C.store( i , j+SIMDSIZE*2UL, xmm3 );
5246  C.store( i , j+SIMDSIZE*3UL, xmm4 );
5247  C.store( i+1UL, j , xmm5 );
5248  C.store( i+1UL, j+SIMDSIZE , xmm6 );
5249  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
5250  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
5251  }
5252 
5253  if( i < M )
5254  {
5255  const size_t kbegin( ( IsUpper_v<MT4> )
5256  ?( ( IsLower_v<MT5> )
5257  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5258  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5259  :( IsLower_v<MT5> ? j : 0UL ) );
5260  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5261 
5262  SIMDType xmm1( C.load(i,j ) );
5263  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5264  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5265  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
5266 
5267  for( size_t k=kbegin; k<kend; ++k ) {
5268  const SIMDType a1( set( A(i,k) ) );
5269  xmm1 -= a1 * B.load(k,j );
5270  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5271  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5272  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5273  }
5274 
5275  C.store( i, j , xmm1 );
5276  C.store( i, j+SIMDSIZE , xmm2 );
5277  C.store( i, j+SIMDSIZE*2UL, xmm3 );
5278  C.store( i, j+SIMDSIZE*3UL, xmm4 );
5279  }
5280  }
5281 
5282  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5283  {
5284  size_t i( 0UL );
5285 
5286  for( ; (i+2UL) <= M; i+=2UL )
5287  {
5288  const size_t kbegin( ( IsUpper_v<MT4> )
5289  ?( ( IsLower_v<MT5> )
5290  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5291  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5292  :( IsLower_v<MT5> ? j : 0UL ) );
5293  const size_t kend( ( IsLower_v<MT4> )
5294  ?( ( IsUpper_v<MT5> )
5295  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
5296  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5297  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
5298 
5299  SIMDType xmm1( C.load(i ,j ) );
5300  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
5301  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
5302  SIMDType xmm4( C.load(i+1UL,j ) );
5303  SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
5304  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
5305 
5306  for( size_t k=kbegin; k<kend; ++k ) {
5307  const SIMDType a1( set( A(i ,k) ) );
5308  const SIMDType a2( set( A(i+1UL,k) ) );
5309  const SIMDType b1( B.load(k,j ) );
5310  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5311  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5312  xmm1 -= a1 * b1;
5313  xmm2 -= a1 * b2;
5314  xmm3 -= a1 * b3;
5315  xmm4 -= a2 * b1;
5316  xmm5 -= a2 * b2;
5317  xmm6 -= a2 * b3;
5318  }
5319 
5320  C.store( i , j , xmm1 );
5321  C.store( i , j+SIMDSIZE , xmm2 );
5322  C.store( i , j+SIMDSIZE*2UL, xmm3 );
5323  C.store( i+1UL, j , xmm4 );
5324  C.store( i+1UL, j+SIMDSIZE , xmm5 );
5325  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
5326  }
5327 
5328  if( i < M )
5329  {
5330  const size_t kbegin( ( IsUpper_v<MT4> )
5331  ?( ( IsLower_v<MT5> )
5332  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5333  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5334  :( IsLower_v<MT5> ? j : 0UL ) );
5335  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
5336 
5337  SIMDType xmm1( C.load(i,j ) );
5338  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
5339  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
5340 
5341  for( size_t k=kbegin; k<kend; ++k ) {
5342  const SIMDType a1( set( A(i,k) ) );
5343  xmm1 -= a1 * B.load(k,j );
5344  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5345  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5346  }
5347 
5348  C.store( i, j , xmm1 );
5349  C.store( i, j+SIMDSIZE , xmm2 );
5350  C.store( i, j+SIMDSIZE*2UL, xmm3 );
5351  }
5352  }
5353 
5354  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5355  {
5356  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
5357  size_t i( LOW ? j : 0UL );
5358 
5359  for( ; (i+4UL) <= iend; i+=4UL )
5360  {
5361  const size_t kbegin( ( IsUpper_v<MT4> )
5362  ?( ( IsLower_v<MT5> )
5363  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5364  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5365  :( IsLower_v<MT5> ? j : 0UL ) );
5366  const size_t kend( ( IsLower_v<MT4> )
5367  ?( ( IsUpper_v<MT5> )
5368  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
5369  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5370  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5371 
5372  SIMDType xmm1( C.load(i ,j ) );
5373  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
5374  SIMDType xmm3( C.load(i+1UL,j ) );
5375  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
5376  SIMDType xmm5( C.load(i+2UL,j ) );
5377  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
5378  SIMDType xmm7( C.load(i+3UL,j ) );
5379  SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
5380 
5381  for( size_t k=kbegin; k<kend; ++k ) {
5382  const SIMDType a1( set( A(i ,k) ) );
5383  const SIMDType a2( set( A(i+1UL,k) ) );
5384  const SIMDType a3( set( A(i+2UL,k) ) );
5385  const SIMDType a4( set( A(i+3UL,k) ) );
5386  const SIMDType b1( B.load(k,j ) );
5387  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5388  xmm1 -= a1 * b1;
5389  xmm2 -= a1 * b2;
5390  xmm3 -= a2 * b1;
5391  xmm4 -= a2 * b2;
5392  xmm5 -= a3 * b1;
5393  xmm6 -= a3 * b2;
5394  xmm7 -= a4 * b1;
5395  xmm8 -= a4 * b2;
5396  }
5397 
5398  C.store( i , j , xmm1 );
5399  C.store( i , j+SIMDSIZE, xmm2 );
5400  C.store( i+1UL, j , xmm3 );
5401  C.store( i+1UL, j+SIMDSIZE, xmm4 );
5402  C.store( i+2UL, j , xmm5 );
5403  C.store( i+2UL, j+SIMDSIZE, xmm6 );
5404  C.store( i+3UL, j , xmm7 );
5405  C.store( i+3UL, j+SIMDSIZE, xmm8 );
5406  }
5407 
5408  for( ; (i+3UL) <= iend; i+=3UL )
5409  {
5410  const size_t kbegin( ( IsUpper_v<MT4> )
5411  ?( ( IsLower_v<MT5> )
5412  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5413  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5414  :( IsLower_v<MT5> ? j : 0UL ) );
5415  const size_t kend( ( IsLower_v<MT4> )
5416  ?( ( IsUpper_v<MT5> )
5417  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
5418  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5419  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5420 
5421  SIMDType xmm1( C.load(i ,j ) );
5422  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
5423  SIMDType xmm3( C.load(i+1UL,j ) );
5424  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
5425  SIMDType xmm5( C.load(i+2UL,j ) );
5426  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
5427 
5428  for( size_t k=kbegin; k<kend; ++k ) {
5429  const SIMDType a1( set( A(i ,k) ) );
5430  const SIMDType a2( set( A(i+1UL,k) ) );
5431  const SIMDType a3( set( A(i+2UL,k) ) );
5432  const SIMDType b1( B.load(k,j ) );
5433  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5434  xmm1 -= a1 * b1;
5435  xmm2 -= a1 * b2;
5436  xmm3 -= a2 * b1;
5437  xmm4 -= a2 * b2;
5438  xmm5 -= a3 * b1;
5439  xmm6 -= a3 * b2;
5440  }
5441 
5442  C.store( i , j , xmm1 );
5443  C.store( i , j+SIMDSIZE, xmm2 );
5444  C.store( i+1UL, j , xmm3 );
5445  C.store( i+1UL, j+SIMDSIZE, xmm4 );
5446  C.store( i+2UL, j , xmm5 );
5447  C.store( i+2UL, j+SIMDSIZE, xmm6 );
5448  }
5449 
5450  for( ; (i+2UL) <= iend; i+=2UL )
5451  {
5452  const size_t kbegin( ( IsUpper_v<MT4> )
5453  ?( ( IsLower_v<MT5> )
5454  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5455  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5456  :( IsLower_v<MT5> ? j : 0UL ) );
5457  const size_t kend( ( IsLower_v<MT4> )
5458  ?( ( IsUpper_v<MT5> )
5459  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
5460  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5461  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5462 
5463  SIMDType xmm1( C.load(i ,j ) );
5464  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
5465  SIMDType xmm3( C.load(i+1UL,j ) );
5466  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
5467  SIMDType xmm5, xmm6, xmm7, xmm8;
5468  size_t k( kbegin );
5469 
5470  for( ; (k+2UL) <= kend; k+=2UL ) {
5471  const SIMDType a1( set( A(i ,k ) ) );
5472  const SIMDType a2( set( A(i+1UL,k ) ) );
5473  const SIMDType a3( set( A(i ,k+1UL) ) );
5474  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
5475  const SIMDType b1( B.load(k ,j ) );
5476  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5477  const SIMDType b3( B.load(k+1UL,j ) );
5478  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5479  xmm1 -= a1 * b1;
5480  xmm2 -= a1 * b2;
5481  xmm3 -= a2 * b1;
5482  xmm4 -= a2 * b2;
5483  xmm5 -= a3 * b3;
5484  xmm6 -= a3 * b4;
5485  xmm7 -= a4 * b3;
5486  xmm8 -= a4 * b4;
5487  }
5488 
5489  for( ; k<kend; ++k ) {
5490  const SIMDType a1( set( A(i ,k) ) );
5491  const SIMDType a2( set( A(i+1UL,k) ) );
5492  const SIMDType b1( B.load(k,j ) );
5493  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5494  xmm1 -= a1 * b1;
5495  xmm2 -= a1 * b2;
5496  xmm3 -= a2 * b1;
5497  xmm4 -= a2 * b2;
5498  }
5499 
5500  C.store( i , j , xmm1+xmm5 );
5501  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
5502  C.store( i+1UL, j , xmm3+xmm7 );
5503  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
5504  }
5505 
5506  if( i < iend )
5507  {
5508  const size_t kbegin( ( IsUpper_v<MT4> )
5509  ?( ( IsLower_v<MT5> )
5510  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5511  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5512  :( IsLower_v<MT5> ? j : 0UL ) );
5513  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
5514 
5515  SIMDType xmm1( C.load(i,j ) );
5516  SIMDType xmm2( C.load(i,j+SIMDSIZE) );
5517  SIMDType xmm3, xmm4;
5518  size_t k( kbegin );
5519 
5520  for( ; (k+2UL) <= kend; k+=2UL ) {
5521  const SIMDType a1( set( A(i,k ) ) );
5522  const SIMDType a2( set( A(i,k+1UL) ) );
5523  xmm1 -= a1 * B.load(k ,j );
5524  xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
5525  xmm3 -= a2 * B.load(k+1UL,j );
5526  xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
5527  }
5528 
5529  for( ; k<kend; ++k ) {
5530  const SIMDType a1( set( A(i,k) ) );
5531  xmm1 -= a1 * B.load(k,j );
5532  xmm2 -= a1 * B.load(k,j+SIMDSIZE);
5533  }
5534 
5535  C.store( i, j , xmm1+xmm3 );
5536  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
5537  }
5538  }
5539 
5540  for( ; j<jpos; j+=SIMDSIZE )
5541  {
5542  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
5543  size_t i( LOW ? j : 0UL );
5544 
5545  for( ; (i+4UL) <= iend; i+=4UL )
5546  {
5547  const size_t kbegin( ( IsUpper_v<MT4> )
5548  ?( ( IsLower_v<MT5> )
5549  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5550  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5551  :( IsLower_v<MT5> ? j : 0UL ) );
5552  const size_t kend( ( IsLower_v<MT4> )
5553  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
5554  :( K ) );
5555 
5556  SIMDType xmm1( C.load(i ,j) );
5557  SIMDType xmm2( C.load(i+1UL,j) );
5558  SIMDType xmm3( C.load(i+2UL,j) );
5559  SIMDType xmm4( C.load(i+3UL,j) );
5560  SIMDType xmm5, xmm6, xmm7, xmm8;
5561  size_t k( kbegin );
5562 
5563  for( ; (k+2UL) <= kend; k+=2UL ) {
5564  const SIMDType b1( B.load(k ,j) );
5565  const SIMDType b2( B.load(k+1UL,j) );
5566  xmm1 -= set( A(i ,k ) ) * b1;
5567  xmm2 -= set( A(i+1UL,k ) ) * b1;
5568  xmm3 -= set( A(i+2UL,k ) ) * b1;
5569  xmm4 -= set( A(i+3UL,k ) ) * b1;
5570  xmm5 -= set( A(i ,k+1UL) ) * b2;
5571  xmm6 -= set( A(i+1UL,k+1UL) ) * b2;
5572  xmm7 -= set( A(i+2UL,k+1UL) ) * b2;
5573  xmm8 -= set( A(i+3UL,k+1UL) ) * b2;
5574  }
5575 
5576  for( ; k<kend; ++k ) {
5577  const SIMDType b1( B.load(k,j) );
5578  xmm1 -= set( A(i ,k) ) * b1;
5579  xmm2 -= set( A(i+1UL,k) ) * b1;
5580  xmm3 -= set( A(i+2UL,k) ) * b1;
5581  xmm4 -= set( A(i+3UL,k) ) * b1;
5582  }
5583 
5584  C.store( i , j, xmm1+xmm5 );
5585  C.store( i+1UL, j, xmm2+xmm6 );
5586  C.store( i+2UL, j, xmm3+xmm7 );
5587  C.store( i+3UL, j, xmm4+xmm8 );
5588  }
5589 
5590  for( ; (i+3UL) <= iend; i+=3UL )
5591  {
5592  const size_t kbegin( ( IsUpper_v<MT4> )
5593  ?( ( IsLower_v<MT5> )
5594  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5595  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5596  :( IsLower_v<MT5> ? j : 0UL ) );
5597  const size_t kend( ( IsLower_v<MT4> )
5598  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
5599  :( K ) );
5600 
5601  SIMDType xmm1( C.load(i ,j) );
5602  SIMDType xmm2( C.load(i+1UL,j) );
5603  SIMDType xmm3( C.load(i+2UL,j) );
5604  SIMDType xmm4, xmm5, xmm6;
5605  size_t k( kbegin );
5606 
5607  for( ; (k+2UL) <= kend; k+=2UL ) {
5608  const SIMDType b1( B.load(k ,j) );
5609  const SIMDType b2( B.load(k+1UL,j) );
5610  xmm1 -= set( A(i ,k ) ) * b1;
5611  xmm2 -= set( A(i+1UL,k ) ) * b1;
5612  xmm3 -= set( A(i+2UL,k ) ) * b1;
5613  xmm4 -= set( A(i ,k+1UL) ) * b2;
5614  xmm5 -= set( A(i+1UL,k+1UL) ) * b2;
5615  xmm6 -= set( A(i+2UL,k+1UL) ) * b2;
5616  }
5617 
5618  for( ; k<kend; ++k ) {
5619  const SIMDType b1( B.load(k,j) );
5620  xmm1 -= set( A(i ,k) ) * b1;
5621  xmm2 -= set( A(i+1UL,k) ) * b1;
5622  xmm3 -= set( A(i+2UL,k) ) * b1;
5623  }
5624 
5625  C.store( i , j, xmm1+xmm4 );
5626  C.store( i+1UL, j, xmm2+xmm5 );
5627  C.store( i+2UL, j, xmm3+xmm6 );
5628  }
5629 
5630  for( ; (i+2UL) <= iend; i+=2UL )
5631  {
5632  const size_t kbegin( ( IsUpper_v<MT4> )
5633  ?( ( IsLower_v<MT5> )
5634  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5635  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5636  :( IsLower_v<MT5> ? j : 0UL ) );
5637  const size_t kend( ( IsLower_v<MT4> )
5638  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5639  :( K ) );
5640 
5641  SIMDType xmm1( C.load(i ,j) );
5642  SIMDType xmm2( C.load(i+1UL,j) );
5643  SIMDType xmm3, xmm4;
5644  size_t k( kbegin );
5645 
5646  for( ; (k+2UL) <= kend; k+=2UL ) {
5647  const SIMDType b1( B.load(k ,j) );
5648  const SIMDType b2( B.load(k+1UL,j) );
5649  xmm1 -= set( A(i ,k ) ) * b1;
5650  xmm2 -= set( A(i+1UL,k ) ) * b1;
5651  xmm3 -= set( A(i ,k+1UL) ) * b2;
5652  xmm4 -= set( A(i+1UL,k+1UL) ) * b2;
5653  }
5654 
5655  for( ; k<kend; ++k ) {
5656  const SIMDType b1( B.load(k,j) );
5657  xmm1 -= set( A(i ,k) ) * b1;
5658  xmm2 -= set( A(i+1UL,k) ) * b1;
5659  }
5660 
5661  C.store( i , j, xmm1+xmm3 );
5662  C.store( i+1UL, j, xmm2+xmm4 );
5663  }
5664 
5665  if( i < iend )
5666  {
5667  const size_t kbegin( ( IsUpper_v<MT4> )
5668  ?( ( IsLower_v<MT5> )
5669  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5670  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5671  :( IsLower_v<MT5> ? j : 0UL ) );
5672 
5673  SIMDType xmm1( C.load(i,j) );
5674  SIMDType xmm2;
5675  size_t k( kbegin );
5676 
5677  for( ; (k+2UL) <= K; k+=2UL ) {
5678  xmm1 -= set( A(i,k ) ) * B.load(k ,j);
5679  xmm2 -= set( A(i,k+1UL) ) * B.load(k+1UL,j);
5680  }
5681 
5682  for( ; k<K; ++k ) {
5683  xmm1 -= set( A(i,k) ) * B.load(k,j);
5684  }
5685 
5686  C.store( i, j, xmm1+xmm2 );
5687  }
5688  }
5689 
5690  for( ; remainder && j<N; ++j )
5691  {
5692  const size_t iend( UPP ? j+1UL : M );
5693  size_t i( LOW ? j : 0UL );
5694 
5695  for( ; (i+2UL) <= iend; i+=2UL )
5696  {
5697  const size_t kbegin( ( IsUpper_v<MT4> )
5698  ?( ( IsLower_v<MT5> )
5699  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5700  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5701  :( IsLower_v<MT5> ? j : 0UL ) );
5702  const size_t kend( ( IsLower_v<MT4> )
5703  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5704  :( K ) );
5705 
5706  ElementType value1( C(i ,j) );
5707  ElementType value2( C(i+1UL,j) );
5708 
5709  for( size_t k=kbegin; k<kend; ++k ) {
5710  value1 -= A(i ,k) * B(k,j);
5711  value2 -= A(i+1UL,k) * B(k,j);
5712  }
5713 
5714  C(i ,j) = value1;
5715  C(i+1UL,j) = value2;
5716  }
5717 
5718  if( i < iend )
5719  {
5720  const size_t kbegin( ( IsUpper_v<MT4> )
5721  ?( ( IsLower_v<MT5> )
5722  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5723  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5724  :( IsLower_v<MT5> ? j : 0UL ) );
5725 
5726  ElementType value( C(i,j) );
5727 
5728  for( size_t k=kbegin; k<K; ++k ) {
5729  value -= A(i,k) * B(k,j);
5730  }
5731 
5732  C(i,j) = value;
5733  }
5734  }
5735  }
5737  //**********************************************************************************************
5738 
5739  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
5754  template< typename MT3 // Type of the left-hand side target matrix
5755  , typename MT4 // Type of the left-hand side matrix operand
5756  , typename MT5 > // Type of the right-hand side matrix operand
5757  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5758  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5759  {
5760  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5761 
5762  const size_t M( A.rows() );
5763  const size_t N( B.columns() );
5764  const size_t K( A.columns() );
5765 
5766  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5767 
5768  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
5769  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5770 
5771  size_t i( 0UL );
5772 
5773  if( IsIntegral_v<ElementType> )
5774  {
5775  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5776  for( size_t j=0UL; j<N; ++j )
5777  {
5778  const size_t kbegin( ( IsLower_v<MT5> )
5779  ?( ( IsUpper_v<MT4> )
5780  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5781  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5782  :( IsUpper_v<MT4> ? i : 0UL ) );
5783  const size_t kend( ( IsUpper_v<MT5> )
5784  ?( ( IsLower_v<MT4> )
5785  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5786  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5787  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
5788 
5789  SIMDType xmm1( C.load(i ,j) );
5790  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
5791  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
5792  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
5793  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
5794  SIMDType xmm6( C.load(i+SIMDSIZE*5UL,j) );
5795  SIMDType xmm7( C.load(i+SIMDSIZE*6UL,j) );
5796  SIMDType xmm8( C.load(i+SIMDSIZE*7UL,j) );
5797 
5798  for( size_t k=kbegin; k<kend; ++k ) {
5799  const SIMDType b1( set( B(k,j) ) );
5800  xmm1 -= A.load(i ,k) * b1;
5801  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5802  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5803  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5804  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5805  xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
5806  xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
5807  xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
5808  }
5809 
5810  C.store( i , j, xmm1 );
5811  C.store( i+SIMDSIZE , j, xmm2 );
5812  C.store( i+SIMDSIZE*2UL, j, xmm3 );
5813  C.store( i+SIMDSIZE*3UL, j, xmm4 );
5814  C.store( i+SIMDSIZE*4UL, j, xmm5 );
5815  C.store( i+SIMDSIZE*5UL, j, xmm6 );
5816  C.store( i+SIMDSIZE*6UL, j, xmm7 );
5817  C.store( i+SIMDSIZE*7UL, j, xmm8 );
5818  }
5819  }
5820  }
5821 
5822  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5823  {
5824  size_t j( 0UL );
5825 
5826  for( ; (j+2UL) <= N; j+=2UL )
5827  {
5828  const size_t kbegin( ( IsLower_v<MT5> )
5829  ?( ( IsUpper_v<MT4> )
5830  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5831  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5832  :( IsUpper_v<MT4> ? i : 0UL ) );
5833  const size_t kend( ( IsUpper_v<MT5> )
5834  ?( ( IsLower_v<MT4> )
5835  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5836  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5837  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
5838 
5839  SIMDType xmm1 ( C.load(i ,j ) );
5840  SIMDType xmm2 ( C.load(i+SIMDSIZE ,j ) );
5841  SIMDType xmm3 ( C.load(i+SIMDSIZE*2UL,j ) );
5842  SIMDType xmm4 ( C.load(i+SIMDSIZE*3UL,j ) );
5843  SIMDType xmm5 ( C.load(i+SIMDSIZE*4UL,j ) );
5844  SIMDType xmm6 ( C.load(i ,j+1UL) );
5845  SIMDType xmm7 ( C.load(i+SIMDSIZE ,j+1UL) );
5846  SIMDType xmm8 ( C.load(i+SIMDSIZE*2UL,j+1UL) );
5847  SIMDType xmm9 ( C.load(i+SIMDSIZE*3UL,j+1UL) );
5848  SIMDType xmm10( C.load(i+SIMDSIZE*4UL,j+1UL) );
5849 
5850  for( size_t k=kbegin; k<kend; ++k ) {
5851  const SIMDType a1( A.load(i ,k) );
5852  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5853  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5854  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5855  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5856  const SIMDType b1( set( B(k,j ) ) );
5857  const SIMDType b2( set( B(k,j+1UL) ) );
5858  xmm1 -= a1 * b1;
5859  xmm2 -= a2 * b1;
5860  xmm3 -= a3 * b1;
5861  xmm4 -= a4 * b1;
5862  xmm5 -= a5 * b1;
5863  xmm6 -= a1 * b2;
5864  xmm7 -= a2 * b2;
5865  xmm8 -= a3 * b2;
5866  xmm9 -= a4 * b2;
5867  xmm10 -= a5 * b2;
5868  }
5869 
5870  C.store( i , j , xmm1 );
5871  C.store( i+SIMDSIZE , j , xmm2 );
5872  C.store( i+SIMDSIZE*2UL, j , xmm3 );
5873  C.store( i+SIMDSIZE*3UL, j , xmm4 );
5874  C.store( i+SIMDSIZE*4UL, j , xmm5 );
5875  C.store( i , j+1UL, xmm6 );
5876  C.store( i+SIMDSIZE , j+1UL, xmm7 );
5877  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
5878  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
5879  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
5880  }
5881 
5882  if( j < N )
5883  {
5884  const size_t kbegin( ( IsLower_v<MT5> )
5885  ?( ( IsUpper_v<MT4> )
5886  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5887  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5888  :( IsUpper_v<MT4> ? i : 0UL ) );
5889  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
5890 
5891  SIMDType xmm1( C.load(i ,j) );
5892  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
5893  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
5894  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
5895  SIMDType xmm5( C.load(i+SIMDSIZE*4UL,j) );
5896 
5897  for( size_t k=kbegin; k<kend; ++k ) {
5898  const SIMDType b1( set( B(k,j) ) );
5899  xmm1 -= A.load(i ,k) * b1;
5900  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5901  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5902  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5903  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5904  }
5905 
5906  C.store( i , j, xmm1 );
5907  C.store( i+SIMDSIZE , j, xmm2 );
5908  C.store( i+SIMDSIZE*2UL, j, xmm3 );
5909  C.store( i+SIMDSIZE*3UL, j, xmm4 );
5910  C.store( i+SIMDSIZE*4UL, j, xmm5 );
5911  }
5912  }
5913 
5914  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5915  {
5916  size_t j( 0UL );
5917 
5918  for( ; (j+2UL) <= N; j+=2UL )
5919  {
5920  const size_t kbegin( ( IsLower_v<MT5> )
5921  ?( ( IsUpper_v<MT4> )
5922  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5923  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5924  :( IsUpper_v<MT4> ? i : 0UL ) );
5925  const size_t kend( ( IsUpper_v<MT5> )
5926  ?( ( IsLower_v<MT4> )
5927  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5928  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5929  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
5930 
5931  SIMDType xmm1( C.load(i ,j ) );
5932  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
5933  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
5934  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j ) );
5935  SIMDType xmm5( C.load(i ,j+1UL) );
5936  SIMDType xmm6( C.load(i+SIMDSIZE ,j+1UL) );
5937  SIMDType xmm7( C.load(i+SIMDSIZE*2UL,j+1UL) );
5938  SIMDType xmm8( C.load(i+SIMDSIZE*3UL,j+1UL) );
5939 
5940  for( size_t k=kbegin; k<kend; ++k ) {
5941  const SIMDType a1( A.load(i ,k) );
5942  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5943  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5944  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5945  const SIMDType b1( set( B(k,j ) ) );
5946  const SIMDType b2( set( B(k,j+1UL) ) );
5947  xmm1 -= a1 * b1;
5948  xmm2 -= a2 * b1;
5949  xmm3 -= a3 * b1;
5950  xmm4 -= a4 * b1;
5951  xmm5 -= a1 * b2;
5952  xmm6 -= a2 * b2;
5953  xmm7 -= a3 * b2;
5954  xmm8 -= a4 * b2;
5955  }
5956 
5957  C.store( i , j , xmm1 );
5958  C.store( i+SIMDSIZE , j , xmm2 );
5959  C.store( i+SIMDSIZE*2UL, j , xmm3 );
5960  C.store( i+SIMDSIZE*3UL, j , xmm4 );
5961  C.store( i , j+1UL, xmm5 );
5962  C.store( i+SIMDSIZE , j+1UL, xmm6 );
5963  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
5964  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
5965  }
5966 
5967  if( j < N )
5968  {
5969  const size_t kbegin( ( IsLower_v<MT5> )
5970  ?( ( IsUpper_v<MT4> )
5971  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5972  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5973  :( IsUpper_v<MT4> ? i : 0UL ) );
5974  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
5975 
5976  SIMDType xmm1( C.load(i ,j) );
5977  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
5978  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
5979  SIMDType xmm4( C.load(i+SIMDSIZE*3UL,j) );
5980 
5981  for( size_t k=kbegin; k<kend; ++k ) {
5982  const SIMDType b1( set( B(k,j) ) );
5983  xmm1 -= A.load(i ,k) * b1;
5984  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5985  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5986  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5987  }
5988 
5989  C.store( i , j, xmm1 );
5990  C.store( i+SIMDSIZE , j, xmm2 );
5991  C.store( i+SIMDSIZE*2UL, j, xmm3 );
5992  C.store( i+SIMDSIZE*3UL, j, xmm4 );
5993  }
5994  }
5995 
5996  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5997  {
5998  size_t j( 0UL );
5999 
6000  for( ; (j+2UL) <= N; j+=2UL )
6001  {
6002  const size_t kbegin( ( IsLower_v<MT5> )
6003  ?( ( IsUpper_v<MT4> )
6004  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6005  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6006  :( IsUpper_v<MT4> ? i : 0UL ) );
6007  const size_t kend( ( IsUpper_v<MT5> )
6008  ?( ( IsLower_v<MT4> )
6009  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6010  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6011  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
6012 
6013  SIMDType xmm1( C.load(i ,j ) );
6014  SIMDType xmm2( C.load(i+SIMDSIZE ,j ) );
6015  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j ) );
6016  SIMDType xmm4( C.load(i ,j+1UL) );
6017  SIMDType xmm5( C.load(i+SIMDSIZE ,j+1UL) );
6018  SIMDType xmm6( C.load(i+SIMDSIZE*2UL,j+1UL) );
6019 
6020  for( size_t k=kbegin; k<kend; ++k ) {
6021  const SIMDType a1( A.load(i ,k) );
6022  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6023  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6024  const SIMDType b1( set( B(k,j ) ) );
6025  const SIMDType b2( set( B(k,j+1UL) ) );
6026  xmm1 -= a1 * b1;
6027  xmm2 -= a2 * b1;
6028  xmm3 -= a3 * b1;
6029  xmm4 -= a1 * b2;
6030  xmm5 -= a2 * b2;
6031  xmm6 -= a3 * b2;
6032  }
6033 
6034  C.store( i , j , xmm1 );
6035  C.store( i+SIMDSIZE , j , xmm2 );
6036  C.store( i+SIMDSIZE*2UL, j , xmm3 );
6037  C.store( i , j+1UL, xmm4 );
6038  C.store( i+SIMDSIZE , j+1UL, xmm5 );
6039  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
6040  }
6041 
6042  if( j < N )
6043  {
6044  const size_t kbegin( ( IsLower_v<MT5> )
6045  ?( ( IsUpper_v<MT4> )
6046  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6047  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6048  :( IsUpper_v<MT4> ? i : 0UL ) );
6049  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
6050 
6051  SIMDType xmm1( C.load(i ,j) );
6052  SIMDType xmm2( C.load(i+SIMDSIZE ,j) );
6053  SIMDType xmm3( C.load(i+SIMDSIZE*2UL,j) );
6054 
6055  for( size_t k=kbegin; k<kend; ++k ) {
6056  const SIMDType b1( set( B(k,j) ) );
6057  xmm1 -= A.load(i ,k) * b1;
6058  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6059  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6060  }
6061 
6062  C.store( i , j, xmm1 );
6063  C.store( i+SIMDSIZE , j, xmm2 );
6064  C.store( i+SIMDSIZE*2UL, j, xmm3 );
6065  }
6066  }
6067 
6068  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6069  {
6070  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
6071  size_t j( UPP ? i : 0UL );
6072 
6073  for( ; (j+4UL) <= jend; j+=4UL )
6074  {
6075  const size_t kbegin( ( IsLower_v<MT5> )
6076  ?( ( IsUpper_v<MT4> )
6077  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6078  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6079  :( IsUpper_v<MT4> ? i : 0UL ) );
6080  const size_t kend( ( IsUpper_v<MT5> )
6081  ?( ( IsLower_v<MT4> )
6082  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6083  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6084  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6085 
6086  SIMDType xmm1( C.load(i ,j ) );
6087  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
6088  SIMDType xmm3( C.load(i ,j+1UL) );
6089  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
6090  SIMDType xmm5( C.load(i ,j+2UL) );
6091  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
6092  SIMDType xmm7( C.load(i ,j+3UL) );
6093  SIMDType xmm8( C.load(i+SIMDSIZE,j+3UL) );
6094 
6095  for( size_t k=kbegin; k<kend; ++k ) {
6096  const SIMDType a1( A.load(i ,k) );
6097  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6098  const SIMDType b1( set( B(k,j ) ) );
6099  const SIMDType b2( set( B(k,j+1UL) ) );
6100  const SIMDType b3( set( B(k,j+2UL) ) );
6101  const SIMDType b4( set( B(k,j+3UL) ) );
6102  xmm1 -= a1 * b1;
6103  xmm2 -= a2 * b1;
6104  xmm3 -= a1 * b2;
6105  xmm4 -= a2 * b2;
6106  xmm5 -= a1 * b3;
6107  xmm6 -= a2 * b3;
6108  xmm7 -= a1 * b4;
6109  xmm8 -= a2 * b4;
6110  }
6111 
6112  C.store( i , j , xmm1 );
6113  C.store( i+SIMDSIZE, j , xmm2 );
6114  C.store( i , j+1UL, xmm3 );
6115  C.store( i+SIMDSIZE, j+1UL, xmm4 );
6116  C.store( i , j+2UL, xmm5 );
6117  C.store( i+SIMDSIZE, j+2UL, xmm6 );
6118  C.store( i , j+3UL, xmm7 );
6119  C.store( i+SIMDSIZE, j+3UL, xmm8 );
6120  }
6121 
6122  for( ; (j+3UL) <= jend; j+=3UL )
6123  {
6124  const size_t kbegin( ( IsLower_v<MT5> )
6125  ?( ( IsUpper_v<MT4> )
6126  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6127  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6128  :( IsUpper_v<MT4> ? i : 0UL ) );
6129  const size_t kend( ( IsUpper_v<MT5> )
6130  ?( ( IsLower_v<MT4> )
6131  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6132  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6133  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6134 
6135  SIMDType xmm1( C.load(i ,j ) );
6136  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
6137  SIMDType xmm3( C.load(i ,j+1UL) );
6138  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
6139  SIMDType xmm5( C.load(i ,j+2UL) );
6140  SIMDType xmm6( C.load(i+SIMDSIZE,j+2UL) );
6141 
6142  for( size_t k=kbegin; k<kend; ++k ) {
6143  const SIMDType a1( A.load(i ,k) );
6144  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6145  const SIMDType b1( set( B(k,j ) ) );
6146  const SIMDType b2( set( B(k,j+1UL) ) );
6147  const SIMDType b3( set( B(k,j+2UL) ) );
6148  xmm1 -= a1 * b1;
6149  xmm2 -= a2 * b1;
6150  xmm3 -= a1 * b2;
6151  xmm4 -= a2 * b2;
6152  xmm5 -= a1 * b3;
6153  xmm6 -= a2 * b3;
6154  }
6155 
6156  C.store( i , j , xmm1 );
6157  C.store( i+SIMDSIZE, j , xmm2 );
6158  C.store( i , j+1UL, xmm3 );
6159  C.store( i+SIMDSIZE, j+1UL, xmm4 );
6160  C.store( i , j+2UL, xmm5 );
6161  C.store( i+SIMDSIZE, j+2UL, xmm6 );
6162  }
6163 
6164  for( ; (j+2UL) <= jend; j+=2UL )
6165  {
6166  const size_t kbegin( ( IsLower_v<MT5> )
6167  ?( ( IsUpper_v<MT4> )
6168  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6169  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6170  :( IsUpper_v<MT4> ? i : 0UL ) );
6171  const size_t kend( ( IsUpper_v<MT5> )
6172  ?( ( IsLower_v<MT4> )
6173  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6174  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6175  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
6176 
6177  SIMDType xmm1( C.load(i ,j ) );
6178  SIMDType xmm2( C.load(i+SIMDSIZE,j ) );
6179  SIMDType xmm3( C.load(i ,j+1UL) );
6180  SIMDType xmm4( C.load(i+SIMDSIZE,j+1UL) );
6181  SIMDType xmm5, xmm6, xmm7, xmm8;
6182  size_t k( kbegin );
6183 
6184  for( ; (k+2UL) <= kend; k+=2UL ) {
6185  const SIMDType a1( A.load(i ,k ) );
6186  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6187  const SIMDType a3( A.load(i ,k+1UL) );
6188  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6189  const SIMDType b1( set( B(k ,j ) ) );
6190  const SIMDType b2( set( B(k ,j+1UL) ) );
6191  const SIMDType b3( set( B(k+1UL,j ) ) );
6192  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
6193  xmm1 -= a1 * b1;
6194  xmm2 -= a2 * b1;
6195  xmm3 -= a1 * b2;
6196  xmm4 -= a2 * b2;
6197  xmm5 -= a3 * b3;
6198  xmm6 -= a4 * b3;
6199  xmm7 -= a3 * b4;
6200  xmm8 -= a4 * b4;
6201  }
6202 
6203  for( ; k<kend; ++k ) {
6204  const SIMDType a1( A.load(i ,k) );
6205  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6206  const SIMDType b1( set( B(k,j ) ) );
6207  const SIMDType b2( set( B(k,j+1UL) ) );
6208  xmm1 -= a1 * b1;
6209  xmm2 -= a2 * b1;
6210  xmm3 -= a1 * b2;
6211  xmm4 -= a2 * b2;
6212  }
6213 
6214  C.store( i , j , xmm1+xmm5 );
6215  C.store( i+SIMDSIZE, j , xmm2+xmm6 );
6216  C.store( i , j+1UL, xmm3+xmm7 );
6217  C.store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
6218  }
6219 
6220  if( j < jend )
6221  {
6222  const size_t kbegin( ( IsLower_v<MT5> )
6223  ?( ( IsUpper_v<MT4> )
6224  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6225  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6226  :( IsUpper_v<MT4> ? i : 0UL ) );
6227  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6228 
6229  SIMDType xmm1( C.load(i ,j) );
6230  SIMDType xmm2( C.load(i+SIMDSIZE,j) );
6231  SIMDType xmm3, xmm4;
6232  size_t k( kbegin );
6233 
6234  for( ; (k+2UL) <= kend; k+=2UL ) {
6235  const SIMDType b1( set( B(k ,j) ) );
6236  const SIMDType b2( set( B(k+1UL,j) ) );
6237  xmm1 -= A.load(i ,k ) * b1;
6238  xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
6239  xmm3 -= A.load(i ,k+1UL) * b2;
6240  xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
6241  }
6242 
6243  for( ; k<kend; ++k ) {
6244  const SIMDType b1( set( B(k,j) ) );
6245  xmm1 -= A.load(i ,k) * b1;
6246  xmm2 -= A.load(i+SIMDSIZE,k) * b1;
6247  }
6248 
6249  C.store( i , j, xmm1+xmm3 );
6250  C.store( i+SIMDSIZE, j, xmm2+xmm4 );
6251  }
6252  }
6253 
6254  for( ; i<ipos; i+=SIMDSIZE )
6255  {
6256  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
6257  size_t j( UPP ? i : 0UL );
6258 
6259  for( ; (j+4UL) <= jend; j+=4UL )
6260  {
6261  const size_t kbegin( ( IsLower_v<MT5> )
6262  ?( ( IsUpper_v<MT4> )
6263  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6264  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6265  :( IsUpper_v<MT4> ? i : 0UL ) );
6266  const size_t kend( ( IsUpper_v<MT5> )
6267  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6268  :( K ) );
6269 
6270  SIMDType xmm1( C.load(i,j ) );
6271  SIMDType xmm2( C.load(i,j+1UL) );
6272  SIMDType xmm3( C.load(i,j+2UL) );
6273  SIMDType xmm4( C.load(i,j+3UL) );
6274  SIMDType xmm5, xmm6, xmm7, xmm8;
6275  size_t k( kbegin );
6276 
6277  for( ; (k+2UL) <= kend; k+=2UL ) {
6278  const SIMDType a1( A.load(i,k ) );
6279  const SIMDType a2( A.load(i,k+1UL) );
6280  xmm1 -= a1 * set( B(k ,j ) );
6281  xmm2 -= a1 * set( B(k ,j+1UL) );
6282  xmm3 -= a1 * set( B(k ,j+2UL) );
6283  xmm4 -= a1 * set( B(k ,j+3UL) );
6284  xmm5 -= a2 * set( B(k+1UL,j ) );
6285  xmm6 -= a2 * set( B(k+1UL,j+1UL) );
6286  xmm7 -= a2 * set( B(k+1UL,j+2UL) );
6287  xmm8 -= a2 * set( B(k+1UL,j+3UL) );
6288  }
6289 
6290  for( ; k<kend; ++k ) {
6291  const SIMDType a1( A.load(i,k) );
6292  xmm1 -= a1 * set( B(k,j ) );
6293  xmm2 -= a1 * set( B(k,j+1UL) );
6294  xmm3 -= a1 * set( B(k,j+2UL) );
6295  xmm4 -= a1 * set( B(k,j+3UL) );
6296  }
6297 
6298  C.store( i, j , xmm1+xmm5 );
6299  C.store( i, j+1UL, xmm2+xmm6 );
6300  C.store( i, j+2UL, xmm3+xmm7 );
6301  C.store( i, j+3UL, xmm4+xmm8 );
6302  }
6303 
6304  for( ; (j+3UL) <= jend; j+=3UL )
6305  {
6306  const size_t kbegin( ( IsLower_v<MT5> )
6307  ?( ( IsUpper_v<MT4> )
6308  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6309  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6310  :( IsUpper_v<MT4> ? i : 0UL ) );
6311  const size_t kend( ( IsUpper_v<MT5> )
6312  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6313  :( K ) );
6314 
6315  SIMDType xmm1( C.load(i,j ) );
6316  SIMDType xmm2( C.load(i,j+1UL) );
6317  SIMDType xmm3( C.load(i,j+2UL) );
6318  SIMDType xmm4, xmm5, xmm6;
6319  size_t k( kbegin );
6320 
6321  for( ; (k+2UL) <= kend; k+=2UL ) {
6322  const SIMDType a1( A.load(i,k ) );
6323  const SIMDType a2( A.load(i,k+1UL) );
6324  xmm1 -= a1 * set( B(k ,j ) );
6325  xmm2 -= a1 * set( B(k ,j+1UL) );
6326  xmm3 -= a1 * set( B(k ,j+2UL) );
6327  xmm4 -= a2 * set( B(k+1UL,j ) );
6328  xmm5 -= a2 * set( B(k+1UL,j+1UL) );
6329  xmm6 -= a2 * set( B(k+1UL,j+2UL) );
6330  }
6331 
6332  for( ; k<kend; ++k ) {
6333  const SIMDType a1( A.load(i,k) );
6334  xmm1 -= a1 * set( B(k,j ) );
6335  xmm2 -= a1 * set( B(k,j+1UL) );
6336  xmm3 -= a1 * set( B(k,j+2UL) );
6337  }
6338 
6339  C.store( i, j , xmm1+xmm4 );
6340  C.store( i, j+1UL, xmm2+xmm5 );
6341  C.store( i, j+2UL, xmm3+xmm6 );
6342  }
6343 
6344  for( ; (j+2UL) <= jend; j+=2UL )
6345  {
6346  const size_t kbegin( ( IsLower_v<MT5> )
6347  ?( ( IsUpper_v<MT4> )
6348  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6349  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6350  :( IsUpper_v<MT4> ? i : 0UL ) );
6351  const size_t kend( ( IsUpper_v<MT5> )
6352  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6353  :( K ) );
6354 
6355  SIMDType xmm1( C.load(i,j ) );
6356  SIMDType xmm2( C.load(i,j+1UL) );
6357  SIMDType xmm3, xmm4;
6358  size_t k( kbegin );
6359 
6360  for( ; (k+2UL) <= kend; k+=2UL ) {
6361  const SIMDType a1( A.load(i,k ) );
6362  const SIMDType a2( A.load(i,k+1UL) );
6363  xmm1 -= a1 * set( B(k ,j ) );
6364  xmm2 -= a1 * set( B(k ,j+1UL) );
6365  xmm3 -= a2 * set( B(k+1UL,j ) );
6366  xmm4 -= a2 * set( B(k+1UL,j+1UL) );
6367  }
6368 
6369  for( ; k<kend; ++k ) {
6370  const SIMDType a1( A.load(i,k) );
6371  xmm1 -= a1 * set( B(k,j ) );
6372  xmm2 -= a1 * set( B(k,j+1UL) );
6373  }
6374 
6375  C.store( i, j , xmm1+xmm3 );
6376  C.store( i, j+1UL, xmm2+xmm4 );
6377  }
6378 
6379  if( j < jend )
6380  {
6381  const size_t kbegin( ( IsLower_v<MT5> )
6382  ?( ( IsUpper_v<MT4> )
6383  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6384  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6385  :( IsUpper_v<MT4> ? i : 0UL ) );
6386 
6387  SIMDType xmm1( C.load(i,j) );
6388  SIMDType xmm2;
6389  size_t k( kbegin );
6390 
6391  for( ; (k+2UL) <= K; k+=2UL ) {
6392  xmm1 -= A.load(i,k ) * set( B(k ,j) );
6393  xmm2 -= A.load(i,k+1UL) * set( B(k+1UL,j) );
6394  }
6395 
6396  for( ; k<K; ++k ) {
6397  xmm1 -= A.load(i,k) * set( B(k,j) );
6398  }
6399 
6400  C.store( i, j, xmm1+xmm2 );
6401  }
6402  }
6403 
6404  for( ; remainder && i<M; ++i )
6405  {
6406  const size_t jend( LOW ? i+1UL : N );
6407  size_t j( UPP ? i : 0UL );
6408 
6409  for( ; (j+2UL) <= jend; j+=2UL )
6410  {
6411  const size_t kbegin( ( IsLower_v<MT5> )
6412  ?( ( IsUpper_v<MT4> )
6413  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6414  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6415  :( IsUpper_v<MT4> ? i : 0UL ) );
6416  const size_t kend( ( IsUpper_v<MT5> )
6417  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6418  :( K ) );
6419 
6420  ElementType value1( C(i,j ) );
6421  ElementType value2( C(i,j+1UL) );
6422 
6423  for( size_t k=kbegin; k<kend; ++k ) {
6424  value1 -= A(i,k) * B(k,j );
6425  value2 -= A(i,k) * B(k,j+1UL);
6426  }
6427 
6428  C(i,j ) = value1;
6429  C(i,j+1UL) = value2;
6430  }
6431 
6432  if( j < jend )
6433  {
6434  const size_t kbegin( ( IsLower_v<MT5> )
6435  ?( ( IsUpper_v<MT4> )
6436  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6437  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6438  :( IsUpper_v<MT4> ? i : 0UL ) );
6439 
6440  ElementType value( C(i,j) );
6441 
6442  for( size_t k=kbegin; k<K; ++k ) {
6443  value -= A(i,k) * B(k,j);
6444  }
6445 
6446  C(i,j) = value;
6447  }
6448  }
6449  }
6451  //**********************************************************************************************
6452 
6453  //**Default subtraction assignment to dense matrices (large matrices)***************************
6467  template< typename MT3 // Type of the left-hand side target matrix
6468  , typename MT4 // Type of the left-hand side matrix operand
6469  , typename MT5 > // Type of the right-hand side matrix operand
6470  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6471  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6472  {
6473  selectDefaultSubAssignKernel( C, A, B );
6474  }
6476  //**********************************************************************************************
6477 
6478  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
6493  template< typename MT3 // Type of the left-hand side target matrix
6494  , typename MT4 // Type of the left-hand side matrix operand
6495  , typename MT5 > // Type of the right-hand side matrix operand
6496  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6497  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6498  {
6499  if( LOW )
6500  lmmm( C, A, B, ElementType(-1), ElementType(1) );
6501  else if( UPP )
6502  ummm( C, A, B, ElementType(-1), ElementType(1) );
6503  else
6504  mmm( C, A, B, ElementType(-1), ElementType(1) );
6505  }
6507  //**********************************************************************************************
6508 
6509  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
6523  template< typename MT3 // Type of the left-hand side target matrix
6524  , typename MT4 // Type of the left-hand side matrix operand
6525  , typename MT5 > // Type of the right-hand side matrix operand
6526  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6527  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6528  {
6529  selectLargeSubAssignKernel( C, A, B );
6530  }
6532  //**********************************************************************************************
6533 
6534  //**BLAS-based subraction assignment to dense matrices******************************************
6535 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6536 
6549  template< typename MT3 // Type of the left-hand side target matrix
6550  , typename MT4 // Type of the left-hand side matrix operand
6551  , typename MT5 > // Type of the right-hand side matrix operand
6552  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6553  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6554  {
6555  using ET = ElementType_t<MT3>;
6556 
6557  if( IsTriangular_v<MT4> ) {
6558  ResultType_t<MT3> tmp( serial( B ) );
6559  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
6560  subAssign( C, tmp );
6561  }
6562  else if( IsTriangular_v<MT5> ) {
6563  ResultType_t<MT3> tmp( serial( A ) );
6564  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
6565  subAssign( C, tmp );
6566  }
6567  else {
6568  gemm( C, A, B, ET(-1), ET(1) );
6569  }
6570  }
6572 #endif
6573  //**********************************************************************************************
6574 
6575  //**Subtraction assignment to sparse matrices***************************************************
6576  // No special implementation for the subtraction assignment to sparse matrices.
6577  //**********************************************************************************************
6578 
6579  //**Schur product assignment to dense matrices**************************************************
6592  template< typename MT // Type of the target dense matrix
6593  , bool SO > // Storage order of the target dense matrix
6594  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6595  {
6597 
6601 
6602  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6603  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6604 
6605  const ResultType tmp( serial( rhs ) );
6606  schurAssign( ~lhs, tmp );
6607  }
6609  //**********************************************************************************************
6610 
6611  //**Schur product assignment to sparse matrices*************************************************
6612  // No special implementation for the Schur product assignment to sparse matrices.
6613  //**********************************************************************************************
6614 
6615  //**Multiplication assignment to dense matrices*************************************************
6616  // No special implementation for the multiplication assignment to dense matrices.
6617  //**********************************************************************************************
6618 
6619  //**Multiplication assignment to sparse matrices************************************************
6620  // No special implementation for the multiplication assignment to sparse matrices.
6621  //**********************************************************************************************
6622 
6623  //**SMP assignment to dense matrices************************************************************
6639  template< typename MT // Type of the target dense matrix
6640  , bool SO > // Storage order of the target dense matrix
6641  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6642  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6643  {
6645 
6646  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6647  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6648 
6649  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
6650  return;
6651  }
6652  else if( rhs.lhs_.columns() == 0UL ) {
6653  reset( ~lhs );
6654  return;
6655  }
6656 
6657  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6658  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6659 
6660  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6661  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6662  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6663  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6664  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6665  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6666 
6667  smpAssign( ~lhs, A * B );
6668  }
6670  //**********************************************************************************************
6671 
6672  //**SMP assignment to sparse matrices***********************************************************
6688  template< typename MT // Type of the target sparse matrix
6689  , bool SO > // Storage order of the target sparse matrix
6690  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6691  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6692  {
6694 
6695  using TmpType = If_t< SO, ResultType, OppositeType >;
6696 
6703 
6704  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6705  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6706 
6707  const ForwardFunctor fwd;
6708 
6709  const TmpType tmp( rhs );
6710  smpAssign( ~lhs, fwd( tmp ) );
6711  }
6713  //**********************************************************************************************
6714 
6715  //**SMP addition assignment to dense matrices***************************************************
6731  template< typename MT // Type of the target dense matrix
6732  , bool SO > // Storage order of the target dense matrix
6733  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6734  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6735  {
6737 
6738  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6739  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6740 
6741  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6742  return;
6743  }
6744 
6745  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6746  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6747 
6748  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6749  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6750  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6751  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6752  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6753  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6754 
6755  smpAddAssign( ~lhs, A * B );
6756  }
6758  //**********************************************************************************************
6759 
6760  //**SMP addition assignment to sparse matrices**************************************************
6761  // No special implementation for the SMP addition assignment to sparse matrices.
6762  //**********************************************************************************************
6763 
6764  //**SMP subtraction assignment to dense matrices************************************************
6780  template< typename MT // Type of the target dense matrix
6781  , bool SO > // Storage order of the target dense matrix
6782  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6783  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6784  {
6786 
6787  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6788  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6789 
6790  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6791  return;
6792  }
6793 
6794  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6795  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6796 
6797  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6798  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6799  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6800  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6801  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6802  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6803 
6804  smpSubAssign( ~lhs, A * B );
6805  }
6807  //**********************************************************************************************
6808 
6809  //**SMP subtraction assignment to sparse matrices***********************************************
6810  // No special implementation for the SMP subtraction assignment to sparse matrices.
6811  //**********************************************************************************************
6812 
6813  //**SMP Schur product assignment to dense matrices**********************************************
6826  template< typename MT // Type of the target dense matrix
6827  , bool SO > // Storage order of the target dense matrix
6828  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6829  {
6831 
6835 
6836  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6837  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6838 
6839  const ResultType tmp( rhs );
6840  smpSchurAssign( ~lhs, tmp );
6841  }
6843  //**********************************************************************************************
6844 
6845  //**SMP Schur product assignment to sparse matrices*********************************************
6846  // No special implementation for the SMP Schur product assignment to sparse matrices.
6847  //**********************************************************************************************
6848 
6849  //**SMP multiplication assignment to dense matrices*********************************************
6850  // No special implementation for the SMP multiplication assignment to dense matrices.
6851  //**********************************************************************************************
6852 
6853  //**SMP multiplication assignment to sparse matrices********************************************
6854  // No special implementation for the SMP multiplication assignment to sparse matrices.
6855  //**********************************************************************************************
6856 
6857  //**Compile time checks*************************************************************************
6865  //**********************************************************************************************
6866 };
6867 //*************************************************************************************************
6868 
6869 
6870 
6871 
6872 //=================================================================================================
6873 //
6874 // DMATSCALARMULTEXPR SPECIALIZATION
6875 //
6876 //=================================================================================================
6877 
6878 //*************************************************************************************************
6886 template< typename MT1 // Type of the left-hand side dense matrix
6887  , typename MT2 // Type of the right-hand side dense matrix
6888  , bool SF // Symmetry flag
6889  , bool HF // Hermitian flag
6890  , bool LF // Lower flag
6891  , bool UF // Upper flag
6892  , typename ST > // Type of the right-hand side scalar value
6893 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
6894  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
6895  , private Computation
6896 {
6897  private:
6898  //**Type definitions****************************************************************************
6900  using MMM = TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
6901 
6902  using RES = ResultType_t<MMM>;
6903  using RT1 = ResultType_t<MT1>;
6904  using RT2 = ResultType_t<MT2>;
6905  using ET1 = ElementType_t<RT1>;
6906  using ET2 = ElementType_t<RT2>;
6907  using CT1 = CompositeType_t<MT1>;
6908  using CT2 = CompositeType_t<MT2>;
6909  //**********************************************************************************************
6910 
6911  //**********************************************************************************************
6913  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
6914  //**********************************************************************************************
6915 
6916  //**********************************************************************************************
6918  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
6919  //**********************************************************************************************
6920 
6921  //**********************************************************************************************
6922  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
6923  static constexpr bool HERM = ( HF && !( LF || UF ) );
6924  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
6925  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
6926  //**********************************************************************************************
6927 
6928  //**********************************************************************************************
6930 
6933  template< typename T1, typename T2, typename T3 >
6934  static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
6935  //**********************************************************************************************
6936 
6937  //**********************************************************************************************
6939 
6941  template< typename T1, typename T2, typename T3, typename T4 >
6942  static constexpr bool UseBlasKernel_v =
6943  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
6944  !SYM && !HERM && !LOW && !UPP &&
6945  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
6946  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
6947  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
6948  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
6949  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6950  IsBLASCompatible_v< ElementType_t<T1> > &&
6951  IsBLASCompatible_v< ElementType_t<T2> > &&
6952  IsBLASCompatible_v< ElementType_t<T3> > &&
6953  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
6954  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
6955  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
6956  //**********************************************************************************************
6957 
6958  //**********************************************************************************************
6960 
6962  template< typename T1, typename T2, typename T3, typename T4 >
6963  static constexpr bool UseVectorizedDefaultKernel_v =
6964  ( useOptimizedKernels &&
6965  !( IsDiagonal_v<T2> && IsDiagonal_v<T3> ) &&
6966  !( IsDiagonal_v<T2> && IsColumnMajorMatrix_v<T1> ) &&
6967  !( IsDiagonal_v<T3> && IsRowMajorMatrix_v<T1> ) &&
6968  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6969  IsSIMDCombinable_v< ElementType_t<T1>
6970  , ElementType_t<T2>
6971  , ElementType_t<T3>
6972  , T4 > &&
6973  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
6974  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
6975  //**********************************************************************************************
6976 
6977  //**********************************************************************************************
6979 
6981  using ForwardFunctor = If_t< HERM
6982  , DeclHerm
6983  , If_t< SYM
6984  , DeclSym
6985  , If_t< LOW
6986  , If_t< UPP
6987  , DeclDiag
6988  , DeclLow >
6989  , If_t< UPP
6990  , DeclUpp
6991  , Noop > > > >;
6992  //**********************************************************************************************
6993 
6994  public:
6995  //**Type definitions****************************************************************************
6997  using This = DMatScalarMultExpr<MMM,ST,true>;
6998 
7000  using BaseType = DenseMatrix<This,true>;
7001 
7003  using ResultType = typename If_t< HERM
7004  , DeclHermTrait< MultTrait_t<RES,ST> >
7005  , If_t< SYM
7006  , DeclSymTrait< MultTrait_t<RES,ST> >
7007  , If_t< LOW
7008  , If_t< UPP
7009  , DeclDiagTrait< MultTrait_t<RES,ST> >
7010  , DeclLowTrait< MultTrait_t<RES,ST> > >
7011  , If_t< UPP
7012  , DeclUppTrait< MultTrait_t<RES,ST> >
7013  , MultTrait<RES,ST> > > > >::Type;
7014 
7015  using OppositeType = OppositeType_t<ResultType>;
7016  using TransposeType = TransposeType_t<ResultType>;
7017  using ElementType = ElementType_t<ResultType>;
7018  using SIMDType = SIMDTrait_t<ElementType>;
7019  using ReturnType = const ElementType;
7020  using CompositeType = const ResultType;
7021 
7023  using LeftOperand = const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7024 
7026  using RightOperand = ST;
7027 
7029  using LT = If_t< evaluateLeft, const RT1, CT1 >;
7030 
7032  using RT = If_t< evaluateRight, const RT2, CT2 >;
7033  //**********************************************************************************************
7034 
7035  //**Compilation flags***************************************************************************
7037  static constexpr bool simdEnabled =
7038  ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
7039  MT1::simdEnabled && MT2::simdEnabled &&
7040  IsSIMDCombinable_v<ET1,ET2,ST> &&
7041  HasSIMDAdd_v<ET1,ET2> &&
7042  HasSIMDMult_v<ET1,ET2> );
7043 
7045  static constexpr bool smpAssignable =
7046  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
7047  //**********************************************************************************************
7048 
7049  //**SIMD properties*****************************************************************************
7051  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
7052  //**********************************************************************************************
7053 
7054  //**Constructor*********************************************************************************
7060  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
7061  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
7062  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
7063  {}
7064  //**********************************************************************************************
7065 
7066  //**Access operator*****************************************************************************
7073  inline ReturnType operator()( size_t i, size_t j ) const {
7074  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
7075  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
7076  return matrix_(i,j) * scalar_;
7077  }
7078  //**********************************************************************************************
7079 
7080  //**At function*********************************************************************************
7088  inline ReturnType at( size_t i, size_t j ) const {
7089  if( i >= matrix_.rows() ) {
7090  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
7091  }
7092  if( j >= matrix_.columns() ) {
7093  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
7094  }
7095  return (*this)(i,j);
7096  }
7097  //**********************************************************************************************
7098 
7099  //**Rows function*******************************************************************************
7104  inline size_t rows() const {
7105  return matrix_.rows();
7106  }
7107  //**********************************************************************************************
7108 
7109  //**Columns function****************************************************************************
7114  inline size_t columns() const {
7115  return matrix_.columns();
7116  }
7117  //**********************************************************************************************
7118 
7119  //**Left operand access*************************************************************************
7124  inline LeftOperand leftOperand() const {
7125  return matrix_;
7126  }
7127  //**********************************************************************************************
7128 
7129  //**Right operand access************************************************************************
7134  inline RightOperand rightOperand() const {
7135  return scalar_;
7136  }
7137  //**********************************************************************************************
7138 
7139  //**********************************************************************************************
7145  template< typename T >
7146  inline bool canAlias( const T* alias ) const {
7147  return matrix_.canAlias( alias );
7148  }
7149  //**********************************************************************************************
7150 
7151  //**********************************************************************************************
7157  template< typename T >
7158  inline bool isAliased( const T* alias ) const {
7159  return matrix_.isAliased( alias );
7160  }
7161  //**********************************************************************************************
7162 
7163  //**********************************************************************************************
7168  inline bool isAligned() const {
7169  return matrix_.isAligned();
7170  }
7171  //**********************************************************************************************
7172 
7173  //**********************************************************************************************
7178  inline bool canSMPAssign() const noexcept {
7179  return ( !BLAZE_BLAS_MODE ||
7180  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7182  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7183  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7184  }
7185  //**********************************************************************************************
7186 
7187  private:
7188  //**Member variables****************************************************************************
7191  //**********************************************************************************************
7192 
7193  //**Assignment to dense matrices****************************************************************
7205  template< typename MT // Type of the target dense matrix
7206  , bool SO > // Storage order of the target dense matrix
7207  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7208  {
7210 
7211  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7212  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7213 
7214  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7215  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7216 
7217  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7218  return;
7219  }
7220  else if( left.columns() == 0UL ) {
7221  reset( ~lhs );
7222  return;
7223  }
7224 
7225  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7226  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7227 
7228  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7229  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7230  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7231  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7232  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7233  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7234 
7235  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
7236  }
7237  //**********************************************************************************************
7238 
7239  //**Assignment to dense matrices (kernel selection)*********************************************
7250  template< typename MT3 // Type of the left-hand side target matrix
7251  , typename MT4 // Type of the left-hand side matrix operand
7252  , typename MT5 // Type of the right-hand side matrix operand
7253  , typename ST2 > // Type of the scalar value
7254  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7255  {
7256  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
7257  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
7258  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
7259  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7260  selectSmallAssignKernel( C, A, B, scalar );
7261  else
7262  selectBlasAssignKernel( C, A, B, scalar );
7263  }
7264  //**********************************************************************************************
7265 
7266  //**Default assignment to row-major dense matrices (general/general)****************************
7280  template< typename MT3 // Type of the left-hand side target matrix
7281  , typename MT4 // Type of the left-hand side matrix operand
7282  , typename MT5 // Type of the right-hand side matrix operand
7283  , typename ST2 > // Type of the scalar value
7284  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7285  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7286  {
7287  const size_t M( A.rows() );
7288  const size_t N( B.columns() );
7289  const size_t K( A.columns() );
7290 
7291  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7292 
7293  for( size_t i=0UL; i<M; ++i )
7294  {
7295  const size_t kbegin( ( IsUpper_v<MT4> )
7296  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7297  :( 0UL ) );
7298  const size_t kend( ( IsLower_v<MT4> )
7299  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7300  :( K ) );
7301  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7302 
7303  if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
7304  for( size_t j=0UL; j<N; ++j ) {
7305  reset( C(i,j) );
7306  }
7307  continue;
7308  }
7309 
7310  {
7311  const size_t jbegin( ( IsUpper_v<MT5> )
7312  ?( ( IsStrictlyUpper_v<MT5> )
7313  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
7314  :( UPP ? max(i,kbegin) : kbegin ) )
7315  :( UPP ? i : 0UL ) );
7316  const size_t jend( ( IsLower_v<MT5> )
7317  ?( ( IsStrictlyLower_v<MT5> )
7318  ?( LOW ? min(i+1UL,kbegin) : kbegin )
7319  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
7320  :( LOW ? i+1UL : N ) );
7321 
7322  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7323  for( size_t j=0UL; j<jbegin; ++j ) {
7324  reset( C(i,j) );
7325  }
7326  }
7327  else if( IsStrictlyUpper_v<MT5> ) {
7328  reset( C(i,0UL) );
7329  }
7330  for( size_t j=jbegin; j<jend; ++j ) {
7331  C(i,j) = A(i,kbegin) * B(kbegin,j);
7332  }
7333  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7334  for( size_t j=jend; j<N; ++j ) {
7335  reset( C(i,j) );
7336  }
7337  }
7338  else if( IsStrictlyLower_v<MT5> ) {
7339  reset( C(i,N-1UL) );
7340  }
7341  }
7342 
7343  for( size_t k=kbegin+1UL; k<kend; ++k )
7344  {
7345  const size_t jbegin( ( IsUpper_v<MT5> )
7346  ?( ( IsStrictlyUpper_v<MT5> )
7347  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
7348  :( SYM || HERM || UPP ? max( i, k ) : k ) )
7349  :( SYM || HERM || UPP ? i : 0UL ) );
7350  const size_t jend( ( IsLower_v<MT5> )
7351  ?( ( IsStrictlyLower_v<MT5> )
7352  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
7353  :( LOW ? min(i+1UL,k) : k ) )
7354  :( LOW ? i+1UL : N ) );
7355 
7356  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7357  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7358 
7359  for( size_t j=jbegin; j<jend; ++j ) {
7360  C(i,j) += A(i,k) * B(k,j);
7361  }
7362  if( IsLower_v<MT5> ) {
7363  C(i,jend) = A(i,k) * B(k,jend);
7364  }
7365  }
7366 
7367  {
7368  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7369  ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
7370  :( SYM || HERM || UPP ? i : 0UL ) );
7371  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
7372  ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
7373  :( LOW ? i+1UL : N ) );
7374 
7375  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7376  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7377 
7378  for( size_t j=jbegin; j<jend; ++j ) {
7379  C(i,j) *= scalar;
7380  }
7381  }
7382  }
7383 
7384  if( SYM || HERM ) {
7385  for( size_t i=1UL; i<M; ++i ) {
7386  for( size_t j=0UL; j<i; ++j ) {
7387  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
7388  }
7389  }
7390  }
7391  }
7392  //**********************************************************************************************
7393 
7394  //**Default assignment to column-major dense matrices (general/general)*************************
7408  template< typename MT3 // Type of the left-hand side target matrix
7409  , typename MT4 // Type of the left-hand side matrix operand
7410  , typename MT5 // Type of the right-hand side matrix operand
7411  , typename ST2 > // Type of the scalar value
7412  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7413  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7414  {
7415  const size_t M( A.rows() );
7416  const size_t N( B.columns() );
7417  const size_t K( A.columns() );
7418 
7419  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7420 
7421  for( size_t j=0UL; j<N; ++j )
7422  {
7423  const size_t kbegin( ( IsLower_v<MT5> )
7424  ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7425  :( 0UL ) );
7426  const size_t kend( ( IsUpper_v<MT5> )
7427  ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7428  :( K ) );
7429  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7430 
7431  if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
7432  for( size_t i=0UL; i<M; ++i ) {
7433  reset( C(i,j) );
7434  }
7435  continue;
7436  }
7437 
7438  {
7439  const size_t ibegin( ( IsLower_v<MT4> )
7440  ?( ( IsStrictlyLower_v<MT4> )
7441  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
7442  :( LOW ? max(j,kbegin) : kbegin ) )
7443  :( LOW ? j : 0UL ) );
7444  const size_t iend( ( IsUpper_v<MT4> )
7445  ?( ( IsStrictlyUpper_v<MT4> )
7446  ?( UPP ? min(j+1UL,kbegin) : kbegin )
7447  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
7448  :( UPP ? j+1UL : M ) );
7449 
7450  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7451  for( size_t i=0UL; i<ibegin; ++i ) {
7452  reset( C(i,j) );
7453  }
7454  }
7455  else if( IsStrictlyLower_v<MT4> ) {
7456  reset( C(0UL,j) );
7457  }
7458  for( size_t i=ibegin; i<iend; ++i ) {
7459  C(i,j) = A(i,kbegin) * B(kbegin,j);
7460  }
7461  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7462  for( size_t i=iend; i<M; ++i ) {
7463  reset( C(i,j) );
7464  }
7465  }
7466  else if( IsStrictlyUpper_v<MT4> ) {
7467  reset( C(M-1UL,j) );
7468  }
7469  }
7470 
7471  for( size_t k=kbegin+1UL; k<kend; ++k )
7472  {
7473  const size_t ibegin( ( IsLower_v<MT4> )
7474  ?( ( IsStrictlyLower_v<MT4> )
7475  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
7476  :( SYM || HERM || LOW ? max( j, k ) : k ) )
7477  :( SYM || HERM || LOW ? j : 0UL ) );
7478  const size_t iend( ( IsUpper_v<MT4> )
7479  ?( ( IsStrictlyUpper_v<MT4> )
7480  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
7481  :( UPP ? min(j+1UL,k) : k ) )
7482  :( UPP ? j+1UL : M ) );
7483 
7484  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7485  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7486 
7487  for( size_t i=ibegin; i<iend; ++i ) {
7488  C(i,j) += A(i,k) * B(k,j);
7489  }
7490  if( IsUpper_v<MT4> ) {
7491  C(iend,j) = A(iend,k) * B(k,j);
7492  }
7493  }
7494 
7495  {
7496  const size_t ibegin( ( ( IsLower_v<MT4> && IsLower_v<MT5> ) )
7497  ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
7498  :( SYM || HERM || LOW ? j : 0UL ) );
7499  const size_t iend( ( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) )
7500  ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
7501  :( UPP ? j+1UL : M ) );
7502 
7503  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7504  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7505 
7506  for( size_t i=ibegin; i<iend; ++i ) {
7507  C(i,j) *= scalar;
7508  }
7509  }
7510  }
7511 
7512  if( SYM || HERM ) {
7513  for( size_t j=1UL; j<N; ++j ) {
7514  for( size_t i=0UL; i<j; ++i ) {
7515  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
7516  }
7517  }
7518  }
7519  }
7520  //**********************************************************************************************
7521 
7522  //**Default assignment to row-major dense matrices (general/diagonal)***************************
7536  template< typename MT3 // Type of the left-hand side target matrix
7537  , typename MT4 // Type of the left-hand side matrix operand
7538  , typename MT5 // Type of the right-hand side matrix operand
7539  , typename ST2 > // Type of the scalar value
7540  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7541  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7542  {
7543  constexpr size_t block( BLOCK_SIZE );
7544 
7545  const size_t M( A.rows() );
7546  const size_t N( B.columns() );
7547 
7548  for( size_t ii=0UL; ii<M; ii+=block ) {
7549  const size_t iend( min( M, ii+block ) );
7550  for( size_t jj=0UL; jj<N; jj+=block ) {
7551  const size_t jend( min( N, jj+block ) );
7552  for( size_t i=ii; i<iend; ++i )
7553  {
7554  const size_t jbegin( ( IsUpper_v<MT4> )
7555  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
7556  :( jj ) );
7557  const size_t jpos( ( IsLower_v<MT4> )
7558  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
7559  :( jend ) );
7560 
7561  if( IsUpper_v<MT4> ) {
7562  for( size_t j=jj; j<jbegin; ++j ) {
7563  reset( C(i,j) );
7564  }
7565  }
7566  for( size_t j=jbegin; j<jpos; ++j ) {
7567  C(i,j) = A(i,j) * B(j,j) * scalar;
7568  }
7569  if( IsLower_v<MT4> ) {
7570  for( size_t j=jpos; j<jend; ++j ) {
7571  reset( C(i,j) );
7572  }
7573  }
7574  }
7575  }
7576  }
7577  }
7578  //**********************************************************************************************
7579 
7580  //**Default assignment to column-major dense matrices (general/diagonal)************************
7594  template< typename MT3 // Type of the left-hand side target matrix
7595  , typename MT4 // Type of the left-hand side matrix operand
7596  , typename MT5 // Type of the right-hand side matrix operand
7597  , typename ST2 > // Type of the scalar value
7598  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7599  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7600  {
7601  const size_t M( A.rows() );
7602  const size_t N( B.columns() );
7603 
7604  for( size_t j=0UL; j<N; ++j )
7605  {
7606  const size_t ibegin( ( IsLower_v<MT4> )
7607  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7608  :( 0UL ) );
7609  const size_t iend( ( IsUpper_v<MT4> )
7610  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
7611  :( M ) );
7612  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7613 
7614  if( IsLower_v<MT4> ) {
7615  for( size_t i=0UL; i<ibegin; ++i ) {
7616  reset( C(i,j) );
7617  }
7618  }
7619  for( size_t i=ibegin; i<iend; ++i ) {
7620  C(i,j) = A(i,j) * B(j,j) * scalar;
7621  }
7622  if( IsUpper_v<MT4> ) {
7623  for( size_t i=iend; i<M; ++i ) {
7624  reset( C(i,j) );
7625  }
7626  }
7627  }
7628  }
7629  //**********************************************************************************************
7630 
7631  //**Default assignment to row-major dense matrices (diagonal/general)***************************
7645  template< typename MT3 // Type of the left-hand side target matrix
7646  , typename MT4 // Type of the left-hand side matrix operand
7647  , typename MT5 // Type of the right-hand side matrix operand
7648  , typename ST2 > // Type of the scalar value
7649  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7650  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7651  {
7652  const size_t M( A.rows() );
7653  const size_t N( B.columns() );
7654 
7655  for( size_t i=0UL; i<M; ++i )
7656  {
7657  const size_t jbegin( ( IsUpper_v<MT5> )
7658  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
7659  :( 0UL ) );
7660  const size_t jend( ( IsLower_v<MT5> )
7661  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
7662  :( N ) );
7663  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7664 
7665  if( IsUpper_v<MT5> ) {
7666  for( size_t j=0UL; j<jbegin; ++j ) {
7667  reset( C(i,j) );
7668  }
7669  }
7670  for( size_t j=jbegin; j<jend; ++j ) {
7671  C(i,j) = A(i,i) * B(i,j) * scalar;
7672  }
7673  if( IsLower_v<MT5> ) {
7674  for( size_t j=jend; j<N; ++j ) {
7675  reset( C(i,j) );
7676  }
7677  }
7678  }
7679  }
7680  //**********************************************************************************************
7681 
7682  //**Default assignment to column-major dense matrices (diagonal/general)************************
7696  template< typename MT3 // Type of the left-hand side target matrix
7697  , typename MT4 // Type of the left-hand side matrix operand
7698  , typename MT5 // Type of the right-hand side matrix operand
7699  , typename ST2 > // Type of the scalar value
7700  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7701  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7702  {
7703  constexpr size_t block( BLOCK_SIZE );
7704 
7705  const size_t M( A.rows() );
7706  const size_t N( B.columns() );
7707 
7708  for( size_t jj=0UL; jj<N; jj+=block ) {
7709  const size_t jend( min( N, jj+block ) );
7710  for( size_t ii=0UL; ii<M; ii+=block ) {
7711  const size_t iend( min( M, ii+block ) );
7712  for( size_t j=jj; j<jend; ++j )
7713  {
7714  const size_t ibegin( ( IsLower_v<MT5> )
7715  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
7716  :( ii ) );
7717  const size_t ipos( ( IsUpper_v<MT5> )
7718  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
7719  :( iend ) );
7720 
7721  if( IsLower_v<MT5> ) {
7722  for( size_t i=ii; i<ibegin; ++i ) {
7723  reset( C(i,j) );
7724  }
7725  }
7726  for( size_t i=ibegin; i<ipos; ++i ) {
7727  C(i,j) = A(i,i) * B(i,j) * scalar;
7728  }
7729  if( IsUpper_v<MT5> ) {
7730  for( size_t i=ipos; i<iend; ++i ) {
7731  reset( C(i,j) );
7732  }
7733  }
7734  }
7735  }
7736  }
7737  }
7738  //**********************************************************************************************
7739 
7740  //**Default assignment to dense matrices (diagonal/diagonal)************************************
7754  template< typename MT3 // Type of the left-hand side target matrix
7755  , typename MT4 // Type of the left-hand side matrix operand
7756  , typename MT5 // Type of the right-hand side matrix operand
7757  , typename ST2 > // Type of the scalar value
7758  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7759  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7760  {
7761  reset( C );
7762 
7763  for( size_t i=0UL; i<A.rows(); ++i ) {
7764  C(i,i) = A(i,i) * B(i,i) * scalar;
7765  }
7766  }
7767  //**********************************************************************************************
7768 
7769  //**Default assignment to dense matrices (small matrices)***************************************
7783  template< typename MT3 // Type of the left-hand side target matrix
7784  , typename MT4 // Type of the left-hand side matrix operand
7785  , typename MT5 // Type of the right-hand side matrix operand
7786  , typename ST2 > // Type of the scalar value
7787  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7788  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7789  {
7790  selectDefaultAssignKernel( C, A, B, scalar );
7791  }
7792  //**********************************************************************************************
7793 
7794  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
7809  template< typename MT3 // Type of the left-hand side target matrix
7810  , typename MT4 // Type of the left-hand side matrix operand
7811  , typename MT5 // Type of the right-hand side matrix operand
7812  , typename ST2 > // Type of the scalar value
7813  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7814  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7815  {
7816  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
7817 
7818  const size_t M( A.rows() );
7819  const size_t N( B.columns() );
7820  const size_t K( A.columns() );
7821 
7822  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7823 
7824  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
7825  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
7826 
7827  const SIMDType factor( set( scalar ) );
7828 
7829  if( LOW && UPP && N > SIMDSIZE*3UL ) {
7830  reset( C );
7831  }
7832 
7833  {
7834  size_t j( 0UL );
7835 
7836  if( IsIntegral_v<ElementType> )
7837  {
7838  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7839  for( size_t i=0UL; i<M; ++i )
7840  {
7841  const size_t kbegin( ( IsUpper_v<MT4> )
7842  ?( ( IsLower_v<MT5> )
7843  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7844  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7845  :( IsLower_v<MT5> ? j : 0UL ) );
7846  const size_t kend( ( IsLower_v<MT4> )
7847  ?( ( IsUpper_v<MT5> )
7848  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
7849  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7850  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
7851 
7852  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7853 
7854  for( size_t k=kbegin; k<kend; ++k ) {
7855  const SIMDType a1( set( A(i,k) ) );
7856  xmm1 += a1 * B.load(k,j );
7857  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7858  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7859  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7860  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7861  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7862  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7863  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7864  }
7865 
7866  C.store( i, j , xmm1 * factor );
7867  C.store( i, j+SIMDSIZE , xmm2 * factor );
7868  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7869  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7870  C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7871  C.store( i, j+SIMDSIZE*5UL, xmm6 * factor );
7872  C.store( i, j+SIMDSIZE*6UL, xmm7 * factor );
7873  C.store( i, j+SIMDSIZE*7UL, xmm8 * factor );
7874  }
7875  }
7876  }
7877 
7878  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7879  {
7880  size_t i( 0UL );
7881 
7882  for( ; (i+2UL) <= M; i+=2UL )
7883  {
7884  const size_t kbegin( ( IsUpper_v<MT4> )
7885  ?( ( IsLower_v<MT5> )
7886  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7887  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7888  :( IsLower_v<MT5> ? j : 0UL ) );
7889  const size_t kend( ( IsLower_v<MT4> )
7890  ?( ( IsUpper_v<MT5> )
7891  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
7892  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7893  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
7894 
7895  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7896 
7897  for( size_t k=kbegin; k<kend; ++k ) {
7898  const SIMDType a1( set( A(i ,k) ) );
7899  const SIMDType a2( set( A(i+1UL,k) ) );
7900  const SIMDType b1( B.load(k,j ) );
7901  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7902  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7903  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7904  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7905  xmm1 += a1 * b1;
7906  xmm2 += a1 * b2;
7907  xmm3 += a1 * b3;
7908  xmm4 += a1 * b4;
7909  xmm5 += a1 * b5;
7910  xmm6 += a2 * b1;
7911  xmm7 += a2 * b2;
7912  xmm8 += a2 * b3;
7913  xmm9 += a2 * b4;
7914  xmm10 += a2 * b5;
7915  }
7916 
7917  C.store( i , j , xmm1 * factor );
7918  C.store( i , j+SIMDSIZE , xmm2 * factor );
7919  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7920  C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7921  C.store( i , j+SIMDSIZE*4UL, xmm5 * factor );
7922  C.store( i+1UL, j , xmm6 * factor );
7923  C.store( i+1UL, j+SIMDSIZE , xmm7 * factor );
7924  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
7925  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
7926  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
7927  }
7928 
7929  if( i < M )
7930  {
7931  const size_t kbegin( ( IsUpper_v<MT4> )
7932  ?( ( IsLower_v<MT5> )
7933  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7934  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7935  :( IsLower_v<MT5> ? j : 0UL ) );
7936  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
7937 
7938  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7939 
7940  for( size_t k=kbegin; k<kend; ++k ) {
7941  const SIMDType a1( set( A(i,k) ) );
7942  xmm1 += a1 * B.load(k,j );
7943  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7944  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7945  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7946  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7947  }
7948 
7949  C.store( i, j , xmm1 * factor );
7950  C.store( i, j+SIMDSIZE , xmm2 * factor );
7951  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7952  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7953  C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7954  }
7955  }
7956 
7957  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7958  {
7959  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
7960  size_t i( LOW ? j : 0UL );
7961 
7962  for( ; (i+2UL) <= iend; i+=2UL )
7963  {
7964  const size_t kbegin( ( IsUpper_v<MT4> )
7965  ?( ( IsLower_v<MT5> )
7966  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7967  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7968  :( IsLower_v<MT5> ? j : 0UL ) );
7969  const size_t kend( ( IsLower_v<MT4> )
7970  ?( ( IsUpper_v<MT5> )
7971  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
7972  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7973  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
7974 
7975  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7976 
7977  for( size_t k=kbegin; k<kend; ++k ) {
7978  const SIMDType a1( set( A(i ,k) ) );
7979  const SIMDType a2( set( A(i+1UL,k) ) );
7980  const SIMDType b1( B.load(k,j ) );
7981  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7982  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7983  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7984  xmm1 += a1 * b1;
7985  xmm2 += a1 * b2;
7986  xmm3 += a1 * b3;
7987  xmm4 += a1 * b4;
7988  xmm5 += a2 * b1;
7989  xmm6 += a2 * b2;
7990  xmm7 += a2 * b3;
7991  xmm8 += a2 * b4;
7992  }
7993 
7994  C.store( i , j , xmm1 * factor );
7995  C.store( i , j+SIMDSIZE , xmm2 * factor );
7996  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7997  C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7998  C.store( i+1UL, j , xmm5 * factor );
7999  C.store( i+1UL, j+SIMDSIZE , xmm6 * factor );
8000  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
8001  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
8002  }
8003 
8004  if( i < iend )
8005  {
8006  const size_t kbegin( ( IsUpper_v<MT4> )
8007  ?( ( IsLower_v<MT5> )
8008  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8009  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8010  :( IsLower_v<MT5> ? j : 0UL ) );
8011  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
8012 
8013  SIMDType xmm1, xmm2, xmm3, xmm4;
8014 
8015  for( size_t k=kbegin; k<kend; ++k ) {
8016  const SIMDType a1( set( A(i,k) ) );
8017  xmm1 += a1 * B.load(k,j );
8018  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8019  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8020  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8021  }
8022 
8023  C.store( i, j , xmm1 * factor );
8024  C.store( i, j+SIMDSIZE , xmm2 * factor );
8025  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8026  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8027  }
8028  }
8029 
8030  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8031  {
8032  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
8033  size_t i( LOW ? j : 0UL );
8034 
8035  for( ; (i+2UL) <= iend; i+=2UL )
8036  {
8037  const size_t kbegin( ( IsUpper_v<MT4> )
8038  ?( ( IsLower_v<MT5> )
8039  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8040  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8041  :( IsLower_v<MT5> ? j : 0UL ) );
8042  const size_t kend( ( IsLower_v<MT4> )
8043  ?( ( IsUpper_v<MT5> )
8044  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
8045  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8046  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
8047 
8048  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8049 
8050  for( size_t k=kbegin; k<kend; ++k ) {
8051  const SIMDType a1( set( A(i ,k) ) );
8052  const SIMDType a2( set( A(i+1UL,k) ) );
8053  const SIMDType b1( B.load(k,j ) );
8054  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8055  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8056  xmm1 += a1 * b1;
8057  xmm2 += a1 * b2;
8058  xmm3 += a1 * b3;
8059  xmm4 += a2 * b1;
8060  xmm5 += a2 * b2;
8061  xmm6 += a2 * b3;
8062  }
8063 
8064  C.store( i , j , xmm1 * factor );
8065  C.store( i , j+SIMDSIZE , xmm2 * factor );
8066  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8067  C.store( i+1UL, j , xmm4 * factor );
8068  C.store( i+1UL, j+SIMDSIZE , xmm5 * factor );
8069  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
8070  }
8071 
8072  if( i < iend )
8073  {
8074  const size_t kbegin( ( IsUpper_v<MT4> )
8075  ?( ( IsLower_v<MT5> )
8076  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8077  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8078  :( IsLower_v<MT5> ? j : 0UL ) );
8079  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
8080 
8081  SIMDType xmm1, xmm2, xmm3;
8082 
8083  for( size_t k=kbegin; k<kend; ++k ) {
8084  const SIMDType a1( set( A(i,k) ) );
8085  xmm1 += a1 * B.load(k,j );
8086  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8087  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8088  }
8089 
8090  C.store( i, j , xmm1 * factor );
8091  C.store( i, j+SIMDSIZE , xmm2 * factor );
8092  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8093  }
8094  }
8095 
8096  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8097  {
8098  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
8099  size_t i( LOW ? j : 0UL );
8100 
8101  for( ; (i+4UL) <= iend; i+=4UL )
8102  {
8103  const size_t kbegin( ( IsUpper_v<MT4> )
8104  ?( ( IsLower_v<MT5> )
8105  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8106  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8107  :( IsLower_v<MT5> ? j : 0UL ) );
8108  const size_t kend( ( IsLower_v<MT4> )
8109  ?( ( IsUpper_v<MT5> )
8110  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
8111  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8112  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8113 
8114  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8115 
8116  for( size_t k=kbegin; k<kend; ++k ) {
8117  const SIMDType a1( set( A(i ,k) ) );
8118  const SIMDType a2( set( A(i+1UL,k) ) );
8119  const SIMDType a3( set( A(i+2UL,k) ) );
8120  const SIMDType a4( set( A(i+3UL,k) ) );
8121  const SIMDType b1( B.load(k,j ) );
8122  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8123  xmm1 += a1 * b1;
8124  xmm2 += a1 * b2;
8125  xmm3 += a2 * b1;
8126  xmm4 += a2 * b2;
8127  xmm5 += a3 * b1;
8128  xmm6 += a3 * b2;
8129  xmm7 += a4 * b1;
8130  xmm8 += a4 * b2;
8131  }
8132 
8133  C.store( i , j , xmm1 * factor );
8134  C.store( i , j+SIMDSIZE, xmm2 * factor );
8135  C.store( i+1UL, j , xmm3 * factor );
8136  C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8137  C.store( i+2UL, j , xmm5 * factor );
8138  C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8139  C.store( i+3UL, j , xmm7 * factor );
8140  C.store( i+3UL, j+SIMDSIZE, xmm8 * factor );
8141  }
8142 
8143  for( ; (i+3UL) <= iend; i+=3UL )
8144  {
8145  const size_t kbegin( ( IsUpper_v<MT4> )
8146  ?( ( IsLower_v<MT5> )
8147  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8148  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8149  :( IsLower_v<MT5> ? j : 0UL ) );
8150  const size_t kend( ( IsLower_v<MT4> )
8151  ?( ( IsUpper_v<MT5> )
8152  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
8153  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8154  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8155 
8156  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8157 
8158  for( size_t k=kbegin; k<kend; ++k ) {
8159  const SIMDType a1( set( A(i ,k) ) );
8160  const SIMDType a2( set( A(i+1UL,k) ) );
8161  const SIMDType a3( set( A(i+2UL,k) ) );
8162  const SIMDType b1( B.load(k,j ) );
8163  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8164  xmm1 += a1 * b1;
8165  xmm2 += a1 * b2;
8166  xmm3 += a2 * b1;
8167  xmm4 += a2 * b2;
8168  xmm5 += a3 * b1;
8169  xmm6 += a3 * b2;
8170  }
8171 
8172  C.store( i , j , xmm1 * factor );
8173  C.store( i , j+SIMDSIZE, xmm2 * factor );
8174  C.store( i+1UL, j , xmm3 * factor );
8175  C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8176  C.store( i+2UL, j , xmm5 * factor );
8177  C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8178  }
8179 
8180  for( ; (i+2UL) <= iend; i+=2UL )
8181  {
8182  const size_t kbegin( ( IsUpper_v<MT4> )
8183  ?( ( IsLower_v<MT5> )
8184  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8185  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8186  :( IsLower_v<MT5> ? j : 0UL ) );
8187  const size_t kend( ( IsLower_v<MT4> )
8188  ?( ( IsUpper_v<MT5> )
8189  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8190  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8191  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
8192 
8193  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8194  size_t k( kbegin );
8195 
8196  for( ; (k+2UL) <= kend; k+=2UL ) {
8197  const SIMDType a1( set( A(i ,k ) ) );
8198  const SIMDType a2( set( A(i+1UL,k ) ) );
8199  const SIMDType a3( set( A(i ,k+1UL) ) );
8200  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
8201  const SIMDType b1( B.load(k ,j ) );
8202  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
8203  const SIMDType b3( B.load(k+1UL,j ) );
8204  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
8205  xmm1 += a1 * b1;
8206  xmm2 += a1 * b2;
8207  xmm3 += a2 * b1;
8208  xmm4 += a2 * b2;
8209  xmm5 += a3 * b3;
8210  xmm6 += a3 * b4;
8211  xmm7 += a4 * b3;
8212  xmm8 += a4 * b4;
8213  }
8214 
8215  for( ; k<kend; ++k ) {
8216  const SIMDType a1( set( A(i ,k) ) );
8217  const SIMDType a2( set( A(i+1UL,k) ) );
8218  const SIMDType b1( B.load(k,j ) );
8219  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8220  xmm1 += a1 * b1;
8221  xmm2 += a1 * b2;
8222  xmm3 += a2 * b1;
8223  xmm4 += a2 * b2;
8224  }
8225 
8226  C.store( i , j , (xmm1+xmm5) * factor );
8227  C.store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
8228  C.store( i+1UL, j , (xmm3+xmm7) * factor );
8229  C.store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
8230  }
8231 
8232  if( i < iend )
8233  {
8234  const size_t kbegin( ( IsUpper_v<MT4> )
8235  ?( ( IsLower_v<MT5> )
8236  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8237  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8238  :( IsLower_v<MT5> ? j : 0UL ) );
8239  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8240 
8241  SIMDType xmm1, xmm2, xmm3, xmm4;
8242  size_t k( kbegin );
8243 
8244  for( ; (k+2UL) <= kend; k+=2UL ) {
8245  const SIMDType a1( set( A(i,k ) ) );
8246  const SIMDType a2( set( A(i,k+1UL) ) );
8247  xmm1 += a1 * B.load(k ,j );
8248  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8249  xmm3 += a2 * B.load(k+1UL,j );
8250  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8251  }
8252 
8253  for( ; k<kend; ++k ) {
8254  const SIMDType a1( set( A(i,k) ) );
8255  xmm1 += a1 * B.load(k,j );
8256  xmm2 += a1 * B.load(k,j+SIMDSIZE);
8257  }
8258 
8259  C.store( i, j , (xmm1+xmm3) * factor );
8260  C.store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
8261  }
8262  }
8263 
8264  for( ; j<jpos; j+=SIMDSIZE )
8265  {
8266  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
8267  size_t i( LOW ? j : 0UL );
8268 
8269  for( ; (i+4UL) <= iend; i+=4UL )
8270  {
8271  const size_t kbegin( ( IsUpper_v<MT4> )
8272  ?( ( IsLower_v<MT5> )
8273  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8274  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8275  :( IsLower_v<MT5> ? j : 0UL ) );
8276  const size_t kend( ( IsLower_v<MT4> )
8277  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8278  :( K ) );
8279 
8280  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8281  size_t k( kbegin );
8282 
8283  for( ; (k+2UL) <= kend; k+=2UL ) {
8284  const SIMDType b1( B.load(k ,j) );
8285  const SIMDType b2( B.load(k+1UL,j) );
8286  xmm1 += set( A(i ,k ) ) * b1;
8287  xmm2 += set( A(i+1UL,k ) ) * b1;
8288  xmm3 += set( A(i+2UL,k ) ) * b1;
8289  xmm4 += set( A(i+3UL,k ) ) * b1;
8290  xmm5 += set( A(i ,k+1UL) ) * b2;
8291  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
8292  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
8293  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
8294  }
8295 
8296  for( ; k<kend; ++k ) {
8297  const SIMDType b1( B.load(k,j) );
8298  xmm1 += set( A(i ,k) ) * b1;
8299  xmm2 += set( A(i+1UL,k) ) * b1;
8300  xmm3 += set( A(i+2UL,k) ) * b1;
8301  xmm4 += set( A(i+3UL,k) ) * b1;
8302  }
8303 
8304  C.store( i , j, (xmm1+xmm5) * factor );
8305  C.store( i+1UL, j, (xmm2+xmm6) * factor );
8306  C.store( i+2UL, j, (xmm3+xmm7) * factor );
8307  C.store( i+3UL, j, (xmm4+xmm8) * factor );
8308  }
8309 
8310  for( ; (i+3UL) <= iend; i+=3UL )
8311  {
8312  const size_t kbegin( ( IsUpper_v<MT4> )
8313  ?( ( IsLower_v<MT5> )
8314  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8315  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8316  :( IsLower_v<MT5> ? j : 0UL ) );
8317  const size_t kend( ( IsLower_v<MT4> )
8318  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8319  :( K ) );
8320 
8321  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8322  size_t k( kbegin );
8323 
8324  for( ; (k+2UL) <= kend; k+=2UL ) {
8325  const SIMDType b1( B.load(k ,j) );
8326  const SIMDType b2( B.load(k+1UL,j) );
8327  xmm1 += set( A(i ,k ) ) * b1;
8328  xmm2 += set( A(i+1UL,k ) ) * b1;
8329  xmm3 += set( A(i+2UL,k ) ) * b1;
8330  xmm4 += set( A(i ,k+1UL) ) * b2;
8331  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
8332  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
8333  }
8334 
8335  for( ; k<kend; ++k ) {
8336  const SIMDType b1( B.load(k,j) );
8337  xmm1 += set( A(i ,k) ) * b1;
8338  xmm2 += set( A(i+1UL,k) ) * b1;
8339  xmm3 += set( A(i+2UL,k) ) * b1;
8340  }
8341 
8342  C.store( i , j, (xmm1+xmm4) * factor );
8343  C.store( i+1UL, j, (xmm2+xmm5) * factor );
8344  C.store( i+2UL, j, (xmm3+xmm6) * factor );
8345  }
8346 
8347  for( ; (i+2UL) <= iend; i+=2UL )
8348  {
8349  const size_t kbegin( ( IsUpper_v<MT4> )
8350  ?( ( IsLower_v<MT5> )
8351  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8352  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8353  :( IsLower_v<MT5> ? j : 0UL ) );
8354  const size_t kend( ( IsLower_v<MT4> )
8355  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8356  :( K ) );
8357 
8358  SIMDType xmm1, xmm2, xmm3, xmm4;
8359  size_t k( kbegin );
8360 
8361  for( ; (k+2UL) <= kend; k+=2UL ) {
8362  const SIMDType b1( B.load(k ,j) );
8363  const SIMDType b2( B.load(k+1UL,j) );
8364  xmm1 += set( A(i ,k ) ) * b1;
8365  xmm2 += set( A(i+1UL,k ) ) * b1;
8366  xmm3 += set( A(i ,k+1UL) ) * b2;
8367  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
8368  }
8369 
8370  for( ; k<kend; ++k ) {
8371  const SIMDType b1( B.load(k,j) );
8372  xmm1 += set( A(i ,k) ) * b1;
8373  xmm2 += set( A(i+1UL,k) ) * b1;
8374  }
8375 
8376  C.store( i , j, (xmm1+xmm3) * factor );
8377  C.store( i+1UL, j, (xmm2+xmm4) * factor );
8378  }
8379 
8380  if( i < iend )
8381  {
8382  const size_t kbegin( ( IsUpper_v<MT4> )
8383  ?( ( IsLower_v<MT5> )
8384  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8385  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8386  :( IsLower_v<MT5> ? j : 0UL ) );
8387 
8388  SIMDType xmm1, xmm2;
8389  size_t k( kbegin );
8390 
8391  for( ; (k+2UL) <= K; k+=2UL ) {
8392  xmm1 += set( A(i,k ) ) * B.load(k ,j);
8393  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
8394  }
8395 
8396  for( ; k<K; ++k ) {
8397  xmm1 += set( A(i,k) ) * B.load(k,j);
8398  }
8399 
8400  C.store( i, j, (xmm1+xmm2) * factor );
8401  }
8402  }
8403 
8404  for( ; remainder && j<N; ++j )
8405  {
8406  size_t i( LOW && UPP ? j : 0UL );
8407 
8408  for( ; (i+2UL) <= M; i+=2UL )
8409  {
8410  const size_t kbegin( ( IsUpper_v<MT4> )
8411  ?( ( IsLower_v<MT5> )
8412  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8413  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8414  :( IsLower_v<MT5> ? j : 0UL ) );
8415  const size_t kend( ( IsLower_v<MT4> )
8416  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8417  :( K ) );
8418 
8419  ElementType value1{};
8420  ElementType value2{};
8421 
8422  for( size_t k=kbegin; k<kend; ++k ) {
8423  value1 += A(i ,k) * B(k,j);
8424  value2 += A(i+1UL,k) * B(k,j);
8425  }
8426 
8427  C(i ,j) = value1 * scalar;
8428  C(i+1UL,j) = value2 * scalar;
8429  }
8430 
8431  if( i < M )
8432  {
8433  const size_t kbegin( ( IsUpper_v<MT4> )
8434  ?( ( IsLower_v<MT5> )
8435  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8436  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8437  :( IsLower_v<MT5> ? j : 0UL ) );
8438 
8439  ElementType value{};
8440 
8441  for( size_t k=kbegin; k<K; ++k ) {
8442  value += A(i,k) * B(k,j);
8443  }
8444 
8445  C(i,j) = value * scalar;
8446  }
8447  }
8448  }
8449 
8450  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
8451  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8452  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8453  for( size_t j=0UL; j<jend; ++j ) {
8454  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
8455  }
8456  }
8457  }
8458  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
8459  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
8460  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
8461  for( size_t i=0UL; i<iend; ++i ) {
8462  reset( C(i,j) );
8463  }
8464  }
8465  }
8466  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
8467  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8468  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8469  for( size_t j=0UL; j<jend; ++j ) {
8470  reset( C(i,j) );
8471  }
8472  }
8473  }
8474  }
8475  //**********************************************************************************************
8476 
8477  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
8492  template< typename MT3 // Type of the left-hand side target matrix
8493  , typename MT4 // Type of the left-hand side matrix operand
8494  , typename MT5 // Type of the right-hand side matrix operand
8495  , typename ST2 > // Type of the scalar value
8496  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8497  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8498  {
8499  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
8500 
8501  const size_t M( A.rows() );
8502  const size_t N( B.columns() );
8503  const size_t K( A.columns() );
8504 
8505  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8506 
8507  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
8508  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
8509 
8510  const SIMDType factor( set( scalar ) );
8511 
8512  if( LOW && UPP && M > SIMDSIZE*3UL ) {
8513  reset( C );
8514  }
8515 
8516  {
8517  size_t i( 0UL );
8518 
8519  if( IsIntegral_v<ElementType> )
8520  {
8521  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8522  for( size_t j=0UL; j<N; ++j )
8523  {
8524  const size_t kbegin( ( IsLower_v<MT5> )
8525  ?( ( IsUpper_v<MT4> )
8526  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8527  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8528  :( IsUpper_v<MT4> ? i : 0UL ) );
8529  const size_t kend( ( IsUpper_v<MT5> )
8530  ?( ( IsLower_v<MT4> )
8531  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
8532  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
8533  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
8534 
8535  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8536 
8537  for( size_t k=kbegin; k<kend; ++k ) {
8538  const SIMDType b1( set( B(k,j) ) );
8539  xmm1 += A.load(i ,k) * b1;
8540  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8541  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8542  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8543  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8544  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
8545  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
8546  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
8547  }
8548 
8549  C.store( i , j, xmm1 * factor );
8550  C.store( i+SIMDSIZE , j, xmm2 * factor );
8551  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8552  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8553  C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8554  C.store( i+SIMDSIZE*5UL, j, xmm6 * factor );
8555  C.store( i+SIMDSIZE*6UL, j, xmm7 * factor );
8556  C.store( i+SIMDSIZE*7UL, j, xmm8 * factor );
8557  }
8558  }
8559  }
8560 
8561  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
8562  {
8563  size_t j( 0UL );
8564 
8565  for( ; (j+2UL) <= N; j+=2UL )
8566  {
8567  const size_t kbegin( ( IsLower_v<MT5> )
8568  ?( ( IsUpper_v<MT4> )
8569  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8570  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8571  :( IsUpper_v<MT4> ? i : 0UL ) );
8572  const size_t kend( ( IsUpper_v<MT5> )
8573  ?( ( IsLower_v<MT4> )
8574  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8575  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8576  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
8577 
8578  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8579 
8580  for( size_t k=kbegin; k<kend; ++k ) {
8581  const SIMDType a1( A.load(i ,k) );
8582  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8583  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8584  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8585  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
8586  const SIMDType b1( set( B(k,j ) ) );
8587  const SIMDType b2( set( B(k,j+1UL) ) );
8588  xmm1 += a1 * b1;
8589  xmm2 += a2 * b1;
8590  xmm3 += a3 * b1;
8591  xmm4 += a4 * b1;
8592  xmm5 += a5 * b1;
8593  xmm6 += a1 * b2;
8594  xmm7 += a2 * b2;
8595  xmm8 += a3 * b2;
8596  xmm9 += a4 * b2;
8597  xmm10 += a5 * b2;
8598  }
8599 
8600  C.store( i , j , xmm1 * factor );
8601  C.store( i+SIMDSIZE , j , xmm2 * factor );
8602  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8603  C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8604  C.store( i+SIMDSIZE*4UL, j , xmm5 * factor );
8605  C.store( i , j+1UL, xmm6 * factor );
8606  C.store( i+SIMDSIZE , j+1UL, xmm7 * factor );
8607  C.store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
8608  C.store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
8609  C.store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
8610  }
8611 
8612  if( j < N )
8613  {
8614  const size_t kbegin( ( IsLower_v<MT5> )
8615  ?( ( IsUpper_v<MT4> )
8616  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8617  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8618  :( IsUpper_v<MT4> ? i : 0UL ) );
8619  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
8620 
8621  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8622 
8623  for( size_t k=kbegin; k<kend; ++k ) {
8624  const SIMDType b1( set( B(k,j) ) );
8625  xmm1 += A.load(i ,k) * b1;
8626  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8627  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8628  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8629  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8630  }
8631 
8632  C.store( i , j, xmm1 * factor );
8633  C.store( i+SIMDSIZE , j, xmm2 * factor );
8634  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8635  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8636  C.store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8637  }
8638  }
8639 
8640  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8641  {
8642  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
8643  size_t j( UPP ? i : 0UL );
8644 
8645  for( ; (j+2UL) <= jend; j+=2UL )
8646  {
8647  const size_t kbegin( ( IsLower_v<MT5> )
8648  ?( ( IsUpper_v<MT4> )
8649  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8650  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8651  :( IsUpper_v<MT4> ? i : 0UL ) );
8652  const size_t kend( ( IsUpper_v<MT5> )
8653  ?( ( IsLower_v<MT4> )
8654  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8655  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8656  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
8657 
8658  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8659 
8660  for( size_t k=kbegin; k<kend; ++k ) {
8661  const SIMDType a1( A.load(i ,k) );
8662  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8663  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8664  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8665  const SIMDType b1( set( B(k,j ) ) );
8666  const SIMDType b2( set( B(k,j+1UL) ) );
8667  xmm1 += a1 * b1;
8668  xmm2 += a2 * b1;
8669  xmm3 += a3 * b1;
8670  xmm4 += a4 * b1;
8671  xmm5 += a1 * b2;
8672  xmm6 += a2 * b2;
8673  xmm7 += a3 * b2;
8674  xmm8 += a4 * b2;
8675  }
8676 
8677  C.store( i , j , xmm1 * factor );
8678  C.store( i+SIMDSIZE , j , xmm2 * factor );
8679  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8680  C.store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8681  C.store( i , j+1UL, xmm5 * factor );
8682  C.store( i+SIMDSIZE , j+1UL, xmm6 * factor );
8683  C.store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
8684  C.store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
8685  }
8686 
8687  if( j < jend )
8688  {
8689  const size_t kbegin( ( IsLower_v<MT5> )
8690  ?( ( IsUpper_v<MT4> )
8691  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8692  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8693  :( IsUpper_v<MT4> ? i : 0UL ) );
8694  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
8695 
8696  SIMDType xmm1, xmm2, xmm3, xmm4;
8697 
8698  for( size_t k=kbegin; k<kend; ++k ) {
8699  const SIMDType b1( set( B(k,j) ) );
8700  xmm1 += A.load(i ,k) * b1;
8701  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8702  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8703  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8704  }
8705 
8706  C.store( i , j, xmm1 * factor );
8707  C.store( i+SIMDSIZE , j, xmm2 * factor );
8708  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8709  C.store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8710  }
8711  }
8712 
8713  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
8714  {
8715  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
8716  size_t j( UPP ? i : 0UL );
8717 
8718  for( ; (j+2UL) <= jend; j+=2UL )
8719  {
8720  const size_t kbegin( ( IsLower_v<MT5> )
8721  ?( ( IsUpper_v<MT4> )
8722  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8723  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8724  :( IsUpper_v<MT4> ? i : 0UL ) );
8725  const size_t kend( ( IsUpper_v<MT5> )
8726  ?( ( IsLower_v<MT4> )
8727  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8728  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8729  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
8730 
8731  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8732 
8733  for( size_t k=kbegin; k<kend; ++k ) {
8734  const SIMDType a1( A.load(i ,k) );
8735  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8736  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8737  const SIMDType b1( set( B(k,j ) ) );
8738  const SIMDType b2( set( B(k,j+1UL) ) );
8739  xmm1 += a1 * b1;
8740  xmm2 += a2 * b1;
8741  xmm3 += a3 * b1;
8742  xmm4 += a1 * b2;
8743  xmm5 += a2 * b2;
8744  xmm6 += a3 * b2;
8745  }
8746 
8747  C.store( i , j , xmm1 * factor );
8748  C.store( i+SIMDSIZE , j , xmm2 * factor );
8749  C.store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8750  C.store( i , j+1UL, xmm4 * factor );
8751  C.store( i+SIMDSIZE , j+1UL, xmm5 * factor );
8752  C.store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
8753  }
8754 
8755  if( j < jend )
8756  {
8757  const size_t kbegin( ( IsLower_v<MT5> )
8758  ?( ( IsUpper_v<MT4> )
8759  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8760  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8761  :( IsUpper_v<MT4> ? i : 0UL ) );
8762  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
8763 
8764  SIMDType xmm1, xmm2, xmm3;
8765 
8766  for( size_t k=kbegin; k<kend; ++k ) {
8767  const SIMDType b1( set( B(k,j) ) );
8768  xmm1 += A.load(i ,k) * b1;
8769  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8770  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8771  }
8772 
8773  C.store( i , j, xmm1 * factor );
8774  C.store( i+SIMDSIZE , j, xmm2 * factor );
8775  C.store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8776  }
8777  }
8778 
8779  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8780  {
8781  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
8782  size_t j( UPP ? i : 0UL );
8783 
8784  for( ; (j+4UL) <= jend; j+=4UL )
8785  {
8786  const size_t kbegin( ( IsLower_v<MT5> )
8787  ?( ( IsUpper_v<MT4> )
8788  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8789  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8790  :( IsUpper_v<MT4> ? i : 0UL ) );
8791  const size_t kend( ( IsUpper_v<MT5> )
8792  ?( ( IsLower_v<MT4> )
8793  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
8794  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
8795  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8796 
8797  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8798 
8799  for( size_t k=kbegin; k<kend; ++k ) {
8800  const SIMDType a1( A.load(i ,k) );
8801  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8802  const SIMDType b1( set( B(k,j ) ) );
8803  const SIMDType b2( set( B(k,j+1UL) ) );
8804  const SIMDType b3( set( B(k,j+2UL) ) );
8805  const SIMDType b4( set( B(k,j+3UL) ) );
8806  xmm1 += a1 * b1;
8807  xmm2 += a2 * b1;
8808  xmm3 += a1 * b2;
8809  xmm4 += a2 * b2;
8810  xmm5 += a1 * b3;
8811  xmm6 += a2 * b3;
8812  xmm7 += a1 * b4;
8813  xmm8 += a2 * b4;
8814  }
8815 
8816  C.store( i , j , xmm1 * factor );
8817  C.store( i+SIMDSIZE, j , xmm2 * factor );
8818  C.store( i , j+1UL, xmm3 * factor );
8819  C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8820  C.store( i , j+2UL, xmm5 * factor );
8821  C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8822  C.store( i , j+3UL, xmm7 * factor );
8823  C.store( i+SIMDSIZE, j+3UL, xmm8 * factor );
8824  }
8825 
8826  for( ; (j+3UL) <= jend; j+=3UL )
8827  {
8828  const size_t kbegin( ( IsLower_v<MT5> )
8829  ?( ( IsUpper_v<MT4> )
8830  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8831  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8832  :( IsUpper_v<MT4> ? i : 0UL ) );
8833  const size_t kend( ( IsUpper_v<MT5> )
8834  ?( ( IsLower_v<MT4> )
8835  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
8836  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
8837  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8838 
8839  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8840 
8841  for( size_t k=kbegin; k<kend; ++k ) {
8842  const SIMDType a1( A.load(i ,k) );
8843  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8844  const SIMDType b1( set( B(k,j ) ) );
8845  const SIMDType b2( set( B(k,j+1UL) ) );
8846  const SIMDType b3( set( B(k,j+2UL) ) );
8847  xmm1 += a1 * b1;
8848  xmm2 += a2 * b1;
8849  xmm3 += a1 * b2;
8850  xmm4 += a2 * b2;
8851  xmm5 += a1 * b3;
8852  xmm6 += a2 * b3;
8853  }
8854 
8855  C.store( i , j , xmm1 * factor );
8856  C.store( i+SIMDSIZE, j , xmm2 * factor );
8857  C.store( i , j+1UL, xmm3 * factor );
8858  C.store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8859  C.store( i , j+2UL, xmm5 * factor );
8860  C.store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8861  }
8862 
8863  for( ; (j+2UL) <= jend; j+=2UL )
8864  {
8865  const size_t kbegin( ( IsLower_v<MT5> )
8866  ?( ( IsUpper_v<MT4> )
8867  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8868  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8869  :( IsUpper_v<MT4> ? i : 0UL ) );
8870  const size_t kend( ( IsUpper_v<MT5> )
8871  ?( ( IsLower_v<MT4> )
8872  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8873  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8874  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
8875 
8876  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8877  size_t k( kbegin );
8878 
8879  for( ; (k+2UL) <= kend; k+=2UL ) {
8880  const SIMDType a1( A.load(i ,k ) );
8881  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8882  const SIMDType a3( A.load(i ,k+1UL) );
8883  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8884  const SIMDType b1( set( B(k ,j ) ) );
8885  const SIMDType b2( set( B(k ,j+1UL) ) );
8886  const SIMDType b3( set( B(k+1UL,j ) ) );
8887  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
8888  xmm1 += a1 * b1;
8889  xmm2 += a2 * b1;
8890  xmm3 += a1 * b2;
8891  xmm4 += a2 * b2;
8892  xmm5 += a3 * b3;
8893  xmm6 += a4 * b3;
8894  xmm7 += a3 * b4;
8895  xmm8 += a4 * b4;
8896  }
8897 
8898  for( ; k<kend; ++k ) {
8899  const SIMDType a1( A.load(i ,k) );
8900  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8901  const SIMDType b1( set( B(k,j ) ) );
8902  const SIMDType b2( set( B(k,j+1UL) ) );
8903  xmm1 += a1 * b1;
8904  xmm2 += a2 * b1;
8905  xmm3 += a1 * b2;
8906  xmm4 += a2 * b2;
8907  }
8908 
8909  C.store( i , j , (xmm1+xmm5) * factor );
8910  C.store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
8911  C.store( i , j+1UL, (xmm3+xmm7) * factor );
8912  C.store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
8913  }
8914 
8915  if( j < jend )
8916  {
8917  const size_t kbegin( ( IsLower_v<MT5> )
8918  ?( ( IsUpper_v<MT4> )
8919  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8920  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8921  :( IsUpper_v<MT4> ? i : 0UL ) );
8922  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8923 
8924  SIMDType xmm1, xmm2, xmm3, xmm4;
8925  size_t k( kbegin );
8926 
8927  for( ; (k+2UL) <= kend; k+=2UL ) {
8928  const SIMDType b1( set( B(k ,j) ) );
8929  const SIMDType b2( set( B(k+1UL,j) ) );
8930  xmm1 += A.load(i ,k ) * b1;
8931  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8932  xmm3 += A.load(i ,k+1UL) * b2;
8933  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8934  }
8935 
8936  for( ; k<kend; ++k ) {
8937  const SIMDType b1( set( B(k,j) ) );
8938  xmm1 += A.load(i ,k) * b1;
8939  xmm2 += A.load(i+SIMDSIZE,k) * b1;
8940  }
8941 
8942  C.store( i , j, (xmm1+xmm3) * factor );
8943  C.store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
8944  }
8945  }
8946 
8947  for( ; i<ipos; i+=SIMDSIZE )
8948  {
8949  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
8950  size_t j( UPP ? i : 0UL );
8951 
8952  for( ; (j+4UL) <= jend; j+=4UL )
8953  {
8954  const size_t kbegin( ( IsLower_v<MT5> )
8955  ?( ( IsUpper_v<MT4> )
8956  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8957  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8958  :( IsUpper_v<MT4> ? i : 0UL ) );
8959  const size_t kend( ( IsUpper_v<MT5> )
8960  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
8961  :( K ) );
8962 
8963  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8964  size_t k( kbegin );
8965 
8966  for( ; (k+2UL) <= kend; k+=2UL ) {
8967  const SIMDType a1( A.load(i,k ) );
8968  const SIMDType a2( A.load(i,k+1UL) );
8969  xmm1 += a1 * set( B(k ,j ) );
8970  xmm2 += a1 * set( B(k ,j+1UL) );
8971  xmm3 += a1 * set( B(k ,j+2UL) );
8972  xmm4 += a1 * set( B(k ,j+3UL) );
8973  xmm5 += a2 * set( B(k+1UL,j ) );
8974  xmm6 += a2 * set( B(k+1UL,j+1UL) );
8975  xmm7 += a2 * set( B(k+1UL,j+2UL) );
8976  xmm8 += a2 * set( B(k+1UL,j+3UL) );
8977  }
8978 
8979  for( ; k<kend; ++k ) {
8980  const SIMDType a1( A.load(i,k) );
8981  xmm1 += a1 * set( B(k,j ) );
8982  xmm2 += a1 * set( B(k,j+1UL) );
8983  xmm3 += a1 * set( B(k,j+2UL) );
8984  xmm4 += a1 * set( B(k,j+3UL) );
8985  }
8986 
8987  C.store( i, j , (xmm1+xmm5) * factor );
8988  C.store( i, j+1UL, (xmm2+xmm6) * factor );
8989  C.store( i, j+2UL, (xmm3+xmm7) * factor );
8990  C.store( i, j+3UL, (xmm4+xmm8) * factor );
8991  }
8992 
8993  for( ; (j+3UL) <= jend; j+=3UL )
8994  {
8995  const size_t kbegin( ( IsLower_v<MT5> )
8996  ?( ( IsUpper_v<MT4> )
8997  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8998  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8999  :( IsUpper_v<MT4> ? i : 0UL ) );
9000  const size_t kend( ( IsUpper_v<MT5> )
9001  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
9002  :( K ) );
9003 
9004  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9005  size_t k( kbegin );
9006 
9007  for( ; (k+2UL) <= kend; k+=2UL ) {
9008  const SIMDType a1( A.load(i,k ) );
9009  const SIMDType a2( A.load(i,k+1UL) );
9010  xmm1 += a1 * set( B(k ,j ) );
9011  xmm2 += a1 * set( B(k ,j+1UL) );
9012  xmm3 += a1 * set( B(k ,j+2UL) );
9013  xmm4 += a2 * set( B(k+1UL,j ) );
9014  xmm5 += a2 * set( B(k+1UL,j+1UL) );
9015  xmm6 += a2 * set( B(k+1UL,j+2UL) );
9016  }
9017 
9018  for( ; k<kend; ++k ) {
9019  const SIMDType a1( A.load(i,k) );
9020  xmm1 += a1 * set( B(k,j ) );
9021  xmm2 += a1 * set( B(k,j+1UL) );
9022  xmm3 += a1 * set( B(k,j+2UL) );
9023  }
9024 
9025  C.store( i, j , (xmm1+xmm4) * factor );
9026  C.store( i, j+1UL, (xmm2+xmm5) * factor );
9027  C.store( i, j+2UL, (xmm3+xmm6) * factor );
9028  }
9029 
9030  for( ; (j+2UL) <= jend; j+=2UL )
9031  {
9032  const size_t kbegin( ( IsLower_v<MT5> )
9033  ?( ( IsUpper_v<MT4> )
9034  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9035  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9036  :( IsUpper_v<MT4> ? i : 0UL ) );
9037  const size_t kend( ( IsUpper_v<MT5> )
9038  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9039  :( K ) );
9040 
9041  SIMDType xmm1, xmm2, xmm3, xmm4;
9042  size_t k( kbegin );
9043 
9044  for( ; k<kend; ++k ) {
9045  const SIMDType a1( A.load(i,k) );
9046  xmm1 += a1 * set( B(k,j ) );
9047  xmm2 += a1 * set( B(k,j+1UL) );
9048  }
9049 
9050  for( ; (k+2UL) <= kend; k+=2UL ) {
9051  const SIMDType a1( A.load(i,k ) );
9052  const SIMDType a2( A.load(i,k+1UL) );
9053  xmm1 += a1 * set( B(k ,j ) );
9054  xmm2 += a1 * set( B(k ,j+1UL) );
9055  xmm3 += a2 * set( B(k+1UL,j ) );
9056  xmm4 += a2 * set( B(k+1UL,j+1UL) );
9057  }
9058 
9059  C.store( i, j , (xmm1+xmm3) * factor );
9060  C.store( i, j+1UL, (xmm2+xmm4) * factor );
9061  }
9062 
9063  if( j < jend )
9064  {
9065  const size_t kbegin( ( IsLower_v<MT5> )
9066  ?( ( IsUpper_v<MT4> )
9067  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9068  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9069  :( IsUpper_v<MT4> ? i : 0UL ) );
9070 
9071  SIMDType xmm1, xmm2;
9072  size_t k( kbegin );
9073 
9074  for( ; (k+2UL) <= K; k+=2UL ) {
9075  xmm1 += A.load(i,k ) * set( B(k ,j) );
9076  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
9077  }
9078 
9079  for( ; k<K; ++k ) {
9080  xmm1 += A.load(i,k) * set( B(k,j) );
9081  }
9082 
9083  C.store( i, j, (xmm1+xmm2) * factor );
9084  }
9085  }
9086 
9087  for( ; remainder && i<M; ++i )
9088  {
9089  size_t j( LOW && UPP ? i : 0UL );
9090 
9091  for( ; (j+2UL) <= N; j+=2UL )
9092  {
9093  const size_t kbegin( ( IsLower_v<MT5> )
9094  ?( ( IsUpper_v<MT4> )
9095  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9096  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9097  :( IsUpper_v<MT4> ? i : 0UL ) );
9098  const size_t kend( ( IsUpper_v<MT5> )
9099  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9100  :( K ) );
9101 
9102  ElementType value1{};
9103  ElementType value2{};
9104 
9105  for( size_t k=kbegin; k<kend; ++k ) {
9106  value1 += A(i,k) * B(k,j );
9107  value2 += A(i,k) * B(k,j+1UL);
9108  }
9109 
9110  C(i,j ) = value1 * scalar;
9111  C(i,j+1UL) = value2 * scalar;
9112  }
9113 
9114  if( j < N )
9115  {
9116  const size_t kbegin( ( IsLower_v<MT5> )
9117  ?( ( IsUpper_v<MT4> )
9118  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9119  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9120  :( IsUpper_v<MT4> ? i : 0UL ) );
9121 
9122  ElementType value{};
9123 
9124  for( size_t k=kbegin; k<K; ++k ) {
9125  value += A(i,k) * B(k,j);
9126  }
9127 
9128  C(i,j) = value * scalar;
9129  }
9130  }
9131  }
9132 
9133  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
9134  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9135  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9136  for( size_t i=0UL; i<iend; ++i ) {
9137  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
9138  }
9139  }
9140  }
9141  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
9142  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9143  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9144  for( size_t i=0UL; i<iend; ++i ) {
9145  reset( C(i,j) );
9146  }
9147  }
9148  }
9149  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
9150  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
9151  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
9152  for( size_t j=0UL; j<jend; ++j ) {
9153  reset( C(i,j) );
9154  }
9155  }
9156  }
9157  }
9158  //**********************************************************************************************
9159 
9160  //**Default assignment to dense matrices (large matrices)***************************************
9174  template< typename MT3 // Type of the left-hand side target matrix
9175  , typename MT4 // Type of the left-hand side matrix operand
9176  , typename MT5 // Type of the right-hand side matrix operand
9177  , typename ST2 > // Type of the scalar value
9178  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9179  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9180  {
9181  selectDefaultAssignKernel( C, A, B, scalar );
9182  }
9183  //**********************************************************************************************
9184 
9185  //**Vectorized default assignment to dense matrices (large matrices)****************************
9200  template< typename MT3 // Type of the left-hand side target matrix
9201  , typename MT4 // Type of the left-hand side matrix operand
9202  , typename MT5 // Type of the right-hand side matrix operand
9203  , typename ST2 > // Type of the scalar value
9204  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9205  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9206  {
9207  if( SYM )
9208  smmm( C, A, B, scalar );
9209  else if( HERM )
9210  hmmm( C, A, B, scalar );
9211  else if( LOW )
9212  lmmm( C, A, B, scalar, ST2(0) );
9213  else if( UPP )
9214  ummm( C, A, B, scalar, ST2(0) );
9215  else
9216  mmm( C, A, B, scalar, ST2(0) );
9217  }
9218  //**********************************************************************************************
9219 
9220  //**BLAS-based assignment to dense matrices (default)*******************************************
9234  template< typename MT3 // Type of the left-hand side target matrix
9235  , typename MT4 // Type of the left-hand side matrix operand
9236  , typename MT5 // Type of the right-hand side matrix operand
9237  , typename ST2 > // Type of the scalar value
9238  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9239  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9240  {
9241  selectLargeAssignKernel( C, A, B, scalar );
9242  }
9243  //**********************************************************************************************
9244 
9245  //**BLAS-based assignment to dense matrices*****************************************************
9246 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9247 
9260  template< typename MT3 // Type of the left-hand side target matrix
9261  , typename MT4 // Type of the left-hand side matrix operand
9262  , typename MT5 // Type of the right-hand side matrix operand
9263  , typename ST2 > // Type of the scalar value
9264  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9265  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9266  {
9267  using ET = ElementType_t<MT3>;
9268 
9269  if( IsTriangular_v<MT4> ) {
9270  assign( C, B );
9271  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9272  }
9273  else if( IsTriangular_v<MT5> ) {
9274  assign( C, A );
9275  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
9276  }
9277  else {
9278  gemm( C, A, B, ET(scalar), ET(0) );
9279  }
9280  }
9281 #endif
9282  //**********************************************************************************************
9283 
9284  //**Assignment to sparse matrices***************************************************************
9296  template< typename MT // Type of the target sparse matrix
9297  , bool SO > // Storage order of the target sparse matrix
9298  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9299  {
9301 
9302  using TmpType = If_t< SO, ResultType, OppositeType >;
9303 
9310 
9311  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9312  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9313 
9314  const ForwardFunctor fwd;
9315 
9316  const TmpType tmp( serial( rhs ) );
9317  assign( ~lhs, fwd( tmp ) );
9318  }
9319  //**********************************************************************************************
9320 
9321  //**Addition assignment to dense matrices*******************************************************
9333  template< typename MT // Type of the target dense matrix
9334  , bool SO > // Storage order of the target dense matrix
9335  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9336  {
9338 
9339  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9340  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9341 
9342  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9343  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9344 
9345  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
9346  return;
9347  }
9348 
9349  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
9350  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
9351 
9352  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9353  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9354  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9355  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9356  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
9357  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
9358 
9359  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
9360  }
9361  //**********************************************************************************************
9362 
9363  //**Addition assignment to dense matrices (kernel selection)************************************
9374  template< typename MT3 // Type of the left-hand side target matrix
9375  , typename MT4 // Type of the left-hand side matrix operand
9376  , typename MT5 // Type of the right-hand side matrix operand
9377  , typename ST2 > // Type of the scalar value
9378  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9379  {
9380  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
9381  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
9382  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
9383  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9384  selectSmallAddAssignKernel( C, A, B, scalar );
9385  else
9386  selectBlasAddAssignKernel( C, A, B, scalar );
9387  }
9388  //**********************************************************************************************
9389 
9390  //**Default addition assignment to dense matrices (general/general)*****************************
9404  template< typename MT3 // Type of the left-hand side target matrix
9405  , typename MT4 // Type of the left-hand side matrix operand
9406  , typename MT5 // Type of the right-hand side matrix operand
9407  , typename ST2 > // Type of the scalar value
9408  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9409  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9410  {
9411  const ResultType tmp( serial( A * B * scalar ) );
9412  addAssign( C, tmp );
9413  }
9414  //**********************************************************************************************
9415 
9416  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
9430  template< typename MT3 // Type of the left-hand side target matrix
9431  , typename MT4 // Type of the left-hand side matrix operand
9432  , typename MT5 // Type of the right-hand side matrix operand
9433  , typename ST2 > // Type of the scalar value
9434  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9435  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9436  {
9437  constexpr size_t block( BLOCK_SIZE );
9438 
9439  const size_t M( A.rows() );
9440  const size_t N( B.columns() );
9441 
9442  for( size_t ii=0UL; ii<M; ii+=block ) {
9443  const size_t iend( min( M, ii+block ) );
9444  for( size_t jj=0UL; jj<N; jj+=block ) {
9445  const size_t jend( min( N, jj+block ) );
9446  for( size_t i=ii; i<iend; ++i )
9447  {
9448  const size_t jbegin( ( IsUpper_v<MT4> )
9449  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
9450  :( jj ) );
9451  const size_t jpos( ( IsLower_v<MT4> )
9452  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
9453  :( jend ) );
9454 
9455  for( size_t j=jbegin; j<jpos; ++j ) {
9456  C(i,j) += A(i,j) * B(j,j) * scalar;
9457  }
9458  }
9459  }
9460  }
9461  }
9462  //**********************************************************************************************
9463 
9464  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
9478  template< typename MT3 // Type of the left-hand side target matrix
9479  , typename MT4 // Type of the left-hand side matrix operand
9480  , typename MT5 // Type of the right-hand side matrix operand
9481  , typename ST2 > // Type of the scalar value
9482  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9483  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9484  {
9485  const size_t M( A.rows() );
9486  const size_t N( B.columns() );
9487 
9488  for( size_t j=0UL; j<N; ++j )
9489  {
9490  const size_t ibegin( ( IsLower_v<MT4> )
9491  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
9492  :( 0UL ) );
9493  const size_t iend( ( IsUpper_v<MT4> )
9494  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
9495  :( M ) );
9496  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
9497 
9498  const size_t inum( iend - ibegin );
9499  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
9500 
9501  for( size_t i=ibegin; i<ipos; i+=2UL ) {
9502  C(i ,j) += A(i ,j) * B(j,j) * scalar;
9503  C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
9504  }
9505  if( ipos < iend ) {
9506  C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
9507  }
9508  }
9509  }
9510  //**********************************************************************************************
9511 
9512  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
9526  template< typename MT3 // Type of the left-hand side target matrix
9527  , typename MT4 // Type of the left-hand side matrix operand
9528  , typename MT5 // Type of the right-hand side matrix operand
9529  , typename ST2 > // Type of the scalar value
9530  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9531  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9532  {
9533  const size_t M( A.rows() );
9534  const size_t N( B.columns() );
9535 
9536  for( size_t i=0UL; i<M; ++i )
9537  {
9538  const size_t jbegin( ( IsUpper_v<MT5> )
9539  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
9540  :( 0UL ) );
9541  const size_t jend( ( IsLower_v<MT5> )
9542  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
9543  :( N ) );
9544  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
9545 
9546  const size_t jnum( jend - jbegin );
9547  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
9548 
9549  for( size_t j=jbegin; j<jpos; j+=2UL ) {
9550  C(i,j ) += A(i,i) * B(i,j ) * scalar;
9551  C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
9552  }
9553  if( jpos < jend ) {
9554  C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
9555  }
9556  }
9557  }
9558  //**********************************************************************************************
9559 
9560  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
9574  template< typename MT3 // Type of the left-hand side target matrix
9575  , typename MT4 // Type of the left-hand side matrix operand
9576  , typename MT5 // Type of the right-hand side matrix operand
9577  , typename ST2 > // Type of the scalar value
9578  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9579  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9580  {
9581  constexpr size_t block( BLOCK_SIZE );
9582 
9583  const size_t M( A.rows() );
9584  const size_t N( B.columns() );
9585 
9586  for( size_t jj=0UL; jj<N; jj+=block ) {
9587  const size_t jend( min( N, jj+block ) );
9588  for( size_t ii=0UL; ii<M; ii+=block ) {
9589  const size_t iend( min( M, ii+block ) );
9590  for( size_t j=jj; j<jend; ++j )
9591  {
9592  const size_t ibegin( ( IsLower_v<MT5> )
9593  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
9594  :( ii ) );
9595  const size_t ipos( ( IsUpper_v<MT5> )
9596  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
9597  :( iend ) );
9598 
9599  for( size_t i=ibegin; i<ipos; ++i ) {
9600  C(i,j) += A(i,i) * B(i,j) * scalar;
9601  }
9602  }
9603  }
9604  }
9605  }
9606  //**********************************************************************************************
9607 
9608  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
9622  template< typename MT3 // Type of the left-hand side target matrix
9623  , typename MT4 // Type of the left-hand side matrix operand
9624  , typename MT5 // Type of the right-hand side matrix operand
9625  , typename ST2 > // Type of the scalar value
9626  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9627  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9628  {
9629  for( size_t i=0UL; i<A.rows(); ++i ) {
9630  C(i,i) += A(i,i) * B(i,i) * scalar;
9631  }
9632  }
9633  //**********************************************************************************************
9634 
9635  //**Default addition assignment to dense matrices (small matrices)******************************
9649  template< typename MT3 // Type of the left-hand side target matrix
9650  , typename MT4 // Type of the left-hand side matrix operand
9651  , typename MT5 // Type of the right-hand side matrix operand
9652  , typename ST2 > // Type of the scalar value
9653  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9654  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9655  {
9656  selectDefaultAddAssignKernel( C, A, B, scalar );
9657  }
9658  //**********************************************************************************************
9659 
9660  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
9675  template< typename MT3 // Type of the left-hand side target matrix
9676  , typename MT4 // Type of the left-hand side matrix operand
9677  , typename MT5 // Type of the right-hand side matrix operand
9678  , typename ST2 > // Type of the scalar value
9679  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9680  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9681  {
9682  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
9683 
9684  const size_t M( A.rows() );
9685  const size_t N( B.columns() );
9686  const size_t K( A.columns() );
9687 
9688  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
9689 
9690  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
9691  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
9692 
9693  const SIMDType factor( set( scalar ) );
9694 
9695  size_t j( 0UL );
9696 
9697  if( IsIntegral_v<ElementType> )
9698  {
9699  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
9700  for( size_t i=0UL; i<M; ++i )
9701  {
9702  const size_t kbegin( ( IsUpper_v<MT4> )
9703  ?( ( IsLower_v<MT5> )
9704  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9705  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9706  :( IsLower_v<MT5> ? j : 0UL ) );
9707  const size_t kend( ( IsLower_v<MT4> )
9708  ?( ( IsUpper_v<MT5> )
9709  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
9710  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
9711  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
9712 
9713  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9714 
9715  for( size_t k=kbegin; k<kend; ++k ) {
9716  const SIMDType a1( set( A(i,k) ) );
9717  xmm1 += a1 * B.load(k,j );
9718  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9719  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9720  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9721  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9722  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
9723  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
9724  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
9725  }
9726 
9727  C.store( i, j , C.load(i,j ) + xmm1 * factor );
9728  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
9729  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9730  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9731  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9732  C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
9733  C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
9734  C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
9735  }
9736  }
9737  }
9738 
9739  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
9740  {
9741  size_t i( 0UL );
9742 
9743  for( ; (i+2UL) <= M; i+=2UL )
9744  {
9745  const size_t kbegin( ( IsUpper_v<MT4> )
9746  ?( ( IsLower_v<MT5> )
9747  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9748  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9749  :( IsLower_v<MT5> ? j : 0UL ) );
9750  const size_t kend( ( IsLower_v<MT4> )
9751  ?( ( IsUpper_v<MT5> )
9752  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
9753  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
9754  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
9755 
9756  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9757 
9758  for( size_t k=kbegin; k<kend; ++k ) {
9759  const SIMDType a1( set( A(i ,k) ) );
9760  const SIMDType a2( set( A(i+1UL,k) ) );
9761  const SIMDType b1( B.load(k,j ) );
9762  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9763  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9764  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9765  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
9766  xmm1 += a1 * b1;
9767  xmm2 += a1 * b2;
9768  xmm3 += a1 * b3;
9769  xmm4 += a1 * b4;
9770  xmm5 += a1 * b5;
9771  xmm6 += a2 * b1;
9772  xmm7 += a2 * b2;
9773  xmm8 += a2 * b3;
9774  xmm9 += a2 * b4;
9775  xmm10 += a2 * b5;
9776  }
9777 
9778  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
9779  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
9780  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9781  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9782  C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
9783  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
9784  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
9785  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
9786  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
9787  C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
9788  }
9789 
9790  if( i < M )
9791  {
9792  const size_t kbegin( ( IsUpper_v<MT4> )
9793  ?( ( IsLower_v<MT5> )
9794  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9795  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9796  :( IsLower_v<MT5> ? j : 0UL ) );
9797  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
9798 
9799  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9800 
9801  for( size_t k=kbegin; k<kend; ++k ) {
9802  const SIMDType a1( set( A(i,k) ) );
9803  xmm1 += a1 * B.load(k,j );
9804  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9805  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9806  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9807  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9808  }
9809 
9810  C.store( i, j , C.load(i,j ) + xmm1 * factor );
9811  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
9812  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9813  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9814  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9815  }
9816  }
9817 
9818  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
9819  {
9820  size_t i( 0UL );
9821 
9822  for( ; (i+2UL) <= M; i+=2UL )
9823  {
9824  const size_t kbegin( ( IsUpper_v<MT4> )
9825  ?( ( IsLower_v<MT5> )
9826  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9827  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9828  :( IsLower_v<MT5> ? j : 0UL ) );
9829  const size_t kend( ( IsLower_v<MT4> )
9830  ?( ( IsUpper_v<MT5> )
9831  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
9832  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
9833  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
9834 
9835  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9836 
9837  for( size_t k=kbegin; k<kend; ++k ) {
9838  const SIMDType a1( set( A(i ,k) ) );
9839  const SIMDType a2( set( A(i+1UL,k) ) );
9840  const SIMDType b1( B.load(k,j ) );
9841  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9842  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9843  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9844  xmm1 += a1 * b1;
9845  xmm2 += a1 * b2;
9846  xmm3 += a1 * b3;
9847  xmm4 += a1 * b4;
9848  xmm5 += a2 * b1;
9849  xmm6 += a2 * b2;
9850  xmm7 += a2 * b3;
9851  xmm8 += a2 * b4;
9852  }
9853 
9854  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
9855  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
9856  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9857  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9858  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
9859  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
9860  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
9861  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
9862  }
9863 
9864  if( i < M )
9865  {
9866  const size_t kbegin( ( IsUpper_v<MT4> )
9867  ?( ( IsLower_v<MT5> )
9868  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9869  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9870  :( IsLower_v<MT5> ? j : 0UL ) );
9871  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
9872 
9873  SIMDType xmm1, xmm2, xmm3, xmm4;
9874 
9875  for( size_t k=kbegin; k<kend; ++k ) {
9876  const SIMDType a1( set( A(i,k) ) );
9877  xmm1 += a1 * B.load(k,j );
9878  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9879  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9880  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9881  }
9882 
9883  C.store( i, j , C.load(i,j ) + xmm1 * factor );
9884  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
9885  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9886  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9887  }
9888  }
9889 
9890  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
9891  {
9892  size_t i( 0UL );
9893 
9894  for( ; (i+2UL) <= M; i+=2UL )
9895  {
9896  const size_t kbegin( ( IsUpper_v<MT4> )
9897  ?( ( IsLower_v<MT5> )
9898  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9899  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9900  :( IsLower_v<MT5> ? j : 0UL ) );
9901  const size_t kend( ( IsLower_v<MT4> )
9902  ?( ( IsUpper_v<MT5> )
9903  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
9904  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
9905  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
9906 
9907  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9908 
9909  for( size_t k=kbegin; k<kend; ++k ) {
9910  const SIMDType a1( set( A(i ,k) ) );
9911  const SIMDType a2( set( A(i+1UL,k) ) );
9912  const SIMDType b1( B.load(k,j ) );
9913  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9914  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9915  xmm1 += a1 * b1;
9916  xmm2 += a1 * b2;
9917  xmm3 += a1 * b3;
9918  xmm4 += a2 * b1;
9919  xmm5 += a2 * b2;
9920  xmm6 += a2 * b3;
9921  }
9922 
9923  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
9924  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
9925  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9926  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
9927  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
9928  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
9929  }
9930 
9931  if( i < M )
9932  {
9933  const size_t kbegin( ( IsUpper_v<MT4> )
9934  ?( ( IsLower_v<MT5> )
9935  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9936  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9937  :( IsLower_v<MT5> ? j : 0UL ) );
9938  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
9939 
9940  SIMDType xmm1, xmm2, xmm3;
9941 
9942  for( size_t k=kbegin; k<kend; ++k ) {
9943  const SIMDType a1( set( A(i,k) ) );
9944  xmm1 += a1 * B.load(k,j );
9945  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9946  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9947  }
9948 
9949  C.store( i, j , C.load(i,j ) + xmm1 * factor );
9950  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
9951  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9952  }
9953  }
9954 
9955  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
9956  {
9957  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
9958  size_t i( LOW ? j : 0UL );
9959 
9960  for( ; (i+4UL) <= iend; i+=4UL )
9961  {
9962  const size_t kbegin( ( IsUpper_v<MT4> )
9963  ?( ( IsLower_v<MT5> )
9964  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9965  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9966  :( IsLower_v<MT5> ? j : 0UL ) );
9967  const size_t kend( ( IsLower_v<MT4> )
9968  ?( ( IsUpper_v<MT5> )
9969  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
9970  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
9971  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
9972 
9973  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9974 
9975  for( size_t k=kbegin; k<kend; ++k ) {
9976  const SIMDType a1( set( A(i ,k) ) );
9977  const SIMDType a2( set( A(i+1UL,k) ) );
9978  const SIMDType a3( set( A(i+2UL,k) ) );
9979  const SIMDType a4( set( A(i+3UL,k) ) );
9980  const SIMDType b1( B.load(k,j ) );
9981  const SIMDType b2( B.load(k,j+SIMDSIZE) );
9982  xmm1 += a1 * b1;
9983  xmm2 += a1 * b2;
9984  xmm3 += a2 * b1;
9985  xmm4 += a2 * b2;
9986  xmm5 += a3 * b1;
9987  xmm6 += a3 * b2;
9988  xmm7 += a4 * b1;
9989  xmm8 += a4 * b2;
9990  }
9991 
9992  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
9993  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
9994  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
9995  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
9996  C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
9997  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
9998  C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
9999  C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
10000  }
10001 
10002  for( ; (i+3UL) <= iend; i+=3UL )
10003  {
10004  const size_t kbegin( ( IsUpper_v<MT4> )
10005  ?( ( IsLower_v<MT5> )
10006  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10007  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10008  :( IsLower_v<MT5> ? j : 0UL ) );
10009  const size_t kend( ( IsLower_v<MT4> )
10010  ?( ( IsUpper_v<MT5> )
10011  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
10012  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
10013  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
10014 
10015  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10016 
10017  for( size_t k=kbegin; k<kend; ++k ) {
10018  const SIMDType a1( set( A(i ,k) ) );
10019  const SIMDType a2( set( A(i+1UL,k) ) );
10020  const SIMDType a3( set( A(i+2UL,k) ) );
10021  const SIMDType b1( B.load(k,j ) );
10022  const SIMDType b2( B.load(k,j+SIMDSIZE) );
10023  xmm1 += a1 * b1;
10024  xmm2 += a1 * b2;
10025  xmm3 += a2 * b1;
10026  xmm4 += a2 * b2;
10027  xmm5 += a3 * b1;
10028  xmm6 += a3 * b2;
10029  }
10030 
10031  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10032  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
10033  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
10034  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
10035  C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
10036  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
10037  }
10038 
10039  for( ; (i+2UL) <= iend; i+=2UL )
10040  {
10041  const size_t kbegin( ( IsUpper_v<MT4> )
10042  ?( ( IsLower_v<MT5> )
10043  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10044  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10045  :( IsLower_v<MT5> ? j : 0UL ) );
10046  const size_t kend( ( IsLower_v<MT4> )
10047  ?( ( IsUpper_v<MT5> )
10048  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
10049  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10050  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
10051 
10052  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10053  size_t k( kbegin );
10054 
10055  for( ; (k+2UL) <= kend; k+=2UL ) {
10056  const SIMDType a1( set( A(i ,k ) ) );
10057  const SIMDType a2( set( A(i+1UL,k ) ) );
10058  const SIMDType a3( set( A(i ,k+1UL) ) );
10059  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
10060  const SIMDType b1( B.load(k ,j ) );
10061  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
10062  const SIMDType b3( B.load(k+1UL,j ) );
10063  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
10064  xmm1 += a1 * b1;
10065  xmm2 += a1 * b2;
10066  xmm3 += a2 * b1;
10067  xmm4 += a2 * b2;
10068  xmm5 += a3 * b3;
10069  xmm6 += a3 * b4;
10070  xmm7 += a4 * b3;
10071  xmm8 += a4 * b4;
10072  }
10073 
10074  for( ; k<kend; ++k ) {
10075  const SIMDType a1( set( A(i ,k) ) );
10076  const SIMDType a2( set( A(i+1UL,k) ) );
10077  const SIMDType b1( B.load(k,j ) );
10078  const SIMDType b2( B.load(k,j+SIMDSIZE) );
10079  xmm1 += a1 * b1;
10080  xmm2 += a1 * b2;
10081  xmm3 += a2 * b1;
10082  xmm4 += a2 * b2;
10083  }
10084 
10085  C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
10086  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
10087  C.store( i+1UL, j , C.load(i+1UL,j ) + (xmm3+xmm7) * factor );
10088  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
10089  }
10090 
10091  if( i < iend )
10092  {
10093  const size_t kbegin( ( IsUpper_v<MT4> )
10094  ?( ( IsLower_v<MT5> )
10095  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10096  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10097  :( IsLower_v<MT5> ? j : 0UL ) );
10098  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
10099 
10100  SIMDType xmm1, xmm2, xmm3, xmm4;
10101  size_t k( kbegin );
10102 
10103  for( ; (k+2UL) <= kend; k+=2UL ) {
10104  const SIMDType a1( set( A(i,k ) ) );
10105  const SIMDType a2( set( A(i,k+1UL) ) );
10106  xmm1 += a1 * B.load(k ,j );
10107  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
10108  xmm3 += a2 * B.load(k+1UL,j );
10109  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
10110  }
10111 
10112  for( ; k<kend; ++k ) {
10113  const SIMDType a1( set( A(i,k) ) );
10114  xmm1 += a1 * B.load(k,j );
10115  xmm2 += a1 * B.load(k,j+SIMDSIZE);
10116  }
10117 
10118  C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
10119  C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
10120  }
10121  }
10122 
10123  for( ; j<jpos; j+=SIMDSIZE )
10124  {
10125  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
10126  size_t i( LOW ? j : 0UL );
10127 
10128  for( ; (i+4UL) <= iend; i+=4UL )
10129  {
10130  const size_t kbegin( ( IsUpper_v<MT4> )
10131  ?( ( IsLower_v<MT5> )
10132  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10133  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10134  :( IsLower_v<MT5> ? j : 0UL ) );
10135  const size_t kend( ( IsLower_v<MT4> )
10136  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
10137  :( K ) );
10138 
10139  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10140  size_t k( kbegin );
10141 
10142  for( ; (k+2UL) <= kend; k+=2UL ) {
10143  const SIMDType b1( B.load(k ,j) );
10144  const SIMDType b2( B.load(k+1UL,j) );
10145  xmm1 += set( A(i ,k ) ) * b1;
10146  xmm2 += set( A(i+1UL,k ) ) * b1;
10147  xmm3 += set( A(i+2UL,k ) ) * b1;
10148  xmm4 += set( A(i+3UL,k ) ) * b1;
10149  xmm5 += set( A(i ,k+1UL) ) * b2;
10150  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
10151  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
10152  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
10153  }
10154 
10155  for( ; k<kend; ++k ) {
10156  const SIMDType b1( B.load(k,j) );
10157  xmm1 += set( A(i ,k) ) * b1;
10158  xmm2 += set( A(i+1UL,k) ) * b1;
10159  xmm3 += set( A(i+2UL,k) ) * b1;
10160  xmm4 += set( A(i+3UL,k) ) * b1;
10161  }
10162 
10163  C.store( i , j, C.load(i ,j) + (xmm1+xmm5) * factor );
10164  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm6) * factor );
10165  C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm7) * factor );
10166  C.store( i+3UL, j, C.load(i+3UL,j) + (xmm4+xmm8) * factor );
10167  }
10168 
10169  for( ; (i+3UL) <= iend; i+=3UL )
10170  {
10171  const size_t kbegin( ( IsUpper_v<MT4> )
10172  ?( ( IsLower_v<MT5> )
10173  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10174  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10175  :( IsLower_v<MT5> ? j : 0UL ) );
10176  const size_t kend( ( IsLower_v<MT4> )
10177  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
10178  :( K ) );
10179 
10180  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10181  size_t k( kbegin );
10182 
10183  for( ; (k+2UL) <= kend; k+=2UL ) {
10184  const SIMDType b1( B.load(k ,j) );
10185  const SIMDType b2( B.load(k+1UL,j) );
10186  xmm1 += set( A(i ,k ) ) * b1;
10187  xmm2 += set( A(i+1UL,k ) ) * b1;
10188  xmm3 += set( A(i+2UL,k ) ) * b1;
10189  xmm4 += set( A(i ,k+1UL) ) * b2;
10190  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
10191  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
10192  }
10193 
10194  for( ; k<kend; ++k ) {
10195  const SIMDType b1( B.load(k,j) );
10196  xmm1 += set( A(i ,k) ) * b1;
10197  xmm2 += set( A(i+1UL,k) ) * b1;
10198  xmm3 += set( A(i+2UL,k) ) * b1;
10199  }
10200 
10201  C.store( i , j, C.load(i ,j) + (xmm1+xmm4) * factor );
10202  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm5) * factor );
10203  C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm6) * factor );
10204  }
10205 
10206  for( ; (i+2UL) <= iend; i+=2UL )
10207  {
10208  const size_t kbegin( ( IsUpper_v<MT4> )
10209  ?( ( IsLower_v<MT5> )
10210  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10211  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10212  :( IsLower_v<MT5> ? j : 0UL ) );
10213  const size_t kend( ( IsLower_v<MT4> )
10214  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
10215  :( K ) );
10216 
10217  SIMDType xmm1, xmm2, xmm3, xmm4;
10218  size_t k( kbegin );
10219 
10220  for( ; (k+2UL) <= kend; k+=2UL ) {
10221  const SIMDType b1( B.load(k ,j) );
10222  const SIMDType b2( B.load(k+1UL,j) );
10223  xmm1 += set( A(i ,k ) ) * b1;
10224  xmm2 += set( A(i+1UL,k ) ) * b1;
10225  xmm3 += set( A(i ,k+1UL) ) * b2;
10226  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
10227  }
10228 
10229  for( ; k<kend; ++k ) {
10230  const SIMDType b1( B.load(k,j) );
10231  xmm1 += set( A(i ,k) ) * b1;
10232  xmm2 += set( A(i+1UL,k) ) * b1;
10233  }
10234 
10235  C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
10236  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm4) * factor );
10237  }
10238 
10239  if( i < iend )
10240  {
10241  const size_t kbegin( ( IsUpper_v<MT4> )
10242  ?( ( IsLower_v<MT5> )
10243  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10244  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10245  :( IsLower_v<MT5> ? j : 0UL ) );
10246 
10247  SIMDType xmm1, xmm2;
10248  size_t k( kbegin );
10249 
10250  for( ; (k+2UL) <= K; k+=2UL ) {
10251  xmm1 += set( A(i,k ) ) * B.load(k ,j);
10252  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
10253  }
10254 
10255  for( ; k<K; ++k ) {
10256  xmm1 += set( A(i,k) ) * B.load(k,j);
10257  }
10258 
10259  C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
10260  }
10261  }
10262 
10263  for( ; remainder && j<N; ++j )
10264  {
10265  const size_t iend( UPP ? j+1UL : M );
10266  size_t i( LOW ? j : 0UL );
10267 
10268  for( ; (i+2UL) <= iend; i+=2UL )
10269  {
10270  const size_t kbegin( ( IsUpper_v<MT4> )
10271  ?( ( IsLower_v<MT5> )
10272  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10273  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10274  :( IsLower_v<MT5> ? j : 0UL ) );
10275  const size_t kend( ( IsLower_v<MT4> )
10276  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
10277  :( K ) );
10278 
10279  ElementType value1{};
10280  ElementType value2{};
10281 
10282  for( size_t k=kbegin; k<kend; ++k ) {
10283  value1 += A(i ,k) * B(k,j);
10284  value2 += A(i+1UL,k) * B(k,j);
10285  }
10286 
10287  C(i ,j) += value1 * scalar;
10288  C(i+1UL,j) += value2 * scalar;
10289  }
10290 
10291  if( i < iend )
10292  {
10293  const size_t kbegin( ( IsUpper_v<MT4> )
10294  ?( ( IsLower_v<MT5> )
10295  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10296  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10297  :( IsLower_v<MT5> ? j : 0UL ) );
10298 
10299  ElementType value{};
10300 
10301  for( size_t k=kbegin; k<K; ++k ) {
10302  value += A(i,k) * B(k,j);
10303  }
10304 
10305  C(i,j) += value * scalar;
10306  }
10307  }
10308  }
10309  //**********************************************************************************************
10310 
10311  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
10326  template< typename MT3 // Type of the left-hand side target matrix
10327  , typename MT4 // Type of the left-hand side matrix operand
10328  , typename MT5 // Type of the right-hand side matrix operand
10329  , typename ST2 > // Type of the scalar value
10330  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10331  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10332  {
10333  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
10334 
10335  const size_t M( A.rows() );
10336  const size_t N( B.columns() );
10337  const size_t K( A.columns() );
10338 
10339  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
10340 
10341  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
10342  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
10343 
10344  const SIMDType factor( set( scalar ) );
10345 
10346  size_t i( 0UL );
10347 
10348  if( IsIntegral_v<ElementType> )
10349  {
10350  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
10351  for( size_t j=0UL; j<N; ++j )
10352  {
10353  const size_t kbegin( ( IsLower_v<MT5> )
10354  ?( ( IsUpper_v<MT4> )
10355  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10356  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10357  :( IsUpper_v<MT4> ? i : 0UL ) );
10358  const size_t kend( ( IsUpper_v<MT5> )
10359  ?( ( IsLower_v<MT4> )
10360  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
10361  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
10362  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
10363 
10364  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10365 
10366  for( size_t k=kbegin; k<kend; ++k ) {
10367  const SIMDType b1( set( B(k,j) ) );
10368  xmm1 += A.load(i ,k) * b1;
10369  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10370  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10371  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10372  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10373  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
10374  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
10375  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
10376  }
10377 
10378  C.store( i , j, C.load(i ,j) + xmm1 * factor );
10379  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
10380  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10381  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10382  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10383  C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
10384  C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
10385  C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
10386  }
10387  }
10388  }
10389 
10390  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
10391  {
10392  size_t j( 0UL );
10393 
10394  for( ; (j+2UL) <= N; j+=2UL )
10395  {
10396  const size_t kbegin( ( IsLower_v<MT5> )
10397  ?( ( IsUpper_v<MT4> )
10398  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10399  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10400  :( IsUpper_v<MT4> ? i : 0UL ) );
10401  const size_t kend( ( IsUpper_v<MT5> )
10402  ?( ( IsLower_v<MT4> )
10403  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10404  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10405  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
10406 
10407  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10408 
10409  for( size_t k=kbegin; k<kend; ++k ) {
10410  const SIMDType a1( A.load(i ,k) );
10411  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10412  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10413  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10414  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
10415  const SIMDType b1( set( B(k,j ) ) );
10416  const SIMDType b2( set( B(k,j+1UL) ) );
10417  xmm1 += a1 * b1;
10418  xmm2 += a2 * b1;
10419  xmm3 += a3 * b1;
10420  xmm4 += a4 * b1;
10421  xmm5 += a5 * b1;
10422  xmm6 += a1 * b2;
10423  xmm7 += a2 * b2;
10424  xmm8 += a3 * b2;
10425  xmm9 += a4 * b2;
10426  xmm10 += a5 * b2;
10427  }
10428 
10429  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10430  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
10431  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10432  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10433  C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
10434  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
10435  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
10436  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
10437  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
10438  C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
10439  }
10440 
10441  if( j < N )
10442  {
10443  const size_t kbegin( ( IsLower_v<MT5> )
10444  ?( ( IsUpper_v<MT4> )
10445  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10446  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10447  :( IsUpper_v<MT4> ? i : 0UL ) );
10448  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
10449 
10450  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10451 
10452  for( size_t k=kbegin; k<kend; ++k ) {
10453  const SIMDType b1( set( B(k,j) ) );
10454  xmm1 += A.load(i ,k) * b1;
10455  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10456  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10457  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10458  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10459  }
10460 
10461  C.store( i , j, C.load(i ,j) + xmm1 * factor );
10462  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
10463  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10464  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10465  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10466  }
10467  }
10468 
10469  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10470  {
10471  size_t j( 0UL );
10472 
10473  for( ; (j+2UL) <= N; j+=2UL )
10474  {
10475  const size_t kbegin( ( IsLower_v<MT5> )
10476  ?( ( IsUpper_v<MT4> )
10477  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10478  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10479  :( IsUpper_v<MT4> ? i : 0UL ) );
10480  const size_t kend( ( IsUpper_v<MT5> )
10481  ?( ( IsLower_v<MT4> )
10482  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10483  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10484  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
10485 
10486  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10487 
10488  for( size_t k=kbegin; k<kend; ++k ) {
10489  const SIMDType a1( A.load(i ,k) );
10490  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10491  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10492  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10493  const SIMDType b1( set( B(k,j ) ) );
10494  const SIMDType b2( set( B(k,j+1UL) ) );
10495  xmm1 += a1 * b1;
10496  xmm2 += a2 * b1;
10497  xmm3 += a3 * b1;
10498  xmm4 += a4 * b1;
10499  xmm5 += a1 * b2;
10500  xmm6 += a2 * b2;
10501  xmm7 += a3 * b2;
10502  xmm8 += a4 * b2;
10503  }
10504 
10505  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10506  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
10507  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10508  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10509  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
10510  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
10511  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
10512  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
10513  }
10514 
10515  if( j < N )
10516  {
10517  const size_t kbegin( ( IsLower_v<MT5> )
10518  ?( ( IsUpper_v<MT4> )
10519  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10520  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10521  :( IsUpper_v<MT4> ? i : 0UL ) );
10522  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
10523 
10524  SIMDType xmm1, xmm2, xmm3, xmm4;
10525 
10526  for( size_t k=kbegin; k<kend; ++k ) {
10527  const SIMDType b1( set( B(k,j) ) );
10528  xmm1 += A.load(i ,k) * b1;
10529  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10530  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10531  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10532  }
10533 
10534  C.store( i , j, C.load(i ,j) + xmm1 * factor );
10535  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
10536  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10537  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10538  }
10539  }
10540 
10541  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
10542  {
10543  size_t j( 0UL );
10544 
10545  for( ; (j+2UL) <= N; j+=2UL )
10546  {
10547  const size_t kbegin( ( IsLower_v<MT5> )
10548  ?( ( IsUpper_v<MT4> )
10549  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10550  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10551  :( IsUpper_v<MT4> ? i : 0UL ) );
10552  const size_t kend( ( IsUpper_v<MT5> )
10553  ?( ( IsLower_v<MT4> )
10554  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10555  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10556  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
10557 
10558  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10559 
10560  for( size_t k=kbegin; k<kend; ++k ) {
10561  const SIMDType a1( A.load(i ,k) );
10562  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10563  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10564  const SIMDType b1( set( B(k,j ) ) );
10565  const SIMDType b2( set( B(k,j+1UL) ) );
10566  xmm1 += a1 * b1;
10567  xmm2 += a2 * b1;
10568  xmm3 += a3 * b1;
10569  xmm4 += a1 * b2;
10570  xmm5 += a2 * b2;
10571  xmm6 += a3 * b2;
10572  }
10573 
10574  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10575  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) + xmm2 * factor );
10576  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10577  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
10578  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
10579  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
10580  }
10581 
10582  if( j < N )
10583  {
10584  const size_t kbegin( ( IsLower_v<MT5> )
10585  ?( ( IsUpper_v<MT4> )
10586  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10587  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10588  :( IsUpper_v<MT4> ? i : 0UL ) );
10589  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
10590 
10591  SIMDType xmm1, xmm2, xmm3;
10592 
10593  for( size_t k=kbegin; k<kend; ++k ) {
10594  const SIMDType b1( set( B(k,j) ) );
10595  xmm1 += A.load(i ,k) * b1;
10596  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10597  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10598  }
10599 
10600  C.store( i , j, C.load(i ,j) + xmm1 * factor );
10601  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) + xmm2 * factor );
10602  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10603  }
10604  }
10605 
10606  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10607  {
10608  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
10609  size_t j( UPP ? i : 0UL );
10610 
10611  for( ; (j+4UL) <= jend; j+=4UL )
10612  {
10613  const size_t kbegin( ( IsLower_v<MT5> )
10614  ?( ( IsUpper_v<MT4> )
10615  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10616  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10617  :( IsUpper_v<MT4> ? i : 0UL ) );
10618  const size_t kend( ( IsUpper_v<MT5> )
10619  ?( ( IsLower_v<MT4> )
10620  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
10621  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
10622  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
10623 
10624  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10625 
10626  for( size_t k=kbegin; k<kend; ++k ) {
10627  const SIMDType a1( A.load(i ,k) );
10628  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10629  const SIMDType b1( set( B(k,j ) ) );
10630  const SIMDType b2( set( B(k,j+1UL) ) );
10631  const SIMDType b3( set( B(k,j+2UL) ) );
10632  const SIMDType b4( set( B(k,j+3UL) ) );
10633  xmm1 += a1 * b1;
10634  xmm2 += a2 * b1;
10635  xmm3 += a1 * b2;
10636  xmm4 += a2 * b2;
10637  xmm5 += a1 * b3;
10638  xmm6 += a2 * b3;
10639  xmm7 += a1 * b4;
10640  xmm8 += a2 * b4;
10641  }
10642 
10643  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10644  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
10645  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
10646  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10647  C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
10648  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10649  C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
10650  C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
10651  }
10652 
10653  for( ; (j+3UL) <= jend; j+=3UL )
10654  {
10655  const size_t kbegin( ( IsLower_v<MT5> )
10656  ?( ( IsUpper_v<MT4> )
10657  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10658  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10659  :( IsUpper_v<MT4> ? i : 0UL ) );
10660  const size_t kend( ( IsUpper_v<MT5> )
10661  ?( ( IsLower_v<MT4> )
10662  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
10663  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
10664  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
10665 
10666  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10667 
10668  for( size_t k=kbegin; k<kend; ++k ) {
10669  const SIMDType a1( A.load(i ,k) );
10670  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10671  const SIMDType b1( set( B(k,j ) ) );
10672  const SIMDType b2( set( B(k,j+1UL) ) );
10673  const SIMDType b3( set( B(k,j+2UL) ) );
10674  xmm1 += a1 * b1;
10675  xmm2 += a2 * b1;
10676  xmm3 += a1 * b2;
10677  xmm4 += a2 * b2;
10678  xmm5 += a1 * b3;
10679  xmm6 += a2 * b3;
10680  }
10681 
10682  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10683  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + xmm2 * factor );
10684  C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
10685  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10686  C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
10687  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10688  }
10689 
10690  for( ; (j+2UL) <= jend; j+=2UL )
10691  {
10692  const size_t kbegin( ( IsLower_v<MT5> )
10693  ?( ( IsUpper_v<MT4> )
10694  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10695  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10696  :( IsUpper_v<MT4> ? i : 0UL ) );
10697  const size_t kend( ( IsUpper_v<MT5> )
10698  ?( ( IsLower_v<MT4> )
10699  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10700  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10701  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
10702 
10703  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10704  size_t k( kbegin );
10705 
10706  for( ; (k+2UL) <= kend; k+=2UL ) {
10707  const SIMDType a1( A.load(i ,k ) );
10708  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
10709  const SIMDType a3( A.load(i ,k+1UL) );
10710  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
10711  const SIMDType b1( set( B(k ,j ) ) );
10712  const SIMDType b2( set( B(k ,j+1UL) ) );
10713  const SIMDType b3( set( B(k+1UL,j ) ) );
10714  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
10715  xmm1 += a1 * b1;
10716  xmm2 += a2 * b1;
10717  xmm3 += a1 * b2;
10718  xmm4 += a2 * b2;
10719  xmm5 += a3 * b3;
10720  xmm6 += a4 * b3;
10721  xmm7 += a3 * b4;
10722  xmm8 += a4 * b4;
10723  }
10724 
10725  for( ; k<kend; ++k ) {
10726  const SIMDType a1( A.load(i ,k) );
10727  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10728  const SIMDType b1( set( B(k,j ) ) );
10729  const SIMDType b2( set( B(k,j+1UL) ) );
10730  xmm1 += a1 * b1;
10731  xmm2 += a2 * b1;
10732  xmm3 += a1 * b2;
10733  xmm4 += a2 * b2;
10734  }
10735 
10736  C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
10737  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
10738  C.store( i , j+1UL, C.load(i ,j+1UL) + (xmm3+xmm7) * factor );
10739  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
10740  }
10741 
10742  if( j < jend )
10743  {
10744  const size_t kbegin( ( IsLower_v<MT5> )
10745  ?( ( IsUpper_v<MT4> )
10746  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10747  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10748  :( IsUpper_v<MT4> ? i : 0UL ) );
10749  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
10750 
10751  SIMDType xmm1, xmm2, xmm3, xmm4;
10752  size_t k( kbegin );
10753 
10754  for( ; (k+2UL) <= kend; k+=2UL ) {
10755  const SIMDType b1( set( B(k ,j) ) );
10756  const SIMDType b2( set( B(k+1UL,j) ) );
10757  xmm1 += A.load(i ,k ) * b1;
10758  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
10759  xmm3 += A.load(i ,k+1UL) * b2;
10760  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
10761  }
10762 
10763  for( ; k<kend; ++k ) {
10764  const SIMDType b1( set( B(k,j) ) );
10765  xmm1 += A.load(i ,k) * b1;
10766  xmm2 += A.load(i+SIMDSIZE,k) * b1;
10767  }
10768 
10769  C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
10770  C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
10771  }
10772  }
10773 
10774  for( ; i<ipos; i+=SIMDSIZE )
10775  {
10776  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
10777  size_t j( UPP ? i : 0UL );
10778 
10779  for( ; (j+4UL) <= jend; j+=4UL )
10780  {
10781  const size_t kbegin( ( IsLower_v<MT5> )
10782  ?( ( IsUpper_v<MT4> )
10783  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10784  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10785  :( IsUpper_v<MT4> ? i : 0UL ) );
10786  const size_t kend( ( IsUpper_v<MT5> )
10787  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
10788  :( K ) );
10789 
10790  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10791  size_t k( kbegin );
10792 
10793  for( ; (k+2UL) <= kend; k+=2UL ) {
10794  const SIMDType a1( A.load(i,k ) );
10795  const SIMDType a2( A.load(i,k+1UL) );
10796  xmm1 += a1 * set( B(k ,j ) );
10797  xmm2 += a1 * set( B(k ,j+1UL) );
10798  xmm3 += a1 * set( B(k ,j+2UL) );
10799  xmm4 += a1 * set( B(k ,j+3UL) );
10800  xmm5 += a2 * set( B(k+1UL,j ) );
10801  xmm6 += a2 * set( B(k+1UL,j+1UL) );
10802  xmm7 += a2 * set( B(k+1UL,j+2UL) );
10803  xmm8 += a2 * set( B(k+1UL,j+3UL) );
10804  }
10805 
10806  for( ; k<kend; ++k ) {
10807  const SIMDType a1( A.load(i,k) );
10808  xmm1 += a1 * set( B(k,j ) );
10809  xmm2 += a1 * set( B(k,j+1UL) );
10810  xmm3 += a1 * set( B(k,j+2UL) );
10811  xmm4 += a1 * set( B(k,j+3UL) );
10812  }
10813 
10814  C.store( i, j , C.load(i,j ) + (xmm1+xmm5) * factor );
10815  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm6) * factor );
10816  C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm7) * factor );
10817  C.store( i, j+3UL, C.load(i,j+3UL) + (xmm4+xmm8) * factor );
10818  }
10819 
10820  for( ; (j+3UL) <= jend; j+=3UL )
10821  {
10822  const size_t kbegin( ( IsLower_v<MT5> )
10823  ?( ( IsUpper_v<MT4> )
10824  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10825  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10826  :( IsUpper_v<MT4> ? i : 0UL ) );
10827  const size_t kend( ( IsUpper_v<MT5> )
10828  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
10829  :( K ) );
10830 
10831  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10832  size_t k( kbegin );
10833 
10834  for( ; (k+2UL) <= kend; k+=2UL ) {
10835  const SIMDType a1( A.load(i,k ) );
10836  const SIMDType a2( A.load(i,k+1UL) );
10837  xmm1 += a1 * set( B(k ,j ) );
10838  xmm2 += a1 * set( B(k ,j+1UL) );
10839  xmm3 += a1 * set( B(k ,j+2UL) );
10840  xmm4 += a2 * set( B(k+1UL,j ) );
10841  xmm5 += a2 * set( B(k+1UL,j+1UL) );
10842  xmm6 += a2 * set( B(k+1UL,j+2UL) );
10843  }
10844 
10845  for( ; k<kend; ++k ) {
10846  const SIMDType a1( A.load(i,k) );
10847  xmm1 += a1 * set( B(k,j ) );
10848  xmm2 += a1 * set( B(k,j+1UL) );
10849  xmm3 += a1 * set( B(k,j+2UL) );
10850  }
10851 
10852  C.store( i, j , C.load(i,j ) + (xmm1+xmm4) * factor );
10853  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm5) * factor );
10854  C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm6) * factor );
10855  }
10856 
10857  for( ; (j+2UL) <= jend; j+=2UL )
10858  {
10859  const size_t kbegin( ( IsLower_v<MT5> )
10860  ?( ( IsUpper_v<MT4> )
10861  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10862  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10863  :( IsUpper_v<MT4> ? i : 0UL ) );
10864  const size_t kend( ( IsUpper_v<MT5> )
10865  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
10866  :( K ) );
10867 
10868  SIMDType xmm1, xmm2, xmm3, xmm4;
10869  size_t k( kbegin );
10870 
10871  for( ; (k+2UL) <= kend; k+=2UL ) {
10872  const SIMDType a1( A.load(i,k ) );
10873  const SIMDType a2( A.load(i,k+1UL) );
10874  xmm1 += a1 * set( B(k ,j ) );
10875  xmm2 += a1 * set( B(k ,j+1UL) );
10876  xmm3 += a2 * set( B(k+1UL,j ) );
10877  xmm4 += a2 * set( B(k+1UL,j+1UL) );
10878  }
10879 
10880  for( ; k<kend; ++k ) {
10881  const SIMDType a1( A.load(i,k) );
10882  xmm1 += a1 * set( B(k,j ) );
10883  xmm2 += a1 * set( B(k,j+1UL) );
10884  }
10885 
10886  C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
10887  C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm4) * factor );
10888  }
10889 
10890  if( j < jend )
10891  {
10892  const size_t kbegin( ( IsLower_v<MT5> )
10893  ?( ( IsUpper_v<MT4> )
10894  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10895  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10896  :( IsUpper_v<MT4> ? i : 0UL ) );
10897 
10898  SIMDType xmm1, xmm2;
10899  size_t k( kbegin );
10900 
10901  for( ; (k+2UL) <= K; k+=2UL ) {
10902  xmm1 += A.load(i,k ) * set( B(k ,j) );
10903  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
10904  }
10905 
10906  for( ; k<K; ++k ) {
10907  xmm1 += A.load(i,k) * set( B(k,j) );
10908  }
10909 
10910  C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
10911  }
10912  }
10913 
10914  for( ; remainder && i<M; ++i )
10915  {
10916  const size_t jend( LOW ? i+1UL : N );
10917  size_t j( UPP ? i : 0UL );
10918 
10919  for( ; (j+2UL) <= jend; j+=2UL )
10920  {
10921  const size_t kbegin( ( IsLower_v<MT5> )
10922  ?( ( IsUpper_v<MT4> )
10923  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10924  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10925  :( IsUpper_v<MT4> ? i : 0UL ) );
10926  const size_t kend( ( IsUpper_v<MT5> )
10927  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
10928  :( K ) );
10929 
10930  ElementType value1{};
10931  ElementType value2{};
10932 
10933  for( size_t k=kbegin; k<kend; ++k ) {
10934  value1 += A(i,k) * B(k,j );
10935  value2 += A(i,k) * B(k,j+1UL);
10936  }
10937 
10938  C(i,j ) += value1 * scalar;
10939  C(i,j+1UL) += value2 * scalar;
10940  }
10941 
10942  if( j < jend )
10943  {
10944  const size_t kbegin( ( IsLower_v<MT5> )
10945  ?( ( IsUpper_v<MT4> )
10946  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10947  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10948  :( IsUpper_v<MT4> ? i : 0UL ) );
10949 
10950  ElementType value{};
10951 
10952  for( size_t k=kbegin; k<K; ++k ) {
10953  value += A(i,k) * B(k,j);
10954  }
10955 
10956  C(i,j) += value * scalar;
10957  }
10958  }
10959  }
10960  //**********************************************************************************************
10961 
10962  //**Default addition assignment to dense matrices (large matrices)******************************
10976  template< typename MT3 // Type of the left-hand side target matrix
10977  , typename MT4 // Type of the left-hand side matrix operand
10978  , typename MT5 // Type of the right-hand side matrix operand
10979  , typename ST2 > // Type of the scalar value
10980  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10981  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10982  {
10983  selectDefaultAddAssignKernel( C, A, B, scalar );
10984  }
10985  //**********************************************************************************************
10986 
10987  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
11002  template< typename MT3 // Type of the left-hand side target matrix
11003  , typename MT4 // Type of the left-hand side matrix operand
11004  , typename MT5 // Type of the right-hand side matrix operand
11005  , typename ST2 > // Type of the scalar value
11006  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11007  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11008  {
11009  if( LOW )
11010  lmmm( C, A, B, scalar, ST2(1) );
11011  else if( UPP )
11012  ummm( C, A, B, scalar, ST2(1) );
11013  else
11014  mmm( C, A, B, scalar, ST2(1) );
11015  }
11016  //**********************************************************************************************
11017 
11018  //**BLAS-based addition assignment to dense matrices (default)**********************************
11032  template< typename MT3 // Type of the left-hand side target matrix
11033  , typename MT4 // Type of the left-hand side matrix operand
11034  , typename MT5 // Type of the right-hand side matrix operand
11035  , typename ST2 > // Type of the scalar value
11036  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11037  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11038  {
11039  selectLargeAddAssignKernel( C, A, B, scalar );
11040  }
11041  //**********************************************************************************************
11042 
11043  //**BLAS-based addition assignment to dense matrices********************************************
11044 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
11045 
11058  template< typename MT3 // Type of the left-hand side target matrix
11059  , typename MT4 // Type of the left-hand side matrix operand
11060  , typename MT5 // Type of the right-hand side matrix operand
11061  , typename ST2 > // Type of the scalar value
11062  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11063  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11064  {
11065  using ET = ElementType_t<MT3>;
11066 
11067  if( IsTriangular_v<MT4> ) {
11068  ResultType_t<MT3> tmp( serial( B ) );
11069  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
11070  addAssign( C, tmp );
11071  }
11072  else if( IsTriangular_v<MT5> ) {
11073  ResultType_t<MT3> tmp( serial( A ) );
11074  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
11075  addAssign( C, tmp );
11076  }
11077  else {
11078  gemm( C, A, B, ET(scalar), ET(1) );
11079  }
11080  }
11081 #endif
11082  //**********************************************************************************************
11083 
11084  //**Addition assignment to sparse matrices******************************************************
11085  // No special implementation for the addition assignment to sparse matrices.
11086  //**********************************************************************************************
11087 
11088  //**Subtraction assignment to dense matrices****************************************************
11100  template< typename MT // Type of the target dense matrix
11101  , bool SO > // Storage order of the target dense matrix
11102  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11103  {
11105 
11106  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11107  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11108 
11109  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
11110  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
11111 
11112  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11113  return;
11114  }
11115 
11116  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
11117  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
11118 
11119  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11120  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11121  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11122  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11123  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11124  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11125 
11126  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
11127  }
11128  //**********************************************************************************************
11129 
11130  //**Subtraction assignment to dense matrices (kernel selection)*********************************
11141  template< typename MT3 // Type of the left-hand side target matrix
11142  , typename MT4 // Type of the left-hand side matrix operand
11143  , typename MT5 // Type of the right-hand side matrix operand
11144  , typename ST2 > // Type of the scalar value
11145  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11146  {
11147  if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
11148  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <= SIMDSIZE*10UL ) ||
11149  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <= SIMDSIZE*10UL ) ||
11150  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
11151  selectSmallSubAssignKernel( C, A, B, scalar );
11152  else
11153  selectBlasSubAssignKernel( C, A, B, scalar );
11154  }
11155  //**********************************************************************************************
11156 
11157  //**Default subtraction assignment to dense matrices********************************************
11171  template< typename MT3 // Type of the left-hand side target matrix
11172  , typename MT4 // Type of the left-hand side matrix operand
11173  , typename MT5 // Type of the right-hand side matrix operand
11174  , typename ST2 > // Type of the scalar value
11175  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11176  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11177  {
11178  const ResultType tmp( serial( A * B * scalar ) );
11179  subAssign( C, tmp );
11180  }
11181  //**********************************************************************************************
11182 
11183  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
11197  template< typename MT3 // Type of the left-hand side target matrix
11198  , typename MT4 // Type of the left-hand side matrix operand
11199  , typename MT5 // Type of the right-hand side matrix operand
11200  , typename ST2 > // Type of the scalar value
11201  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11202  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11203  {
11204  constexpr size_t block( BLOCK_SIZE );
11205 
11206  const size_t M( A.rows() );
11207  const size_t N( B.columns() );
11208 
11209  for( size_t ii=0UL; ii<M; ii+=block ) {
11210  const size_t iend( min( M, ii+block ) );
11211  for( size_t jj=0UL; jj<N; jj+=block ) {
11212  const size_t jend( min( N, jj+block ) );
11213  for( size_t i=ii; i<iend; ++i )
11214  {
11215  const size_t jbegin( ( IsUpper_v<MT4> )
11216  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
11217  :( jj ) );
11218  const size_t jpos( ( IsLower_v<MT4> )
11219  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
11220  :( jend ) );
11221 
11222  for( size_t j=jbegin; j<jpos; ++j ) {
11223  C(i,j) -= A(i,j) * B(j,j) * scalar;
11224  }
11225  }
11226  }
11227  }
11228  }
11229  //**********************************************************************************************
11230 
11231  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
11245  template< typename MT3 // Type of the left-hand side target matrix
11246  , typename MT4 // Type of the left-hand side matrix operand
11247  , typename MT5 // Type of the right-hand side matrix operand
11248  , typename ST2 > // Type of the scalar value
11249  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11250  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11251  {
11252  const size_t M( A.rows() );
11253  const size_t N( B.columns() );
11254 
11255  for( size_t j=0UL; j<N; ++j )
11256  {
11257  const size_t ibegin( ( IsLower_v<MT4> )
11258  ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
11259  :( 0UL ) );
11260  const size_t iend( ( IsUpper_v<MT4> )
11261  ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
11262  :( M ) );
11263  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
11264 
11265  const size_t inum( iend - ibegin );
11266  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
11267 
11268  for( size_t i=ibegin; i<ipos; i+=2UL ) {
11269  C(i ,j) -= A(i ,j) * B(j,j) * scalar;
11270  C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
11271  }
11272  if( ipos < iend ) {
11273  C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
11274  }
11275  }
11276  }
11277  //**********************************************************************************************
11278 
11279  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
11293  template< typename MT3 // Type of the left-hand side target matrix
11294  , typename MT4 // Type of the left-hand side matrix operand
11295  , typename MT5 // Type of the right-hand side matrix operand
11296  , typename ST2 > // Type of the scalar value
11297  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11298  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11299  {
11300  const size_t M( A.rows() );
11301  const size_t N( B.columns() );
11302 
11303  for( size_t i=0UL; i<M; ++i )
11304  {
11305  const size_t jbegin( ( IsUpper_v<MT5> )
11306  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
11307  :( 0UL ) );
11308  const size_t jend( ( IsLower_v<MT5> )
11309  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
11310  :( N ) );
11311  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
11312 
11313  const size_t jnum( jend - jbegin );
11314  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
11315 
11316  for( size_t j=jbegin; j<jpos; j+=2UL ) {
11317  C(i,j ) -= A(i,i) * B(i,j ) * scalar;
11318  C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
11319  }
11320  if( jpos < jend ) {
11321  C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
11322  }
11323  }
11324  }
11325  //**********************************************************************************************
11326 
11327  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
11341  template< typename MT3 // Type of the left-hand side target matrix
11342  , typename MT4 // Type of the left-hand side matrix operand
11343  , typename MT5 // Type of the right-hand side matrix operand
11344  , typename ST2 > // Type of the scalar value
11345  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11346  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11347  {
11348  constexpr size_t block( BLOCK_SIZE );
11349 
11350  const size_t M( A.rows() );
11351  const size_t N( B.columns() );
11352 
11353  for( size_t jj=0UL; jj<N; jj+=block ) {
11354  const size_t jend( min( N, jj+block ) );
11355  for( size_t ii=0UL; ii<M; ii+=block ) {
11356  const size_t iend( min( M, ii+block ) );
11357  for( size_t j=jj; j<jend; ++j )
11358  {
11359  const size_t ibegin( ( IsLower_v<MT5> )
11360  ?( max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
11361  :( ii ) );
11362  const size_t ipos( ( IsUpper_v<MT5> )
11363  ?( min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
11364  :( iend ) );
11365 
11366  for( size_t i=ibegin; i<ipos; ++i ) {
11367  C(i,j) -= A(i,i) * B(i,j) * scalar;
11368  }
11369  }
11370  }
11371  }
11372  }
11373  //**********************************************************************************************
11374 
11375  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
11389  template< typename MT3 // Type of the left-hand side target matrix
11390  , typename MT4 // Type of the left-hand side matrix operand
11391  , typename MT5 // Type of the right-hand side matrix operand
11392  , typename ST2 > // Type of the scalar value
11393  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11394  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11395  {
11396  for( size_t i=0UL; i<A.rows(); ++i ) {
11397  C(i,i) -= A(i,i) * B(i,i) * scalar;
11398  }
11399  }
11400  //**********************************************************************************************
11401 
11402  //**Default subtraction assignment to dense matrices (small matrices)***************************
11416  template< typename MT3 // Type of the left-hand side target matrix
11417  , typename MT4 // Type of the left-hand side matrix operand
11418  , typename MT5 // Type of the right-hand side matrix operand
11419  , typename ST2 > // Type of the scalar value
11420  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11421  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11422  {
11423  selectDefaultSubAssignKernel( C, A, B, scalar );
11424  }
11425  //**********************************************************************************************
11426 
11427  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
11442  template< typename MT3 // Type of the left-hand side target matrix
11443  , typename MT4 // Type of the left-hand side matrix operand
11444  , typename MT5 // Type of the right-hand side matrix operand
11445  , typename ST2 > // Type of the scalar value
11446  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11447  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11448  {
11449  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
11450 
11451  const size_t M( A.rows() );
11452  const size_t N( B.columns() );
11453  const size_t K( A.columns() );
11454 
11455  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
11456 
11457  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
11458  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
11459 
11460  const SIMDType factor( set( scalar ) );
11461 
11462  size_t j( 0UL );
11463 
11464  if( IsIntegral_v<ElementType> )
11465  {
11466  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
11467  for( size_t i=0UL; i<M; ++i )
11468  {
11469  const size_t kbegin( ( IsUpper_v<MT4> )
11470  ?( ( IsLower_v<MT5> )
11471  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11472  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11473  :( IsLower_v<MT5> ? j : 0UL ) );
11474  const size_t kend( ( IsLower_v<MT4> )
11475  ?( ( IsUpper_v<MT5> )
11476  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
11477  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
11478  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
11479 
11480  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11481 
11482  for( size_t k=kbegin; k<kend; ++k ) {
11483  const SIMDType a1( set( A(i,k) ) );
11484  xmm1 += a1 * B.load(k,j );
11485  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11486  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11487  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11488  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11489  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
11490  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
11491  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
11492  }
11493 
11494  C.store( i, j , C.load(i,j ) - xmm1 * factor );
11495  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
11496  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11497  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11498  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11499  C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
11500  C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
11501  C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
11502  }
11503  }
11504  }
11505 
11506  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
11507  {
11508  size_t i( 0UL );
11509 
11510  for( ; (i+2UL) <= M; i+=2UL )
11511  {
11512  const size_t kbegin( ( IsUpper_v<MT4> )
11513  ?( ( IsLower_v<MT5> )
11514  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11515  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11516  :( IsLower_v<MT5> ? j : 0UL ) );
11517  const size_t kend( ( IsLower_v<MT4> )
11518  ?( ( IsUpper_v<MT5> )
11519  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
11520  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11521  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
11522 
11523  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
11524 
11525  for( size_t k=kbegin; k<kend; ++k ) {
11526  const SIMDType a1( set( A(i ,k) ) );
11527  const SIMDType a2( set( A(i+1UL,k) ) );
11528  const SIMDType b1( B.load(k,j ) );
11529  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11530  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11531  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11532  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
11533  xmm1 += a1 * b1;
11534  xmm2 += a1 * b2;
11535  xmm3 += a1 * b3;
11536  xmm4 += a1 * b4;
11537  xmm5 += a1 * b5;
11538  xmm6 += a2 * b1;
11539  xmm7 += a2 * b2;
11540  xmm8 += a2 * b3;
11541  xmm9 += a2 * b4;
11542  xmm10 += a2 * b5;
11543  }
11544 
11545  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11546  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
11547  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11548  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11549  C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
11550  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
11551  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
11552  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
11553  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
11554  C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
11555  }
11556 
11557  if( i < M )
11558  {
11559  const size_t kbegin( ( IsUpper_v<MT4> )
11560  ?( ( IsLower_v<MT5> )
11561  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11562  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11563  :( IsLower_v<MT5> ? j : 0UL ) );
11564  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
11565 
11566  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
11567 
11568  for( size_t k=kbegin; k<kend; ++k ) {
11569  const SIMDType a1( set( A(i,k) ) );
11570  xmm1 += a1 * B.load(k,j );
11571  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11572  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11573  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11574  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11575  }
11576 
11577  C.store( i, j , C.load(i,j ) - xmm1 * factor );
11578  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
11579  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11580  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11581  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11582  }
11583  }
11584 
11585  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
11586  {
11587  size_t i( 0UL );
11588 
11589  for( ; (i+2UL) <= M; i+=2UL )
11590  {
11591  const size_t kbegin( ( IsUpper_v<MT4> )
11592  ?( ( IsLower_v<MT5> )
11593  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11594  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11595  :( IsLower_v<MT5> ? j : 0UL ) );
11596  const size_t kend( ( IsLower_v<MT4> )
11597  ?( ( IsUpper_v<MT5> )
11598  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
11599  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11600  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
11601 
11602  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11603 
11604  for( size_t k=kbegin; k<kend; ++k ) {
11605  const SIMDType a1( set( A(i ,k) ) );
11606  const SIMDType a2( set( A(i+1UL,k) ) );
11607  const SIMDType b1( B.load(k,j ) );
11608  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11609  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11610  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11611  xmm1 += a1 * b1;
11612  xmm2 += a1 * b2;
11613  xmm3 += a1 * b3;
11614  xmm4 += a1 * b4;
11615  xmm5 += a2 * b1;
11616  xmm6 += a2 * b2;
11617  xmm7 += a2 * b3;
11618  xmm8 += a2 * b4;
11619  }
11620 
11621  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11622  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
11623  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11624  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11625  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
11626  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
11627  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
11628  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
11629  }
11630 
11631  if( i < M )
11632  {
11633  const size_t kbegin( ( IsUpper_v<MT4> )
11634  ?( ( IsLower_v<MT5> )
11635  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11636  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11637  :( IsLower_v<MT5> ? j : 0UL ) );
11638  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
11639 
11640  SIMDType xmm1, xmm2, xmm3, xmm4;
11641 
11642  for( size_t k=kbegin; k<kend; ++k ) {
11643  const SIMDType a1( set( A(i,k) ) );
11644  xmm1 += a1 * B.load(k,j );
11645  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11646  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11647  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11648  }
11649 
11650  C.store( i, j , C.load(i,j ) - xmm1 * factor );
11651  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
11652  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11653  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11654  }
11655  }
11656 
11657  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
11658  {
11659  size_t i( 0UL );
11660 
11661  for( ; (i+2UL) <= M; i+=2UL )
11662  {
11663  const size_t kbegin( ( IsUpper_v<MT4> )
11664  ?( ( IsLower_v<MT5> )
11665  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11666  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11667  :( IsLower_v<MT5> ? j : 0UL ) );
11668  const size_t kend( ( IsLower_v<MT4> )
11669  ?( ( IsUpper_v<MT5> )
11670  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
11671  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11672  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
11673 
11674  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11675 
11676  for( size_t k=kbegin; k<kend; ++k ) {
11677  const SIMDType a1( set( A(i ,k) ) );
11678  const SIMDType a2( set( A(i+1UL,k) ) );
11679  const SIMDType b1( B.load(k,j ) );
11680  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11681  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11682  xmm1 += a1 * b1;
11683  xmm2 += a1 * b2;
11684  xmm3 += a1 * b3;
11685  xmm4 += a2 * b1;
11686  xmm5 += a2 * b2;
11687  xmm6 += a2 * b3;
11688  }
11689 
11690  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11691  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
11692  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11693  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
11694  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
11695  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
11696  }
11697 
11698  if( i < M )
11699  {
11700  const size_t kbegin( ( IsUpper_v<MT4> )
11701  ?( ( IsLower_v<MT5> )
11702  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11703  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11704  :( IsLower_v<MT5> ? j : 0UL ) );
11705  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
11706 
11707  SIMDType xmm1, xmm2, xmm3;
11708 
11709  for( size_t k=kbegin; k<kend; ++k ) {
11710  const SIMDType a1( set( A(i,k) ) );
11711  xmm1 += a1 * B.load(k,j );
11712  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11713  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11714  }
11715 
11716  C.store( i, j , C.load(i,j ) - xmm1 * factor );
11717  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
11718  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11719  }
11720  }
11721 
11722  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
11723  {
11724  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
11725  size_t i( LOW ? j : 0UL );
11726 
11727  for( ; (i+4UL) <= iend; i+=4UL )
11728  {
11729  const size_t kbegin( ( IsUpper_v<MT4> )
11730  ?( ( IsLower_v<MT5> )
11731  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11732  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11733  :( IsLower_v<MT5> ? j : 0UL ) );
11734  const size_t kend( ( IsLower_v<MT4> )
11735  ?( ( IsUpper_v<MT5> )
11736  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
11737  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
11738  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
11739 
11740  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11741 
11742  for( size_t k=kbegin; k<kend; ++k ) {
11743  const SIMDType a1( set( A(i ,k) ) );
11744  const SIMDType a2( set( A(i+1UL,k) ) );
11745  const SIMDType a3( set( A(i+2UL,k) ) );
11746  const SIMDType a4( set( A(i+3UL,k) ) );
11747  const SIMDType b1( B.load(k,j ) );
11748  const SIMDType b2( B.load(k,j+SIMDSIZE) );
11749  xmm1 += a1 * b1;
11750  xmm2 += a1 * b2;
11751  xmm3 += a2 * b1;
11752  xmm4 += a2 * b2;
11753  xmm5 += a3 * b1;
11754  xmm6 += a3 * b2;
11755  xmm7 += a4 * b1;
11756  xmm8 += a4 * b2;
11757  }
11758 
11759  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11760  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
11761  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
11762  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11763  C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
11764  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11765  C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
11766  C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
11767  }
11768 
11769  for( ; (i+3UL) <= iend; i+=3UL )
11770  {
11771  const size_t kbegin( ( IsUpper_v<MT4> )
11772  ?( ( IsLower_v<MT5> )
11773  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11774  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11775  :( IsLower_v<MT5> ? j : 0UL ) );
11776  const size_t kend( ( IsLower_v<MT4> )
11777  ?( ( IsUpper_v<MT5> )
11778  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
11779  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
11780  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
11781 
11782  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11783 
11784  for( size_t k=kbegin; k<kend; ++k ) {
11785  const SIMDType a1( set( A(i ,k) ) );
11786  const SIMDType a2( set( A(i+1UL,k) ) );
11787  const SIMDType a3( set( A(i+2UL,k) ) );
11788  const SIMDType b1( B.load(k,j ) );
11789  const SIMDType b2( B.load(k,j+SIMDSIZE) );
11790  xmm1 += a1 * b1;
11791  xmm2 += a1 * b2;
11792  xmm3 += a2 * b1;
11793  xmm4 += a2 * b2;
11794  xmm5 += a3 * b1;
11795  xmm6 += a3 * b2;
11796  }
11797 
11798  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11799  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
11800  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
11801  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11802  C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
11803  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11804  }
11805 
11806  for( ; (i+2UL) <= iend; i+=2UL )
11807  {
11808  const size_t kbegin( ( IsUpper_v<MT4> )
11809  ?( ( IsLower_v<MT5> )
11810  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11811  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11812  :( IsLower_v<MT5> ? j : 0UL ) );
11813  const size_t kend( ( IsLower_v<MT4> )
11814  ?( ( IsUpper_v<MT5> )
11815  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
11816  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11817  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
11818 
11819  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11820  size_t k( kbegin );
11821 
11822  for( ; (k+2UL) <= kend; k+=2UL ) {
11823  const SIMDType a1( set( A(i ,k ) ) );
11824  const SIMDType a2( set( A(i+1UL,k ) ) );
11825  const SIMDType a3( set( A(i ,k+1UL) ) );
11826  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
11827  const SIMDType b1( B.load(k ,j ) );
11828  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
11829  const SIMDType b3( B.load(k+1UL,j ) );
11830  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
11831  xmm1 += a1 * b1;
11832  xmm2 += a1 * b2;
11833  xmm3 += a2 * b1;
11834  xmm4 += a2 * b2;
11835  xmm5 += a3 * b3;
11836  xmm6 += a3 * b4;
11837  xmm7 += a4 * b3;
11838  xmm8 += a4 * b4;
11839  }
11840 
11841  for( ; k<kend; ++k ) {
11842  const SIMDType a1( set( A(i ,k) ) );
11843  const SIMDType a2( set( A(i+1UL,k) ) );
11844  const SIMDType b1( B.load(k,j ) );
11845  const SIMDType b2( B.load(k,j+SIMDSIZE) );
11846  xmm1 += a1 * b1;
11847  xmm2 += a1 * b2;
11848  xmm3 += a2 * b1;
11849  xmm4 += a2 * b2;
11850  }
11851 
11852  C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
11853  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
11854  C.store( i+1UL, j , C.load(i+1UL,j ) - (xmm3+xmm7) * factor );
11855  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
11856  }
11857 
11858  if( i < iend )
11859  {
11860  const size_t kbegin( ( IsUpper_v<MT4> )
11861  ?( ( IsLower_v<MT5> )
11862  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11863  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11864  :( IsLower_v<MT5> ? j : 0UL ) );
11865  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
11866 
11867  SIMDType xmm1, xmm2, xmm3, xmm4;
11868  size_t k( kbegin );
11869 
11870  for( ; (k+2UL) <= kend; k+=2UL ) {
11871  const SIMDType a1( set( A(i,k ) ) );
11872  const SIMDType a2( set( A(i,k+1UL) ) );
11873  xmm1 += a1 * B.load(k ,j );
11874  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
11875  xmm3 += a2 * B.load(k+1UL,j );
11876  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
11877  }
11878 
11879  for( ; k<kend; ++k ) {
11880  const SIMDType a1( set( A(i,k) ) );
11881  xmm1 += a1 * B.load(k,j );
11882  xmm2 += a1 * B.load(k,j+SIMDSIZE);
11883  }
11884 
11885  C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
11886  C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
11887  }
11888  }
11889 
11890  for( ; j<jpos; j+=SIMDSIZE )
11891  {
11892  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
11893  size_t i( LOW ? j : 0UL );
11894 
11895  for( ; (i+4UL) <= iend; i+=4UL )
11896  {
11897  const size_t kbegin( ( IsUpper_v<MT4> )
11898  ?( ( IsLower_v<MT5> )
11899  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11900  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11901  :( IsLower_v<MT5> ? j : 0UL ) );
11902  const size_t kend( ( IsLower_v<MT4> )
11903  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
11904  :( K ) );
11905 
11906  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11907  size_t k( kbegin );
11908 
11909  for( ; (k+2UL) <= kend; k+=2UL ) {
11910  const SIMDType b1( B.load(k ,j) );
11911  const SIMDType b2( B.load(k+1UL,j) );
11912  xmm1 += set( A(i ,k ) ) * b1;
11913  xmm2 += set( A(i+1UL,k ) ) * b1;
11914  xmm3 += set( A(i+2UL,k ) ) * b1;
11915  xmm4 += set( A(i+3UL,k ) ) * b1;
11916  xmm5 += set( A(i ,k+1UL) ) * b2;
11917  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
11918  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
11919  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
11920  }
11921 
11922  for( ; k<kend; ++k ) {
11923  const SIMDType b1( B.load(k,j) );
11924  xmm1 += set( A(i ,k) ) * b1;
11925  xmm2 += set( A(i+1UL,k) ) * b1;
11926  xmm3 += set( A(i+2UL,k) ) * b1;
11927  xmm4 += set( A(i+3UL,k) ) * b1;
11928  }
11929 
11930  C.store( i , j, C.load(i ,j) - (xmm1+xmm5) * factor );
11931  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm6) * factor );
11932  C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm7) * factor );
11933  C.store( i+3UL, j, C.load(i+3UL,j) - (xmm4+xmm8) * factor );
11934  }
11935 
11936  for( ; (i+3UL) <= iend; i+=3UL )
11937  {
11938  const size_t kbegin( ( IsUpper_v<MT4> )
11939  ?( ( IsLower_v<MT5> )
11940  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11941  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11942  :( IsLower_v<MT5> ? j : 0UL ) );
11943  const size_t kend( ( IsLower_v<MT4> )
11944  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
11945  :( K ) );
11946 
11947  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11948  size_t k( kbegin );
11949 
11950  for( ; (k+2UL) <= kend; k+=2UL ) {
11951  const SIMDType b1( B.load(k ,j) );
11952  const SIMDType b2( B.load(k+1UL,j) );
11953  xmm1 += set( A(i ,k ) ) * b1;
11954  xmm2 += set( A(i+1UL,k ) ) * b1;
11955  xmm3 += set( A(i+2UL,k ) ) * b1;
11956  xmm4 += set( A(i ,k+1UL) ) * b2;
11957  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
11958  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
11959  }
11960 
11961  for( ; k<kend; ++k ) {
11962  const SIMDType b1( B.load(k,j) );
11963  xmm1 += set( A(i ,k) ) * b1;
11964  xmm2 += set( A(i+1UL,k) ) * b1;
11965  xmm3 += set( A(i+2UL,k) ) * b1;
11966  }
11967 
11968  C.store( i , j, C.load(i ,j) - (xmm1+xmm4) * factor );
11969  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm5) * factor );
11970  C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm6) * factor );
11971  }
11972 
11973  for( ; (i+2UL) <= iend; i+=2UL )
11974  {
11975  const size_t kbegin( ( IsUpper_v<MT4> )
11976  ?( ( IsLower_v<MT5> )
11977  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11978  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11979  :( IsLower_v<MT5> ? j : 0UL ) );
11980  const size_t kend( ( IsLower_v<MT4> )
11981  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
11982  :( K ) );
11983 
11984  SIMDType xmm1, xmm2, xmm3, xmm4;
11985  size_t k( kbegin );
11986 
11987  for( ; (k+2UL) <= kend; k+=2UL ) {
11988  const SIMDType b1( B.load(k ,j) );
11989  const SIMDType b2( B.load(k+1UL,j) );
11990  xmm1 += set( A(i ,k ) ) * b1;
11991  xmm2 += set( A(i+1UL,k ) ) * b1;
11992  xmm3 += set( A(i ,k+1UL) ) * b2;
11993  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
11994  }
11995 
11996  for( ; k<kend; ++k ) {
11997  const SIMDType b1( B.load(k,j) );
11998  xmm1 += set( A(i ,k) ) * b1;
11999  xmm2 += set( A(i+1UL,k) ) * b1;
12000  }
12001 
12002  C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
12003  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm4) * factor );
12004  }
12005 
12006  if( i < iend )
12007  {
12008  const size_t kbegin( ( IsUpper_v<MT4> )
12009  ?( ( IsLower_v<MT5> )
12010  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12011  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12012  :( IsLower_v<MT5> ? j : 0UL ) );
12013 
12014  SIMDType xmm1, xmm2;
12015  size_t k( kbegin );
12016 
12017  for( ; (k+2UL) <= K; k+=2UL ) {
12018  xmm1 += set( A(i,k ) ) * B.load(k ,j);
12019  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
12020  }
12021 
12022  for( ; k<K; ++k ) {
12023  xmm1 += set( A(i,k) ) * B.load(k,j);
12024  }
12025 
12026  C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
12027  }
12028  }
12029 
12030  for( ; remainder && j<N; ++j )
12031  {
12032  const size_t iend( UPP ? j+1UL : M );
12033  size_t i( LOW ? j : 0UL );
12034 
12035  for( ; (i+2UL) <= iend; i+=2UL )
12036  {
12037  const size_t kbegin( ( IsUpper_v<MT4> )
12038  ?( ( IsLower_v<MT5> )
12039  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12040  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12041  :( IsLower_v<MT5> ? j : 0UL ) );
12042  const size_t kend( ( IsLower_v<MT4> )
12043  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
12044  :( K ) );
12045 
12046  ElementType value1{};
12047  ElementType value2{};
12048 
12049  for( size_t k=kbegin; k<kend; ++k ) {
12050  value1 += A(i ,k) * B(k,j);
12051  value2 += A(i+1UL,k) * B(k,j);
12052  }
12053 
12054  C(i ,j) -= value1 * scalar;
12055  C(i+1UL,j) -= value2 * scalar;
12056  }
12057 
12058  if( i < iend )
12059  {
12060  const size_t kbegin( ( IsUpper_v<MT4> )
12061  ?( ( IsLower_v<MT5> )
12062  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12063  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12064  :( IsLower_v<MT5> ? j : 0UL ) );
12065 
12066  ElementType value{};
12067 
12068  for( size_t k=kbegin; k<K; ++k ) {
12069  value += A(i,k) * B(k,j);
12070  }
12071 
12072  C(i,j) -= value * scalar;
12073  }
12074  }
12075  }
12076  //**********************************************************************************************
12077 
12078  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
12093  template< typename MT3 // Type of the left-hand side target matrix
12094  , typename MT4 // Type of the left-hand side matrix operand
12095  , typename MT5 // Type of the right-hand side matrix operand
12096  , typename ST2 > // Type of the scalar value
12097  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12098  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12099  {
12100  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
12101 
12102  const size_t M( A.rows() );
12103  const size_t N( B.columns() );
12104  const size_t K( A.columns() );
12105 
12106  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
12107 
12108  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
12109  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
12110 
12111  const SIMDType factor( set( scalar ) );
12112 
12113  size_t i( 0UL );
12114 
12115  if( IsIntegral_v<ElementType> )
12116  {
12117  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
12118  for( size_t j=0UL; j<N; ++j )
12119  {
12120  const size_t kbegin( ( IsLower_v<MT5> )
12121  ?( ( IsUpper_v<MT4> )
12122  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12123  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12124  :( IsUpper_v<MT4> ? i : 0UL ) );
12125  const size_t kend( ( IsUpper_v<MT5> )
12126  ?( ( IsLower_v<MT4> )
12127  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
12128  :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
12129  :( IsLower_v<MT4> ? min( i+SIMDSIZE*8UL, K ) : K ) );
12130 
12131  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12132 
12133  for( size_t k=kbegin; k<kend; ++k ) {
12134  const SIMDType b1( set( B(k,j) ) );
12135  xmm1 += A.load(i ,k) * b1;
12136  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12137  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12138  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12139  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12140  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
12141  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
12142  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
12143  }
12144 
12145  C.store( i , j, C.load(i ,j) - xmm1 * factor );
12146  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
12147  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12148  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12149  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12150  C.store( i+SIMDSIZE*5UL, j, C.load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
12151  C.store( i+SIMDSIZE*6UL, j, C.load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
12152  C.store( i+SIMDSIZE*7UL, j, C.load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
12153  }
12154  }
12155  }
12156 
12157  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
12158  {
12159  size_t j( 0UL );
12160 
12161  for( ; (j+2UL) <= N; j+=2UL )
12162  {
12163  const size_t kbegin( ( IsLower_v<MT5> )
12164  ?( ( IsUpper_v<MT4> )
12165  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12166  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12167  :( IsUpper_v<MT4> ? i : 0UL ) );
12168  const size_t kend( ( IsUpper_v<MT5> )
12169  ?( ( IsLower_v<MT4> )
12170  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12171  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12172  :( IsLower_v<MT4> ? min( i+SIMDSIZE*5UL, K ) : K ) );
12173 
12174  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
12175 
12176  for( size_t k=kbegin; k<kend; ++k ) {
12177  const SIMDType a1( A.load(i ,k) );
12178  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12179  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12180  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12181  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
12182  const SIMDType b1( set( B(k,j ) ) );
12183  const SIMDType b2( set( B(k,j+1UL) ) );
12184  xmm1 += a1 * b1;
12185  xmm2 += a2 * b1;
12186  xmm3 += a3 * b1;
12187  xmm4 += a4 * b1;
12188  xmm5 += a5 * b1;
12189  xmm6 += a1 * b2;
12190  xmm7 += a2 * b2;
12191  xmm8 += a3 * b2;
12192  xmm9 += a4 * b2;
12193  xmm10 += a5 * b2;
12194  }
12195 
12196  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12197  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
12198  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12199  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12200  C.store( i+SIMDSIZE*4UL, j , C.load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
12201  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
12202  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
12203  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
12204  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
12205  C.store( i+SIMDSIZE*4UL, j+1UL, C.load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
12206  }
12207 
12208  if( j < N )
12209  {
12210  const size_t kbegin( ( IsLower_v<MT5> )
12211  ?( ( IsUpper_v<MT4> )
12212  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12213  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12214  :( IsUpper_v<MT4> ? i : 0UL ) );
12215  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
12216 
12217  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
12218 
12219  for( size_t k=kbegin; k<kend; ++k ) {
12220  const SIMDType b1( set( B(k,j) ) );
12221  xmm1 += A.load(i ,k) * b1;
12222  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12223  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12224  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12225  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12226  }
12227 
12228  C.store( i , j, C.load(i ,j) - xmm1 * factor );
12229  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
12230  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12231  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12232  C.store( i+SIMDSIZE*4UL, j, C.load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12233  }
12234  }
12235 
12236  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
12237  {
12238  size_t j( 0UL );
12239 
12240  for( ; (j+2UL) <= N; j+=2UL )
12241  {
12242  const size_t kbegin( ( IsLower_v<MT5> )
12243  ?( ( IsUpper_v<MT4> )
12244  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12245  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12246  :( IsUpper_v<MT4> ? i : 0UL ) );
12247  const size_t kend( ( IsUpper_v<MT5> )
12248  ?( ( IsLower_v<MT4> )
12249  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12250  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12251  :( IsLower_v<MT4> ? min( i+SIMDSIZE*4UL, K ) : K ) );
12252 
12253  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12254 
12255  for( size_t k=kbegin; k<kend; ++k ) {
12256  const SIMDType a1( A.load(i ,k) );
12257  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12258  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12259  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12260  const SIMDType b1( set( B(k,j ) ) );
12261  const SIMDType b2( set( B(k,j+1UL) ) );
12262  xmm1 += a1 * b1;
12263  xmm2 += a2 * b1;
12264  xmm3 += a3 * b1;
12265  xmm4 += a4 * b1;
12266  xmm5 += a1 * b2;
12267  xmm6 += a2 * b2;
12268  xmm7 += a3 * b2;
12269  xmm8 += a4 * b2;
12270  }
12271 
12272  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12273  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
12274  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12275  C.store( i+SIMDSIZE*3UL, j , C.load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12276  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
12277  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
12278  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
12279  C.store( i+SIMDSIZE*3UL, j+1UL, C.load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
12280  }
12281 
12282  if( j < N )
12283  {
12284  const size_t kbegin( ( IsLower_v<MT5> )
12285  ?( ( IsUpper_v<MT4> )
12286  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12287  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12288  :( IsUpper_v<MT4> ? i : 0UL ) );
12289  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
12290 
12291  SIMDType xmm1, xmm2, xmm3, xmm4;
12292 
12293  for( size_t k=kbegin; k<kend; ++k ) {
12294  const SIMDType b1( set( B(k,j) ) );
12295  xmm1 += A.load(i ,k) * b1;
12296  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12297  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12298  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12299  }
12300 
12301  C.store( i , j, C.load(i ,j) - xmm1 * factor );
12302  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
12303  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12304  C.store( i+SIMDSIZE*3UL, j, C.load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12305  }
12306  }
12307 
12308  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
12309  {
12310  size_t j( 0UL );
12311 
12312  for( ; (j+2UL) <= N; j+=2UL )
12313  {
12314  const size_t kbegin( ( IsLower_v<MT5> )
12315  ?( ( IsUpper_v<MT4> )
12316  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12317  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12318  :( IsUpper_v<MT4> ? i : 0UL ) );
12319  const size_t kend( ( IsUpper_v<MT5> )
12320  ?( ( IsLower_v<MT4> )
12321  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12322  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12323  :( IsLower_v<MT4> ? min( i+SIMDSIZE*3UL, K ) : K ) );
12324 
12325  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12326 
12327  for( size_t k=kbegin; k<kend; ++k ) {
12328  const SIMDType a1( A.load(i ,k) );
12329  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12330  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12331  const SIMDType b1( set( B(k,j ) ) );
12332  const SIMDType b2( set( B(k,j+1UL) ) );
12333  xmm1 += a1 * b1;
12334  xmm2 += a2 * b1;
12335  xmm3 += a3 * b1;
12336  xmm4 += a1 * b2;
12337  xmm5 += a2 * b2;
12338  xmm6 += a3 * b2;
12339  }
12340 
12341  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12342  C.store( i+SIMDSIZE , j , C.load(i+SIMDSIZE ,j ) - xmm2 * factor );
12343  C.store( i+SIMDSIZE*2UL, j , C.load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12344  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
12345  C.store( i+SIMDSIZE , j+1UL, C.load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
12346  C.store( i+SIMDSIZE*2UL, j+1UL, C.load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
12347  }
12348 
12349  if( j < N )
12350  {
12351  const size_t kbegin( ( IsLower_v<MT5> )
12352  ?( ( IsUpper_v<MT4> )
12353  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12354  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12355  :( IsUpper_v<MT4> ? i : 0UL ) );
12356  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
12357 
12358  SIMDType xmm1, xmm2, xmm3;
12359 
12360  for( size_t k=kbegin; k<kend; ++k ) {
12361  const SIMDType b1( set( B(k,j) ) );
12362  xmm1 += A.load(i ,k) * b1;
12363  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12364  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12365  }
12366 
12367  C.store( i , j, C.load(i ,j) - xmm1 * factor );
12368  C.store( i+SIMDSIZE , j, C.load(i+SIMDSIZE ,j) - xmm2 * factor );
12369  C.store( i+SIMDSIZE*2UL, j, C.load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12370  }
12371  }
12372 
12373  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
12374  {
12375  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
12376  size_t j( UPP ? i : 0UL );
12377 
12378  for( ; (j+4UL) <= jend; j+=4UL )
12379  {
12380  const size_t kbegin( ( IsLower_v<MT5> )
12381  ?( ( IsUpper_v<MT4> )
12382  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12383  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12384  :( IsUpper_v<MT4> ? i : 0UL ) );
12385  const size_t kend( ( IsUpper_v<MT5> )
12386  ?( ( IsLower_v<MT4> )
12387  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
12388  :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
12389  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
12390 
12391  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12392 
12393  for( size_t k=kbegin; k<kend; ++k ) {
12394  const SIMDType a1( A.load(i ,k) );
12395  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12396  const SIMDType b1( set( B(k,j ) ) );
12397  const SIMDType b2( set( B(k,j+1UL) ) );
12398  const SIMDType b3( set( B(k,j+2UL) ) );
12399  const SIMDType b4( set( B(k,j+3UL) ) );
12400  xmm1 += a1 * b1;
12401  xmm2 += a2 * b1;
12402  xmm3 += a1 * b2;
12403  xmm4 += a2 * b2;
12404  xmm5 += a1 * b3;
12405  xmm6 += a2 * b3;
12406  xmm7 += a1 * b4;
12407  xmm8 += a2 * b4;
12408  }
12409 
12410  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12411  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
12412  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
12413  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12414  C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
12415  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12416  C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
12417  C.store( i+SIMDSIZE, j+3UL, C.load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
12418  }
12419 
12420  for( ; (j+3UL) <= jend; j+=3UL )
12421  {
12422  const size_t kbegin( ( IsLower_v<MT5> )
12423  ?( ( IsUpper_v<MT4> )
12424  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12425  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12426  :( IsUpper_v<MT4> ? i : 0UL ) );
12427  const size_t kend( ( IsUpper_v<MT5> )
12428  ?( ( IsLower_v<MT4> )
12429  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
12430  :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
12431  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
12432 
12433  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12434 
12435  for( size_t k=kbegin; k<kend; ++k ) {
12436  const SIMDType a1( A.load(i ,k) );
12437  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12438  const SIMDType b1( set( B(k,j ) ) );
12439  const SIMDType b2( set( B(k,j+1UL) ) );
12440  const SIMDType b3( set( B(k,j+2UL) ) );
12441  xmm1 += a1 * b1;
12442  xmm2 += a2 * b1;
12443  xmm3 += a1 * b2;
12444  xmm4 += a2 * b2;
12445  xmm5 += a1 * b3;
12446  xmm6 += a2 * b3;
12447  }
12448 
12449  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12450  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - xmm2 * factor );
12451  C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
12452  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12453  C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
12454  C.store( i+SIMDSIZE, j+2UL, C.load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12455  }
12456 
12457  for( ; (j+2UL) <= jend; j+=2UL )
12458  {
12459  const size_t kbegin( ( IsLower_v<MT5> )
12460  ?( ( IsUpper_v<MT4> )
12461  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12462  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12463  :( IsUpper_v<MT4> ? i : 0UL ) );
12464  const size_t kend( ( IsUpper_v<MT5> )
12465  ?( ( IsLower_v<MT4> )
12466  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12467  :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12468  :( IsLower_v<MT4> ? min( i+SIMDSIZE*2UL, K ) : K ) );
12469 
12470  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12471  size_t k( kbegin );
12472 
12473  for( ; (k+2UL) <= kend; k+=2UL ) {
12474  const SIMDType a1( A.load(i ,k ) );
12475  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
12476  const SIMDType a3( A.load(i ,k+1UL) );
12477  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
12478  const SIMDType b1( set( B(k ,j ) ) );
12479  const SIMDType b2( set( B(k ,j+1UL) ) );
12480  const SIMDType b3( set( B(k+1UL,j ) ) );
12481  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
12482  xmm1 += a1 * b1;
12483  xmm2 += a2 * b1;
12484  xmm3 += a1 * b2;
12485  xmm4 += a2 * b2;
12486  xmm5 += a3 * b3;
12487  xmm6 += a4 * b3;
12488  xmm7 += a3 * b4;
12489  xmm8 += a4 * b4;
12490  }
12491 
12492  for( ; k<kend; ++k ) {
12493  const SIMDType a1( A.load(i ,k) );
12494  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12495  const SIMDType b1( set( B(k,j ) ) );
12496  const SIMDType b2( set( B(k,j+1UL) ) );
12497  xmm1 += a1 * b1;
12498  xmm2 += a2 * b1;
12499  xmm3 += a1 * b2;
12500  xmm4 += a2 * b2;
12501  }
12502 
12503  C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
12504  C.store( i+SIMDSIZE, j , C.load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
12505  C.store( i , j+1UL, C.load(i ,j+1UL) - (xmm3+xmm7) * factor );
12506  C.store( i+SIMDSIZE, j+1UL, C.load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
12507  }
12508 
12509  if( j < jend )
12510  {
12511  const size_t kbegin( ( IsLower_v<MT5> )
12512  ?( ( IsUpper_v<MT4> )
12513  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12514  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12515  :( IsUpper_v<MT4> ? i : 0UL ) );
12516  const size_t kend( ( IsLower_v<MT4> )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
12517 
12518  SIMDType xmm1, xmm2, xmm3, xmm4;
12519  size_t k( kbegin );
12520 
12521  for( ; (k+2UL) <= kend; k+=2UL ) {
12522  const SIMDType b1( set( B(k ,j) ) );
12523  const SIMDType b2( set( B(k+1UL,j) ) );
12524  xmm1 += A.load(i ,k ) * b1;
12525  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
12526  xmm3 += A.load(i ,k+1UL) * b2;
12527  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
12528  }
12529 
12530  for( ; k<kend; ++k ) {
12531  const SIMDType b1( set( B(k,j) ) );
12532  xmm1 += A.load(i ,k) * b1;
12533  xmm2 += A.load(i+SIMDSIZE,k) * b1;
12534  }
12535 
12536  C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
12537  C.store( i+SIMDSIZE, j, C.load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
12538  }
12539  }
12540 
12541  for( ; i<ipos; i+=SIMDSIZE )
12542  {
12543  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
12544  size_t j( UPP ? i : 0UL );
12545 
12546  for( ; (j+4UL) <= jend; j+=4UL )
12547  {
12548  const size_t kbegin( ( IsLower_v<MT5> )
12549  ?( ( IsUpper_v<MT4> )
12550  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12551  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12552  :( IsUpper_v<MT4> ? i : 0UL ) );
12553  const size_t kend( ( IsUpper_v<MT5> )
12554  ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
12555  :( K ) );
12556 
12557  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12558  size_t k( kbegin );
12559 
12560  for( ; (k+2UL) <= kend; k+=2UL ) {
12561  const SIMDType a1( A.load(i,k ) );
12562  const SIMDType a2( A.load(i,k+1UL) );
12563  xmm1 += a1 * set( B(k ,j ) );
12564  xmm2 += a1 * set( B(k ,j+1UL) );
12565  xmm3 += a1 * set( B(k ,j+2UL) );
12566  xmm4 += a1 * set( B(k ,j+3UL) );
12567  xmm5 += a2 * set( B(k+1UL,j ) );
12568  xmm6 += a2 * set( B(k+1UL,j+1UL) );
12569  xmm7 += a2 * set( B(k+1UL,j+2UL) );
12570  xmm8 += a2 * set( B(k+1UL,j+3UL) );
12571  }
12572 
12573  for( ; k<kend; ++k ) {
12574  const SIMDType a1( A.load(i,k) );
12575  xmm1 += a1 * set( B(k,j ) );
12576  xmm2 += a1 * set( B(k,j+1UL) );
12577  xmm3 += a1 * set( B(k,j+2UL) );
12578  xmm4 += a1 * set( B(k,j+3UL) );
12579  }
12580 
12581  C.store( i, j , C.load(i,j ) - (xmm1+xmm5) * factor );
12582  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm6) * factor );
12583  C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm7) * factor );
12584  C.store( i, j+3UL, C.load(i,j+3UL) - (xmm4+xmm8) * factor );
12585  }
12586 
12587  for( ; (j+3UL) <= jend; j+=3UL )
12588  {
12589  const size_t kbegin( ( IsLower_v<MT5> )
12590  ?( ( IsUpper_v<MT4> )
12591  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12592  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12593  :( IsUpper_v<MT4> ? i : 0UL ) );
12594  const size_t kend( ( IsUpper_v<MT5> )
12595  ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
12596  :( K ) );
12597 
12598  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12599  size_t k( kbegin );
12600 
12601  for( ; (k+2UL) <= kend; k+=2UL ) {
12602  const SIMDType a1( A.load(i,k ) );
12603  const SIMDType a2( A.load(i,k+1UL) );
12604  xmm1 += a1 * set( B(k ,j ) );
12605  xmm2 += a1 * set( B(k ,j+1UL) );
12606  xmm3 += a1 * set( B(k ,j+2UL) );
12607  xmm4 += a2 * set( B(k+1UL,j ) );
12608  xmm5 += a2 * set( B(k+1UL,j+1UL) );
12609  xmm6 += a2 * set( B(k+1UL,j+2UL) );
12610  }
12611 
12612  for( ; k<kend; ++k ) {
12613  const SIMDType a1( A.load(i,k) );
12614  xmm1 += a1 * set( B(k,j ) );
12615  xmm2 += a1 * set( B(k,j+1UL) );
12616  xmm3 += a1 * set( B(k,j+2UL) );
12617  }
12618 
12619  C.store( i, j , C.load(i,j ) - (xmm1+xmm4) * factor );
12620  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm5) * factor );
12621  C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm6) * factor );
12622  }
12623 
12624  for( ; (j+2UL) <= jend; j+=2UL )
12625  {
12626  const size_t kbegin( ( IsLower_v<MT5> )
12627  ?( ( IsUpper_v<MT4> )
12628  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12629  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12630  :( IsUpper_v<MT4> ? i : 0UL ) );
12631  const size_t kend( ( IsUpper_v<MT5> )
12632  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
12633  :( K ) );
12634 
12635  SIMDType xmm1, xmm2, xmm3, xmm4;
12636  size_t k( kbegin );
12637 
12638  for( ; (k+2UL) <= kend; k+=2UL ) {
12639  const SIMDType a1( A.load(i,k ) );
12640  const SIMDType a2( A.load(i,k+1UL) );
12641  xmm1 += a1 * set( B(k ,j ) );
12642  xmm2 += a1 * set( B(k ,j+1UL) );
12643  xmm3 += a2 * set( B(k+1UL,j ) );
12644  xmm4 += a2 * set( B(k+1UL,j+1UL) );
12645  }
12646 
12647  for( ; k<kend; ++k ) {
12648  const SIMDType a1( A.load(i,k) );
12649  xmm1 += a1 * set( B(k,j ) );
12650  xmm2 += a1 * set( B(k,j+1UL) );
12651  }
12652 
12653  C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
12654  C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm4) * factor );
12655  }
12656 
12657  if( j < jend )
12658  {
12659  const size_t kbegin( ( IsLower_v<MT5> )
12660  ?( ( IsUpper_v<MT4> )
12661  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12662  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12663  :( IsUpper_v<MT4> ? i : 0UL ) );
12664 
12665  SIMDType xmm1, xmm2;
12666  size_t k( kbegin );
12667 
12668  for( ; (k+2UL) <= K; k+=2UL ) {
12669  xmm1 += A.load(i,k ) * set( B(k ,j) );
12670  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
12671  }
12672 
12673  for( ; k<K; ++k ) {
12674  xmm1 += A.load(i,k) * set( B(k,j) );
12675  }
12676 
12677  C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
12678  }
12679  }
12680 
12681  for( ; remainder && i<M; ++i )
12682  {
12683  const size_t jend( LOW ? i+1UL : N );
12684  size_t j( UPP ? i : 0UL );
12685 
12686  for( ; (j+2UL) <= jend; j+=2UL )
12687  {
12688  const size_t kbegin( ( IsLower_v<MT5> )
12689  ?( ( IsUpper_v<MT4> )
12690  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12691  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12692  :( IsUpper_v<MT4> ? i : 0UL ) );
12693  const size_t kend( ( IsUpper_v<MT5> )
12694  ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
12695  :( K ) );
12696 
12697  ElementType value1{};
12698  ElementType value2{};
12699 
12700  for( size_t k=kbegin; k<kend; ++k ) {
12701  value1 += A(i,k) * B(k,j );
12702  value2 += A(i,k) * B(k,j+1UL);
12703  }
12704 
12705  C(i,j ) -= value1 * scalar;
12706  C(i,j+1UL) -= value2 * scalar;
12707  }
12708 
12709  if( j < jend )
12710  {
12711  const size_t kbegin( ( IsLower_v<MT5> )
12712  ?( ( IsUpper_v<MT4> )
12713  ?( max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12714  :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12715  :( IsUpper_v<MT4> ? i : 0UL ) );
12716 
12717  ElementType value{};
12718 
12719  for( size_t k=kbegin; k<K; ++k ) {
12720  value += A(i,k) * B(k,j);
12721  }
12722 
12723  C(i,j) -= value * scalar;
12724  }
12725  }
12726  }
12727  //**********************************************************************************************
12728 
12729  //**Default subtraction assignment to dense matrices (large matrices)***************************
12743  template< typename MT3 // Type of the left-hand side target matrix
12744  , typename MT4 // Type of the left-hand side matrix operand
12745  , typename MT5 // Type of the right-hand side matrix operand
12746  , typename ST2 > // Type of the scalar value
12747  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12748  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12749  {
12750  selectDefaultSubAssignKernel( C, A, B, scalar );
12751  }
12752  //**********************************************************************************************
12753 
12754  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
12769  template< typename MT3 // Type of the left-hand side target matrix
12770  , typename MT4 // Type of the left-hand side matrix operand
12771  , typename MT5 // Type of the right-hand side matrix operand
12772  , typename ST2 > // Type of the scalar value
12773  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12774  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12775  {
12776  if( LOW )
12777  lmmm( C, A, B, -scalar, ST2(1) );
12778  else if( UPP )
12779  ummm( C, A, B, -scalar, ST2(1) );
12780  else
12781  mmm( C, A, B, -scalar, ST2(1) );
12782  }
12783  //**********************************************************************************************
12784 
12785  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
12799  template< typename MT3 // Type of the left-hand side target matrix
12800  , typename MT4 // Type of the left-hand side matrix operand
12801  , typename MT5 // Type of the right-hand side matrix operand
12802  , typename ST2 > // Type of the scalar value
12803  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12804  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12805  {
12806  selectLargeSubAssignKernel( C, A, B, scalar );
12807  }
12808  //**********************************************************************************************
12809 
12810  //**BLAS-based subraction assignment to dense matrices******************************************
12811 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
12812 
12825  template< typename MT3 // Type of the left-hand side target matrix
12826  , typename MT4 // Type of the left-hand side matrix operand
12827  , typename MT5 // Type of the right-hand side matrix operand
12828  , typename ST2 > // Type of the scalar value
12829  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12830  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12831  {
12832  using ET = ElementType_t<MT3>;
12833 
12834  if( IsTriangular_v<MT4> ) {
12835  ResultType_t<MT3> tmp( serial( B ) );
12836  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
12837  subAssign( C, tmp );
12838  }
12839  else if( IsTriangular_v<MT5> ) {
12840  ResultType_t<MT3> tmp( serial( A ) );
12841  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
12842  subAssign( C, tmp );
12843  }
12844  else {
12845  gemm( C, A, B, ET(-scalar), ET(1) );
12846  }
12847  }
12848 #endif
12849  //**********************************************************************************************
12850 
12851  //**Subtraction assignment to sparse matrices***************************************************
12852  // No special implementation for the subtraction assignment to sparse matrices.
12853  //**********************************************************************************************
12854 
12855  //**Schur product assignment to dense matrices**************************************************
12867  template< typename MT // Type of the target dense matrix
12868  , bool SO > // Storage order of the target dense matrix
12869  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12870  {
12872 
12876 
12877  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12878  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12879 
12880  const ResultType tmp( serial( rhs ) );
12881  schurAssign( ~lhs, tmp );
12882  }
12883  //**********************************************************************************************
12884 
12885  //**Schur product assignment to sparse matrices*************************************************
12886  // No special implementation for the Schur product assignment to sparse matrices.
12887  //**********************************************************************************************
12888 
12889  //**Multiplication assignment to dense matrices*************************************************
12890  // No special implementation for the multiplication assignment to dense matrices.
12891  //**********************************************************************************************
12892 
12893  //**Multiplication assignment to sparse matrices************************************************
12894  // No special implementation for the multiplication assignment to sparse matrices.
12895  //**********************************************************************************************
12896 
12897  //**SMP assignment to dense matrices************************************************************
12912  template< typename MT // Type of the target dense matrix
12913  , bool SO > // Storage order of the target dense matrix
12914  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12915  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
12916  {
12918 
12919  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12920  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12921 
12922  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
12923  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
12924 
12925  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
12926  return;
12927  }
12928  else if( left.columns() == 0UL ) {
12929  reset( ~lhs );
12930  return;
12931  }
12932 
12933  LT A( left ); // Evaluation of the left-hand side dense matrix operand
12934  RT B( right ); // Evaluation of the right-hand side dense matrix operand
12935 
12936  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
12937  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
12938  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
12939  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
12940  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
12941  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
12942 
12943  smpAssign( ~lhs, A * B * rhs.scalar_ );
12944  }
12945  //**********************************************************************************************
12946 
12947  //**SMP assignment to sparse matrices***********************************************************
12962  template< typename MT // Type of the target sparse matrix
12963  , bool SO > // Storage order of the target sparse matrix
12964  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12965  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
12966  {
12968 
12969  using TmpType = If_t< SO, ResultType, OppositeType >;
12970 
12977 
12978  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12979  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12980 
12981  const ForwardFunctor fwd;
12982 
12983  const TmpType tmp( rhs );
12984  smpAssign( ~lhs, fwd( tmp ) );
12985  }
12986  //**********************************************************************************************
12987 
12988  //**SMP addition assignment to dense matrices***************************************************
13003  template< typename MT // Type of the target dense matrix
13004  , bool SO > // Storage order of the target dense matrix
13005  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13006  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13007  {
13009 
13010  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13011  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13012 
13013  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13014  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13015 
13016  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
13017  return;
13018  }
13019 
13020  LT A( left ); // Evaluation of the left-hand side dense matrix operand
13021  RT B( right ); // Evaluation of the right-hand side dense matrix operand
13022 
13023  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13024  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13025  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13026  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13027  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
13028  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
13029 
13030  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
13031  }
13032  //**********************************************************************************************
13033 
13034  //**SMP addition assignment to sparse matrices**************************************************
13035  // No special implementation for the SMP addition assignment to sparse matrices.
13036  //**********************************************************************************************
13037 
13038  //**SMP subtraction assignment to dense matrices************************************************
13053  template< typename MT // Type of the target dense matrix
13054  , bool SO > // Storage order of the target dense matrix
13055  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13056  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13057  {
13059 
13060  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13061  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13062 
13063  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13064  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13065 
13066  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
13067  return;
13068  }
13069 
13070  LT A( left ); // Evaluation of the left-hand side dense matrix operand
13071  RT B( right ); // Evaluation of the right-hand side dense matrix operand
13072 
13073  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13074  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13075  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13076  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13077  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
13078  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
13079 
13080  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
13081  }
13082  //**********************************************************************************************
13083 
13084  //**SMP subtraction assignment to sparse matrices***********************************************
13085  // No special implementation for the SMP subtraction assignment to sparse matrices.
13086  //**********************************************************************************************
13087 
13088  //**SMP Schur product assignment to dense matrices**********************************************
13100  template< typename MT // Type of the target dense matrix
13101  , bool SO > // Storage order of the target dense matrix
13102  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13103  {
13105 
13109 
13110  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13111  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13112 
13113  const ResultType tmp( rhs );
13114  smpSchurAssign( ~lhs, tmp );
13115  }
13116  //**********************************************************************************************
13117 
13118  //**SMP Schur product assignment to sparse matrices*********************************************
13119  // No special implementation for the SMP Schur product assignment to sparse matrices.
13120  //**********************************************************************************************
13121 
13122  //**SMP multiplication assignment to dense matrices*********************************************
13123  // No special implementation for the SMP multiplication assignment to dense matrices.
13124  //**********************************************************************************************
13125 
13126  //**SMP multiplication assignment to sparse matrices********************************************
13127  // No special implementation for the SMP multiplication assignment to sparse matrices.
13128  //**********************************************************************************************
13129 
13130  //**Compile time checks*************************************************************************
13139  //**********************************************************************************************
13140 };
13142 //*************************************************************************************************
13143 
13144 
13145 
13146 
13147 //=================================================================================================
13148 //
13149 // GLOBAL BINARY ARITHMETIC OPERATORS
13150 //
13151 //=================================================================================================
13152 
13153 //*************************************************************************************************
13183 template< typename MT1 // Type of the left-hand side dense matrix
13184  , typename MT2 > // Type of the right-hand side dense matrix
13185 inline decltype(auto)
13186  operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,false>& rhs )
13187 {
13189 
13190  if( (~lhs).columns() != (~rhs).rows() ) {
13191  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
13192  }
13193 
13195  return ReturnType( ~lhs, ~rhs );
13196 }
13197 //*************************************************************************************************
13198 
13199 
13200 
13201 
13202 //=================================================================================================
13203 //
13204 // GLOBAL FUNCTIONS
13205 //
13206 //=================================================================================================
13207 
13208 //*************************************************************************************************
13233 template< typename MT1 // Type of the left-hand side dense matrix
13234  , typename MT2 // Type of the right-hand side dense matrix
13235  , bool SF // Symmetry flag
13236  , bool HF // Hermitian flag
13237  , bool LF // Lower flag
13238  , bool UF > // Upper flag
13239 inline decltype(auto) declsym( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13240 {
13242 
13243  if( !isSquare( dm ) ) {
13244  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
13245  }
13246 
13247  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
13248  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13249 }
13251 //*************************************************************************************************
13252 
13253 
13254 //*************************************************************************************************
13279 template< typename MT1 // Type of the left-hand side dense matrix
13280  , typename MT2 // Type of the right-hand side dense matrix
13281  , bool SF // Symmetry flag
13282  , bool HF // Hermitian flag
13283  , bool LF // Lower flag
13284  , bool UF > // Upper flag
13285 inline decltype(auto) declherm( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13286 {
13288 
13289  if( !isSquare( dm ) ) {
13290  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
13291  }
13292 
13293  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
13294  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13295 }
13297 //*************************************************************************************************
13298 
13299 
13300 //*************************************************************************************************
13325 template< typename MT1 // Type of the left-hand side dense matrix
13326  , typename MT2 // Type of the right-hand side dense matrix
13327  , bool SF // Symmetry flag
13328  , bool HF // Hermitian flag
13329  , bool LF // Lower flag
13330  , bool UF > // Upper flag
13331 inline decltype(auto) decllow( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13332 {
13334 
13335  if( !isSquare( dm ) ) {
13336  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
13337  }
13338 
13339  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
13340  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13341 }
13343 //*************************************************************************************************
13344 
13345 
13346 //*************************************************************************************************
13371 template< typename MT1 // Type of the left-hand side dense matrix
13372  , typename MT2 // Type of the right-hand side dense matrix
13373  , bool SF // Symmetry flag
13374  , bool HF // Hermitian flag
13375  , bool LF // Lower flag
13376  , bool UF > // Upper flag
13377 inline decltype(auto) declupp( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13378 {
13380 
13381  if( !isSquare( dm ) ) {
13382  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
13383  }
13384 
13385  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
13386  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13387 }
13389 //*************************************************************************************************
13390 
13391 
13392 //*************************************************************************************************
13417 template< typename MT1 // Type of the left-hand side dense matrix
13418  , typename MT2 // Type of the right-hand side dense matrix
13419  , bool SF // Symmetry flag
13420  , bool HF // Hermitian flag
13421  , bool LF // Lower flag
13422  , bool UF > // Upper flag
13423 inline decltype(auto) decldiag( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13424 {
13426 
13427  if( !isSquare( dm ) ) {
13428  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
13429  }
13430 
13431  using ReturnType = const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
13432  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13433 }
13435 //*************************************************************************************************
13436 
13437 
13438 
13439 
13440 //=================================================================================================
13441 //
13442 // SIZE SPECIALIZATIONS
13443 //
13444 //=================================================================================================
13445 
13446 //*************************************************************************************************
13448 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13449 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
13450  : public Size<MT1,0UL>
13451 {};
13452 
13453 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13454 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
13455  : public Size<MT2,1UL>
13456 {};
13458 //*************************************************************************************************
13459 
13460 
13461 
13462 
13463 //=================================================================================================
13464 //
13465 // ISALIGNED SPECIALIZATIONS
13466 //
13467 //=================================================================================================
13468 
13469 //*************************************************************************************************
13471 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13472 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13473  : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
13474 {};
13476 //*************************************************************************************************
13477 
13478 } // namespace blaze
13479 
13480 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:426
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:167
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:273
Header file for basic type definitions.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:162
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:478
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:266
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:532
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:479
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:271
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:270
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:522
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1002
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:596
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:158
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:412
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:422
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:431
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:174
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:173
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:172
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:402
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:268
Header file for the IsComplexDouble type trait.
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:157
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:157
Constraint on the data type.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:564
Header file for the DisableIf class template.
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
constexpr bool IsSIMDCombinable_v
Auxiliary variable template for the IsSIMDCombinable type trait.The IsSIMDCombinable_v variable templ...
Definition: IsSIMDCombinable.h:137
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatDMatMultExpr.h:290
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:434
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
constexpr bool HasSIMDMult_v
Auxiliary variable template for the HasSIMDMult type trait.The HasSIMDMult_v variable template provid...
Definition: HasSIMDMult.h:189
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
Header file for the decllow trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:272
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:392
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:167
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1002
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:552
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:161
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
Header file for the exception macros of the math module.
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:604
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:466
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:446
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:376
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:285
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:468
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:173
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatDMatMultExpr.h:297
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:421
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:312
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:171
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:576
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:456
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:327
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:453
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:440
constexpr bool HasSIMDAdd_v
Auxiliary variable template for the HasSIMDAdd type trait.The HasSIMDAdd_v variable template provides...
Definition: HasSIMDAdd.h:188
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:269
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
If_t< IsExpression_v< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:279
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:156
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatDMatMultExpr.h:303
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:586
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1326
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:542
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:170
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:282
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:276
Header file for the IsExpression type trait class.
Header file for the function trace functionality.