TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
51 #include <blaze/math/dense/MMM.h>
52 #include <blaze/math/Exception.h>
58 #include <blaze/math/Functions.h>
66 #include <blaze/math/shims/Reset.h>
68 #include <blaze/math/SIMD.h>
115 #include <blaze/system/BLAS.h>
116 #include <blaze/system/Blocking.h>
117 #include <blaze/system/Debugging.h>
119 #include <blaze/system/Thresholds.h>
120 #include <blaze/util/Assert.h>
121 #include <blaze/util/Complex.h>
125 #include <blaze/util/DisableIf.h>
126 #include <blaze/util/EnableIf.h>
129 #include <blaze/util/InvalidType.h>
130 #include <blaze/util/mpl/And.h>
131 #include <blaze/util/mpl/Bool.h>
132 #include <blaze/util/mpl/If.h>
133 #include <blaze/util/mpl/Not.h>
134 #include <blaze/util/mpl/Or.h>
135 #include <blaze/util/TrueType.h>
136 #include <blaze/util/Types.h>
145 
146 
147 namespace blaze {
148 
149 //=================================================================================================
150 //
151 // CLASS TDMATDMATMULTEXPR
152 //
153 //=================================================================================================
154 
155 //*************************************************************************************************
162 template< typename MT1 // Type of the left-hand side dense matrix
163  , typename MT2 // Type of the right-hand side dense matrix
164  , bool SF // Symmetry flag
165  , bool HF // Hermitian flag
166  , bool LF // Lower flag
167  , bool UF > // Upper flag
168 class TDMatDMatMultExpr : public DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true >
169  , private MatMatMultExpr
170  , private Computation
171 {
172  private:
173  //**Type definitions****************************************************************************
180  //**********************************************************************************************
181 
182  //**********************************************************************************************
184  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
185  //**********************************************************************************************
186 
187  //**********************************************************************************************
189  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
190  //**********************************************************************************************
191 
192  //**********************************************************************************************
194  enum : bool {
195  SYM = ( SF && !( HF || LF || UF ) ),
196  HERM = ( HF && !( LF || UF ) ),
197  LOW = ( LF || ( ( SF || HF ) && UF ) ),
198  UPP = ( UF || ( ( SF || HF ) && LF ) )
199  };
200  //**********************************************************************************************
201 
202  //**********************************************************************************************
204 
208  template< typename T1, typename T2, typename T3 >
209  struct IsEvaluationRequired {
210  enum : bool { value = ( evaluateLeft || evaluateRight ) };
211  };
213  //**********************************************************************************************
214 
215  //**********************************************************************************************
217 
220  template< typename T1, typename T2, typename T3 >
221  struct UseBlasKernel {
223  !SYM && !HERM && !LOW && !UPP &&
228  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
233  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
234  };
236  //**********************************************************************************************
237 
238  //**********************************************************************************************
240 
243  template< typename T1, typename T2, typename T3 >
244  struct UseVectorizedDefaultKernel {
245  enum : bool { value = useOptimizedKernels &&
247  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
250  , ElementType_<T3> >::value &&
253  };
255  //**********************************************************************************************
256 
257  //**********************************************************************************************
259 
262  typedef IfTrue_< HERM
263  , DeclHerm
264  , IfTrue_< SYM
265  , DeclSym
266  , IfTrue_< LOW
267  , IfTrue_< UPP
268  , DeclDiag
269  , DeclLow >
270  , IfTrue_< UPP
271  , DeclUpp
272  , Noop > > > > ForwardFunctor;
274  //**********************************************************************************************
275 
276  public:
277  //**Type definitions****************************************************************************
280 
286  typedef const ElementType ReturnType;
287  typedef const ResultType CompositeType;
288 
290  typedef If_< IsExpression<MT1>, const MT1, const MT1& > LeftOperand;
291 
293  typedef If_< IsExpression<MT2>, const MT2, const MT2& > RightOperand;
294 
297 
300  //**********************************************************************************************
301 
302  //**Compilation flags***************************************************************************
304  enum : bool { simdEnabled = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
305  MT1::simdEnabled && MT2::simdEnabled &&
308 
310  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
311  !evaluateRight && MT2::smpAssignable };
312  //**********************************************************************************************
313 
314  //**SIMD properties*****************************************************************************
316  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
317  //**********************************************************************************************
318 
319  //**Constructor*********************************************************************************
325  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
326  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
327  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
328  {
329  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
330  }
331  //**********************************************************************************************
332 
333  //**Access operator*****************************************************************************
340  inline ReturnType operator()( size_t i, size_t j ) const {
341  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
342  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
343 
344  if( IsDiagonal<MT1>::value ) {
345  return lhs_(i,i) * rhs_(i,j);
346  }
347  else if( IsDiagonal<MT2>::value ) {
348  return lhs_(i,j) * rhs_(j,j);
349  }
351  const size_t begin( ( IsUpper<MT1>::value )
352  ?( ( IsLower<MT2>::value )
353  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
354  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
355  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
356  :( ( IsLower<MT2>::value )
357  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
358  :( 0UL ) ) );
359  const size_t end( ( IsLower<MT1>::value )
360  ?( ( IsUpper<MT2>::value )
361  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
362  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
363  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
364  :( ( IsUpper<MT2>::value )
365  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
366  :( lhs_.columns() ) ) );
367 
368  if( begin >= end ) return ElementType();
369 
370  const size_t n( end - begin );
371 
372  return subvector( row( lhs_, i ), begin, n ) * subvector( column( rhs_, j ), begin, n );
373  }
374  else {
375  return row( lhs_, i ) * column( rhs_, j );
376  }
377  }
378  //**********************************************************************************************
379 
380  //**At function*********************************************************************************
388  inline ReturnType at( size_t i, size_t j ) const {
389  if( i >= lhs_.rows() ) {
390  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
391  }
392  if( j >= rhs_.columns() ) {
393  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
394  }
395  return (*this)(i,j);
396  }
397  //**********************************************************************************************
398 
399  //**Rows function*******************************************************************************
404  inline size_t rows() const noexcept {
405  return lhs_.rows();
406  }
407  //**********************************************************************************************
408 
409  //**Columns function****************************************************************************
414  inline size_t columns() const noexcept {
415  return rhs_.columns();
416  }
417  //**********************************************************************************************
418 
419  //**Left operand access*************************************************************************
424  inline LeftOperand leftOperand() const noexcept {
425  return lhs_;
426  }
427  //**********************************************************************************************
428 
429  //**Right operand access************************************************************************
434  inline RightOperand rightOperand() const noexcept {
435  return rhs_;
436  }
437  //**********************************************************************************************
438 
439  //**********************************************************************************************
445  template< typename T >
446  inline bool canAlias( const T* alias ) const noexcept {
447  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
448  }
449  //**********************************************************************************************
450 
451  //**********************************************************************************************
457  template< typename T >
458  inline bool isAliased( const T* alias ) const noexcept {
459  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
460  }
461  //**********************************************************************************************
462 
463  //**********************************************************************************************
468  inline bool isAligned() const noexcept {
469  return lhs_.isAligned() && rhs_.isAligned();
470  }
471  //**********************************************************************************************
472 
473  //**********************************************************************************************
478  inline bool canSMPAssign() const noexcept {
479  return ( !BLAZE_BLAS_IS_PARALLEL ||
480  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
481  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
483  }
484  //**********************************************************************************************
485 
486  private:
487  //**Member variables****************************************************************************
488  LeftOperand lhs_;
489  RightOperand rhs_;
490  //**********************************************************************************************
491 
492  //**Assignment to dense matrices****************************************************************
505  template< typename MT // Type of the target dense matrix
506  , bool SO > // Storage order of the target dense matrix
507  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
508  {
510 
511  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
512  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
513 
514  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
515  return;
516  }
517  else if( rhs.lhs_.columns() == 0UL ) {
518  reset( ~lhs );
519  return;
520  }
521 
522  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
523  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
524 
525  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
526  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
527  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
528  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
529  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
530  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
531 
532  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
533  }
535  //**********************************************************************************************
536 
537  //**Assignment to dense matrices (kernel selection)*********************************************
548  template< typename MT3 // Type of the left-hand side target matrix
549  , typename MT4 // Type of the left-hand side matrix operand
550  , typename MT5 > // Type of the right-hand side matrix operand
551  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
552  {
554  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
555  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
556  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
557  selectSmallAssignKernel( C, A, B );
558  else
559  selectBlasAssignKernel( C, A, B );
560  }
562  //**********************************************************************************************
563 
564  //**Default assignment to row-major dense matrices (general/general)****************************
578  template< typename MT3 // Type of the left-hand side target matrix
579  , typename MT4 // Type of the left-hand side matrix operand
580  , typename MT5 > // Type of the right-hand side matrix operand
582  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
583  {
584  const size_t M( A.rows() );
585  const size_t N( B.columns() );
586  const size_t K( A.columns() );
587 
588  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
589 
590  for( size_t i=0UL; i<M; ++i )
591  {
592  const size_t kbegin( ( IsUpper<MT4>::value )
593  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
594  :( 0UL ) );
595  const size_t kend( ( IsLower<MT4>::value )
596  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
597  :( K ) );
598  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
599 
600  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
601  for( size_t j=0UL; j<N; ++j ) {
602  reset( (~C)(i,j) );
603  }
604  continue;
605  }
606 
607  {
608  const size_t jbegin( ( IsUpper<MT5>::value )
610  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
611  :( UPP ? max(i,kbegin) : kbegin ) )
612  :( UPP ? i : 0UL ) );
613  const size_t jend( ( IsLower<MT5>::value )
615  ?( LOW ? min(i+1UL,kbegin) : kbegin )
616  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
617  :( LOW ? i+1UL : N ) );
618 
619  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
620  for( size_t j=0UL; j<jbegin; ++j ) {
621  reset( (~C)(i,j) );
622  }
623  }
624  else if( IsStrictlyUpper<MT5>::value ) {
625  reset( (~C)(i,0UL) );
626  }
627  for( size_t j=jbegin; j<jend; ++j ) {
628  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
629  }
630  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
631  for( size_t j=jend; j<N; ++j ) {
632  reset( (~C)(i,j) );
633  }
634  }
635  else if( IsStrictlyLower<MT5>::value ) {
636  reset( (~C)(i,N-1UL) );
637  }
638  }
639 
640  for( size_t k=kbegin+1UL; k<kend; ++k )
641  {
642  const size_t jbegin( ( IsUpper<MT5>::value )
644  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
645  :( SYM || HERM || UPP ? max( i, k ) : k ) )
646  :( SYM || HERM || UPP ? i : 0UL ) );
647  const size_t jend( ( IsLower<MT5>::value )
649  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
650  :( LOW ? min(i+1UL,k) : k ) )
651  :( LOW ? i+1UL : N ) );
652 
653  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
654  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
655 
656  for( size_t j=jbegin; j<jend; ++j ) {
657  (~C)(i,j) += A(i,k) * B(k,j);
658  }
659  if( IsLower<MT5>::value ) {
660  (~C)(i,jend) = A(i,k) * B(k,jend);
661  }
662  }
663  }
664 
665  if( SYM || HERM ) {
666  for( size_t i=1UL; i<M; ++i ) {
667  for( size_t j=0UL; j<i; ++j ) {
668  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
669  }
670  }
671  }
672  }
674  //**********************************************************************************************
675 
676  //**Default assignment to column-major dense matrices (general/general)*************************
690  template< typename MT3 // Type of the left-hand side target matrix
691  , typename MT4 // Type of the left-hand side matrix operand
692  , typename MT5 > // Type of the right-hand side matrix operand
693  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
694  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
695  {
696  const size_t M( A.rows() );
697  const size_t N( B.columns() );
698  const size_t K( A.columns() );
699 
700  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
701 
702  for( size_t j=0UL; j<N; ++j )
703  {
704  const size_t kbegin( ( IsLower<MT5>::value )
705  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
706  :( 0UL ) );
707  const size_t kend( ( IsUpper<MT5>::value )
708  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
709  :( K ) );
710  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
711 
712  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
713  for( size_t i=0UL; i<M; ++i ) {
714  reset( (~C)(i,j) );
715  }
716  continue;
717  }
718 
719  {
720  const size_t ibegin( ( IsLower<MT4>::value )
722  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
723  :( LOW ? max(j,kbegin) : kbegin ) )
724  :( LOW ? j : 0UL ) );
725  const size_t iend( ( IsUpper<MT4>::value )
727  ?( UPP ? min(j+1UL,kbegin) : kbegin )
728  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
729  :( UPP ? j+1UL : M ) );
730 
731  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
732  for( size_t i=0UL; i<ibegin; ++i ) {
733  reset( (~C)(i,j) );
734  }
735  }
736  else if( IsStrictlyLower<MT4>::value ) {
737  reset( (~C)(0UL,j) );
738  }
739  for( size_t i=ibegin; i<iend; ++i ) {
740  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
741  }
742  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
743  for( size_t i=iend; i<M; ++i ) {
744  reset( (~C)(i,j) );
745  }
746  }
747  else if( IsStrictlyUpper<MT4>::value ) {
748  reset( (~C)(M-1UL,j) );
749  }
750  }
751 
752  for( size_t k=kbegin+1UL; k<kend; ++k )
753  {
754  const size_t ibegin( ( IsLower<MT4>::value )
756  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
757  :( SYM || HERM || LOW ? max( j, k ) : k ) )
758  :( SYM || HERM || LOW ? j : 0UL ) );
759  const size_t iend( ( IsUpper<MT4>::value )
761  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
762  :( UPP ? min(j+1UL,k) : k ) )
763  :( UPP ? j+1UL : M ) );
764 
765  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
766  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
767 
768  for( size_t i=ibegin; i<iend; ++i ) {
769  (~C)(i,j) += A(i,k) * B(k,j);
770  }
771  if( IsUpper<MT4>::value ) {
772  (~C)(iend,j) = A(iend,k) * B(k,j);
773  }
774  }
775  }
776 
777  if( SYM || HERM ) {
778  for( size_t j=1UL; j<N; ++j ) {
779  for( size_t i=0UL; i<j; ++i ) {
780  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
781  }
782  }
783  }
784  }
786  //**********************************************************************************************
787 
788  //**Default assignment to row-major dense matrices (general/diagonal)***************************
802  template< typename MT3 // Type of the left-hand side target matrix
803  , typename MT4 // Type of the left-hand side matrix operand
804  , typename MT5 > // Type of the right-hand side matrix operand
805  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
806  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
807  {
808  constexpr size_t block( BLOCK_SIZE );
809 
810  const size_t M( A.rows() );
811  const size_t N( B.columns() );
812 
813  for( size_t ii=0UL; ii<M; ii+=block ) {
814  const size_t iend( min( M, ii+block ) );
815  for( size_t jj=0UL; jj<N; jj+=block ) {
816  const size_t jend( min( N, jj+block ) );
817  for( size_t i=ii; i<iend; ++i )
818  {
819  const size_t jbegin( ( IsUpper<MT4>::value )
820  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
821  :( jj ) );
822  const size_t jpos( ( IsLower<MT4>::value )
823  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
824  :( jend ) );
825 
826  if( IsUpper<MT4>::value ) {
827  for( size_t j=jj; j<jbegin; ++j ) {
828  reset( (~C)(i,j) );
829  }
830  }
831  for( size_t j=jbegin; j<jpos; ++j ) {
832  (~C)(i,j) = A(i,j) * B(j,j);
833  }
834  if( IsLower<MT4>::value ) {
835  for( size_t j=jpos; j<jend; ++j ) {
836  reset( (~C)(i,j) );
837  }
838  }
839  }
840  }
841  }
842  }
844  //**********************************************************************************************
845 
846  //**Default assignment to column-major dense matrices (general/diagonal)************************
860  template< typename MT3 // Type of the left-hand side target matrix
861  , typename MT4 // Type of the left-hand side matrix operand
862  , typename MT5 > // Type of the right-hand side matrix operand
863  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
864  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
865  {
866  const size_t M( A.rows() );
867  const size_t N( B.columns() );
868 
869  for( size_t j=0UL; j<N; ++j )
870  {
871  const size_t ibegin( ( IsLower<MT4>::value )
872  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
873  :( 0UL ) );
874  const size_t iend( ( IsUpper<MT4>::value )
875  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
876  :( M ) );
877  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
878 
879  if( IsLower<MT4>::value ) {
880  for( size_t i=0UL; i<ibegin; ++i ) {
881  reset( (~C)(i,j) );
882  }
883  }
884  for( size_t i=ibegin; i<iend; ++i ) {
885  (~C)(i,j) = A(i,j) * B(j,j);
886  }
887  if( IsUpper<MT4>::value ) {
888  for( size_t i=iend; i<M; ++i ) {
889  reset( (~C)(i,j) );
890  }
891  }
892  }
893  }
895  //**********************************************************************************************
896 
897  //**Default assignment to row-major dense matrices (diagonal/general)***************************
911  template< typename MT3 // Type of the left-hand side target matrix
912  , typename MT4 // Type of the left-hand side matrix operand
913  , typename MT5 > // Type of the right-hand side matrix operand
915  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
916  {
917  const size_t M( A.rows() );
918  const size_t N( B.columns() );
919 
920  for( size_t i=0UL; i<M; ++i )
921  {
922  const size_t jbegin( ( IsUpper<MT5>::value )
923  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
924  :( 0UL ) );
925  const size_t jend( ( IsLower<MT5>::value )
926  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
927  :( N ) );
928  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
929 
930  if( IsUpper<MT5>::value ) {
931  for( size_t j=0UL; j<jbegin; ++j ) {
932  reset( (~C)(i,j) );
933  }
934  }
935  for( size_t j=jbegin; j<jend; ++j ) {
936  (~C)(i,j) = A(i,i) * B(i,j);
937  }
938  if( IsLower<MT5>::value ) {
939  for( size_t j=jend; j<N; ++j ) {
940  reset( (~C)(i,j) );
941  }
942  }
943  }
944  }
946  //**********************************************************************************************
947 
948  //**Default assignment to column-major dense matrices (diagonal/general)************************
962  template< typename MT3 // Type of the left-hand side target matrix
963  , typename MT4 // Type of the left-hand side matrix operand
964  , typename MT5 > // Type of the right-hand side matrix operand
965  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
966  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
967  {
968  constexpr size_t block( BLOCK_SIZE );
969 
970  const size_t M( A.rows() );
971  const size_t N( B.columns() );
972 
973  for( size_t jj=0UL; jj<N; jj+=block ) {
974  const size_t jend( min( N, jj+block ) );
975  for( size_t ii=0UL; ii<M; ii+=block ) {
976  const size_t iend( min( M, ii+block ) );
977  for( size_t j=jj; j<jend; ++j )
978  {
979  const size_t ibegin( ( IsLower<MT5>::value )
980  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
981  :( ii ) );
982  const size_t ipos( ( IsUpper<MT5>::value )
983  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
984  :( iend ) );
985 
986  if( IsLower<MT5>::value ) {
987  for( size_t i=ii; i<ibegin; ++i ) {
988  reset( (~C)(i,j) );
989  }
990  }
991  for( size_t i=ibegin; i<ipos; ++i ) {
992  (~C)(i,j) = A(i,i) * B(i,j);
993  }
994  if( IsUpper<MT5>::value ) {
995  for( size_t i=ipos; i<iend; ++i ) {
996  reset( (~C)(i,j) );
997  }
998  }
999  }
1000  }
1001  }
1002  }
1004  //**********************************************************************************************
1005 
1006  //**Default assignment to dense matrices (diagonal/diagonal)************************************
1020  template< typename MT3 // Type of the left-hand side target matrix
1021  , typename MT4 // Type of the left-hand side matrix operand
1022  , typename MT5 > // Type of the right-hand side matrix operand
1023  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
1024  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
1025  {
1026  reset( C );
1027 
1028  for( size_t i=0UL; i<A.rows(); ++i ) {
1029  C(i,i) = A(i,i) * B(i,i);
1030  }
1031  }
1033  //**********************************************************************************************
1034 
1035  //**Default assignment to dense matrices (small matrices)***************************************
1049  template< typename MT3 // Type of the left-hand side target matrix
1050  , typename MT4 // Type of the left-hand side matrix operand
1051  , typename MT5 > // Type of the right-hand side matrix operand
1053  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1054  {
1055  selectDefaultAssignKernel( ~C, A, B );
1056  }
1058  //**********************************************************************************************
1059 
1060  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1075  template< typename MT3 // Type of the left-hand side target matrix
1076  , typename MT4 // Type of the left-hand side matrix operand
1077  , typename MT5 > // Type of the right-hand side matrix operand
1079  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1080  {
1081  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1082 
1083  const size_t M( A.rows() );
1084  const size_t N( B.columns() );
1085  const size_t K( A.columns() );
1086 
1087  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1088 
1089  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1090  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1091 
1092  if( LOW && UPP && N > SIMDSIZE*3UL ) {
1093  reset( ~C );
1094  }
1095 
1096  {
1097  size_t j( 0UL );
1098 
1100  {
1101  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1102  for( size_t i=0UL; i<M; ++i )
1103  {
1104  const size_t kbegin( ( IsUpper<MT4>::value )
1105  ?( ( IsLower<MT5>::value )
1106  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1107  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1108  :( IsLower<MT5>::value ? j : 0UL ) );
1109  const size_t kend( ( IsLower<MT4>::value )
1110  ?( ( IsUpper<MT5>::value )
1111  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1112  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1113  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
1114 
1115  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1116 
1117  for( size_t k=kbegin; k<kend; ++k ) {
1118  const SIMDType a1( set( A(i,k) ) );
1119  xmm1 += a1 * B.load(k,j );
1120  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1121  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1122  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1123  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1124  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1125  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1126  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1127  }
1128 
1129  (~C).store( i, j , xmm1 );
1130  (~C).store( i, j+SIMDSIZE , xmm2 );
1131  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1132  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1133  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1134  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1135  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1136  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1137  }
1138  }
1139  }
1140 
1141  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1142  {
1143  size_t i( 0UL );
1144 
1145  for( ; (i+2UL) <= M; i+=2UL )
1146  {
1147  const size_t kbegin( ( IsUpper<MT4>::value )
1148  ?( ( IsLower<MT5>::value )
1149  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1150  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1151  :( IsLower<MT5>::value ? j : 0UL ) );
1152  const size_t kend( ( IsLower<MT4>::value )
1153  ?( ( IsUpper<MT5>::value )
1154  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
1155  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1156  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
1157 
1158  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1159 
1160  for( size_t k=kbegin; k<kend; ++k ) {
1161  const SIMDType a1( set( A(i ,k) ) );
1162  const SIMDType a2( set( A(i+1UL,k) ) );
1163  const SIMDType b1( B.load(k,j ) );
1164  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1165  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1166  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1167  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
1168  xmm1 += a1 * b1;
1169  xmm2 += a1 * b2;
1170  xmm3 += a1 * b3;
1171  xmm4 += a1 * b4;
1172  xmm5 += a1 * b5;
1173  xmm6 += a2 * b1;
1174  xmm7 += a2 * b2;
1175  xmm8 += a2 * b3;
1176  xmm9 += a2 * b4;
1177  xmm10 += a2 * b5;
1178  }
1179 
1180  (~C).store( i , j , xmm1 );
1181  (~C).store( i , j+SIMDSIZE , xmm2 );
1182  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1183  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1184  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
1185  (~C).store( i+1UL, j , xmm6 );
1186  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
1187  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1188  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1189  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1190  }
1191 
1192  if( i < M )
1193  {
1194  const size_t kbegin( ( IsUpper<MT4>::value )
1195  ?( ( IsLower<MT5>::value )
1196  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1197  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1198  :( IsLower<MT5>::value ? j : 0UL ) );
1199  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
1200 
1201  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1202 
1203  for( size_t k=kbegin; k<kend; ++k ) {
1204  const SIMDType a1( set( A(i,k) ) );
1205  xmm1 += a1 * B.load(k,j );
1206  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1207  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1208  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1209  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1210  }
1211 
1212  (~C).store( i, j , xmm1 );
1213  (~C).store( i, j+SIMDSIZE , xmm2 );
1214  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1215  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1216  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1217  }
1218  }
1219 
1220  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1221  {
1222  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
1223  size_t i( LOW ? j : 0UL );
1224 
1225  for( ; (i+2UL) <= iend; i+=2UL )
1226  {
1227  const size_t kbegin( ( IsUpper<MT4>::value )
1228  ?( ( IsLower<MT5>::value )
1229  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1230  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1231  :( IsLower<MT5>::value ? j : 0UL ) );
1232  const size_t kend( ( IsLower<MT4>::value )
1233  ?( ( IsUpper<MT5>::value )
1234  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1235  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1236  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
1237 
1238  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1239 
1240  for( size_t k=kbegin; k<kend; ++k ) {
1241  const SIMDType a1( set( A(i ,k) ) );
1242  const SIMDType a2( set( A(i+1UL,k) ) );
1243  const SIMDType b1( B.load(k,j ) );
1244  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1245  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1246  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1247  xmm1 += a1 * b1;
1248  xmm2 += a1 * b2;
1249  xmm3 += a1 * b3;
1250  xmm4 += a1 * b4;
1251  xmm5 += a2 * b1;
1252  xmm6 += a2 * b2;
1253  xmm7 += a2 * b3;
1254  xmm8 += a2 * b4;
1255  }
1256 
1257  (~C).store( i , j , xmm1 );
1258  (~C).store( i , j+SIMDSIZE , xmm2 );
1259  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1260  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1261  (~C).store( i+1UL, j , xmm5 );
1262  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1263  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1264  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1265  }
1266 
1267  if( i < iend )
1268  {
1269  const size_t kbegin( ( IsUpper<MT4>::value )
1270  ?( ( IsLower<MT5>::value )
1271  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1272  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1273  :( IsLower<MT5>::value ? j : 0UL ) );
1274  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1275 
1276  SIMDType xmm1, xmm2, xmm3, xmm4;
1277 
1278  for( size_t k=kbegin; k<kend; ++k ) {
1279  const SIMDType a1( set( A(i,k) ) );
1280  xmm1 += a1 * B.load(k,j );
1281  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1282  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1283  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1284  }
1285 
1286  (~C).store( i, j , xmm1 );
1287  (~C).store( i, j+SIMDSIZE , xmm2 );
1288  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1289  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1290  }
1291  }
1292 
1293  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1294  {
1295  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
1296  size_t i( LOW ? j : 0UL );
1297 
1298  for( ; (i+2UL) <= iend; i+=2UL )
1299  {
1300  const size_t kbegin( ( IsUpper<MT4>::value )
1301  ?( ( IsLower<MT5>::value )
1302  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1303  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1304  :( IsLower<MT5>::value ? j : 0UL ) );
1305  const size_t kend( ( IsLower<MT4>::value )
1306  ?( ( IsUpper<MT5>::value )
1307  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1308  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1309  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
1310 
1311  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1312 
1313  for( size_t k=kbegin; k<kend; ++k ) {
1314  const SIMDType a1( set( A(i ,k) ) );
1315  const SIMDType a2( set( A(i+1UL,k) ) );
1316  const SIMDType b1( B.load(k,j ) );
1317  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1318  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1319  xmm1 += a1 * b1;
1320  xmm2 += a1 * b2;
1321  xmm3 += a1 * b3;
1322  xmm4 += a2 * b1;
1323  xmm5 += a2 * b2;
1324  xmm6 += a2 * b3;
1325  }
1326 
1327  (~C).store( i , j , xmm1 );
1328  (~C).store( i , j+SIMDSIZE , xmm2 );
1329  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1330  (~C).store( i+1UL, j , xmm4 );
1331  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
1332  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1333  }
1334 
1335  if( i < iend )
1336  {
1337  const size_t kbegin( ( IsUpper<MT4>::value )
1338  ?( ( IsLower<MT5>::value )
1339  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1340  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1341  :( IsLower<MT5>::value ? j : 0UL ) );
1342  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1343 
1344  SIMDType xmm1, xmm2, xmm3;
1345 
1346  for( size_t k=kbegin; k<kend; ++k ) {
1347  const SIMDType a1( set( A(i,k) ) );
1348  xmm1 += a1 * B.load(k,j );
1349  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1350  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1351  }
1352 
1353  (~C).store( i, j , xmm1 );
1354  (~C).store( i, j+SIMDSIZE , xmm2 );
1355  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1356  }
1357  }
1358 
1359  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1360  {
1361  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
1362  size_t i( LOW ? j : 0UL );
1363 
1364  for( ; (i+2UL) <= iend; i+=2UL )
1365  {
1366  const size_t kbegin( ( IsUpper<MT4>::value )
1367  ?( ( IsLower<MT5>::value )
1368  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1369  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1370  :( IsLower<MT5>::value ? j : 0UL ) );
1371  const size_t kend( ( IsLower<MT4>::value )
1372  ?( ( IsUpper<MT5>::value )
1373  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1374  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1375  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1376 
1377  SIMDType xmm1, xmm2, xmm3, xmm4;
1378 
1379  for( size_t k=kbegin; k<kend; ++k ) {
1380  const SIMDType a1( set( A(i ,k) ) );
1381  const SIMDType a2( set( A(i+1UL,k) ) );
1382  const SIMDType b1( B.load(k,j ) );
1383  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1384  xmm1 += a1 * b1;
1385  xmm2 += a1 * b2;
1386  xmm3 += a2 * b1;
1387  xmm4 += a2 * b2;
1388  }
1389 
1390  (~C).store( i , j , xmm1 );
1391  (~C).store( i , j+SIMDSIZE, xmm2 );
1392  (~C).store( i+1UL, j , xmm3 );
1393  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1394  }
1395 
1396  if( i < iend )
1397  {
1398  const size_t kbegin( ( IsUpper<MT4>::value )
1399  ?( ( IsLower<MT5>::value )
1400  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1401  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1402  :( IsLower<MT5>::value ? j : 0UL ) );
1403  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1404 
1405  SIMDType xmm1, xmm2;
1406 
1407  for( size_t k=kbegin; k<kend; ++k ) {
1408  const SIMDType a1( set( A(i,k) ) );
1409  xmm1 += a1 * B.load(k,j );
1410  xmm2 += a1 * B.load(k,j+SIMDSIZE);
1411  }
1412 
1413  (~C).store( i, j , xmm1 );
1414  (~C).store( i, j+SIMDSIZE, xmm2 );
1415  }
1416  }
1417 
1418  for( ; j<jpos; j+=SIMDSIZE )
1419  {
1420  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
1421  size_t i( LOW ? j : 0UL );
1422 
1423  for( ; (i+2UL) <= iend; i+=2UL )
1424  {
1425  const size_t kbegin( ( IsUpper<MT4>::value )
1426  ?( ( IsLower<MT5>::value )
1427  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1428  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1429  :( IsLower<MT5>::value ? j : 0UL ) );
1430  const size_t kend( ( IsLower<MT4>::value )
1431  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1432  :( K ) );
1433 
1434  SIMDType xmm1, xmm2;
1435 
1436  for( size_t k=kbegin; k<kend; ++k ) {
1437  const SIMDType b1( B.load(k,j) );
1438  xmm1 += set( A(i ,k) ) * b1;
1439  xmm2 += set( A(i+1UL,k) ) * b1;
1440  }
1441 
1442  (~C).store( i , j, xmm1 );
1443  (~C).store( i+1UL, j, xmm2 );
1444  }
1445 
1446  if( i < iend )
1447  {
1448  const size_t kbegin( ( IsUpper<MT4>::value )
1449  ?( ( IsLower<MT5>::value )
1450  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1451  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1452  :( IsLower<MT5>::value ? j : 0UL ) );
1453 
1454  SIMDType xmm1;
1455 
1456  for( size_t k=kbegin; k<K; ++k ) {
1457  xmm1 += set( A(i,k) ) * B.load(k,j);
1458  }
1459 
1460  (~C).store( i, j, xmm1 );
1461  }
1462  }
1463 
1464  for( ; remainder && j<N; ++j )
1465  {
1466  size_t i( LOW && UPP ? j : 0UL );
1467 
1468  for( ; (i+2UL) <= M; i+=2UL )
1469  {
1470  const size_t kbegin( ( IsUpper<MT4>::value )
1471  ?( ( IsLower<MT5>::value )
1472  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1473  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1474  :( IsLower<MT5>::value ? j : 0UL ) );
1475  const size_t kend( ( IsLower<MT4>::value )
1476  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1477  :( K ) );
1478 
1479  ElementType value1 = ElementType();
1480  ElementType value2 = ElementType();
1481 
1482  for( size_t k=kbegin; k<kend; ++k ) {
1483  value1 += A(i ,k) * B(k,j);
1484  value2 += A(i+1UL,k) * B(k,j);
1485  }
1486 
1487  (~C)(i ,j) = value1;
1488  (~C)(i+1UL,j) = value2;
1489  }
1490 
1491  if( i < M )
1492  {
1493  const size_t kbegin( ( IsUpper<MT4>::value )
1494  ?( ( IsLower<MT5>::value )
1495  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1496  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1497  :( IsLower<MT5>::value ? j : 0UL ) );
1498 
1499  ElementType value = ElementType();
1500 
1501  for( size_t k=kbegin; k<K; ++k ) {
1502  value += A(i,k) * B(k,j);
1503  }
1504 
1505  (~C)(i,j) = value;
1506  }
1507  }
1508  }
1509 
1510  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1511  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1512  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1513  for( size_t j=0UL; j<jend; ++j ) {
1514  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
1515  }
1516  }
1517  }
1518  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1519  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1520  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1521  for( size_t i=0UL; i<iend; ++i ) {
1522  reset( (~C)(i,j) );
1523  }
1524  }
1525  }
1526  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1527  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1528  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1529  for( size_t j=0UL; j<jend; ++j ) {
1530  reset( (~C)(i,j) );
1531  }
1532  }
1533  }
1534  }
1536  //**********************************************************************************************
1537 
1538  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1553  template< typename MT3 // Type of the left-hand side target matrix
1554  , typename MT4 // Type of the left-hand side matrix operand
1555  , typename MT5 > // Type of the right-hand side matrix operand
1557  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1558  {
1559  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1560 
1561  const size_t M( A.rows() );
1562  const size_t N( B.columns() );
1563  const size_t K( A.columns() );
1564 
1565  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1566 
1567  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
1568  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1569 
1570  if( LOW && UPP && M > SIMDSIZE*3UL ) {
1571  reset( ~C );
1572  }
1573 
1574  {
1575  size_t i( 0UL );
1576 
1578  {
1579  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1580  for( size_t j=0UL; j<N; ++j )
1581  {
1582  const size_t kbegin( ( IsLower<MT5>::value )
1583  ?( ( IsUpper<MT4>::value )
1584  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1585  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1586  :( IsUpper<MT4>::value ? i : 0UL ) );
1587  const size_t kend( ( IsUpper<MT5>::value )
1588  ?( ( IsLower<MT4>::value )
1589  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1590  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1591  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
1592 
1593  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1594 
1595  for( size_t k=kbegin; k<kend; ++k ) {
1596  const SIMDType b1( set( B(k,j) ) );
1597  xmm1 += A.load(i ,k) * b1;
1598  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1599  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1600  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1601  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1602  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
1603  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
1604  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
1605  }
1606 
1607  (~C).store( i , j, xmm1 );
1608  (~C).store( i+SIMDSIZE , j, xmm2 );
1609  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1610  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1611  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1612  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1613  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1614  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1615  }
1616  }
1617  }
1618 
1619  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
1620  {
1621  size_t j( 0UL );
1622 
1623  for( ; (j+2UL) <= N; j+=2UL )
1624  {
1625  const size_t kbegin( ( IsLower<MT5>::value )
1626  ?( ( IsUpper<MT4>::value )
1627  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1628  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1629  :( IsUpper<MT4>::value ? i : 0UL ) );
1630  const size_t kend( ( IsUpper<MT5>::value )
1631  ?( ( IsLower<MT4>::value )
1632  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1633  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1634  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
1635 
1636  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1637 
1638  for( size_t k=kbegin; k<kend; ++k ) {
1639  const SIMDType a1( A.load(i ,k) );
1640  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1641  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1642  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1643  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1644  const SIMDType b1( set( B(k,j ) ) );
1645  const SIMDType b2( set( B(k,j+1UL) ) );
1646  xmm1 += a1 * b1;
1647  xmm2 += a2 * b1;
1648  xmm3 += a3 * b1;
1649  xmm4 += a4 * b1;
1650  xmm5 += a5 * b1;
1651  xmm6 += a1 * b2;
1652  xmm7 += a2 * b2;
1653  xmm8 += a3 * b2;
1654  xmm9 += a4 * b2;
1655  xmm10 += a5 * b2;
1656  }
1657 
1658  (~C).store( i , j , xmm1 );
1659  (~C).store( i+SIMDSIZE , j , xmm2 );
1660  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1661  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1662  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1663  (~C).store( i , j+1UL, xmm6 );
1664  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1665  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1666  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1667  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1668  }
1669 
1670  if( j < N )
1671  {
1672  const size_t kbegin( ( IsLower<MT5>::value )
1673  ?( ( IsUpper<MT4>::value )
1674  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1675  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1676  :( IsUpper<MT4>::value ? i : 0UL ) );
1677  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1678 
1679  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1680 
1681  for( size_t k=kbegin; k<kend; ++k ) {
1682  const SIMDType b1( set( B(k,j) ) );
1683  xmm1 += A.load(i ,k) * b1;
1684  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1685  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1686  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1687  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1688  }
1689 
1690  (~C).store( i , j, xmm1 );
1691  (~C).store( i+SIMDSIZE , j, xmm2 );
1692  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1693  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1694  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1695  }
1696  }
1697 
1698  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1699  {
1700  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
1701  size_t j( UPP ? i : 0UL );
1702 
1703  for( ; (j+2UL) <= jend; j+=2UL )
1704  {
1705  const size_t kbegin( ( IsLower<MT5>::value )
1706  ?( ( IsUpper<MT4>::value )
1707  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1708  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1709  :( IsUpper<MT4>::value ? i : 0UL ) );
1710  const size_t kend( ( IsUpper<MT5>::value )
1711  ?( ( IsLower<MT4>::value )
1712  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1713  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1714  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
1715 
1716  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1717 
1718  for( size_t k=kbegin; k<kend; ++k ) {
1719  const SIMDType a1( A.load(i ,k) );
1720  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1721  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1722  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1723  const SIMDType b1( set( B(k,j ) ) );
1724  const SIMDType b2( set( B(k,j+1UL) ) );
1725  xmm1 += a1 * b1;
1726  xmm2 += a2 * b1;
1727  xmm3 += a3 * b1;
1728  xmm4 += a4 * b1;
1729  xmm5 += a1 * b2;
1730  xmm6 += a2 * b2;
1731  xmm7 += a3 * b2;
1732  xmm8 += a4 * b2;
1733  }
1734 
1735  (~C).store( i , j , xmm1 );
1736  (~C).store( i+SIMDSIZE , j , xmm2 );
1737  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1738  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1739  (~C).store( i , j+1UL, xmm5 );
1740  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1741  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1742  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1743  }
1744 
1745  if( j < jend )
1746  {
1747  const size_t kbegin( ( IsLower<MT5>::value )
1748  ?( ( IsUpper<MT4>::value )
1749  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1750  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1751  :( IsUpper<MT4>::value ? i : 0UL ) );
1752  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1753 
1754  SIMDType xmm1, xmm2, xmm3, xmm4;
1755 
1756  for( size_t k=kbegin; k<kend; ++k ) {
1757  const SIMDType b1( set( B(k,j) ) );
1758  xmm1 += A.load(i ,k) * b1;
1759  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1760  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1761  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1762  }
1763 
1764  (~C).store( i , j, xmm1 );
1765  (~C).store( i+SIMDSIZE , j, xmm2 );
1766  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1767  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1768  }
1769  }
1770 
1771  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1772  {
1773  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
1774  size_t j( UPP ? i : 0UL );
1775 
1776  for( ; (j+2UL) <= jend; j+=2UL )
1777  {
1778  const size_t kbegin( ( IsLower<MT5>::value )
1779  ?( ( IsUpper<MT4>::value )
1780  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1781  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1782  :( IsUpper<MT4>::value ? i : 0UL ) );
1783  const size_t kend( ( IsUpper<MT5>::value )
1784  ?( ( IsLower<MT4>::value )
1785  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1786  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1787  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
1788 
1789  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1790 
1791  for( size_t k=kbegin; k<kend; ++k ) {
1792  const SIMDType a1( A.load(i ,k) );
1793  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1794  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1795  const SIMDType b1( set( B(k,j ) ) );
1796  const SIMDType b2( set( B(k,j+1UL) ) );
1797  xmm1 += a1 * b1;
1798  xmm2 += a2 * b1;
1799  xmm3 += a3 * b1;
1800  xmm4 += a1 * b2;
1801  xmm5 += a2 * b2;
1802  xmm6 += a3 * b2;
1803  }
1804 
1805  (~C).store( i , j , xmm1 );
1806  (~C).store( i+SIMDSIZE , j , xmm2 );
1807  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1808  (~C).store( i , j+1UL, xmm4 );
1809  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1810  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1811  }
1812 
1813  if( j < jend )
1814  {
1815  const size_t kbegin( ( IsLower<MT5>::value )
1816  ?( ( IsUpper<MT4>::value )
1817  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1818  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1819  :( IsUpper<MT4>::value ? i : 0UL ) );
1820  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
1821 
1822  SIMDType xmm1, xmm2, xmm3;
1823 
1824  for( size_t k=kbegin; k<kend; ++k ) {
1825  const SIMDType b1( set( B(k,j) ) );
1826  xmm1 += A.load(i ,k) * b1;
1827  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1828  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1829  }
1830 
1831  (~C).store( i , j, xmm1 );
1832  (~C).store( i+SIMDSIZE , j, xmm2 );
1833  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1834  }
1835  }
1836 
1837  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1838  {
1839  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
1840  size_t j( UPP ? i : 0UL );
1841 
1842  for( ; (j+2UL) <= jend; j+=2UL )
1843  {
1844  const size_t kbegin( ( IsLower<MT5>::value )
1845  ?( ( IsUpper<MT4>::value )
1846  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1847  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1848  :( IsUpper<MT4>::value ? i : 0UL ) );
1849  const size_t kend( ( IsUpper<MT5>::value )
1850  ?( ( IsLower<MT4>::value )
1851  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1852  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1853  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
1854 
1855  SIMDType xmm1, xmm2, xmm3, xmm4;
1856 
1857  for( size_t k=kbegin; k<kend; ++k ) {
1858  const SIMDType a1( A.load(i ,k) );
1859  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1860  const SIMDType b1( set( B(k,j ) ) );
1861  const SIMDType b2( set( B(k,j+1UL) ) );
1862  xmm1 += a1 * b1;
1863  xmm2 += a2 * b1;
1864  xmm3 += a1 * b2;
1865  xmm4 += a2 * b2;
1866  }
1867 
1868  (~C).store( i , j , xmm1 );
1869  (~C).store( i+SIMDSIZE, j , xmm2 );
1870  (~C).store( i , j+1UL, xmm3 );
1871  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1872  }
1873 
1874  if( j < jend )
1875  {
1876  const size_t kbegin( ( IsLower<MT5>::value )
1877  ?( ( IsUpper<MT4>::value )
1878  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1879  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1880  :( IsUpper<MT4>::value ? i : 0UL ) );
1881  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
1882 
1883  SIMDType xmm1, xmm2;
1884 
1885  for( size_t k=kbegin; k<kend; ++k ) {
1886  const SIMDType b1( set( B(k,j) ) );
1887  xmm1 += A.load(i ,k) * b1;
1888  xmm2 += A.load(i+SIMDSIZE,k) * b1;
1889  }
1890 
1891  (~C).store( i , j, xmm1 );
1892  (~C).store( i+SIMDSIZE, j, xmm2 );
1893  }
1894  }
1895 
1896  for( ; i<ipos; i+=SIMDSIZE )
1897  {
1898  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
1899  size_t j( UPP ? i : 0UL );
1900 
1901  for( ; (j+2UL) <= jend; j+=2UL )
1902  {
1903  const size_t kbegin( ( IsLower<MT5>::value )
1904  ?( ( IsUpper<MT4>::value )
1905  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1906  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1907  :( IsUpper<MT4>::value ? i : 0UL ) );
1908  const size_t kend( ( IsUpper<MT5>::value )
1909  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1910  :( K ) );
1911 
1912  SIMDType xmm1, xmm2;
1913 
1914  for( size_t k=kbegin; k<kend; ++k ) {
1915  const SIMDType a1( A.load(i,k) );
1916  xmm1 += a1 * set( B(k,j ) );
1917  xmm2 += a1 * set( B(k,j+1UL) );
1918  }
1919 
1920  (~C).store( i, j , xmm1 );
1921  (~C).store( i, j+1UL, xmm2 );
1922  }
1923 
1924  if( j < jend )
1925  {
1926  const size_t kbegin( ( IsLower<MT5>::value )
1927  ?( ( IsUpper<MT4>::value )
1928  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1929  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1930  :( IsUpper<MT4>::value ? i : 0UL ) );
1931 
1932  SIMDType xmm1;
1933 
1934  for( size_t k=kbegin; k<K; ++k ) {
1935  xmm1 += A.load(i,k) * set( B(k,j) );
1936  }
1937 
1938  (~C).store( i, j, xmm1 );
1939  }
1940  }
1941 
1942  for( ; remainder && i<M; ++i )
1943  {
1944  size_t j( LOW && UPP ? i : 0UL );
1945 
1946  for( ; (j+2UL) <= N; j+=2UL )
1947  {
1948  const size_t kbegin( ( IsLower<MT5>::value )
1949  ?( ( IsUpper<MT4>::value )
1950  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1951  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1952  :( IsUpper<MT4>::value ? i : 0UL ) );
1953  const size_t kend( ( IsUpper<MT5>::value )
1954  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1955  :( K ) );
1956 
1957  ElementType value1 = ElementType();
1958  ElementType value2 = ElementType();
1959 
1960  for( size_t k=kbegin; k<kend; ++k ) {
1961  value1 += A(i,k) * B(k,j );
1962  value2 += A(i,k) * B(k,j+1UL);
1963  }
1964 
1965  (~C)(i,j ) = value1;
1966  (~C)(i,j+1UL) = value2;
1967  }
1968 
1969  if( j < N )
1970  {
1971  const size_t kbegin( ( IsLower<MT5>::value )
1972  ?( ( IsUpper<MT4>::value )
1973  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1974  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1975  :( IsUpper<MT4>::value ? i : 0UL ) );
1976 
1977  ElementType value = ElementType();
1978 
1979  for( size_t k=kbegin; k<K; ++k ) {
1980  value += A(i,k) * B(k,j);
1981  }
1982 
1983  (~C)(i,j) = value;
1984  }
1985  }
1986  }
1987 
1988  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
1989  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1990  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1991  for( size_t i=0UL; i<iend; ++i ) {
1992  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
1993  }
1994  }
1995  }
1996  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
1997  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1998  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1999  for( size_t i=0UL; i<iend; ++i ) {
2000  reset( (~C)(i,j) );
2001  }
2002  }
2003  }
2004  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
2005  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
2006  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
2007  for( size_t j=0UL; j<jend; ++j ) {
2008  reset( (~C)(i,j) );
2009  }
2010  }
2011  }
2012  }
2014  //**********************************************************************************************
2015 
2016  //**Default assignment to dense matrices (large matrices)***************************************
2030  template< typename MT3 // Type of the left-hand side target matrix
2031  , typename MT4 // Type of the left-hand side matrix operand
2032  , typename MT5 > // Type of the right-hand side matrix operand
2034  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2035  {
2036  selectDefaultAssignKernel( C, A, B );
2037  }
2039  //**********************************************************************************************
2040 
2041  //**Vectorized default assignment to dense matrices (large matrices)****************************
2056  template< typename MT3 // Type of the left-hand side target matrix
2057  , typename MT4 // Type of the left-hand side matrix operand
2058  , typename MT5 > // Type of the right-hand side matrix operand
2060  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2061  {
2062  if( SYM )
2063  smmm( C, A, B, ElementType(1) );
2064  else if( HERM )
2065  hmmm( C, A, B, ElementType(1) );
2066  else if( LOW )
2067  lmmm( C, A, B, ElementType(1), ElementType(0) );
2068  else if( UPP )
2069  ummm( C, A, B, ElementType(1), ElementType(0) );
2070  else
2071  mmm( C, A, B, ElementType(1), ElementType(0) );
2072  }
2074  //**********************************************************************************************
2075 
2076  //**BLAS-based assignment to dense matrices (default)*******************************************
2090  template< typename MT3 // Type of the left-hand side target matrix
2091  , typename MT4 // Type of the left-hand side matrix operand
2092  , typename MT5 > // Type of the right-hand side matrix operand
2094  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2095  {
2096  selectLargeAssignKernel( C, A, B );
2097  }
2099  //**********************************************************************************************
2100 
2101  //**BLAS-based assignment to dense matrices*****************************************************
2102 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2103 
2116  template< typename MT3 // Type of the left-hand side target matrix
2117  , typename MT4 // Type of the left-hand side matrix operand
2118  , typename MT5 > // Type of the right-hand side matrix operand
2120  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2121  {
2122  typedef ElementType_<MT3> ET;
2123 
2124  if( IsTriangular<MT4>::value ) {
2125  assign( C, B );
2126  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2127  }
2128  else if( IsTriangular<MT5>::value ) {
2129  assign( C, A );
2130  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2131  }
2132  else {
2133  gemm( C, A, B, ET(1), ET(0) );
2134  }
2135  }
2137 #endif
2138  //**********************************************************************************************
2139 
2140  //**Assignment to sparse matrices***************************************************************
2153  template< typename MT // Type of the target sparse matrix
2154  , bool SO > // Storage order of the target sparse matrix
2155  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2156  {
2158 
2160 
2167 
2168  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2169  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2170 
2171  const ForwardFunctor fwd;
2172 
2173  const TmpType tmp( serial( rhs ) );
2174  assign( ~lhs, fwd( tmp ) );
2175  }
2177  //**********************************************************************************************
2178 
2179  //**Addition assignment to dense matrices*******************************************************
2192  template< typename MT // Type of the target dense matrix
2193  , bool SO > // Storage order of the target dense matrix
2194  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2195  {
2197 
2198  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2199  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2200 
2201  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2202  return;
2203  }
2204 
2205  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2206  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2207 
2208  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2209  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2210  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2211  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2212  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2213  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2214 
2215  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2216  }
2218  //**********************************************************************************************
2219 
2220  //**Addition assignment to dense matrices (kernel selection)************************************
2231  template< typename MT3 // Type of the left-hand side target matrix
2232  , typename MT4 // Type of the left-hand side matrix operand
2233  , typename MT5 > // Type of the right-hand side matrix operand
2234  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2235  {
2237  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
2238  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
2239  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2240  selectSmallAddAssignKernel( C, A, B );
2241  else
2242  selectBlasAddAssignKernel( C, A, B );
2243  }
2245  //**********************************************************************************************
2246 
2247  //**Default addition assignment to row-major dense matrices (general/general)*******************
2261  template< typename MT3 // Type of the left-hand side target matrix
2262  , typename MT4 // Type of the left-hand side matrix operand
2263  , typename MT5 > // Type of the right-hand side matrix operand
2264  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2265  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2266  {
2267  const size_t M( A.rows() );
2268  const size_t N( B.columns() );
2269  const size_t K( A.columns() );
2270 
2271  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2272 
2273  for( size_t i=0UL; i<M; ++i )
2274  {
2275  const size_t kbegin( ( IsUpper<MT4>::value )
2276  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2277  :( 0UL ) );
2278  const size_t kend( ( IsLower<MT4>::value )
2279  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2280  :( K ) );
2281  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2282 
2283  for( size_t k=kbegin; k<kend; ++k )
2284  {
2285  const size_t jbegin( ( IsUpper<MT5>::value )
2287  ?( UPP ? max(i,k+1UL) : k+1UL )
2288  :( UPP ? max(i,k) : k ) )
2289  :( UPP ? i : 0UL ) );
2290  const size_t jend( ( IsLower<MT5>::value )
2292  ?( LOW ? min(i+1UL,k) : k )
2293  :( LOW ? min(i,k)+1UL : k+1UL ) )
2294  :( LOW ? i+1UL : N ) );
2295 
2296  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
2297  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2298 
2299  const size_t jnum( jend - jbegin );
2300  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2301 
2302  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2303  (~C)(i,j ) += A(i,k) * B(k,j );
2304  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2305  }
2306  if( jpos < jend ) {
2307  (~C)(i,jpos) += A(i,k) * B(k,jpos);
2308  }
2309  }
2310  }
2311  }
2313  //**********************************************************************************************
2314 
2315  //**Default addition assignment to column-major dense matrices (general/general)****************
2329  template< typename MT3 // Type of the left-hand side target matrix
2330  , typename MT4 // Type of the left-hand side matrix operand
2331  , typename MT5 > // Type of the right-hand side matrix operand
2332  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2333  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2334  {
2335  const size_t M( A.rows() );
2336  const size_t N( B.columns() );
2337  const size_t K( A.columns() );
2338 
2339  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2340 
2341  for( size_t j=0UL; j<N; ++j )
2342  {
2343  const size_t kbegin( ( IsLower<MT5>::value )
2344  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2345  :( 0UL ) );
2346  const size_t kend( ( IsUpper<MT5>::value )
2347  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2348  :( K ) );
2349  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2350 
2351  for( size_t k=kbegin; k<kend; ++k )
2352  {
2353  const size_t ibegin( ( IsLower<MT4>::value )
2355  ?( LOW ? max(j,k+1UL) : k+1UL )
2356  :( LOW ? max(j,k) : k ) )
2357  :( LOW ? j : 0UL ) );
2358  const size_t iend( ( IsUpper<MT4>::value )
2360  ?( UPP ? min(j+1UL,k) : k )
2361  :( UPP ? min(j,k)+1UL : k+1UL ) )
2362  :( UPP ? j+1UL : M ) );
2363 
2364  if( ( LOW || UPP ) && ibegin >= iend ) continue;
2365  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2366 
2367  const size_t inum( iend - ibegin );
2368  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2369 
2370  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2371  (~C)(i ,j) += A(i ,k) * B(k,j);
2372  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2373  }
2374  if( ipos < iend ) {
2375  (~C)(ipos,j) += A(ipos,k) * B(k,j);
2376  }
2377  }
2378  }
2379  }
2381  //**********************************************************************************************
2382 
2383  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2397  template< typename MT3 // Type of the left-hand side target matrix
2398  , typename MT4 // Type of the left-hand side matrix operand
2399  , typename MT5 > // Type of the right-hand side matrix operand
2400  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2401  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2402  {
2403  constexpr size_t block( BLOCK_SIZE );
2404 
2405  const size_t M( A.rows() );
2406  const size_t N( B.columns() );
2407 
2408  for( size_t ii=0UL; ii<M; ii+=block ) {
2409  const size_t iend( min( M, ii+block ) );
2410  for( size_t jj=0UL; jj<N; jj+=block ) {
2411  const size_t jend( min( N, jj+block ) );
2412  for( size_t i=ii; i<iend; ++i )
2413  {
2414  const size_t jbegin( ( IsUpper<MT4>::value )
2415  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
2416  :( jj ) );
2417  const size_t jpos( ( IsLower<MT4>::value )
2418  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
2419  :( jend ) );
2420 
2421  for( size_t j=jbegin; j<jpos; ++j ) {
2422  (~C)(i,j) += A(i,j) * B(j,j);
2423  }
2424  }
2425  }
2426  }
2427  }
2429  //**********************************************************************************************
2430 
2431  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
2445  template< typename MT3 // Type of the left-hand side target matrix
2446  , typename MT4 // Type of the left-hand side matrix operand
2447  , typename MT5 > // Type of the right-hand side matrix operand
2448  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2449  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2450  {
2451  const size_t M( A.rows() );
2452  const size_t N( B.columns() );
2453 
2454  for( size_t j=0UL; j<N; ++j )
2455  {
2456  const size_t ibegin( ( IsLower<MT4>::value )
2457  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2458  :( 0UL ) );
2459  const size_t iend( ( IsUpper<MT4>::value )
2460  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2461  :( M ) );
2462  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2463 
2464  const size_t inum( iend - ibegin );
2465  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2466 
2467  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2468  (~C)(i ,j) += A(i ,j) * B(j,j);
2469  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2470  }
2471  if( ipos < iend ) {
2472  (~C)(ipos,j) += A(ipos,j) * B(j,j);
2473  }
2474  }
2475  }
2477  //**********************************************************************************************
2478 
2479  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
2493  template< typename MT3 // Type of the left-hand side target matrix
2494  , typename MT4 // Type of the left-hand side matrix operand
2495  , typename MT5 > // Type of the right-hand side matrix operand
2496  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2497  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2498  {
2499  const size_t M( A.rows() );
2500  const size_t N( B.columns() );
2501 
2502  for( size_t i=0UL; i<M; ++i )
2503  {
2504  const size_t jbegin( ( IsUpper<MT5>::value )
2505  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2506  :( 0UL ) );
2507  const size_t jend( ( IsLower<MT5>::value )
2508  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2509  :( N ) );
2510  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2511 
2512  const size_t jnum( jend - jbegin );
2513  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2514 
2515  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2516  (~C)(i,j ) += A(i,i) * B(i,j );
2517  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2518  }
2519  if( jpos < jend ) {
2520  (~C)(i,jpos) += A(i,i) * B(i,jpos);
2521  }
2522  }
2523  }
2525  //**********************************************************************************************
2526 
2527  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2541  template< typename MT3 // Type of the left-hand side target matrix
2542  , typename MT4 // Type of the left-hand side matrix operand
2543  , typename MT5 > // Type of the right-hand side matrix operand
2544  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2545  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2546  {
2547  constexpr size_t block( BLOCK_SIZE );
2548 
2549  const size_t M( A.rows() );
2550  const size_t N( B.columns() );
2551 
2552  for( size_t jj=0UL; jj<N; jj+=block ) {
2553  const size_t jend( min( N, jj+block ) );
2554  for( size_t ii=0UL; ii<M; ii+=block ) {
2555  const size_t iend( min( M, ii+block ) );
2556  for( size_t j=jj; j<jend; ++j )
2557  {
2558  const size_t ibegin( ( IsLower<MT5>::value )
2559  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
2560  :( ii ) );
2561  const size_t ipos( ( IsUpper<MT5>::value )
2562  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
2563  :( iend ) );
2564 
2565  for( size_t i=ibegin; i<ipos; ++i ) {
2566  (~C)(i,j) += A(i,i) * B(i,j);
2567  }
2568  }
2569  }
2570  }
2571  }
2573  //**********************************************************************************************
2574 
2575  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2589  template< typename MT3 // Type of the left-hand side target matrix
2590  , typename MT4 // Type of the left-hand side matrix operand
2591  , typename MT5 > // Type of the right-hand side matrix operand
2592  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2593  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2594  {
2595  for( size_t i=0UL; i<A.rows(); ++i ) {
2596  C(i,i) += A(i,i) * B(i,i);
2597  }
2598  }
2600  //**********************************************************************************************
2601 
2602  //**Default addition assignment to dense matrices (small matrices)******************************
2616  template< typename MT3 // Type of the left-hand side target matrix
2617  , typename MT4 // Type of the left-hand side matrix operand
2618  , typename MT5 > // Type of the right-hand side matrix operand
2620  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2621  {
2622  selectDefaultAddAssignKernel( C, A, B );
2623  }
2625  //**********************************************************************************************
2626 
2627  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2642  template< typename MT3 // Type of the left-hand side target matrix
2643  , typename MT4 // Type of the left-hand side matrix operand
2644  , typename MT5 > // Type of the right-hand side matrix operand
2646  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2647  {
2648  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2649 
2650  const size_t M( A.rows() );
2651  const size_t N( B.columns() );
2652  const size_t K( A.columns() );
2653 
2654  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2655 
2656  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2657  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2658 
2659  size_t j( 0UL );
2660 
2662  {
2663  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2664  for( size_t i=0UL; i<M; ++i )
2665  {
2666  const size_t kbegin( ( IsUpper<MT4>::value )
2667  ?( ( IsLower<MT5>::value )
2668  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2669  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2670  :( IsLower<MT5>::value ? j : 0UL ) );
2671  const size_t kend( ( IsLower<MT4>::value )
2672  ?( ( IsUpper<MT5>::value )
2673  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
2674  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2675  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
2676 
2677  SIMDType xmm1( (~C).load(i,j ) );
2678  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2679  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2680  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2681  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2682  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
2683  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
2684  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
2685 
2686  for( size_t k=kbegin; k<kend; ++k ) {
2687  const SIMDType a1( set( A(i,k) ) );
2688  xmm1 += a1 * B.load(k,j );
2689  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2690  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2691  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2692  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2693  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
2694  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
2695  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
2696  }
2697 
2698  (~C).store( i, j , xmm1 );
2699  (~C).store( i, j+SIMDSIZE , xmm2 );
2700  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2701  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2702  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2703  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
2704  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
2705  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
2706  }
2707  }
2708  }
2709 
2710  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
2711  {
2712  size_t i( 0UL );
2713 
2714  for( ; (i+2UL) <= M; i+=2UL )
2715  {
2716  const size_t kbegin( ( IsUpper<MT4>::value )
2717  ?( ( IsLower<MT5>::value )
2718  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2719  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2720  :( IsLower<MT5>::value ? j : 0UL ) );
2721  const size_t kend( ( IsLower<MT4>::value )
2722  ?( ( IsUpper<MT5>::value )
2723  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
2724  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2725  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
2726 
2727  SIMDType xmm1 ( (~C).load(i ,j ) );
2728  SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
2729  SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
2730  SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
2731  SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
2732  SIMDType xmm6 ( (~C).load(i+1UL,j ) );
2733  SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
2734  SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2735  SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2736  SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
2737 
2738  for( size_t k=kbegin; k<kend; ++k ) {
2739  const SIMDType a1( set( A(i ,k) ) );
2740  const SIMDType a2( set( A(i+1UL,k) ) );
2741  const SIMDType b1( B.load(k,j ) );
2742  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2743  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2744  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2745  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
2746  xmm1 += a1 * b1;
2747  xmm2 += a1 * b2;
2748  xmm3 += a1 * b3;
2749  xmm4 += a1 * b4;
2750  xmm5 += a1 * b5;
2751  xmm6 += a2 * b1;
2752  xmm7 += a2 * b2;
2753  xmm8 += a2 * b3;
2754  xmm9 += a2 * b4;
2755  xmm10 += a2 * b5;
2756  }
2757 
2758  (~C).store( i , j , xmm1 );
2759  (~C).store( i , j+SIMDSIZE , xmm2 );
2760  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2761  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2762  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
2763  (~C).store( i+1UL, j , xmm6 );
2764  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
2765  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
2766  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
2767  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
2768  }
2769 
2770  if( i < M )
2771  {
2772  const size_t kbegin( ( IsUpper<MT4>::value )
2773  ?( ( IsLower<MT5>::value )
2774  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2775  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2776  :( IsLower<MT5>::value ? j : 0UL ) );
2777  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
2778 
2779  SIMDType xmm1( (~C).load(i,j ) );
2780  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2781  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2782  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2783  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2784 
2785  for( size_t k=kbegin; k<kend; ++k ) {
2786  const SIMDType a1( set( A(i,k) ) );
2787  xmm1 += a1 * B.load(k,j );
2788  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2789  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2790  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2791  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2792  }
2793 
2794  (~C).store( i, j , xmm1 );
2795  (~C).store( i, j+SIMDSIZE , xmm2 );
2796  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2797  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2798  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2799  }
2800  }
2801 
2802  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2803  {
2804  size_t i( 0UL );
2805 
2806  for( ; (i+2UL) <= M; i+=2UL )
2807  {
2808  const size_t kbegin( ( IsUpper<MT4>::value )
2809  ?( ( IsLower<MT5>::value )
2810  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2811  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2812  :( IsLower<MT5>::value ? j : 0UL ) );
2813  const size_t kend( ( IsLower<MT4>::value )
2814  ?( ( IsUpper<MT5>::value )
2815  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
2816  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2817  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
2818 
2819  SIMDType xmm1( (~C).load(i ,j ) );
2820  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2821  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2822  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
2823  SIMDType xmm5( (~C).load(i+1UL,j ) );
2824  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
2825  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2826  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2827 
2828  for( size_t k=kbegin; k<kend; ++k ) {
2829  const SIMDType a1( set( A(i ,k) ) );
2830  const SIMDType a2( set( A(i+1UL,k) ) );
2831  const SIMDType b1( B.load(k,j ) );
2832  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2833  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2834  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2835  xmm1 += a1 * b1;
2836  xmm2 += a1 * b2;
2837  xmm3 += a1 * b3;
2838  xmm4 += a1 * b4;
2839  xmm5 += a2 * b1;
2840  xmm6 += a2 * b2;
2841  xmm7 += a2 * b3;
2842  xmm8 += a2 * b4;
2843  }
2844 
2845  (~C).store( i , j , xmm1 );
2846  (~C).store( i , j+SIMDSIZE , xmm2 );
2847  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2848  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2849  (~C).store( i+1UL, j , xmm5 );
2850  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
2851  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2852  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2853  }
2854 
2855  if( i < M )
2856  {
2857  const size_t kbegin( ( IsUpper<MT4>::value )
2858  ?( ( IsLower<MT5>::value )
2859  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2860  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2861  :( IsLower<MT5>::value ? j : 0UL ) );
2862  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
2863 
2864  SIMDType xmm1( (~C).load(i,j ) );
2865  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2866  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2867  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2868 
2869  for( size_t k=kbegin; k<kend; ++k ) {
2870  const SIMDType a1( set( A(i,k) ) );
2871  xmm1 += a1 * B.load(k,j );
2872  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2873  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2874  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2875  }
2876 
2877  (~C).store( i, j , xmm1 );
2878  (~C).store( i, j+SIMDSIZE , xmm2 );
2879  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2880  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2881  }
2882  }
2883 
2884  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2885  {
2886  size_t i( 0UL );
2887 
2888  for( ; (i+2UL) <= M; i+=2UL )
2889  {
2890  const size_t kbegin( ( IsUpper<MT4>::value )
2891  ?( ( IsLower<MT5>::value )
2892  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2893  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2894  :( IsLower<MT5>::value ? j : 0UL ) );
2895  const size_t kend( ( IsLower<MT4>::value )
2896  ?( ( IsUpper<MT5>::value )
2897  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
2898  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2899  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
2900 
2901  SIMDType xmm1( (~C).load(i ,j ) );
2902  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2903  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2904  SIMDType xmm4( (~C).load(i+1UL,j ) );
2905  SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
2906  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2907 
2908  for( size_t k=kbegin; k<kend; ++k ) {
2909  const SIMDType a1( set( A(i ,k) ) );
2910  const SIMDType a2( set( A(i+1UL,k) ) );
2911  const SIMDType b1( B.load(k,j ) );
2912  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2913  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2914  xmm1 += a1 * b1;
2915  xmm2 += a1 * b2;
2916  xmm3 += a1 * b3;
2917  xmm4 += a2 * b1;
2918  xmm5 += a2 * b2;
2919  xmm6 += a2 * b3;
2920  }
2921 
2922  (~C).store( i , j , xmm1 );
2923  (~C).store( i , j+SIMDSIZE , xmm2 );
2924  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2925  (~C).store( i+1UL, j , xmm4 );
2926  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
2927  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
2928  }
2929 
2930  if( i < M )
2931  {
2932  const size_t kbegin( ( IsUpper<MT4>::value )
2933  ?( ( IsLower<MT5>::value )
2934  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2935  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2936  :( IsLower<MT5>::value ? j : 0UL ) );
2937  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
2938 
2939  SIMDType xmm1( (~C).load(i,j ) );
2940  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2941  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2942 
2943  for( size_t k=kbegin; k<kend; ++k ) {
2944  const SIMDType a1( set( A(i,k) ) );
2945  xmm1 += a1 * B.load(k,j );
2946  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2947  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2948  }
2949 
2950  (~C).store( i, j , xmm1 );
2951  (~C).store( i, j+SIMDSIZE , xmm2 );
2952  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2953  }
2954  }
2955 
2956  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2957  {
2958  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
2959  size_t i( LOW ? j : 0UL );
2960 
2961  for( ; (i+2UL) <= iend; i+=2UL )
2962  {
2963  const size_t kbegin( ( IsUpper<MT4>::value )
2964  ?( ( IsLower<MT5>::value )
2965  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2966  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2967  :( IsLower<MT5>::value ? j : 0UL ) );
2968  const size_t kend( ( IsLower<MT4>::value )
2969  ?( ( IsUpper<MT5>::value )
2970  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
2971  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2972  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
2973 
2974  SIMDType xmm1( (~C).load(i ,j ) );
2975  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2976  SIMDType xmm3( (~C).load(i+1UL,j ) );
2977  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2978 
2979  for( size_t k=kbegin; k<kend; ++k ) {
2980  const SIMDType a1( set( A(i ,k) ) );
2981  const SIMDType a2( set( A(i+1UL,k) ) );
2982  const SIMDType b1( B.load(k,j ) );
2983  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2984  xmm1 += a1 * b1;
2985  xmm2 += a1 * b2;
2986  xmm3 += a2 * b1;
2987  xmm4 += a2 * b2;
2988  }
2989 
2990  (~C).store( i , j , xmm1 );
2991  (~C).store( i , j+SIMDSIZE, xmm2 );
2992  (~C).store( i+1UL, j , xmm3 );
2993  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2994  }
2995 
2996  if( i < iend )
2997  {
2998  const size_t kbegin( ( IsUpper<MT4>::value )
2999  ?( ( IsLower<MT5>::value )
3000  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3001  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3002  :( IsLower<MT5>::value ? j : 0UL ) );
3003  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
3004 
3005  SIMDType xmm1( (~C).load(i,j ) );
3006  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3007 
3008  for( size_t k=kbegin; k<kend; ++k ) {
3009  const SIMDType a1( set( A(i,k) ) );
3010  xmm1 += a1 * B.load(k,j );
3011  xmm2 += a1 * B.load(k,j+SIMDSIZE);
3012  }
3013 
3014  (~C).store( i, j , xmm1 );
3015  (~C).store( i, j+SIMDSIZE, xmm2 );
3016  }
3017  }
3018 
3019  for( ; j<jpos; j+=SIMDSIZE )
3020  {
3021  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
3022  size_t i( LOW ? j : 0UL );
3023 
3024  for( ; (i+2UL) <= iend; i+=2UL )
3025  {
3026  const size_t kbegin( ( IsUpper<MT4>::value )
3027  ?( ( IsLower<MT5>::value )
3028  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3029  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3030  :( IsLower<MT5>::value ? j : 0UL ) );
3031  const size_t kend( ( IsLower<MT4>::value )
3032  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3033  :( K ) );
3034 
3035  SIMDType xmm1( (~C).load(i ,j) );
3036  SIMDType xmm2( (~C).load(i+1UL,j) );
3037 
3038  for( size_t k=kbegin; k<kend; ++k ) {
3039  const SIMDType b1( B.load(k,j) );
3040  xmm1 += set( A(i ,k) ) * b1;
3041  xmm2 += set( A(i+1UL,k) ) * b1;
3042  }
3043 
3044  (~C).store( i , j, xmm1 );
3045  (~C).store( i+1UL, j, xmm2 );
3046  }
3047 
3048  if( i < iend )
3049  {
3050  const size_t kbegin( ( IsUpper<MT4>::value )
3051  ?( ( IsLower<MT5>::value )
3052  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3053  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3054  :( IsLower<MT5>::value ? j : 0UL ) );
3055 
3056  SIMDType xmm1( (~C).load(i,j) );
3057 
3058  for( size_t k=kbegin; k<K; ++k ) {
3059  xmm1 += set( A(i,k) ) * B.load(k,j);
3060  }
3061 
3062  (~C).store( i, j, xmm1 );
3063  }
3064  }
3065 
3066  for( ; remainder && j<N; ++j )
3067  {
3068  const size_t iend( UPP ? j+1UL : M );
3069  size_t i( LOW ? j : 0UL );
3070 
3071  for( ; (i+2UL) <= iend; i+=2UL )
3072  {
3073  const size_t kbegin( ( IsUpper<MT4>::value )
3074  ?( ( IsLower<MT5>::value )
3075  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3076  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3077  :( IsLower<MT5>::value ? j : 0UL ) );
3078  const size_t kend( ( IsLower<MT4>::value )
3079  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3080  :( K ) );
3081 
3082  ElementType value1( (~C)(i ,j) );
3083  ElementType value2( (~C)(i+1UL,j) );;
3084 
3085  for( size_t k=kbegin; k<kend; ++k ) {
3086  value1 += A(i ,k) * B(k,j);
3087  value2 += A(i+1UL,k) * B(k,j);
3088  }
3089 
3090  (~C)(i ,j) = value1;
3091  (~C)(i+1UL,j) = value2;
3092  }
3093 
3094  if( i < iend )
3095  {
3096  const size_t kbegin( ( IsUpper<MT4>::value )
3097  ?( ( IsLower<MT5>::value )
3098  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3099  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3100  :( IsLower<MT5>::value ? j : 0UL ) );
3101 
3102  ElementType value( (~C)(i,j) );
3103 
3104  for( size_t k=kbegin; k<K; ++k ) {
3105  value += A(i,k) * B(k,j);
3106  }
3107 
3108  (~C)(i,j) = value;
3109  }
3110  }
3111  }
3113  //**********************************************************************************************
3114 
3115  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
3130  template< typename MT3 // Type of the left-hand side target matrix
3131  , typename MT4 // Type of the left-hand side matrix operand
3132  , typename MT5 > // Type of the right-hand side matrix operand
3134  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3135  {
3136  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3137 
3138  const size_t M( A.rows() );
3139  const size_t N( B.columns() );
3140  const size_t K( A.columns() );
3141 
3142  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3143 
3144  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3145  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3146 
3147  size_t i( 0UL );
3148 
3150  {
3151  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3152  for( size_t j=0UL; j<N; ++j )
3153  {
3154  const size_t kbegin( ( IsLower<MT5>::value )
3155  ?( ( IsUpper<MT4>::value )
3156  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3157  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3158  :( IsUpper<MT4>::value ? i : 0UL ) );
3159  const size_t kend( ( IsUpper<MT5>::value )
3160  ?( ( IsLower<MT4>::value )
3161  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3162  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3163  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
3164 
3165  SIMDType xmm1( (~C).load(i ,j) );
3166  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3167  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3168  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3169  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3170  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3171  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3172  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3173 
3174  for( size_t k=kbegin; k<kend; ++k ) {
3175  const SIMDType b1( set( B(k,j) ) );
3176  xmm1 += A.load(i ,k) * b1;
3177  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3178  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3179  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3180  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3181  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
3182  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
3183  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
3184  }
3185 
3186  (~C).store( i , j, xmm1 );
3187  (~C).store( i+SIMDSIZE , j, xmm2 );
3188  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3189  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3190  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3191  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3192  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3193  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3194  }
3195  }
3196  }
3197 
3198  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3199  {
3200  size_t j( 0UL );
3201 
3202  for( ; (j+2UL) <= N; j+=2UL )
3203  {
3204  const size_t kbegin( ( IsLower<MT5>::value )
3205  ?( ( IsUpper<MT4>::value )
3206  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3207  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3208  :( IsUpper<MT4>::value ? i : 0UL ) );
3209  const size_t kend( ( IsUpper<MT5>::value )
3210  ?( ( IsLower<MT4>::value )
3211  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3212  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3213  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
3214 
3215  SIMDType xmm1 ( (~C).load(i ,j ) );
3216  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3217  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3218  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3219  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3220  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3221  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3222  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3223  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3224  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3225 
3226  for( size_t k=kbegin; k<kend; ++k ) {
3227  const SIMDType a1( A.load(i ,k) );
3228  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3229  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3230  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3231  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3232  const SIMDType b1( set( B(k,j ) ) );
3233  const SIMDType b2( set( B(k,j+1UL) ) );
3234  xmm1 += a1 * b1;
3235  xmm2 += a2 * b1;
3236  xmm3 += a3 * b1;
3237  xmm4 += a4 * b1;
3238  xmm5 += a5 * b1;
3239  xmm6 += a1 * b2;
3240  xmm7 += a2 * b2;
3241  xmm8 += a3 * b2;
3242  xmm9 += a4 * b2;
3243  xmm10 += a5 * b2;
3244  }
3245 
3246  (~C).store( i , j , xmm1 );
3247  (~C).store( i+SIMDSIZE , j , xmm2 );
3248  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3249  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3250  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3251  (~C).store( i , j+1UL, xmm6 );
3252  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3253  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3254  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3255  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3256  }
3257 
3258  if( j < N )
3259  {
3260  const size_t kbegin( ( IsLower<MT5>::value )
3261  ?( ( IsUpper<MT4>::value )
3262  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3263  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3264  :( IsUpper<MT4>::value ? i : 0UL ) );
3265  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
3266 
3267  SIMDType xmm1( (~C).load(i ,j) );
3268  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3269  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3270  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3271  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3272 
3273  for( size_t k=kbegin; k<kend; ++k ) {
3274  const SIMDType b1( set( B(k,j) ) );
3275  xmm1 += A.load(i ,k) * b1;
3276  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3277  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3278  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3279  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3280  }
3281 
3282  (~C).store( i , j, xmm1 );
3283  (~C).store( i+SIMDSIZE , j, xmm2 );
3284  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3285  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3286  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3287  }
3288  }
3289 
3290  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3291  {
3292  size_t j( 0UL );
3293 
3294  for( ; (j+2UL) <= N; j+=2UL )
3295  {
3296  const size_t kbegin( ( IsLower<MT5>::value )
3297  ?( ( IsUpper<MT4>::value )
3298  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3299  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3300  :( IsUpper<MT4>::value ? i : 0UL ) );
3301  const size_t kend( ( IsUpper<MT5>::value )
3302  ?( ( IsLower<MT4>::value )
3303  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3304  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3305  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
3306 
3307  SIMDType xmm1( (~C).load(i ,j ) );
3308  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3309  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3310  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3311  SIMDType xmm5( (~C).load(i ,j+1UL) );
3312  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3313  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3314  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3315 
3316  for( size_t k=kbegin; k<kend; ++k ) {
3317  const SIMDType a1( A.load(i ,k) );
3318  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3319  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3320  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3321  const SIMDType b1( set( B(k,j ) ) );
3322  const SIMDType b2( set( B(k,j+1UL) ) );
3323  xmm1 += a1 * b1;
3324  xmm2 += a2 * b1;
3325  xmm3 += a3 * b1;
3326  xmm4 += a4 * b1;
3327  xmm5 += a1 * b2;
3328  xmm6 += a2 * b2;
3329  xmm7 += a3 * b2;
3330  xmm8 += a4 * b2;
3331  }
3332 
3333  (~C).store( i , j , xmm1 );
3334  (~C).store( i+SIMDSIZE , j , xmm2 );
3335  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3336  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3337  (~C).store( i , j+1UL, xmm5 );
3338  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3339  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3340  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3341  }
3342 
3343  if( j < N )
3344  {
3345  const size_t kbegin( ( IsLower<MT5>::value )
3346  ?( ( IsUpper<MT4>::value )
3347  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3348  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3349  :( IsUpper<MT4>::value ? i : 0UL ) );
3350  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3351 
3352  SIMDType xmm1( (~C).load(i ,j) );
3353  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3354  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3355  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3356 
3357  for( size_t k=kbegin; k<kend; ++k ) {
3358  const SIMDType b1( set( B(k,j) ) );
3359  xmm1 += A.load(i ,k) * b1;
3360  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3361  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3362  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3363  }
3364 
3365  (~C).store( i , j, xmm1 );
3366  (~C).store( i+SIMDSIZE , j, xmm2 );
3367  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3368  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3369  }
3370  }
3371 
3372  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3373  {
3374  size_t j( 0UL );
3375 
3376  for( ; (j+2UL) <= N; j+=2UL )
3377  {
3378  const size_t kbegin( ( IsLower<MT5>::value )
3379  ?( ( IsUpper<MT4>::value )
3380  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3381  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3382  :( IsUpper<MT4>::value ? i : 0UL ) );
3383  const size_t kend( ( IsUpper<MT5>::value )
3384  ?( ( IsLower<MT4>::value )
3385  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3386  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3387  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
3388 
3389  SIMDType xmm1( (~C).load(i ,j ) );
3390  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3391  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3392  SIMDType xmm4( (~C).load(i ,j+1UL) );
3393  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
3394  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3395 
3396  for( size_t k=kbegin; k<kend; ++k ) {
3397  const SIMDType a1( A.load(i ,k) );
3398  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3399  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3400  const SIMDType b1( set( B(k,j ) ) );
3401  const SIMDType b2( set( B(k,j+1UL) ) );
3402  xmm1 += a1 * b1;
3403  xmm2 += a2 * b1;
3404  xmm3 += a3 * b1;
3405  xmm4 += a1 * b2;
3406  xmm5 += a2 * b2;
3407  xmm6 += a3 * b2;
3408  }
3409 
3410  (~C).store( i , j , xmm1 );
3411  (~C).store( i+SIMDSIZE , j , xmm2 );
3412  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3413  (~C).store( i , j+1UL, xmm4 );
3414  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
3415  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3416  }
3417 
3418  if( j < N )
3419  {
3420  const size_t kbegin( ( IsLower<MT5>::value )
3421  ?( ( IsUpper<MT4>::value )
3422  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3423  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3424  :( IsUpper<MT4>::value ? i : 0UL ) );
3425  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
3426 
3427  SIMDType xmm1( (~C).load(i ,j) );
3428  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3429  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3430 
3431  for( size_t k=kbegin; k<kend; ++k ) {
3432  const SIMDType b1( set( B(k,j) ) );
3433  xmm1 += A.load(i ,k) * b1;
3434  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3435  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3436  }
3437 
3438  (~C).store( i , j, xmm1 );
3439  (~C).store( i+SIMDSIZE , j, xmm2 );
3440  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3441  }
3442  }
3443 
3444  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3445  {
3446  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
3447  size_t j( UPP ? i : 0UL );
3448 
3449  for( ; (j+2UL) <= jend; j+=2UL )
3450  {
3451  const size_t kbegin( ( IsLower<MT5>::value )
3452  ?( ( IsUpper<MT4>::value )
3453  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3454  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3455  :( IsUpper<MT4>::value ? i : 0UL ) );
3456  const size_t kend( ( IsUpper<MT5>::value )
3457  ?( ( IsLower<MT4>::value )
3458  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3459  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3460  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
3461 
3462  SIMDType xmm1( (~C).load(i ,j ) );
3463  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3464  SIMDType xmm3( (~C).load(i ,j+1UL) );
3465  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3466 
3467  for( size_t k=kbegin; k<kend; ++k ) {
3468  const SIMDType a1( A.load(i ,k) );
3469  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3470  const SIMDType b1( set( B(k,j ) ) );
3471  const SIMDType b2( set( B(k,j+1UL) ) );
3472  xmm1 += a1 * b1;
3473  xmm2 += a2 * b1;
3474  xmm3 += a1 * b2;
3475  xmm4 += a2 * b2;
3476  }
3477 
3478  (~C).store( i , j , xmm1 );
3479  (~C).store( i+SIMDSIZE, j , xmm2 );
3480  (~C).store( i , j+1UL, xmm3 );
3481  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3482  }
3483 
3484  if( j < jend )
3485  {
3486  const size_t kbegin( ( IsLower<MT5>::value )
3487  ?( ( IsUpper<MT4>::value )
3488  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3489  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3490  :( IsUpper<MT4>::value ? i : 0UL ) );
3491  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
3492 
3493  SIMDType xmm1( (~C).load(i ,j) );
3494  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3495 
3496  for( size_t k=kbegin; k<kend; ++k ) {
3497  const SIMDType b1( set( B(k,j) ) );
3498  xmm1 += A.load(i ,k) * b1;
3499  xmm2 += A.load(i+SIMDSIZE,k) * b1;
3500  }
3501 
3502  (~C).store( i , j, xmm1 );
3503  (~C).store( i+SIMDSIZE, j, xmm2 );
3504  }
3505  }
3506 
3507  for( ; i<ipos; i+=SIMDSIZE )
3508  {
3509  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
3510  size_t j( UPP ? i : 0UL );
3511 
3512  for( ; (j+2UL) <= jend; j+=2UL )
3513  {
3514  const size_t kbegin( ( IsLower<MT5>::value )
3515  ?( ( IsUpper<MT4>::value )
3516  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3517  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3518  :( IsUpper<MT4>::value ? i : 0UL ) );
3519  const size_t kend( ( IsUpper<MT5>::value )
3520  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3521  :( K ) );
3522 
3523  SIMDType xmm1( (~C).load(i,j ) );
3524  SIMDType xmm2( (~C).load(i,j+1UL) );
3525 
3526  for( size_t k=kbegin; k<kend; ++k ) {
3527  const SIMDType a1( A.load(i,k) );
3528  xmm1 += a1 * set( B(k,j ) );
3529  xmm2 += a1 * set( B(k,j+1UL) );
3530  }
3531 
3532  (~C).store( i, j , xmm1 );
3533  (~C).store( i, j+1UL, xmm2 );
3534  }
3535 
3536  if( j < jend )
3537  {
3538  const size_t kbegin( ( IsLower<MT5>::value )
3539  ?( ( IsUpper<MT4>::value )
3540  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3541  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3542  :( IsUpper<MT4>::value ? i : 0UL ) );
3543 
3544  SIMDType xmm1( (~C).load(i,j) );
3545 
3546  for( size_t k=kbegin; k<K; ++k ) {
3547  xmm1 += A.load(i,k) * set( B(k,j) );
3548  }
3549 
3550  (~C).store( i, j, xmm1 );
3551  }
3552  }
3553 
3554  for( ; remainder && i<M; ++i )
3555  {
3556  const size_t jend( LOW ? i+1UL : N );
3557  size_t j( UPP ? i : 0UL );
3558 
3559  for( ; (j+2UL) <= jend; j+=2UL )
3560  {
3561  const size_t kbegin( ( IsLower<MT5>::value )
3562  ?( ( IsUpper<MT4>::value )
3563  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3564  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3565  :( IsUpper<MT4>::value ? i : 0UL ) );
3566  const size_t kend( ( IsUpper<MT5>::value )
3567  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3568  :( K ) );
3569 
3570  ElementType value1( (~C)(i,j ) );
3571  ElementType value2( (~C)(i,j+1UL) );
3572 
3573  for( size_t k=kbegin; k<kend; ++k ) {
3574  value1 += A(i,k) * B(k,j );
3575  value2 += A(i,k) * B(k,j+1UL);
3576  }
3577 
3578  (~C)(i,j ) = value1;
3579  (~C)(i,j+1UL) = value2;
3580  }
3581 
3582  if( j < jend )
3583  {
3584  const size_t kbegin( ( IsLower<MT5>::value )
3585  ?( ( IsUpper<MT4>::value )
3586  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3587  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3588  :( IsUpper<MT4>::value ? i : 0UL ) );
3589 
3590  ElementType value( (~C)(i,j) );
3591 
3592  for( size_t k=kbegin; k<K; ++k ) {
3593  value += A(i,k) * B(k,j);
3594  }
3595 
3596  (~C)(i,j) = value;
3597  }
3598  }
3599  }
3601  //**********************************************************************************************
3602 
3603  //**Default addition assignment to dense matrices (large matrices)******************************
3617  template< typename MT3 // Type of the left-hand side target matrix
3618  , typename MT4 // Type of the left-hand side matrix operand
3619  , typename MT5 > // Type of the right-hand side matrix operand
3621  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3622  {
3623  selectDefaultAddAssignKernel( C, A, B );
3624  }
3626  //**********************************************************************************************
3627 
3628  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
3643  template< typename MT3 // Type of the left-hand side target matrix
3644  , typename MT4 // Type of the left-hand side matrix operand
3645  , typename MT5 > // Type of the right-hand side matrix operand
3647  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3648  {
3649  if( LOW )
3650  lmmm( C, A, B, ElementType(1), ElementType(1) );
3651  else if( UPP )
3652  ummm( C, A, B, ElementType(1), ElementType(1) );
3653  else
3654  mmm( C, A, B, ElementType(1), ElementType(1) );
3655  }
3657  //**********************************************************************************************
3658 
3659  //**BLAS-based addition assignment to dense matrices (default)**********************************
3673  template< typename MT3 // Type of the left-hand side target matrix
3674  , typename MT4 // Type of the left-hand side matrix operand
3675  , typename MT5 > // Type of the right-hand side matrix operand
3677  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3678  {
3679  selectLargeAddAssignKernel( C, A, B );
3680  }
3682  //**********************************************************************************************
3683 
3684  //**BLAS-based addition assignment to dense matrices********************************************
3685 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3686 
3699  template< typename MT3 // Type of the left-hand side target matrix
3700  , typename MT4 // Type of the left-hand side matrix operand
3701  , typename MT5 > // Type of the right-hand side matrix operand
3703  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3704  {
3705  typedef ElementType_<MT3> ET;
3706 
3707  if( IsTriangular<MT4>::value ) {
3708  ResultType_<MT3> tmp( serial( B ) );
3709  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3710  addAssign( C, tmp );
3711  }
3712  else if( IsTriangular<MT5>::value ) {
3713  ResultType_<MT3> tmp( serial( A ) );
3714  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3715  addAssign( C, tmp );
3716  }
3717  else {
3718  gemm( C, A, B, ET(1), ET(1) );
3719  }
3720  }
3722 #endif
3723  //**********************************************************************************************
3724 
3725  //**Addition assignment to sparse matrices******************************************************
3726  // No special implementation for the addition assignment to sparse matrices.
3727  //**********************************************************************************************
3728 
3729  //**Subtraction assignment to dense matrices****************************************************
3742  template< typename MT // Type of the target dense matrix
3743  , bool SO > // Storage order of the target dense matrix
3744  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
3745  {
3747 
3748  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3749  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3750 
3751  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3752  return;
3753  }
3754 
3755  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3756  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3757 
3758  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3759  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3760  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3761  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3762  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3763  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3764 
3765  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3766  }
3768  //**********************************************************************************************
3769 
3770  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3781  template< typename MT3 // Type of the left-hand side target matrix
3782  , typename MT4 // Type of the left-hand side matrix operand
3783  , typename MT5 > // Type of the right-hand side matrix operand
3784  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3785  {
3787  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
3788  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
3789  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
3790  selectSmallSubAssignKernel( C, A, B );
3791  else
3792  selectBlasSubAssignKernel( C, A, B );
3793  }
3795  //**********************************************************************************************
3796 
3797  //**Default subtraction assignment to row-major dense matrices (general/general)****************
3811  template< typename MT3 // Type of the left-hand side target matrix
3812  , typename MT4 // Type of the left-hand side matrix operand
3813  , typename MT5 > // Type of the right-hand side matrix operand
3814  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
3815  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3816  {
3817  const size_t M( A.rows() );
3818  const size_t N( B.columns() );
3819  const size_t K( A.columns() );
3820 
3821  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3822 
3823  for( size_t i=0UL; i<M; ++i )
3824  {
3825  const size_t kbegin( ( IsUpper<MT4>::value )
3826  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
3827  :( 0UL ) );
3828  const size_t kend( ( IsLower<MT4>::value )
3829  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
3830  :( K ) );
3831  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3832 
3833  for( size_t k=kbegin; k<kend; ++k )
3834  {
3835  const size_t jbegin( ( IsUpper<MT5>::value )
3837  ?( UPP ? max(i,k+1UL) : k+1UL )
3838  :( UPP ? max(i,k) : k ) )
3839  :( UPP ? i : 0UL ) );
3840  const size_t jend( ( IsLower<MT5>::value )
3842  ?( LOW ? min(i+1UL,k) : k )
3843  :( LOW ? min(i,k)+1UL : k+1UL ) )
3844  :( LOW ? i+1UL : N ) );
3845 
3846  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
3847  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3848 
3849  const size_t jnum( jend - jbegin );
3850  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3851 
3852  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3853  (~C)(i,j ) -= A(i,k) * B(k,j );
3854  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3855  }
3856  if( jpos < jend ) {
3857  (~C)(i,jpos) -= A(i,k) * B(k,jpos);
3858  }
3859  }
3860  }
3861  }
3863  //**********************************************************************************************
3864 
3865  //**Default subtraction assignment to column-major dense matrices (general/general)*************
3879  template< typename MT3 // Type of the left-hand side target matrix
3880  , typename MT4 // Type of the left-hand side matrix operand
3881  , typename MT5 > // Type of the right-hand side matrix operand
3882  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
3883  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3884  {
3885  const size_t M( A.rows() );
3886  const size_t N( B.columns() );
3887  const size_t K( A.columns() );
3888 
3889  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3890 
3891  for( size_t j=0UL; j<N; ++j )
3892  {
3893  const size_t kbegin( ( IsLower<MT5>::value )
3894  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3895  :( 0UL ) );
3896  const size_t kend( ( IsUpper<MT5>::value )
3897  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3898  :( K ) );
3899  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3900 
3901  for( size_t k=kbegin; k<kend; ++k )
3902  {
3903  const size_t ibegin( ( IsLower<MT4>::value )
3905  ?( LOW ? max(j,k+1UL) : k+1UL )
3906  :( LOW ? max(j,k) : k ) )
3907  :( LOW ? j : 0UL ) );
3908  const size_t iend( ( IsUpper<MT4>::value )
3910  ?( UPP ? min(j+1UL,k) : k )
3911  :( UPP ? min(j,k)+1UL : k+1UL ) )
3912  :( UPP ? j+1UL : M ) );
3913 
3914  if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
3915  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3916 
3917  const size_t inum( iend - ibegin );
3918  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3919 
3920  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3921  (~C)(i ,j) -= A(i ,k) * B(k,j);
3922  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3923  }
3924  if( ipos < iend ) {
3925  (~C)(ipos,j) -= A(ipos,k) * B(k,j);
3926  }
3927  }
3928  }
3929  }
3931  //**********************************************************************************************
3932 
3933  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
3947  template< typename MT3 // Type of the left-hand side target matrix
3948  , typename MT4 // Type of the left-hand side matrix operand
3949  , typename MT5 > // Type of the right-hand side matrix operand
3950  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
3951  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3952  {
3953  constexpr size_t block( BLOCK_SIZE );
3954 
3955  const size_t M( A.rows() );
3956  const size_t N( B.columns() );
3957 
3958  for( size_t ii=0UL; ii<M; ii+=block ) {
3959  const size_t iend( min( M, ii+block ) );
3960  for( size_t jj=0UL; jj<N; jj+=block ) {
3961  const size_t jend( min( N, jj+block ) );
3962  for( size_t i=ii; i<iend; ++i )
3963  {
3964  const size_t jbegin( ( IsUpper<MT4>::value )
3965  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
3966  :( jj ) );
3967  const size_t jpos( ( IsLower<MT4>::value )
3968  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
3969  :( jend ) );
3970 
3971  for( size_t j=jbegin; j<jpos; ++j ) {
3972  (~C)(i,j) -= A(i,j) * B(j,j);
3973  }
3974  }
3975  }
3976  }
3977  }
3979  //**********************************************************************************************
3980 
3981  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
3995  template< typename MT3 // Type of the left-hand side target matrix
3996  , typename MT4 // Type of the left-hand side matrix operand
3997  , typename MT5 > // Type of the right-hand side matrix operand
3998  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
3999  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4000  {
4001  const size_t M( A.rows() );
4002  const size_t N( B.columns() );
4003 
4004  for( size_t j=0UL; j<N; ++j )
4005  {
4006  const size_t ibegin( ( IsLower<MT4>::value )
4007  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4008  :( 0UL ) );
4009  const size_t iend( ( IsUpper<MT4>::value )
4010  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4011  :( M ) );
4012  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4013 
4014  const size_t inum( iend - ibegin );
4015  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4016 
4017  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4018  (~C)(i ,j) -= A(i ,j) * B(j,j);
4019  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4020  }
4021  if( ipos < iend ) {
4022  (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4023  }
4024  }
4025  }
4027  //**********************************************************************************************
4028 
4029  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
4043  template< typename MT3 // Type of the left-hand side target matrix
4044  , typename MT4 // Type of the left-hand side matrix operand
4045  , typename MT5 > // Type of the right-hand side matrix operand
4046  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4047  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4048  {
4049  const size_t M( A.rows() );
4050  const size_t N( B.columns() );
4051 
4052  for( size_t i=0UL; i<M; ++i )
4053  {
4054  const size_t jbegin( ( IsUpper<MT5>::value )
4055  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4056  :( 0UL ) );
4057  const size_t jend( ( IsLower<MT5>::value )
4058  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4059  :( N ) );
4060  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4061 
4062  const size_t jnum( jend - jbegin );
4063  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4064 
4065  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4066  (~C)(i,j ) -= A(i,i) * B(i,j );
4067  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4068  }
4069  if( jpos < jend ) {
4070  (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4071  }
4072  }
4073  }
4075  //**********************************************************************************************
4076 
4077  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
4091  template< typename MT3 // Type of the left-hand side target matrix
4092  , typename MT4 // Type of the left-hand side matrix operand
4093  , typename MT5 > // Type of the right-hand side matrix operand
4094  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4095  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4096  {
4097  constexpr size_t block( BLOCK_SIZE );
4098 
4099  const size_t M( A.rows() );
4100  const size_t N( B.columns() );
4101 
4102  for( size_t jj=0UL; jj<N; jj+=block ) {
4103  const size_t jend( min( N, jj+block ) );
4104  for( size_t ii=0UL; ii<M; ii+=block ) {
4105  const size_t iend( min( M, ii+block ) );
4106  for( size_t j=jj; j<jend; ++j )
4107  {
4108  const size_t ibegin( ( IsLower<MT5>::value )
4109  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
4110  :( ii ) );
4111  const size_t ipos( ( IsUpper<MT5>::value )
4112  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
4113  :( iend ) );
4114 
4115  for( size_t i=ibegin; i<ipos; ++i ) {
4116  (~C)(i,j) -= A(i,i) * B(i,j);
4117  }
4118  }
4119  }
4120  }
4121  }
4123  //**********************************************************************************************
4124 
4125  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
4139  template< typename MT3 // Type of the left-hand side target matrix
4140  , typename MT4 // Type of the left-hand side matrix operand
4141  , typename MT5 > // Type of the right-hand side matrix operand
4142  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4143  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4144  {
4145  for( size_t i=0UL; i<A.rows(); ++i ) {
4146  C(i,i) -= A(i,i) * B(i,i);
4147  }
4148  }
4150  //**********************************************************************************************
4151 
4152  //**Default subtraction assignment to dense matrices (small matrices)***************************
4166  template< typename MT3 // Type of the left-hand side target matrix
4167  , typename MT4 // Type of the left-hand side matrix operand
4168  , typename MT5 > // Type of the right-hand side matrix operand
4170  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4171  {
4172  selectDefaultSubAssignKernel( C, A, B );
4173  }
4175  //**********************************************************************************************
4176 
4177  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
4192  template< typename MT3 // Type of the left-hand side target matrix
4193  , typename MT4 // Type of the left-hand side matrix operand
4194  , typename MT5 > // Type of the right-hand side matrix operand
4196  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4197  {
4198  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
4199 
4200  const size_t M( A.rows() );
4201  const size_t N( B.columns() );
4202  const size_t K( A.columns() );
4203 
4204  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4205 
4206  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
4207  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
4208 
4209  size_t j( 0UL );
4210 
4212  {
4213  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
4214  for( size_t i=0UL; i<M; ++i )
4215  {
4216  const size_t kbegin( ( IsUpper<MT4>::value )
4217  ?( ( IsLower<MT5>::value )
4218  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4219  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4220  :( IsLower<MT5>::value ? j : 0UL ) );
4221  const size_t kend( ( IsLower<MT4>::value )
4222  ?( ( IsUpper<MT5>::value )
4223  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
4224  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4225  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
4226 
4227  SIMDType xmm1( (~C).load(i,j ) );
4228  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4229  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4230  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4231  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
4232  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
4233  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
4234  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
4235 
4236  for( size_t k=kbegin; k<kend; ++k ) {
4237  const SIMDType a1( set( A(i,k) ) );
4238  xmm1 -= a1 * B.load(k,j );
4239  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
4240  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
4241  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
4242  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
4243  xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
4244  xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
4245  xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
4246  }
4247 
4248  (~C).store( i, j , xmm1 );
4249  (~C).store( i, j+SIMDSIZE , xmm2 );
4250  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4251  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4252  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
4253  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
4254  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
4255  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
4256  }
4257  }
4258  }
4259 
4260  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
4261  {
4262  size_t i( 0UL );
4263 
4264  for( ; (i+2UL) <= M; i+=2UL )
4265  {
4266  const size_t kbegin( ( IsUpper<MT4>::value )
4267  ?( ( IsLower<MT5>::value )
4268  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4269  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4270  :( IsLower<MT5>::value ? j : 0UL ) );
4271  const size_t kend( ( IsLower<MT4>::value )
4272  ?( ( IsUpper<MT5>::value )
4273  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
4274  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4275  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
4276 
4277  SIMDType xmm1 ( (~C).load(i ,j ) );
4278  SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
4279  SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
4280  SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
4281  SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
4282  SIMDType xmm6 ( (~C).load(i+1UL,j ) );
4283  SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
4284  SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
4285  SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
4286  SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
4287 
4288  for( size_t k=kbegin; k<kend; ++k ) {
4289  const SIMDType a1( set( A(i ,k) ) );
4290  const SIMDType a2( set( A(i+1UL,k) ) );
4291  const SIMDType b1( B.load(k,j ) );
4292  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4293  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4294  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4295  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
4296  xmm1 -= a1 * b1;
4297  xmm2 -= a1 * b2;
4298  xmm3 -= a1 * b3;
4299  xmm4 -= a1 * b4;
4300  xmm5 -= a1 * b5;
4301  xmm6 -= a2 * b1;
4302  xmm7 -= a2 * b2;
4303  xmm8 -= a2 * b3;
4304  xmm9 -= a2 * b4;
4305  xmm10 -= a2 * b5;
4306  }
4307 
4308  (~C).store( i , j , xmm1 );
4309  (~C).store( i , j+SIMDSIZE , xmm2 );
4310  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
4311  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
4312  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
4313  (~C).store( i+1UL, j , xmm6 );
4314  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
4315  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
4316  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
4317  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
4318  }
4319 
4320  if( i < M )
4321  {
4322  const size_t kbegin( ( IsUpper<MT4>::value )
4323  ?( ( IsLower<MT5>::value )
4324  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4325  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4326  :( IsLower<MT5>::value ? j : 0UL ) );
4327  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
4328 
4329  SIMDType xmm1( (~C).load(i,j ) );
4330  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4331  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4332  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4333  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
4334 
4335  for( size_t k=kbegin; k<kend; ++k ) {
4336  const SIMDType a1( set( A(i,k) ) );
4337  xmm1 -= a1 * B.load(k,j );
4338  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
4339  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
4340  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
4341  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
4342  }
4343 
4344  (~C).store( i, j , xmm1 );
4345  (~C).store( i, j+SIMDSIZE , xmm2 );
4346  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4347  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4348  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
4349  }
4350  }
4351 
4352  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4353  {
4354  size_t i( 0UL );
4355 
4356  for( ; (i+2UL) <= M; i+=2UL )
4357  {
4358  const size_t kbegin( ( IsUpper<MT4>::value )
4359  ?( ( IsLower<MT5>::value )
4360  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4361  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4362  :( IsLower<MT5>::value ? j : 0UL ) );
4363  const size_t kend( ( IsLower<MT4>::value )
4364  ?( ( IsUpper<MT5>::value )
4365  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
4366  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4367  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
4368 
4369  SIMDType xmm1( (~C).load(i ,j ) );
4370  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
4371  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
4372  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
4373  SIMDType xmm5( (~C).load(i+1UL,j ) );
4374  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
4375  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
4376  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
4377 
4378  for( size_t k=kbegin; k<kend; ++k ) {
4379  const SIMDType a1( set( A(i ,k) ) );
4380  const SIMDType a2( set( A(i+1UL,k) ) );
4381  const SIMDType b1( B.load(k,j ) );
4382  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4383  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4384  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4385  xmm1 -= a1 * b1;
4386  xmm2 -= a1 * b2;
4387  xmm3 -= a1 * b3;
4388  xmm4 -= a1 * b4;
4389  xmm5 -= a2 * b1;
4390  xmm6 -= a2 * b2;
4391  xmm7 -= a2 * b3;
4392  xmm8 -= a2 * b4;
4393  }
4394 
4395  (~C).store( i , j , xmm1 );
4396  (~C).store( i , j+SIMDSIZE , xmm2 );
4397  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
4398  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
4399  (~C).store( i+1UL, j , xmm5 );
4400  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
4401  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
4402  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
4403  }
4404 
4405  if( i < M )
4406  {
4407  const size_t kbegin( ( IsUpper<MT4>::value )
4408  ?( ( IsLower<MT5>::value )
4409  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4410  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4411  :( IsLower<MT5>::value ? j : 0UL ) );
4412  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
4413 
4414  SIMDType xmm1( (~C).load(i,j ) );
4415  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4416  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4417  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4418 
4419  for( size_t k=kbegin; k<kend; ++k ) {
4420  const SIMDType a1( set( A(i,k) ) );
4421  xmm1 -= a1 * B.load(k,j );
4422  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
4423  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
4424  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
4425  }
4426 
4427  (~C).store( i, j , xmm1 );
4428  (~C).store( i, j+SIMDSIZE , xmm2 );
4429  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4430  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4431  }
4432  }
4433 
4434  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4435  {
4436  size_t i( 0UL );
4437 
4438  for( ; (i+2UL) <= M; i+=2UL )
4439  {
4440  const size_t kbegin( ( IsUpper<MT4>::value )
4441  ?( ( IsLower<MT5>::value )
4442  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4443  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4444  :( IsLower<MT5>::value ? j : 0UL ) );
4445  const size_t kend( ( IsLower<MT4>::value )
4446  ?( ( IsUpper<MT5>::value )
4447  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
4448  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4449  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
4450 
4451  SIMDType xmm1( (~C).load(i ,j ) );
4452  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
4453  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
4454  SIMDType xmm4( (~C).load(i+1UL,j ) );
4455  SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
4456  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
4457 
4458  for( size_t k=kbegin; k<kend; ++k ) {
4459  const SIMDType a1( set( A(i ,k) ) );
4460  const SIMDType a2( set( A(i+1UL,k) ) );
4461  const SIMDType b1( B.load(k,j ) );
4462  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4463  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4464  xmm1 -= a1 * b1;
4465  xmm2 -= a1 * b2;
4466  xmm3 -= a1 * b3;
4467  xmm4 -= a2 * b1;
4468  xmm5 -= a2 * b2;
4469  xmm6 -= a2 * b3;
4470  }
4471 
4472  (~C).store( i , j , xmm1 );
4473  (~C).store( i , j+SIMDSIZE , xmm2 );
4474  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
4475  (~C).store( i+1UL, j , xmm4 );
4476  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
4477  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
4478  }
4479 
4480  if( i < M )
4481  {
4482  const size_t kbegin( ( IsUpper<MT4>::value )
4483  ?( ( IsLower<MT5>::value )
4484  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4485  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4486  :( IsLower<MT5>::value ? j : 0UL ) );
4487  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
4488 
4489  SIMDType xmm1( (~C).load(i,j ) );
4490  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4491  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4492 
4493  for( size_t k=kbegin; k<kend; ++k ) {
4494  const SIMDType a1( set( A(i,k) ) );
4495  xmm1 -= a1 * B.load(k,j );
4496  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
4497  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
4498  }
4499 
4500  (~C).store( i, j , xmm1 );
4501  (~C).store( i, j+SIMDSIZE , xmm2 );
4502  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4503  }
4504  }
4505 
4506  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4507  {
4508  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
4509  size_t i( LOW ? j : 0UL );
4510 
4511  for( ; (i+2UL) <= iend; i+=2UL )
4512  {
4513  const size_t kbegin( ( IsUpper<MT4>::value )
4514  ?( ( IsLower<MT5>::value )
4515  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4516  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4517  :( IsLower<MT5>::value ? j : 0UL ) );
4518  const size_t kend( ( IsLower<MT4>::value )
4519  ?( ( IsUpper<MT5>::value )
4520  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
4521  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4522  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
4523 
4524  SIMDType xmm1( (~C).load(i ,j ) );
4525  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
4526  SIMDType xmm3( (~C).load(i+1UL,j ) );
4527  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
4528 
4529  for( size_t k=kbegin; k<kend; ++k ) {
4530  const SIMDType a1( set( A(i ,k) ) );
4531  const SIMDType a2( set( A(i+1UL,k) ) );
4532  const SIMDType b1( B.load(k,j ) );
4533  const SIMDType b2( B.load(k,j+SIMDSIZE) );
4534  xmm1 -= a1 * b1;
4535  xmm2 -= a1 * b2;
4536  xmm3 -= a2 * b1;
4537  xmm4 -= a2 * b2;
4538  }
4539 
4540  (~C).store( i , j , xmm1 );
4541  (~C).store( i , j+SIMDSIZE, xmm2 );
4542  (~C).store( i+1UL, j , xmm3 );
4543  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
4544  }
4545 
4546  if( i < iend )
4547  {
4548  const size_t kbegin( ( IsUpper<MT4>::value )
4549  ?( ( IsLower<MT5>::value )
4550  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4551  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4552  :( IsLower<MT5>::value ? j : 0UL ) );
4553  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
4554 
4555  SIMDType xmm1( (~C).load(i,j ) );
4556  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
4557 
4558  for( size_t k=kbegin; k<kend; ++k ) {
4559  const SIMDType a1( set( A(i,k) ) );
4560  xmm1 -= a1 * B.load(k,j );
4561  xmm2 -= a1 * B.load(k,j+SIMDSIZE);
4562  }
4563 
4564  (~C).store( i, j , xmm1 );
4565  (~C).store( i, j+SIMDSIZE, xmm2 );
4566  }
4567  }
4568 
4569  for( ; j<jpos; j+=SIMDSIZE )
4570  {
4571  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
4572  size_t i( LOW ? j : 0UL );
4573 
4574  for( ; (i+2UL) <= iend; i+=2UL )
4575  {
4576  const size_t kbegin( ( IsUpper<MT4>::value )
4577  ?( ( IsLower<MT5>::value )
4578  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4579  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4580  :( IsLower<MT5>::value ? j : 0UL ) );
4581  const size_t kend( ( IsLower<MT4>::value )
4582  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4583  :( K ) );
4584 
4585  SIMDType xmm1( (~C).load(i ,j) );
4586  SIMDType xmm2( (~C).load(i+1UL,j) );
4587 
4588  for( size_t k=kbegin; k<kend; ++k ) {
4589  const SIMDType b1( B.load(k,j) );
4590  xmm1 -= set( A(i ,k) ) * b1;
4591  xmm2 -= set( A(i+1UL,k) ) * b1;
4592  }
4593 
4594  (~C).store( i , j, xmm1 );
4595  (~C).store( i+1UL, j, xmm2 );
4596  }
4597 
4598  if( i < iend )
4599  {
4600  const size_t kbegin( ( IsUpper<MT4>::value )
4601  ?( ( IsLower<MT5>::value )
4602  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4603  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4604  :( IsLower<MT5>::value ? j : 0UL ) );
4605 
4606  SIMDType xmm1( (~C).load(i,j) );
4607 
4608  for( size_t k=kbegin; k<K; ++k ) {
4609  xmm1 -= set( A(i,k) ) * B.load(k,j);
4610  }
4611 
4612  (~C).store( i, j, xmm1 );
4613  }
4614  }
4615 
4616  for( ; remainder && j<N; ++j )
4617  {
4618  const size_t iend( UPP ? j+1UL : M );
4619  size_t i( LOW ? j : 0UL );
4620 
4621  for( ; (i+2UL) <= iend; i+=2UL )
4622  {
4623  const size_t kbegin( ( IsUpper<MT4>::value )
4624  ?( ( IsLower<MT5>::value )
4625  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4626  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4627  :( IsLower<MT5>::value ? j : 0UL ) );
4628  const size_t kend( ( IsLower<MT4>::value )
4629  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4630  :( K ) );
4631 
4632  ElementType value1( (~C)(i ,j) );
4633  ElementType value2( (~C)(i+1UL,j) );
4634 
4635  for( size_t k=kbegin; k<kend; ++k ) {
4636  value1 -= A(i ,k) * B(k,j);
4637  value2 -= A(i+1UL,k) * B(k,j);
4638  }
4639 
4640  (~C)(i ,j) = value1;
4641  (~C)(i+1UL,j) = value2;
4642  }
4643 
4644  if( i < iend )
4645  {
4646  const size_t kbegin( ( IsUpper<MT4>::value )
4647  ?( ( IsLower<MT5>::value )
4648  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4649  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4650  :( IsLower<MT5>::value ? j : 0UL ) );
4651 
4652  ElementType value( (~C)(i,j) );
4653 
4654  for( size_t k=kbegin; k<K; ++k ) {
4655  value -= A(i,k) * B(k,j);
4656  }
4657 
4658  (~C)(i,j) = value;
4659  }
4660  }
4661  }
4663  //**********************************************************************************************
4664 
4665  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
4680  template< typename MT3 // Type of the left-hand side target matrix
4681  , typename MT4 // Type of the left-hand side matrix operand
4682  , typename MT5 > // Type of the right-hand side matrix operand
4684  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4685  {
4686  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
4687 
4688  const size_t M( A.rows() );
4689  const size_t N( B.columns() );
4690  const size_t K( A.columns() );
4691 
4692  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4693 
4694  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
4695  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
4696 
4697  size_t i( 0UL );
4698 
4700  {
4701  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
4702  for( size_t j=0UL; j<N; ++j )
4703  {
4704  const size_t kbegin( ( IsLower<MT5>::value )
4705  ?( ( IsUpper<MT4>::value )
4706  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4707  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4708  :( IsUpper<MT4>::value ? i : 0UL ) );
4709  const size_t kend( ( IsUpper<MT5>::value )
4710  ?( ( IsLower<MT4>::value )
4711  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4712  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4713  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
4714 
4715  SIMDType xmm1( (~C).load(i ,j) );
4716  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4717  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4718  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4719  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
4720  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
4721  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
4722  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
4723 
4724  for( size_t k=kbegin; k<kend; ++k ) {
4725  const SIMDType b1( set( B(k,j) ) );
4726  xmm1 -= A.load(i ,k) * b1;
4727  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
4728  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
4729  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
4730  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
4731  xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
4732  xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
4733  xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
4734  }
4735 
4736  (~C).store( i , j, xmm1 );
4737  (~C).store( i+SIMDSIZE , j, xmm2 );
4738  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4739  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4740  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
4741  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
4742  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
4743  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
4744  }
4745  }
4746  }
4747 
4748  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
4749  {
4750  size_t j( 0UL );
4751 
4752  for( ; (j+2UL) <= N; j+=2UL )
4753  {
4754  const size_t kbegin( ( IsLower<MT5>::value )
4755  ?( ( IsUpper<MT4>::value )
4756  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4757  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4758  :( IsUpper<MT4>::value ? i : 0UL ) );
4759  const size_t kend( ( IsUpper<MT5>::value )
4760  ?( ( IsLower<MT4>::value )
4761  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4762  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4763  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
4764 
4765  SIMDType xmm1 ( (~C).load(i ,j ) );
4766  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
4767  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
4768  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
4769  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
4770  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
4771  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
4772  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4773  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
4774  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
4775 
4776  for( size_t k=kbegin; k<kend; ++k ) {
4777  const SIMDType a1( A.load(i ,k) );
4778  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4779  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4780  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4781  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
4782  const SIMDType b1( set( B(k,j ) ) );
4783  const SIMDType b2( set( B(k,j+1UL) ) );
4784  xmm1 -= a1 * b1;
4785  xmm2 -= a2 * b1;
4786  xmm3 -= a3 * b1;
4787  xmm4 -= a4 * b1;
4788  xmm5 -= a5 * b1;
4789  xmm6 -= a1 * b2;
4790  xmm7 -= a2 * b2;
4791  xmm8 -= a3 * b2;
4792  xmm9 -= a4 * b2;
4793  xmm10 -= a5 * b2;
4794  }
4795 
4796  (~C).store( i , j , xmm1 );
4797  (~C).store( i+SIMDSIZE , j , xmm2 );
4798  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4799  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
4800  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
4801  (~C).store( i , j+1UL, xmm6 );
4802  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
4803  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
4804  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
4805  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
4806  }
4807 
4808  if( j < N )
4809  {
4810  const size_t kbegin( ( IsLower<MT5>::value )
4811  ?( ( IsUpper<MT4>::value )
4812  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4813  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4814  :( IsUpper<MT4>::value ? i : 0UL ) );
4815  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
4816 
4817  SIMDType xmm1( (~C).load(i ,j) );
4818  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4819  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4820  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4821  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
4822 
4823  for( size_t k=kbegin; k<kend; ++k ) {
4824  const SIMDType b1( set( B(k,j) ) );
4825  xmm1 -= A.load(i ,k) * b1;
4826  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
4827  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
4828  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
4829  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
4830  }
4831 
4832  (~C).store( i , j, xmm1 );
4833  (~C).store( i+SIMDSIZE , j, xmm2 );
4834  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4835  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4836  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
4837  }
4838  }
4839 
4840  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4841  {
4842  size_t j( 0UL );
4843 
4844  for( ; (j+2UL) <= N; j+=2UL )
4845  {
4846  const size_t kbegin( ( IsLower<MT5>::value )
4847  ?( ( IsUpper<MT4>::value )
4848  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4849  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4850  :( IsUpper<MT4>::value ? i : 0UL ) );
4851  const size_t kend( ( IsUpper<MT5>::value )
4852  ?( ( IsLower<MT4>::value )
4853  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4854  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4855  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
4856 
4857  SIMDType xmm1( (~C).load(i ,j ) );
4858  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4859  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4860  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
4861  SIMDType xmm5( (~C).load(i ,j+1UL) );
4862  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
4863  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4864  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
4865 
4866  for( size_t k=kbegin; k<kend; ++k ) {
4867  const SIMDType a1( A.load(i ,k) );
4868  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4869  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4870  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4871  const SIMDType b1( set( B(k,j ) ) );
4872  const SIMDType b2( set( B(k,j+1UL) ) );
4873  xmm1 -= a1 * b1;
4874  xmm2 -= a2 * b1;
4875  xmm3 -= a3 * b1;
4876  xmm4 -= a4 * b1;
4877  xmm5 -= a1 * b2;
4878  xmm6 -= a2 * b2;
4879  xmm7 -= a3 * b2;
4880  xmm8 -= a4 * b2;
4881  }
4882 
4883  (~C).store( i , j , xmm1 );
4884  (~C).store( i+SIMDSIZE , j , xmm2 );
4885  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4886  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
4887  (~C).store( i , j+1UL, xmm5 );
4888  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
4889  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
4890  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
4891  }
4892 
4893  if( j < N )
4894  {
4895  const size_t kbegin( ( IsLower<MT5>::value )
4896  ?( ( IsUpper<MT4>::value )
4897  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4898  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4899  :( IsUpper<MT4>::value ? i : 0UL ) );
4900  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
4901 
4902  SIMDType xmm1( (~C).load(i ,j) );
4903  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4904  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4905  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4906 
4907  for( size_t k=kbegin; k<kend; ++k ) {
4908  const SIMDType b1( set( B(k,j) ) );
4909  xmm1 -= A.load(i ,k) * b1;
4910  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
4911  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
4912  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
4913  }
4914 
4915  (~C).store( i , j, xmm1 );
4916  (~C).store( i+SIMDSIZE , j, xmm2 );
4917  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4918  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4919  }
4920  }
4921 
4922  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4923  {
4924  size_t j( 0UL );
4925 
4926  for( ; (j+2UL) <= N; j+=2UL )
4927  {
4928  const size_t kbegin( ( IsLower<MT5>::value )
4929  ?( ( IsUpper<MT4>::value )
4930  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4931  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4932  :( IsUpper<MT4>::value ? i : 0UL ) );
4933  const size_t kend( ( IsUpper<MT5>::value )
4934  ?( ( IsLower<MT4>::value )
4935  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4936  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4937  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
4938 
4939  SIMDType xmm1( (~C).load(i ,j ) );
4940  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4941  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4942  SIMDType xmm4( (~C).load(i ,j+1UL) );
4943  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
4944  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4945 
4946  for( size_t k=kbegin; k<kend; ++k ) {
4947  const SIMDType a1( A.load(i ,k) );
4948  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4949  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4950  const SIMDType b1( set( B(k,j ) ) );
4951  const SIMDType b2( set( B(k,j+1UL) ) );
4952  xmm1 -= a1 * b1;
4953  xmm2 -= a2 * b1;
4954  xmm3 -= a3 * b1;
4955  xmm4 -= a1 * b2;
4956  xmm5 -= a2 * b2;
4957  xmm6 -= a3 * b2;
4958  }
4959 
4960  (~C).store( i , j , xmm1 );
4961  (~C).store( i+SIMDSIZE , j , xmm2 );
4962  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4963  (~C).store( i , j+1UL, xmm4 );
4964  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
4965  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
4966  }
4967 
4968  if( j < N )
4969  {
4970  const size_t kbegin( ( IsLower<MT5>::value )
4971  ?( ( IsUpper<MT4>::value )
4972  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4973  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4974  :( IsUpper<MT4>::value ? i : 0UL ) );
4975  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
4976 
4977  SIMDType xmm1( (~C).load(i ,j) );
4978  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4979  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4980 
4981  for( size_t k=kbegin; k<kend; ++k ) {
4982  const SIMDType b1( set( B(k,j) ) );
4983  xmm1 -= A.load(i ,k) * b1;
4984  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
4985  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
4986  }
4987 
4988  (~C).store( i , j, xmm1 );
4989  (~C).store( i+SIMDSIZE , j, xmm2 );
4990  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4991  }
4992  }
4993 
4994  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4995  {
4996  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
4997  size_t j( UPP ? i : 0UL );
4998 
4999  for( ; (j+2UL) <= jend; j+=2UL )
5000  {
5001  const size_t kbegin( ( IsLower<MT5>::value )
5002  ?( ( IsUpper<MT4>::value )
5003  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5004  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5005  :( IsUpper<MT4>::value ? i : 0UL ) );
5006  const size_t kend( ( IsUpper<MT5>::value )
5007  ?( ( IsLower<MT4>::value )
5008  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5009  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5010  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
5011 
5012  SIMDType xmm1( (~C).load(i ,j ) );
5013  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
5014  SIMDType xmm3( (~C).load(i ,j+1UL) );
5015  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
5016 
5017  for( size_t k=kbegin; k<kend; ++k ) {
5018  const SIMDType a1( A.load(i ,k) );
5019  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5020  const SIMDType b1( set( B(k,j ) ) );
5021  const SIMDType b2( set( B(k,j+1UL) ) );
5022  xmm1 -= a1 * b1;
5023  xmm2 -= a2 * b1;
5024  xmm3 -= a1 * b2;
5025  xmm4 -= a2 * b2;
5026  }
5027 
5028  (~C).store( i , j , xmm1 );
5029  (~C).store( i+SIMDSIZE, j , xmm2 );
5030  (~C).store( i , j+1UL, xmm3 );
5031  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
5032  }
5033 
5034  if( j < jend )
5035  {
5036  const size_t kbegin( ( IsLower<MT5>::value )
5037  ?( ( IsUpper<MT4>::value )
5038  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5039  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5040  :( IsUpper<MT4>::value ? i : 0UL ) );
5041  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
5042 
5043  SIMDType xmm1( (~C).load(i ,j) );
5044  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
5045 
5046  for( size_t k=kbegin; k<kend; ++k ) {
5047  const SIMDType b1( set( B(k,j) ) );
5048  xmm1 -= A.load(i ,k) * b1;
5049  xmm2 -= A.load(i+SIMDSIZE,k) * b1;
5050  }
5051 
5052  (~C).store( i , j, xmm1 );
5053  (~C).store( i+SIMDSIZE, j, xmm2 );
5054  }
5055  }
5056 
5057  for( ; i<ipos; i+=SIMDSIZE )
5058  {
5059  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
5060  size_t j( UPP ? i : 0UL );
5061 
5062  for( ; (j+2UL) <= jend; j+=2UL )
5063  {
5064  const size_t kbegin( ( IsLower<MT5>::value )
5065  ?( ( IsUpper<MT4>::value )
5066  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5067  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5068  :( IsUpper<MT4>::value ? i : 0UL ) );
5069  const size_t kend( ( IsUpper<MT5>::value )
5070  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5071  :( K ) );
5072 
5073  SIMDType xmm1( (~C).load(i,j ) );
5074  SIMDType xmm2( (~C).load(i,j+1UL) );
5075 
5076  for( size_t k=kbegin; k<kend; ++k ) {
5077  const SIMDType a1( A.load(i,k) );
5078  xmm1 -= a1 * set( B(k,j ) );
5079  xmm2 -= a1 * set( B(k,j+1UL) );
5080  }
5081 
5082  (~C).store( i, j , xmm1 );
5083  (~C).store( i, j+1UL, xmm2 );
5084  }
5085 
5086  if( j < jend )
5087  {
5088  const size_t kbegin( ( IsLower<MT5>::value )
5089  ?( ( IsUpper<MT4>::value )
5090  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5091  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5092  :( IsUpper<MT4>::value ? i : 0UL ) );
5093 
5094  SIMDType xmm1( (~C).load(i,j) );
5095 
5096  for( size_t k=kbegin; k<K; ++k ) {
5097  xmm1 -= A.load(i,k) * set( B(k,j) );
5098  }
5099 
5100  (~C).store( i, j, xmm1 );
5101  }
5102  }
5103 
5104  for( ; remainder && i<M; ++i )
5105  {
5106  const size_t jend( LOW ? i+1UL : N );
5107  size_t j( UPP ? i : 0UL );
5108 
5109  for( ; (j+2UL) <= jend; j+=2UL )
5110  {
5111  const size_t kbegin( ( IsLower<MT5>::value )
5112  ?( ( IsUpper<MT4>::value )
5113  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5114  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5115  :( IsUpper<MT4>::value ? i : 0UL ) );
5116  const size_t kend( ( IsUpper<MT5>::value )
5117  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5118  :( K ) );
5119 
5120  ElementType value1( (~C)(i,j ) );
5121  ElementType value2( (~C)(i,j+1UL) );
5122 
5123  for( size_t k=kbegin; k<kend; ++k ) {
5124  value1 -= A(i,k) * B(k,j );
5125  value2 -= A(i,k) * B(k,j+1UL);
5126  }
5127 
5128  (~C)(i,j ) = value1;
5129  (~C)(i,j+1UL) = value2;
5130  }
5131 
5132  if( j < jend )
5133  {
5134  const size_t kbegin( ( IsLower<MT5>::value )
5135  ?( ( IsUpper<MT4>::value )
5136  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5137  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5138  :( IsUpper<MT4>::value ? i : 0UL ) );
5139 
5140  ElementType value( (~C)(i,j) );
5141 
5142  for( size_t k=kbegin; k<K; ++k ) {
5143  value -= A(i,k) * B(k,j);
5144  }
5145 
5146  (~C)(i,j) = value;
5147  }
5148  }
5149  }
5151  //**********************************************************************************************
5152 
5153  //**Default subtraction assignment to dense matrices (large matrices)***************************
5167  template< typename MT3 // Type of the left-hand side target matrix
5168  , typename MT4 // Type of the left-hand side matrix operand
5169  , typename MT5 > // Type of the right-hand side matrix operand
5171  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5172  {
5173  selectDefaultSubAssignKernel( C, A, B );
5174  }
5176  //**********************************************************************************************
5177 
5178  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
5193  template< typename MT3 // Type of the left-hand side target matrix
5194  , typename MT4 // Type of the left-hand side matrix operand
5195  , typename MT5 > // Type of the right-hand side matrix operand
5197  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5198  {
5199  if( LOW )
5200  lmmm( C, A, B, ElementType(-1), ElementType(1) );
5201  else if( UPP )
5202  ummm( C, A, B, ElementType(-1), ElementType(1) );
5203  else
5204  mmm( C, A, B, ElementType(-1), ElementType(1) );
5205  }
5207  //**********************************************************************************************
5208 
5209  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
5223  template< typename MT3 // Type of the left-hand side target matrix
5224  , typename MT4 // Type of the left-hand side matrix operand
5225  , typename MT5 > // Type of the right-hand side matrix operand
5227  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5228  {
5229  selectLargeSubAssignKernel( C, A, B );
5230  }
5232  //**********************************************************************************************
5233 
5234  //**BLAS-based subraction assignment to dense matrices******************************************
5235 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5236 
5249  template< typename MT3 // Type of the left-hand side target matrix
5250  , typename MT4 // Type of the left-hand side matrix operand
5251  , typename MT5 > // Type of the right-hand side matrix operand
5253  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5254  {
5255  typedef ElementType_<MT3> ET;
5256 
5257  if( IsTriangular<MT4>::value ) {
5258  ResultType_<MT3> tmp( serial( B ) );
5259  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5260  subAssign( C, tmp );
5261  }
5262  else if( IsTriangular<MT5>::value ) {
5263  ResultType_<MT3> tmp( serial( A ) );
5264  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5265  subAssign( C, tmp );
5266  }
5267  else {
5268  gemm( C, A, B, ET(-1), ET(1) );
5269  }
5270  }
5272 #endif
5273  //**********************************************************************************************
5274 
5275  //**Subtraction assignment to sparse matrices***************************************************
5276  // No special implementation for the subtraction assignment to sparse matrices.
5277  //**********************************************************************************************
5278 
5279  //**Multiplication assignment to dense matrices*************************************************
5280  // No special implementation for the multiplication assignment to dense matrices.
5281  //**********************************************************************************************
5282 
5283  //**Multiplication assignment to sparse matrices************************************************
5284  // No special implementation for the multiplication assignment to sparse matrices.
5285  //**********************************************************************************************
5286 
5287  //**SMP assignment to dense matrices************************************************************
5303  template< typename MT // Type of the target dense matrix
5304  , bool SO > // Storage order of the target dense matrix
5306  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5307  {
5309 
5310  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5311  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5312 
5313  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
5314  return;
5315  }
5316  else if( rhs.lhs_.columns() == 0UL ) {
5317  reset( ~lhs );
5318  return;
5319  }
5320 
5321  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5322  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5323 
5324  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5325  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5326  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5327  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5328  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5329  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5330 
5331  smpAssign( ~lhs, A * B );
5332  }
5334  //**********************************************************************************************
5335 
5336  //**SMP assignment to sparse matrices***********************************************************
5352  template< typename MT // Type of the target sparse matrix
5353  , bool SO > // Storage order of the target sparse matrix
5355  smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
5356  {
5358 
5360 
5367 
5368  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5369  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5370 
5371  const ForwardFunctor fwd;
5372 
5373  const TmpType tmp( rhs );
5374  smpAssign( ~lhs, fwd( tmp ) );
5375  }
5377  //**********************************************************************************************
5378 
5379  //**SMP addition assignment to dense matrices***************************************************
5395  template< typename MT // Type of the target dense matrix
5396  , bool SO > // Storage order of the target dense matrix
5399  {
5401 
5402  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5403  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5404 
5405  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5406  return;
5407  }
5408 
5409  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5410  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5411 
5412  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5413  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5414  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5415  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5416  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5417  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5418 
5419  smpAddAssign( ~lhs, A * B );
5420  }
5422  //**********************************************************************************************
5423 
5424  //**SMP addition assignment to sparse matrices**************************************************
5425  // No special implementation for the SMP addition assignment to sparse matrices.
5426  //**********************************************************************************************
5427 
5428  //**SMP subtraction assignment to dense matrices************************************************
5444  template< typename MT // Type of the target dense matrix
5445  , bool SO > // Storage order of the target dense matrix
5448  {
5450 
5451  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5452  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5453 
5454  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5455  return;
5456  }
5457 
5458  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
5459  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
5460 
5461  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
5462  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
5463  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
5464  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
5465  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5466  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
5467 
5468  smpSubAssign( ~lhs, A * B );
5469  }
5471  //**********************************************************************************************
5472 
5473  //**SMP subtraction assignment to sparse matrices***********************************************
5474  // No special implementation for the SMP subtraction assignment to sparse matrices.
5475  //**********************************************************************************************
5476 
5477  //**SMP multiplication assignment to dense matrices*********************************************
5478  // No special implementation for the SMP multiplication assignment to dense matrices.
5479  //**********************************************************************************************
5480 
5481  //**SMP multiplication assignment to sparse matrices********************************************
5482  // No special implementation for the SMP multiplication assignment to sparse matrices.
5483  //**********************************************************************************************
5484 
5485  //**Compile time checks*************************************************************************
5493  //**********************************************************************************************
5494 };
5495 //*************************************************************************************************
5496 
5497 
5498 
5499 
5500 //=================================================================================================
5501 //
5502 // DMATSCALARMULTEXPR SPECIALIZATION
5503 //
5504 //=================================================================================================
5505 
5506 //*************************************************************************************************
5514 template< typename MT1 // Type of the left-hand side dense matrix
5515  , typename MT2 // Type of the right-hand side dense matrix
5516  , bool SF // Symmetry flag
5517  , bool HF // Hermitian flag
5518  , bool LF // Lower flag
5519  , bool UF // Upper flag
5520  , typename ST > // Type of the right-hand side scalar value
5521 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
5522  : public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true >
5523  , private MatScalarMultExpr
5524  , private Computation
5525 {
5526  private:
5527  //**Type definitions****************************************************************************
5530 
5531  typedef ResultType_<MMM> RES;
5532  typedef ResultType_<MT1> RT1;
5533  typedef ResultType_<MT2> RT2;
5534  typedef ElementType_<RT1> ET1;
5535  typedef ElementType_<RT2> ET2;
5536  typedef CompositeType_<MT1> CT1;
5537  typedef CompositeType_<MT2> CT2;
5538  //**********************************************************************************************
5539 
5540  //**********************************************************************************************
5542  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
5543  //**********************************************************************************************
5544 
5545  //**********************************************************************************************
5547  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
5548  //**********************************************************************************************
5549 
5550  //**********************************************************************************************
5552  enum : bool {
5553  SYM = ( SF && !( HF || LF || UF ) ),
5554  HERM = ( HF && !( LF || UF ) ),
5555  LOW = ( LF || ( ( SF || HF ) && UF ) ),
5556  UPP = ( UF || ( ( SF || HF ) && LF ) )
5557  };
5558  //**********************************************************************************************
5559 
5560  //**********************************************************************************************
5562 
5565  template< typename T1, typename T2, typename T3 >
5566  struct IsEvaluationRequired {
5567  enum : bool { value = ( evaluateLeft || evaluateRight ) };
5568  };
5569  //**********************************************************************************************
5570 
5571  //**********************************************************************************************
5573 
5575  template< typename T1, typename T2, typename T3, typename T4 >
5576  struct UseBlasKernel {
5578  !SYM && !HERM && !LOW && !UPP &&
5583  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
5588  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
5590  };
5591  //**********************************************************************************************
5592 
5593  //**********************************************************************************************
5595 
5597  template< typename T1, typename T2, typename T3, typename T4 >
5598  struct UseVectorizedDefaultKernel {
5599  enum : bool { value = useOptimizedKernels &&
5603  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
5607  , T4 >::value &&
5610  };
5611  //**********************************************************************************************
5612 
5613  //**********************************************************************************************
5615 
5617  typedef IfTrue_< HERM
5618  , DeclHerm
5619  , IfTrue_< SYM
5620  , DeclSym
5621  , IfTrue_< LOW
5622  , IfTrue_< UPP
5623  , DeclDiag
5624  , DeclLow >
5625  , IfTrue_< UPP
5626  , DeclUpp
5627  , Noop > > > > ForwardFunctor;
5628  //**********************************************************************************************
5629 
5630  public:
5631  //**Type definitions****************************************************************************
5633  typedef MultTrait_<RES,ST> ResultType;
5638  typedef const ElementType ReturnType;
5639  typedef const ResultType CompositeType;
5640 
5643 
5645  typedef ST RightOperand;
5646 
5649 
5652  //**********************************************************************************************
5653 
5654  //**Compilation flags***************************************************************************
5656  enum : bool { simdEnabled = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
5657  MT1::simdEnabled && MT2::simdEnabled &&
5661 
5663  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
5664  !evaluateRight && MT2::smpAssignable };
5665  //**********************************************************************************************
5666 
5667  //**SIMD properties*****************************************************************************
5669  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
5670  //**********************************************************************************************
5671 
5672  //**Constructor*********************************************************************************
5678  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
5679  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
5680  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
5681  {}
5682  //**********************************************************************************************
5683 
5684  //**Access operator*****************************************************************************
5691  inline ResultType operator()( size_t i, size_t j ) const {
5692  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
5693  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
5694  return matrix_(i,j) * scalar_;
5695  }
5696  //**********************************************************************************************
5697 
5698  //**At function*********************************************************************************
5706  inline ReturnType at( size_t i, size_t j ) const {
5707  if( i >= matrix_.rows() ) {
5708  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
5709  }
5710  if( j >= matrix_.columns() ) {
5711  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
5712  }
5713  return (*this)(i,j);
5714  }
5715  //**********************************************************************************************
5716 
5717  //**Rows function*******************************************************************************
5722  inline size_t rows() const {
5723  return matrix_.rows();
5724  }
5725  //**********************************************************************************************
5726 
5727  //**Columns function****************************************************************************
5732  inline size_t columns() const {
5733  return matrix_.columns();
5734  }
5735  //**********************************************************************************************
5736 
5737  //**Left operand access*************************************************************************
5742  inline LeftOperand leftOperand() const {
5743  return matrix_;
5744  }
5745  //**********************************************************************************************
5746 
5747  //**Right operand access************************************************************************
5752  inline RightOperand rightOperand() const {
5753  return scalar_;
5754  }
5755  //**********************************************************************************************
5756 
5757  //**********************************************************************************************
5763  template< typename T >
5764  inline bool canAlias( const T* alias ) const {
5765  return matrix_.canAlias( alias );
5766  }
5767  //**********************************************************************************************
5768 
5769  //**********************************************************************************************
5775  template< typename T >
5776  inline bool isAliased( const T* alias ) const {
5777  return matrix_.isAliased( alias );
5778  }
5779  //**********************************************************************************************
5780 
5781  //**********************************************************************************************
5786  inline bool isAligned() const {
5787  return matrix_.isAligned();
5788  }
5789  //**********************************************************************************************
5790 
5791  //**********************************************************************************************
5796  inline bool canSMPAssign() const noexcept {
5797  return ( !BLAZE_BLAS_IS_PARALLEL ||
5798  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
5799  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD );
5800  }
5801  //**********************************************************************************************
5802 
5803  private:
5804  //**Member variables****************************************************************************
5805  LeftOperand matrix_;
5806  RightOperand scalar_;
5807  //**********************************************************************************************
5808 
5809  //**Assignment to dense matrices****************************************************************
5821  template< typename MT // Type of the target dense matrix
5822  , bool SO > // Storage order of the target dense matrix
5823  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
5824  {
5826 
5827  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
5828  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
5829 
5830  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5831  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5832 
5833  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
5834  return;
5835  }
5836  else if( left.columns() == 0UL ) {
5837  reset( ~lhs );
5838  return;
5839  }
5840 
5841  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
5842  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
5843 
5844  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
5845  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
5846  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
5847  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
5848  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
5849  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
5850 
5851  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
5852  }
5853  //**********************************************************************************************
5854 
5855  //**Assignment to dense matrices (kernel selection)*********************************************
5866  template< typename MT3 // Type of the left-hand side target matrix
5867  , typename MT4 // Type of the left-hand side matrix operand
5868  , typename MT5 // Type of the right-hand side matrix operand
5869  , typename ST2 > // Type of the scalar value
5870  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5871  {
5873  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
5874  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
5875  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
5876  selectSmallAssignKernel( C, A, B, scalar );
5877  else
5878  selectBlasAssignKernel( C, A, B, scalar );
5879  }
5880  //**********************************************************************************************
5881 
5882  //**Default assignment to row-major dense matrices (general/general)****************************
5896  template< typename MT3 // Type of the left-hand side target matrix
5897  , typename MT4 // Type of the left-hand side matrix operand
5898  , typename MT5 // Type of the right-hand side matrix operand
5899  , typename ST2 > // Type of the scalar value
5901  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5902  {
5903  const size_t M( A.rows() );
5904  const size_t N( B.columns() );
5905  const size_t K( A.columns() );
5906 
5907  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5908 
5909  for( size_t i=0UL; i<M; ++i )
5910  {
5911  const size_t kbegin( ( IsUpper<MT4>::value )
5912  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5913  :( 0UL ) );
5914  const size_t kend( ( IsLower<MT4>::value )
5915  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5916  :( K ) );
5917  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5918 
5919  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
5920  for( size_t j=0UL; j<N; ++j ) {
5921  reset( (~C)(i,j) );
5922  }
5923  continue;
5924  }
5925 
5926  {
5927  const size_t jbegin( ( IsUpper<MT5>::value )
5929  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
5930  :( UPP ? max(i,kbegin) : kbegin ) )
5931  :( UPP ? i : 0UL ) );
5932  const size_t jend( ( IsLower<MT5>::value )
5934  ?( LOW ? min(i+1UL,kbegin) : kbegin )
5935  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
5936  :( LOW ? i+1UL : N ) );
5937 
5938  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
5939  for( size_t j=0UL; j<jbegin; ++j ) {
5940  reset( (~C)(i,j) );
5941  }
5942  }
5943  else if( IsStrictlyUpper<MT5>::value ) {
5944  reset( (~C)(i,0UL) );
5945  }
5946  for( size_t j=jbegin; j<jend; ++j ) {
5947  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
5948  }
5949  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
5950  for( size_t j=jend; j<N; ++j ) {
5951  reset( (~C)(i,j) );
5952  }
5953  }
5954  else if( IsStrictlyLower<MT5>::value ) {
5955  reset( (~C)(i,N-1UL) );
5956  }
5957  }
5958 
5959  for( size_t k=kbegin+1UL; k<kend; ++k )
5960  {
5961  const size_t jbegin( ( IsUpper<MT5>::value )
5963  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
5964  :( SYM || HERM || UPP ? max( i, k ) : k ) )
5965  :( SYM || HERM || UPP ? i : 0UL ) );
5966  const size_t jend( ( IsLower<MT5>::value )
5968  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
5969  :( LOW ? min(i+1UL,k) : k ) )
5970  :( LOW ? i+1UL : N ) );
5971 
5972  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5973  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5974 
5975  for( size_t j=jbegin; j<jend; ++j ) {
5976  (~C)(i,j) += A(i,k) * B(k,j);
5977  }
5978  if( IsLower<MT5>::value ) {
5979  (~C)(i,jend) = A(i,k) * B(k,jend);
5980  }
5981  }
5982 
5983  {
5984  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
5986  :( SYM || HERM || UPP ? i : 0UL ) );
5987  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
5989  :( LOW ? i+1UL : N ) );
5990 
5991  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5992  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5993 
5994  for( size_t j=jbegin; j<jend; ++j ) {
5995  (~C)(i,j) *= scalar;
5996  }
5997  }
5998  }
5999 
6000  if( SYM || HERM ) {
6001  for( size_t i=1UL; i<M; ++i ) {
6002  for( size_t j=0UL; j<i; ++j ) {
6003  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
6004  }
6005  }
6006  }
6007  }
6008  //**********************************************************************************************
6009 
6010  //**Default assignment to column-major dense matrices (general/general)*************************
6024  template< typename MT3 // Type of the left-hand side target matrix
6025  , typename MT4 // Type of the left-hand side matrix operand
6026  , typename MT5 // Type of the right-hand side matrix operand
6027  , typename ST2 > // Type of the scalar value
6028  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6029  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6030  {
6031  const size_t M( A.rows() );
6032  const size_t N( B.columns() );
6033  const size_t K( A.columns() );
6034 
6035  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6036 
6037  for( size_t j=0UL; j<N; ++j )
6038  {
6039  const size_t kbegin( ( IsLower<MT5>::value )
6040  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6041  :( 0UL ) );
6042  const size_t kend( ( IsUpper<MT5>::value )
6043  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6044  :( K ) );
6045  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
6046 
6047  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
6048  for( size_t i=0UL; i<M; ++i ) {
6049  reset( (~C)(i,j) );
6050  }
6051  continue;
6052  }
6053 
6054  {
6055  const size_t ibegin( ( IsLower<MT4>::value )
6057  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
6058  :( LOW ? max(j,kbegin) : kbegin ) )
6059  :( LOW ? j : 0UL ) );
6060  const size_t iend( ( IsUpper<MT4>::value )
6062  ?( UPP ? min(j+1UL,kbegin) : kbegin )
6063  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
6064  :( UPP ? j+1UL : M ) );
6065 
6066  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
6067  for( size_t i=0UL; i<ibegin; ++i ) {
6068  reset( (~C)(i,j) );
6069  }
6070  }
6071  else if( IsStrictlyLower<MT4>::value ) {
6072  reset( (~C)(0UL,j) );
6073  }
6074  for( size_t i=ibegin; i<iend; ++i ) {
6075  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6076  }
6077  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
6078  for( size_t i=iend; i<M; ++i ) {
6079  reset( (~C)(i,j) );
6080  }
6081  }
6082  else if( IsStrictlyUpper<MT4>::value ) {
6083  reset( (~C)(M-1UL,j) );
6084  }
6085  }
6086 
6087  for( size_t k=kbegin+1UL; k<kend; ++k )
6088  {
6089  const size_t ibegin( ( IsLower<MT4>::value )
6091  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
6092  :( SYM || HERM || LOW ? max( j, k ) : k ) )
6093  :( SYM || HERM || LOW ? j : 0UL ) );
6094  const size_t iend( ( IsUpper<MT4>::value )
6096  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
6097  :( UPP ? min(j+1UL,k) : k ) )
6098  :( UPP ? j+1UL : M ) );
6099 
6100  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
6101  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6102 
6103  for( size_t i=ibegin; i<iend; ++i ) {
6104  (~C)(i,j) += A(i,k) * B(k,j);
6105  }
6106  if( IsUpper<MT4>::value ) {
6107  (~C)(iend,j) = A(iend,k) * B(k,j);
6108  }
6109  }
6110 
6111  {
6112  const size_t ibegin( ( ( IsLower<MT4>::value && IsLower<MT5>::value ) )
6114  :( SYM || HERM || LOW ? j : 0UL ) );
6115  const size_t iend( ( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) )
6117  :( UPP ? j+1UL : M ) );
6118 
6119  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
6120  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6121 
6122  for( size_t i=ibegin; i<iend; ++i ) {
6123  (~C)(i,j) *= scalar;
6124  }
6125  }
6126  }
6127 
6128  if( SYM || HERM ) {
6129  for( size_t j=1UL; j<N; ++j ) {
6130  for( size_t i=0UL; i<j; ++i ) {
6131  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
6132  }
6133  }
6134  }
6135  }
6136  //**********************************************************************************************
6137 
6138  //**Default assignment to row-major dense matrices (general/diagonal)***************************
6152  template< typename MT3 // Type of the left-hand side target matrix
6153  , typename MT4 // Type of the left-hand side matrix operand
6154  , typename MT5 // Type of the right-hand side matrix operand
6155  , typename ST2 > // Type of the scalar value
6156  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6157  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6158  {
6159  constexpr size_t block( BLOCK_SIZE );
6160 
6161  const size_t M( A.rows() );
6162  const size_t N( B.columns() );
6163 
6164  for( size_t ii=0UL; ii<M; ii+=block ) {
6165  const size_t iend( min( M, ii+block ) );
6166  for( size_t jj=0UL; jj<N; jj+=block ) {
6167  const size_t jend( min( N, jj+block ) );
6168  for( size_t i=ii; i<iend; ++i )
6169  {
6170  const size_t jbegin( ( IsUpper<MT4>::value )
6171  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
6172  :( jj ) );
6173  const size_t jpos( ( IsLower<MT4>::value )
6174  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
6175  :( jend ) );
6176 
6177  if( IsUpper<MT4>::value ) {
6178  for( size_t j=jj; j<jbegin; ++j ) {
6179  reset( (~C)(i,j) );
6180  }
6181  }
6182  for( size_t j=jbegin; j<jpos; ++j ) {
6183  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6184  }
6185  if( IsLower<MT4>::value ) {
6186  for( size_t j=jpos; j<jend; ++j ) {
6187  reset( (~C)(i,j) );
6188  }
6189  }
6190  }
6191  }
6192  }
6193  }
6194  //**********************************************************************************************
6195 
6196  //**Default assignment to column-major dense matrices (general/diagonal)************************
6210  template< typename MT3 // Type of the left-hand side target matrix
6211  , typename MT4 // Type of the left-hand side matrix operand
6212  , typename MT5 // Type of the right-hand side matrix operand
6213  , typename ST2 > // Type of the scalar value
6214  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6215  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6216  {
6217  const size_t M( A.rows() );
6218  const size_t N( B.columns() );
6219 
6220  for( size_t j=0UL; j<N; ++j )
6221  {
6222  const size_t ibegin( ( IsLower<MT4>::value )
6223  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6224  :( 0UL ) );
6225  const size_t iend( ( IsUpper<MT4>::value )
6226  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6227  :( M ) );
6228  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6229 
6230  if( IsLower<MT4>::value ) {
6231  for( size_t i=0UL; i<ibegin; ++i ) {
6232  reset( (~C)(i,j) );
6233  }
6234  }
6235  for( size_t i=ibegin; i<iend; ++i ) {
6236  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6237  }
6238  if( IsUpper<MT4>::value ) {
6239  for( size_t i=iend; i<M; ++i ) {
6240  reset( (~C)(i,j) );
6241  }
6242  }
6243  }
6244  }
6245  //**********************************************************************************************
6246 
6247  //**Default assignment to row-major dense matrices (diagonal/general)***************************
6261  template< typename MT3 // Type of the left-hand side target matrix
6262  , typename MT4 // Type of the left-hand side matrix operand
6263  , typename MT5 // Type of the right-hand side matrix operand
6264  , typename ST2 > // Type of the scalar value
6266  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6267  {
6268  const size_t M( A.rows() );
6269  const size_t N( B.columns() );
6270 
6271  for( size_t i=0UL; i<M; ++i )
6272  {
6273  const size_t jbegin( ( IsUpper<MT5>::value )
6274  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6275  :( 0UL ) );
6276  const size_t jend( ( IsLower<MT5>::value )
6277  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6278  :( N ) );
6279  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6280 
6281  if( IsUpper<MT5>::value ) {
6282  for( size_t j=0UL; j<jbegin; ++j ) {
6283  reset( (~C)(i,j) );
6284  }
6285  }
6286  for( size_t j=jbegin; j<jend; ++j ) {
6287  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6288  }
6289  if( IsLower<MT5>::value ) {
6290  for( size_t j=jend; j<N; ++j ) {
6291  reset( (~C)(i,j) );
6292  }
6293  }
6294  }
6295  }
6296  //**********************************************************************************************
6297 
6298  //**Default assignment to column-major dense matrices (diagonal/general)************************
6312  template< typename MT3 // Type of the left-hand side target matrix
6313  , typename MT4 // Type of the left-hand side matrix operand
6314  , typename MT5 // Type of the right-hand side matrix operand
6315  , typename ST2 > // Type of the scalar value
6316  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6317  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6318  {
6319  constexpr size_t block( BLOCK_SIZE );
6320 
6321  const size_t M( A.rows() );
6322  const size_t N( B.columns() );
6323 
6324  for( size_t jj=0UL; jj<N; jj+=block ) {
6325  const size_t jend( min( N, jj+block ) );
6326  for( size_t ii=0UL; ii<M; ii+=block ) {
6327  const size_t iend( min( M, ii+block ) );
6328  for( size_t j=jj; j<jend; ++j )
6329  {
6330  const size_t ibegin( ( IsLower<MT5>::value )
6331  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
6332  :( ii ) );
6333  const size_t ipos( ( IsUpper<MT5>::value )
6334  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
6335  :( iend ) );
6336 
6337  if( IsLower<MT5>::value ) {
6338  for( size_t i=ii; i<ibegin; ++i ) {
6339  reset( (~C)(i,j) );
6340  }
6341  }
6342  for( size_t i=ibegin; i<ipos; ++i ) {
6343  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6344  }
6345  if( IsUpper<MT5>::value ) {
6346  for( size_t i=ipos; i<iend; ++i ) {
6347  reset( (~C)(i,j) );
6348  }
6349  }
6350  }
6351  }
6352  }
6353  }
6354  //**********************************************************************************************
6355 
6356  //**Default assignment to dense matrices (diagonal/diagonal)************************************
6370  template< typename MT3 // Type of the left-hand side target matrix
6371  , typename MT4 // Type of the left-hand side matrix operand
6372  , typename MT5 // Type of the right-hand side matrix operand
6373  , typename ST2 > // Type of the scalar value
6374  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6375  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6376  {
6377  reset( C );
6378 
6379  for( size_t i=0UL; i<A.rows(); ++i ) {
6380  C(i,i) = A(i,i) * B(i,i) * scalar;
6381  }
6382  }
6383  //**********************************************************************************************
6384 
6385  //**Default assignment to dense matrices (small matrices)***************************************
6399  template< typename MT3 // Type of the left-hand side target matrix
6400  , typename MT4 // Type of the left-hand side matrix operand
6401  , typename MT5 // Type of the right-hand side matrix operand
6402  , typename ST2 > // Type of the scalar value
6404  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6405  {
6406  selectDefaultAssignKernel( C, A, B, scalar );
6407  }
6408  //**********************************************************************************************
6409 
6410  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
6425  template< typename MT3 // Type of the left-hand side target matrix
6426  , typename MT4 // Type of the left-hand side matrix operand
6427  , typename MT5 // Type of the right-hand side matrix operand
6428  , typename ST2 > // Type of the scalar value
6430  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6431  {
6432  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6433 
6434  const size_t M( A.rows() );
6435  const size_t N( B.columns() );
6436  const size_t K( A.columns() );
6437 
6438  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6439 
6440  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
6441  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
6442 
6443  const SIMDType factor( set( scalar ) );
6444 
6445  if( LOW && UPP && N > SIMDSIZE*3UL ) {
6446  reset( ~C );
6447  }
6448 
6449  {
6450  size_t j( 0UL );
6451 
6453  {
6454  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6455  for( size_t i=0UL; i<M; ++i )
6456  {
6457  const size_t kbegin( ( IsUpper<MT4>::value )
6458  ?( ( IsLower<MT5>::value )
6459  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6460  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6461  :( IsLower<MT5>::value ? j : 0UL ) );
6462  const size_t kend( ( IsLower<MT4>::value )
6463  ?( ( IsUpper<MT5>::value )
6464  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
6465  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6466  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
6467 
6468  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6469 
6470  for( size_t k=kbegin; k<kend; ++k ) {
6471  const SIMDType a1( set( A(i,k) ) );
6472  xmm1 += a1 * B.load(k,j );
6473  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6474  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6475  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6476  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6477  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
6478  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
6479  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
6480  }
6481 
6482  (~C).store( i, j , xmm1 * factor );
6483  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6484  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6485  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6486  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
6487  (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
6488  (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
6489  (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
6490  }
6491  }
6492  }
6493 
6494  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
6495  {
6496  size_t i( 0UL );
6497 
6498  for( ; (i+2UL) <= M; i+=2UL )
6499  {
6500  const size_t kbegin( ( IsUpper<MT4>::value )
6501  ?( ( IsLower<MT5>::value )
6502  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6503  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6504  :( IsLower<MT5>::value ? j : 0UL ) );
6505  const size_t kend( ( IsLower<MT4>::value )
6506  ?( ( IsUpper<MT5>::value )
6507  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
6508  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6509  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
6510 
6511  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6512 
6513  for( size_t k=kbegin; k<kend; ++k ) {
6514  const SIMDType a1( set( A(i ,k) ) );
6515  const SIMDType a2( set( A(i+1UL,k) ) );
6516  const SIMDType b1( B.load(k,j ) );
6517  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6518  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6519  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6520  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
6521  xmm1 += a1 * b1;
6522  xmm2 += a1 * b2;
6523  xmm3 += a1 * b3;
6524  xmm4 += a1 * b4;
6525  xmm5 += a1 * b5;
6526  xmm6 += a2 * b1;
6527  xmm7 += a2 * b2;
6528  xmm8 += a2 * b3;
6529  xmm9 += a2 * b4;
6530  xmm10 += a2 * b5;
6531  }
6532 
6533  (~C).store( i , j , xmm1 * factor );
6534  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
6535  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
6536  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
6537  (~C).store( i , j+SIMDSIZE*4UL, xmm5 * factor );
6538  (~C).store( i+1UL, j , xmm6 * factor );
6539  (~C).store( i+1UL, j+SIMDSIZE , xmm7 * factor );
6540  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
6541  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
6542  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
6543  }
6544 
6545  if( i < M )
6546  {
6547  const size_t kbegin( ( IsUpper<MT4>::value )
6548  ?( ( IsLower<MT5>::value )
6549  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6550  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6551  :( IsLower<MT5>::value ? j : 0UL ) );
6552  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
6553 
6554  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6555 
6556  for( size_t k=kbegin; k<kend; ++k ) {
6557  const SIMDType a1( set( A(i,k) ) );
6558  xmm1 += a1 * B.load(k,j );
6559  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6560  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6561  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6562  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6563  }
6564 
6565  (~C).store( i, j , xmm1 * factor );
6566  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6567  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6568  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6569  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
6570  }
6571  }
6572 
6573  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6574  {
6575  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
6576  size_t i( LOW ? j : 0UL );
6577 
6578  for( ; (i+2UL) <= iend; i+=2UL )
6579  {
6580  const size_t kbegin( ( IsUpper<MT4>::value )
6581  ?( ( IsLower<MT5>::value )
6582  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6583  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6584  :( IsLower<MT5>::value ? j : 0UL ) );
6585  const size_t kend( ( IsLower<MT4>::value )
6586  ?( ( IsUpper<MT5>::value )
6587  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
6588  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6589  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
6590 
6591  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6592 
6593  for( size_t k=kbegin; k<kend; ++k ) {
6594  const SIMDType a1( set( A(i ,k) ) );
6595  const SIMDType a2( set( A(i+1UL,k) ) );
6596  const SIMDType b1( B.load(k,j ) );
6597  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6598  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6599  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6600  xmm1 += a1 * b1;
6601  xmm2 += a1 * b2;
6602  xmm3 += a1 * b3;
6603  xmm4 += a1 * b4;
6604  xmm5 += a2 * b1;
6605  xmm6 += a2 * b2;
6606  xmm7 += a2 * b3;
6607  xmm8 += a2 * b4;
6608  }
6609 
6610  (~C).store( i , j , xmm1 * factor );
6611  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
6612  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
6613  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
6614  (~C).store( i+1UL, j , xmm5 * factor );
6615  (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
6616  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
6617  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
6618  }
6619 
6620  if( i < iend )
6621  {
6622  const size_t kbegin( ( IsUpper<MT4>::value )
6623  ?( ( IsLower<MT5>::value )
6624  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6625  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6626  :( IsLower<MT5>::value ? j : 0UL ) );
6627  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
6628 
6629  SIMDType xmm1, xmm2, xmm3, xmm4;
6630 
6631  for( size_t k=kbegin; k<kend; ++k ) {
6632  const SIMDType a1( set( A(i,k) ) );
6633  xmm1 += a1 * B.load(k,j );
6634  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6635  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6636  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6637  }
6638 
6639  (~C).store( i, j , xmm1 * factor );
6640  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6641  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6642  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6643  }
6644  }
6645 
6646  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
6647  {
6648  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
6649  size_t i( LOW ? j : 0UL );
6650 
6651  for( ; (i+2UL) <= iend; i+=2UL )
6652  {
6653  const size_t kbegin( ( IsUpper<MT4>::value )
6654  ?( ( IsLower<MT5>::value )
6655  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6656  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6657  :( IsLower<MT5>::value ? j : 0UL ) );
6658  const size_t kend( ( IsLower<MT4>::value )
6659  ?( ( IsUpper<MT5>::value )
6660  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
6661  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6662  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
6663 
6664  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6665 
6666  for( size_t k=kbegin; k<kend; ++k ) {
6667  const SIMDType a1( set( A(i ,k) ) );
6668  const SIMDType a2( set( A(i+1UL,k) ) );
6669  const SIMDType b1( B.load(k,j ) );
6670  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6671  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6672  xmm1 += a1 * b1;
6673  xmm2 += a1 * b2;
6674  xmm3 += a1 * b3;
6675  xmm4 += a2 * b1;
6676  xmm5 += a2 * b2;
6677  xmm6 += a2 * b3;
6678  }
6679 
6680  (~C).store( i , j , xmm1 * factor );
6681  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
6682  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
6683  (~C).store( i+1UL, j , xmm4 * factor );
6684  (~C).store( i+1UL, j+SIMDSIZE , xmm5 * factor );
6685  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
6686  }
6687 
6688  if( i < iend )
6689  {
6690  const size_t kbegin( ( IsUpper<MT4>::value )
6691  ?( ( IsLower<MT5>::value )
6692  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6693  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6694  :( IsLower<MT5>::value ? j : 0UL ) );
6695  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
6696 
6697  SIMDType xmm1, xmm2, xmm3;
6698 
6699  for( size_t k=kbegin; k<kend; ++k ) {
6700  const SIMDType a1( set( A(i,k) ) );
6701  xmm1 += a1 * B.load(k,j );
6702  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6703  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6704  }
6705 
6706  (~C).store( i, j , xmm1 * factor );
6707  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6708  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6709  }
6710  }
6711 
6712  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6713  {
6714  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
6715  size_t i( LOW ? j : 0UL );
6716 
6717  for( ; (i+2UL) <= iend; i+=2UL )
6718  {
6719  const size_t kbegin( ( IsUpper<MT4>::value )
6720  ?( ( IsLower<MT5>::value )
6721  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6722  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6723  :( IsLower<MT5>::value ? j : 0UL ) );
6724  const size_t kend( ( IsLower<MT4>::value )
6725  ?( ( IsUpper<MT5>::value )
6726  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
6727  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6728  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
6729 
6730  SIMDType xmm1, xmm2, xmm3, xmm4;
6731 
6732  for( size_t k=kbegin; k<kend; ++k ) {
6733  const SIMDType a1( set( A(i ,k) ) );
6734  const SIMDType a2( set( A(i+1UL,k) ) );
6735  const SIMDType b1( B.load(k,j ) );
6736  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6737  xmm1 += a1 * b1;
6738  xmm2 += a1 * b2;
6739  xmm3 += a2 * b1;
6740  xmm4 += a2 * b2;
6741  }
6742 
6743  (~C).store( i , j , xmm1 * factor );
6744  (~C).store( i , j+SIMDSIZE, xmm2 * factor );
6745  (~C).store( i+1UL, j , xmm3 * factor );
6746  (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
6747  }
6748 
6749  if( i < iend )
6750  {
6751  const size_t kbegin( ( IsUpper<MT4>::value )
6752  ?( ( IsLower<MT5>::value )
6753  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6754  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6755  :( IsLower<MT5>::value ? j : 0UL ) );
6756  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
6757 
6758  SIMDType xmm1, xmm2;
6759 
6760  for( size_t k=kbegin; k<kend; ++k ) {
6761  const SIMDType a1( set( A(i,k) ) );
6762  xmm1 += a1 * B.load(k,j );
6763  xmm2 += a1 * B.load(k,j+SIMDSIZE);
6764  }
6765 
6766  (~C).store( i, j , xmm1 * factor );
6767  (~C).store( i, j+SIMDSIZE, xmm2 * factor );
6768  }
6769  }
6770 
6771  for( ; j<jpos; j+=SIMDSIZE )
6772  {
6773  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
6774  size_t i( LOW ? j : 0UL );
6775 
6776  for( ; (i+2UL) <= iend; i+=2UL )
6777  {
6778  const size_t kbegin( ( IsUpper<MT4>::value )
6779  ?( ( IsLower<MT5>::value )
6780  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6781  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6782  :( IsLower<MT5>::value ? j : 0UL ) );
6783  const size_t kend( ( IsLower<MT4>::value )
6784  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6785  :( K ) );
6786 
6787  SIMDType xmm1, xmm2;
6788 
6789  for( size_t k=kbegin; k<kend; ++k ) {
6790  const SIMDType b1( B.load(k,j) );
6791  xmm1 += set( A(i ,k) ) * b1;
6792  xmm2 += set( A(i+1UL,k) ) * b1;
6793  }
6794 
6795  (~C).store( i , j, xmm1 * factor );
6796  (~C).store( i+1UL, j, xmm2 * factor );
6797  }
6798 
6799  if( i < iend )
6800  {
6801  const size_t kbegin( ( IsUpper<MT4>::value )
6802  ?( ( IsLower<MT5>::value )
6803  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6804  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6805  :( IsLower<MT5>::value ? j : 0UL ) );
6806 
6807  SIMDType xmm1;
6808 
6809  for( size_t k=kbegin; k<K; ++k ) {
6810  xmm1 += set( A(i,k) ) * B.load(k,j);
6811  }
6812 
6813  (~C).store( i, j, xmm1 * factor );
6814  }
6815  }
6816 
6817  for( ; remainder && j<N; ++j )
6818  {
6819  size_t i( LOW && UPP ? j : 0UL );
6820 
6821  for( ; (i+2UL) <= M; i+=2UL )
6822  {
6823  const size_t kbegin( ( IsUpper<MT4>::value )
6824  ?( ( IsLower<MT5>::value )
6825  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6826  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6827  :( IsLower<MT5>::value ? j : 0UL ) );
6828  const size_t kend( ( IsLower<MT4>::value )
6829  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6830  :( K ) );
6831 
6832  ElementType value1 = ElementType();
6833  ElementType value2 = ElementType();
6834 
6835  for( size_t k=kbegin; k<kend; ++k ) {
6836  value1 += A(i ,k) * B(k,j);
6837  value2 += A(i+1UL,k) * B(k,j);
6838  }
6839 
6840  (~C)(i ,j) = value1 * scalar;
6841  (~C)(i+1UL,j) = value2 * scalar;
6842  }
6843 
6844  if( i < M )
6845  {
6846  const size_t kbegin( ( IsUpper<MT4>::value )
6847  ?( ( IsLower<MT5>::value )
6848  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6849  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6850  :( IsLower<MT5>::value ? j : 0UL ) );
6851 
6852  ElementType value = ElementType();
6853 
6854  for( size_t k=kbegin; k<K; ++k ) {
6855  value += A(i,k) * B(k,j);
6856  }
6857 
6858  (~C)(i,j) = value * scalar;
6859  }
6860  }
6861  }
6862 
6863  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
6864  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
6865  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
6866  for( size_t j=0UL; j<jend; ++j ) {
6867  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
6868  }
6869  }
6870  }
6871  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
6872  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6873  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6874  for( size_t i=0UL; i<iend; ++i ) {
6875  reset( (~C)(i,j) );
6876  }
6877  }
6878  }
6879  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
6880  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
6881  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
6882  for( size_t j=0UL; j<jend; ++j ) {
6883  reset( (~C)(i,j) );
6884  }
6885  }
6886  }
6887  }
6888  //**********************************************************************************************
6889 
6890  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
6905  template< typename MT3 // Type of the left-hand side target matrix
6906  , typename MT4 // Type of the left-hand side matrix operand
6907  , typename MT5 // Type of the right-hand side matrix operand
6908  , typename ST2 > // Type of the scalar value
6910  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6911  {
6912  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6913 
6914  const size_t M( A.rows() );
6915  const size_t N( B.columns() );
6916  const size_t K( A.columns() );
6917 
6918  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6919 
6920  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
6921  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
6922 
6923  const SIMDType factor( set( scalar ) );
6924 
6925  if( LOW && UPP && M > SIMDSIZE*3UL ) {
6926  reset( ~C );
6927  }
6928 
6929  {
6930  size_t i( 0UL );
6931 
6933  {
6934  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6935  for( size_t j=0UL; j<N; ++j )
6936  {
6937  const size_t kbegin( ( IsLower<MT5>::value )
6938  ?( ( IsUpper<MT4>::value )
6939  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6940  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6941  :( IsUpper<MT4>::value ? i : 0UL ) );
6942  const size_t kend( ( IsUpper<MT5>::value )
6943  ?( ( IsLower<MT4>::value )
6944  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6945  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6946  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
6947 
6948  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6949 
6950  for( size_t k=kbegin; k<kend; ++k ) {
6951  const SIMDType b1( set( B(k,j) ) );
6952  xmm1 += A.load(i ,k) * b1;
6953  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6954  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6955  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6956  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6957  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
6958  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
6959  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
6960  }
6961 
6962  (~C).store( i , j, xmm1 * factor );
6963  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
6964  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
6965  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
6966  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
6967  (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
6968  (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
6969  (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
6970  }
6971  }
6972  }
6973 
6974  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6975  {
6976  size_t j( 0UL );
6977 
6978  for( ; (j+2UL) <= N; j+=2UL )
6979  {
6980  const size_t kbegin( ( IsLower<MT5>::value )
6981  ?( ( IsUpper<MT4>::value )
6982  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6983  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6984  :( IsUpper<MT4>::value ? i : 0UL ) );
6985  const size_t kend( ( IsUpper<MT5>::value )
6986  ?( ( IsLower<MT4>::value )
6987  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6988  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6989  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
6990 
6991  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6992 
6993  for( size_t k=kbegin; k<kend; ++k ) {
6994  const SIMDType a1( A.load(i ,k) );
6995  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6996  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6997  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6998  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6999  const SIMDType b1( set( B(k,j ) ) );
7000  const SIMDType b2( set( B(k,j+1UL) ) );
7001  xmm1 += a1 * b1;
7002  xmm2 += a2 * b1;
7003  xmm3 += a3 * b1;
7004  xmm4 += a4 * b1;
7005  xmm5 += a5 * b1;
7006  xmm6 += a1 * b2;
7007  xmm7 += a2 * b2;
7008  xmm8 += a3 * b2;
7009  xmm9 += a4 * b2;
7010  xmm10 += a5 * b2;
7011  }
7012 
7013  (~C).store( i , j , xmm1 * factor );
7014  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
7015  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
7016  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
7017  (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
7018  (~C).store( i , j+1UL, xmm6 * factor );
7019  (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
7020  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
7021  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
7022  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
7023  }
7024 
7025  if( j < N )
7026  {
7027  const size_t kbegin( ( IsLower<MT5>::value )
7028  ?( ( IsUpper<MT4>::value )
7029  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7030  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7031  :( IsUpper<MT4>::value ? i : 0UL ) );
7032  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
7033 
7034  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7035 
7036  for( size_t k=kbegin; k<kend; ++k ) {
7037  const SIMDType b1( set( B(k,j) ) );
7038  xmm1 += A.load(i ,k) * b1;
7039  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7040  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7041  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7042  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7043  }
7044 
7045  (~C).store( i , j, xmm1 * factor );
7046  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7047  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7048  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
7049  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
7050  }
7051  }
7052 
7053  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7054  {
7055  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
7056  size_t j( UPP ? i : 0UL );
7057 
7058  for( ; (j+2UL) <= jend; j+=2UL )
7059  {
7060  const size_t kbegin( ( IsLower<MT5>::value )
7061  ?( ( IsUpper<MT4>::value )
7062  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7063  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7064  :( IsUpper<MT4>::value ? i : 0UL ) );
7065  const size_t kend( ( IsUpper<MT5>::value )
7066  ?( ( IsLower<MT4>::value )
7067  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7068  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7069  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
7070 
7071  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7072 
7073  for( size_t k=kbegin; k<kend; ++k ) {
7074  const SIMDType a1( A.load(i ,k) );
7075  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7076  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7077  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7078  const SIMDType b1( set( B(k,j ) ) );
7079  const SIMDType b2( set( B(k,j+1UL) ) );
7080  xmm1 += a1 * b1;
7081  xmm2 += a2 * b1;
7082  xmm3 += a3 * b1;
7083  xmm4 += a4 * b1;
7084  xmm5 += a1 * b2;
7085  xmm6 += a2 * b2;
7086  xmm7 += a3 * b2;
7087  xmm8 += a4 * b2;
7088  }
7089 
7090  (~C).store( i , j , xmm1 * factor );
7091  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
7092  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
7093  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
7094  (~C).store( i , j+1UL, xmm5 * factor );
7095  (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
7096  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
7097  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
7098  }
7099 
7100  if( j < jend )
7101  {
7102  const size_t kbegin( ( IsLower<MT5>::value )
7103  ?( ( IsUpper<MT4>::value )
7104  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7105  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7106  :( IsUpper<MT4>::value ? i : 0UL ) );
7107  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
7108 
7109  SIMDType xmm1, xmm2, xmm3, xmm4;
7110 
7111  for( size_t k=kbegin; k<kend; ++k ) {
7112  const SIMDType b1( set( B(k,j) ) );
7113  xmm1 += A.load(i ,k) * b1;
7114  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7115  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7116  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7117  }
7118 
7119  (~C).store( i , j, xmm1 * factor );
7120  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7121  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7122  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
7123  }
7124  }
7125 
7126  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
7127  {
7128  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
7129  size_t j( UPP ? i : 0UL );
7130 
7131  for( ; (j+2UL) <= jend; j+=2UL )
7132  {
7133  const size_t kbegin( ( IsLower<MT5>::value )
7134  ?( ( IsUpper<MT4>::value )
7135  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7136  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7137  :( IsUpper<MT4>::value ? i : 0UL ) );
7138  const size_t kend( ( IsUpper<MT5>::value )
7139  ?( ( IsLower<MT4>::value )
7140  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7141  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7142  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
7143 
7144  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7145 
7146  for( size_t k=kbegin; k<kend; ++k ) {
7147  const SIMDType a1( A.load(i ,k) );
7148  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7149  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7150  const SIMDType b1( set( B(k,j ) ) );
7151  const SIMDType b2( set( B(k,j+1UL) ) );
7152  xmm1 += a1 * b1;
7153  xmm2 += a2 * b1;
7154  xmm3 += a3 * b1;
7155  xmm4 += a1 * b2;
7156  xmm5 += a2 * b2;
7157  xmm6 += a3 * b2;
7158  }
7159 
7160  (~C).store( i , j , xmm1 * factor );
7161  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
7162  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
7163  (~C).store( i , j+1UL, xmm4 * factor );
7164  (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
7165  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
7166  }
7167 
7168  if( j < jend )
7169  {
7170  const size_t kbegin( ( IsLower<MT5>::value )
7171  ?( ( IsUpper<MT4>::value )
7172  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7173  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7174  :( IsUpper<MT4>::value ? i : 0UL ) );
7175  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
7176 
7177  SIMDType xmm1, xmm2, xmm3;
7178 
7179  for( size_t k=kbegin; k<kend; ++k ) {
7180  const SIMDType b1( set( B(k,j) ) );
7181  xmm1 += A.load(i ,k) * b1;
7182  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7183  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7184  }
7185 
7186  (~C).store( i , j, xmm1 * factor );
7187  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7188  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7189  }
7190  }
7191 
7192  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7193  {
7194  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
7195  size_t j( UPP ? i : 0UL );
7196 
7197  for( ; (j+2UL) <= jend; j+=2UL )
7198  {
7199  const size_t kbegin( ( IsLower<MT5>::value )
7200  ?( ( IsUpper<MT4>::value )
7201  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7202  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7203  :( IsUpper<MT4>::value ? i : 0UL ) );
7204  const size_t kend( ( IsUpper<MT5>::value )
7205  ?( ( IsLower<MT4>::value )
7206  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7207  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7208  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
7209 
7210  SIMDType xmm1, xmm2, xmm3, xmm4;
7211 
7212  for( size_t k=kbegin; k<kend; ++k ) {
7213  const SIMDType a1( A.load(i ,k) );
7214  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7215  const SIMDType b1( set( B(k,j ) ) );
7216  const SIMDType b2( set( B(k,j+1UL) ) );
7217  xmm1 += a1 * b1;
7218  xmm2 += a2 * b1;
7219  xmm3 += a1 * b2;
7220  xmm4 += a2 * b2;
7221  }
7222 
7223  (~C).store( i , j , xmm1 * factor );
7224  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
7225  (~C).store( i , j+1UL, xmm3 * factor );
7226  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
7227  }
7228 
7229  if( j < jend )
7230  {
7231  const size_t kbegin( ( IsLower<MT5>::value )
7232  ?( ( IsUpper<MT4>::value )
7233  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7234  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7235  :( IsUpper<MT4>::value ? i : 0UL ) );
7236  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
7237 
7238  SIMDType xmm1, xmm2;
7239 
7240  for( size_t k=kbegin; k<kend; ++k ) {
7241  const SIMDType b1( set( B(k,j) ) );
7242  xmm1 += A.load(i ,k) * b1;
7243  xmm2 += A.load(i+SIMDSIZE,k) * b1;
7244  }
7245 
7246  (~C).store( i , j, xmm1 * factor );
7247  (~C).store( i+SIMDSIZE, j, xmm2 * factor );
7248  }
7249  }
7250 
7251  for( ; i<ipos; i+=SIMDSIZE )
7252  {
7253  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
7254  size_t j( UPP ? i : 0UL );
7255 
7256  for( ; (j+2UL) <= jend; j+=2UL )
7257  {
7258  const size_t kbegin( ( IsLower<MT5>::value )
7259  ?( ( IsUpper<MT4>::value )
7260  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7261  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7262  :( IsUpper<MT4>::value ? i : 0UL ) );
7263  const size_t kend( ( IsUpper<MT5>::value )
7264  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7265  :( K ) );
7266 
7267  SIMDType xmm1, xmm2;
7268 
7269  for( size_t k=kbegin; k<kend; ++k ) {
7270  const SIMDType a1( A.load(i,k) );
7271  xmm1 += a1 * set( B(k,j ) );
7272  xmm2 += a1 * set( B(k,j+1UL) );
7273  }
7274 
7275  (~C).store( i, j , xmm1 * factor );
7276  (~C).store( i, j+1UL, xmm2 * factor );
7277  }
7278 
7279  if( j < jend )
7280  {
7281  const size_t kbegin( ( IsLower<MT5>::value )
7282  ?( ( IsUpper<MT4>::value )
7283  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7284  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7285  :( IsUpper<MT4>::value ? i : 0UL ) );
7286 
7287  SIMDType xmm1;
7288 
7289  for( size_t k=kbegin; k<K; ++k ) {
7290  xmm1 += A.load(i,k) * set( B(k,j) );
7291  }
7292 
7293  (~C).store( i, j, xmm1 * factor );
7294  }
7295  }
7296 
7297  for( ; remainder && i<M; ++i )
7298  {
7299  size_t j( LOW && UPP ? i : 0UL );
7300 
7301  for( ; (j+2UL) <= N; j+=2UL )
7302  {
7303  const size_t kbegin( ( IsLower<MT5>::value )
7304  ?( ( IsUpper<MT4>::value )
7305  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7306  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7307  :( IsUpper<MT4>::value ? i : 0UL ) );
7308  const size_t kend( ( IsUpper<MT5>::value )
7309  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7310  :( K ) );
7311 
7312  ElementType value1 = ElementType();
7313  ElementType value2 = ElementType();
7314 
7315  for( size_t k=kbegin; k<kend; ++k ) {
7316  value1 += A(i,k) * B(k,j );
7317  value2 += A(i,k) * B(k,j+1UL);
7318  }
7319 
7320  (~C)(i,j ) = value1 * scalar;
7321  (~C)(i,j+1UL) = value2 * scalar;
7322  }
7323 
7324  if( j < N )
7325  {
7326  const size_t kbegin( ( IsLower<MT5>::value )
7327  ?( ( IsUpper<MT4>::value )
7328  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7329  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7330  :( IsUpper<MT4>::value ? i : 0UL ) );
7331 
7332  ElementType value = ElementType();
7333 
7334  for( size_t k=kbegin; k<K; ++k ) {
7335  value += A(i,k) * B(k,j);
7336  }
7337 
7338  (~C)(i,j) = value * scalar;
7339  }
7340  }
7341  }
7342 
7343  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
7344  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
7345  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
7346  for( size_t i=0UL; i<iend; ++i ) {
7347  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
7348  }
7349  }
7350  }
7351  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
7352  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
7353  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
7354  for( size_t i=0UL; i<iend; ++i ) {
7355  reset( (~C)(i,j) );
7356  }
7357  }
7358  }
7359  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
7360  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
7361  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
7362  for( size_t j=0UL; j<jend; ++j ) {
7363  reset( (~C)(i,j) );
7364  }
7365  }
7366  }
7367  }
7368  //**********************************************************************************************
7369 
7370  //**Default assignment to dense matrices (large matrices)***************************************
7384  template< typename MT3 // Type of the left-hand side target matrix
7385  , typename MT4 // Type of the left-hand side matrix operand
7386  , typename MT5 // Type of the right-hand side matrix operand
7387  , typename ST2 > // Type of the scalar value
7389  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7390  {
7391  selectDefaultAssignKernel( C, A, B, scalar );
7392  }
7393  //**********************************************************************************************
7394 
7395  //**Vectorized default assignment to dense matrices (large matrices)****************************
7410  template< typename MT3 // Type of the left-hand side target matrix
7411  , typename MT4 // Type of the left-hand side matrix operand
7412  , typename MT5 // Type of the right-hand side matrix operand
7413  , typename ST2 > // Type of the scalar value
7415  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7416  {
7417  if( SYM )
7418  smmm( C, A, B, scalar );
7419  else if( HERM )
7420  hmmm( C, A, B, scalar );
7421  else if( LOW )
7422  lmmm( C, A, B, scalar, ST2(0) );
7423  else if( UPP )
7424  ummm( C, A, B, scalar, ST2(0) );
7425  else
7426  mmm( C, A, B, scalar, ST2(0) );
7427  }
7428  //**********************************************************************************************
7429 
7430  //**BLAS-based assignment to dense matrices (default)*******************************************
7444  template< typename MT3 // Type of the left-hand side target matrix
7445  , typename MT4 // Type of the left-hand side matrix operand
7446  , typename MT5 // Type of the right-hand side matrix operand
7447  , typename ST2 > // Type of the scalar value
7449  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7450  {
7451  selectLargeAssignKernel( C, A, B, scalar );
7452  }
7453  //**********************************************************************************************
7454 
7455  //**BLAS-based assignment to dense matrices*****************************************************
7456 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7457 
7470  template< typename MT3 // Type of the left-hand side target matrix
7471  , typename MT4 // Type of the left-hand side matrix operand
7472  , typename MT5 // Type of the right-hand side matrix operand
7473  , typename ST2 > // Type of the scalar value
7475  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7476  {
7477  typedef ElementType_<MT3> ET;
7478 
7479  if( IsTriangular<MT4>::value ) {
7480  assign( C, B );
7481  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7482  }
7483  else if( IsTriangular<MT5>::value ) {
7484  assign( C, A );
7485  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7486  }
7487  else {
7488  gemm( C, A, B, ET(scalar), ET(0) );
7489  }
7490  }
7491 #endif
7492  //**********************************************************************************************
7493 
7494  //**Assignment to sparse matrices***************************************************************
7506  template< typename MT // Type of the target sparse matrix
7507  , bool SO > // Storage order of the target sparse matrix
7508  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7509  {
7511 
7513 
7520 
7521  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7522  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7523 
7524  const ForwardFunctor fwd;
7525 
7526  const TmpType tmp( serial( rhs ) );
7527  assign( ~lhs, fwd( tmp ) );
7528  }
7529  //**********************************************************************************************
7530 
7531  //**Addition assignment to dense matrices*******************************************************
7543  template< typename MT // Type of the target dense matrix
7544  , bool SO > // Storage order of the target dense matrix
7545  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7546  {
7548 
7549  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7550  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7551 
7552  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7553  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7554 
7555  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7556  return;
7557  }
7558 
7559  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7560  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7561 
7562  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7563  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7564  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7565  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7566  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7567  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7568 
7569  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
7570  }
7571  //**********************************************************************************************
7572 
7573  //**Addition assignment to dense matrices (kernel selection)************************************
7584  template< typename MT3 // Type of the left-hand side target matrix
7585  , typename MT4 // Type of the left-hand side matrix operand
7586  , typename MT5 // Type of the right-hand side matrix operand
7587  , typename ST2 > // Type of the scalar value
7588  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7589  {
7591  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
7592  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
7593  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7594  selectSmallAddAssignKernel( C, A, B, scalar );
7595  else
7596  selectBlasAddAssignKernel( C, A, B, scalar );
7597  }
7598  //**********************************************************************************************
7599 
7600  //**Default addition assignment to dense matrices (general/general)*****************************
7614  template< typename MT3 // Type of the left-hand side target matrix
7615  , typename MT4 // Type of the left-hand side matrix operand
7616  , typename MT5 // Type of the right-hand side matrix operand
7617  , typename ST2 > // Type of the scalar value
7618  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
7619  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7620  {
7621  const ResultType tmp( serial( A * B * scalar ) );
7622  addAssign( C, tmp );
7623  }
7624  //**********************************************************************************************
7625 
7626  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
7640  template< typename MT3 // Type of the left-hand side target matrix
7641  , typename MT4 // Type of the left-hand side matrix operand
7642  , typename MT5 // Type of the right-hand side matrix operand
7643  , typename ST2 > // Type of the scalar value
7644  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7645  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7646  {
7647  constexpr size_t block( BLOCK_SIZE );
7648 
7649  const size_t M( A.rows() );
7650  const size_t N( B.columns() );
7651 
7652  for( size_t ii=0UL; ii<M; ii+=block ) {
7653  const size_t iend( min( M, ii+block ) );
7654  for( size_t jj=0UL; jj<N; jj+=block ) {
7655  const size_t jend( min( N, jj+block ) );
7656  for( size_t i=ii; i<iend; ++i )
7657  {
7658  const size_t jbegin( ( IsUpper<MT4>::value )
7659  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
7660  :( jj ) );
7661  const size_t jpos( ( IsLower<MT4>::value )
7662  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
7663  :( jend ) );
7664 
7665  for( size_t j=jbegin; j<jpos; ++j ) {
7666  (~C)(i,j) += A(i,j) * B(j,j) * scalar;
7667  }
7668  }
7669  }
7670  }
7671  }
7672  //**********************************************************************************************
7673 
7674  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
7688  template< typename MT3 // Type of the left-hand side target matrix
7689  , typename MT4 // Type of the left-hand side matrix operand
7690  , typename MT5 // Type of the right-hand side matrix operand
7691  , typename ST2 > // Type of the scalar value
7692  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7693  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7694  {
7695  const size_t M( A.rows() );
7696  const size_t N( B.columns() );
7697 
7698  for( size_t j=0UL; j<N; ++j )
7699  {
7700  const size_t ibegin( ( IsLower<MT4>::value )
7701  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
7702  :( 0UL ) );
7703  const size_t iend( ( IsUpper<MT4>::value )
7704  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
7705  :( M ) );
7706  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7707 
7708  const size_t inum( iend - ibegin );
7709  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7710 
7711  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7712  (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
7713  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
7714  }
7715  if( ipos < iend ) {
7716  (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
7717  }
7718  }
7719  }
7720  //**********************************************************************************************
7721 
7722  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
7736  template< typename MT3 // Type of the left-hand side target matrix
7737  , typename MT4 // Type of the left-hand side matrix operand
7738  , typename MT5 // Type of the right-hand side matrix operand
7739  , typename ST2 > // Type of the scalar value
7740  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
7741  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7742  {
7743  const size_t M( A.rows() );
7744  const size_t N( B.columns() );
7745 
7746  for( size_t i=0UL; i<M; ++i )
7747  {
7748  const size_t jbegin( ( IsUpper<MT5>::value )
7749  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
7750  :( 0UL ) );
7751  const size_t jend( ( IsLower<MT5>::value )
7752  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
7753  :( N ) );
7754  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7755 
7756  const size_t jnum( jend - jbegin );
7757  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
7758 
7759  for( size_t j=jbegin; j<jpos; j+=2UL ) {
7760  (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
7761  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
7762  }
7763  if( jpos < jend ) {
7764  (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
7765  }
7766  }
7767  }
7768  //**********************************************************************************************
7769 
7770  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
7784  template< typename MT3 // Type of the left-hand side target matrix
7785  , typename MT4 // Type of the left-hand side matrix operand
7786  , typename MT5 // Type of the right-hand side matrix operand
7787  , typename ST2 > // Type of the scalar value
7788  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
7789  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7790  {
7791  constexpr size_t block( BLOCK_SIZE );
7792 
7793  const size_t M( A.rows() );
7794  const size_t N( B.columns() );
7795 
7796  for( size_t jj=0UL; jj<N; jj+=block ) {
7797  const size_t jend( min( N, jj+block ) );
7798  for( size_t ii=0UL; ii<M; ii+=block ) {
7799  const size_t iend( min( M, ii+block ) );
7800  for( size_t j=jj; j<jend; ++j )
7801  {
7802  const size_t ibegin( ( IsLower<MT5>::value )
7803  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
7804  :( ii ) );
7805  const size_t ipos( ( IsUpper<MT5>::value )
7806  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
7807  :( iend ) );
7808 
7809  for( size_t i=ibegin; i<ipos; ++i ) {
7810  (~C)(i,j) += A(i,i) * B(i,j) * scalar;
7811  }
7812  }
7813  }
7814  }
7815  }
7816  //**********************************************************************************************
7817 
7818  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
7832  template< typename MT3 // Type of the left-hand side target matrix
7833  , typename MT4 // Type of the left-hand side matrix operand
7834  , typename MT5 // Type of the right-hand side matrix operand
7835  , typename ST2 > // Type of the scalar value
7836  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
7837  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7838  {
7839  for( size_t i=0UL; i<A.rows(); ++i ) {
7840  C(i,i) += A(i,i) * B(i,i) * scalar;
7841  }
7842  }
7843  //**********************************************************************************************
7844 
7845  //**Default addition assignment to dense matrices (small matrices)******************************
7859  template< typename MT3 // Type of the left-hand side target matrix
7860  , typename MT4 // Type of the left-hand side matrix operand
7861  , typename MT5 // Type of the right-hand side matrix operand
7862  , typename ST2 > // Type of the scalar value
7864  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7865  {
7866  selectDefaultAddAssignKernel( C, A, B, scalar );
7867  }
7868  //**********************************************************************************************
7869 
7870  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
7885  template< typename MT3 // Type of the left-hand side target matrix
7886  , typename MT4 // Type of the left-hand side matrix operand
7887  , typename MT5 // Type of the right-hand side matrix operand
7888  , typename ST2 > // Type of the scalar value
7890  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7891  {
7892  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7893 
7894  const size_t M( A.rows() );
7895  const size_t N( B.columns() );
7896  const size_t K( A.columns() );
7897 
7898  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7899 
7900  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
7901  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
7902 
7903  const SIMDType factor( set( scalar ) );
7904 
7905  size_t j( 0UL );
7906 
7908  {
7909  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7910  for( size_t i=0UL; i<M; ++i )
7911  {
7912  const size_t kbegin( ( IsUpper<MT4>::value )
7913  ?( ( IsLower<MT5>::value )
7914  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7915  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7916  :( IsLower<MT5>::value ? j : 0UL ) );
7917  const size_t kend( ( IsLower<MT4>::value )
7918  ?( ( IsUpper<MT5>::value )
7919  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
7920  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
7921  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
7922 
7923  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7924 
7925  for( size_t k=kbegin; k<kend; ++k ) {
7926  const SIMDType a1( set( A(i,k) ) );
7927  xmm1 += a1 * B.load(k,j );
7928  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7929  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7930  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7931  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7932  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7933  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7934  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7935  }
7936 
7937  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7938  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
7939  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
7940  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
7941  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
7942  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
7943  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
7944  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
7945  }
7946  }
7947  }
7948 
7949  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7950  {
7951  size_t i( 0UL );
7952 
7953  for( ; (i+2UL) <= M; i+=2UL )
7954  {
7955  const size_t kbegin( ( IsUpper<MT4>::value )
7956  ?( ( IsLower<MT5>::value )
7957  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7958  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7959  :( IsLower<MT5>::value ? j : 0UL ) );
7960  const size_t kend( ( IsLower<MT4>::value )
7961  ?( ( IsUpper<MT5>::value )
7962  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
7963  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
7964  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
7965 
7966  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7967 
7968  for( size_t k=kbegin; k<kend; ++k ) {
7969  const SIMDType a1( set( A(i ,k) ) );
7970  const SIMDType a2( set( A(i+1UL,k) ) );
7971  const SIMDType b1( B.load(k,j ) );
7972  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7973  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7974  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7975  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7976  xmm1 += a1 * b1;
7977  xmm2 += a1 * b2;
7978  xmm3 += a1 * b3;
7979  xmm4 += a1 * b4;
7980  xmm5 += a1 * b5;
7981  xmm6 += a2 * b1;
7982  xmm7 += a2 * b2;
7983  xmm8 += a2 * b3;
7984  xmm9 += a2 * b4;
7985  xmm10 += a2 * b5;
7986  }
7987 
7988  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7989  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
7990  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
7991  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
7992  (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
7993  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm6 * factor );
7994  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
7995  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
7996  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
7997  (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
7998  }
7999 
8000  if( i < M )
8001  {
8002  const size_t kbegin( ( IsUpper<MT4>::value )
8003  ?( ( IsLower<MT5>::value )
8004  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8005  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8006  :( IsLower<MT5>::value ? j : 0UL ) );
8007  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
8008 
8009  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8010 
8011  for( size_t k=kbegin; k<kend; ++k ) {
8012  const SIMDType a1( set( A(i,k) ) );
8013  xmm1 += a1 * B.load(k,j );
8014  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8015  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8016  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8017  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
8018  }
8019 
8020  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8021  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8022  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8023  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
8024  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
8025  }
8026  }
8027 
8028  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
8029  {
8030  size_t i( 0UL );
8031 
8032  for( ; (i+2UL) <= M; i+=2UL )
8033  {
8034  const size_t kbegin( ( IsUpper<MT4>::value )
8035  ?( ( IsLower<MT5>::value )
8036  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8037  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8038  :( IsLower<MT5>::value ? j : 0UL ) );
8039  const size_t kend( ( IsLower<MT4>::value )
8040  ?( ( IsUpper<MT5>::value )
8041  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
8042  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8043  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
8044 
8045  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8046 
8047  for( size_t k=kbegin; k<kend; ++k ) {
8048  const SIMDType a1( set( A(i ,k) ) );
8049  const SIMDType a2( set( A(i+1UL,k) ) );
8050  const SIMDType b1( B.load(k,j ) );
8051  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8052  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8053  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8054  xmm1 += a1 * b1;
8055  xmm2 += a1 * b2;
8056  xmm3 += a1 * b3;
8057  xmm4 += a1 * b4;
8058  xmm5 += a2 * b1;
8059  xmm6 += a2 * b2;
8060  xmm7 += a2 * b3;
8061  xmm8 += a2 * b4;
8062  }
8063 
8064  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8065  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
8066  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
8067  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
8068  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8069  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
8070  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
8071  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
8072  }
8073 
8074  if( i < M )
8075  {
8076  const size_t kbegin( ( IsUpper<MT4>::value )
8077  ?( ( IsLower<MT5>::value )
8078  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8079  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8080  :( IsLower<MT5>::value ? j : 0UL ) );
8081  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
8082 
8083  SIMDType xmm1, xmm2, xmm3, xmm4;
8084 
8085  for( size_t k=kbegin; k<kend; ++k ) {
8086  const SIMDType a1( set( A(i,k) ) );
8087  xmm1 += a1 * B.load(k,j );
8088  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8089  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8090  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8091  }
8092 
8093  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8094  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8095  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8096  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
8097  }
8098  }
8099 
8100  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8101  {
8102  size_t i( 0UL );
8103 
8104  for( ; (i+2UL) <= M; i+=2UL )
8105  {
8106  const size_t kbegin( ( IsUpper<MT4>::value )
8107  ?( ( IsLower<MT5>::value )
8108  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8109  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8110  :( IsLower<MT5>::value ? j : 0UL ) );
8111  const size_t kend( ( IsLower<MT4>::value )
8112  ?( ( IsUpper<MT5>::value )
8113  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
8114  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8115  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
8116 
8117  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8118 
8119  for( size_t k=kbegin; k<kend; ++k ) {
8120  const SIMDType a1( set( A(i ,k) ) );
8121  const SIMDType a2( set( A(i+1UL,k) ) );
8122  const SIMDType b1( B.load(k,j ) );
8123  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8124  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8125  xmm1 += a1 * b1;
8126  xmm2 += a1 * b2;
8127  xmm3 += a1 * b3;
8128  xmm4 += a2 * b1;
8129  xmm5 += a2 * b2;
8130  xmm6 += a2 * b3;
8131  }
8132 
8133  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8134  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
8135  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
8136  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm4 * factor );
8137  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
8138  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
8139  }
8140 
8141  if( i < M )
8142  {
8143  const size_t kbegin( ( IsUpper<MT4>::value )
8144  ?( ( IsLower<MT5>::value )
8145  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8146  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8147  :( IsLower<MT5>::value ? j : 0UL ) );
8148  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
8149 
8150  SIMDType xmm1, xmm2, xmm3;
8151 
8152  for( size_t k=kbegin; k<kend; ++k ) {
8153  const SIMDType a1( set( A(i,k) ) );
8154  xmm1 += a1 * B.load(k,j );
8155  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8156  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8157  }
8158 
8159  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8160  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8161  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8162  }
8163  }
8164 
8165  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8166  {
8167  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
8168  size_t i( LOW ? j : 0UL );
8169 
8170  for( ; (i+2UL) <= iend; i+=2UL )
8171  {
8172  const size_t kbegin( ( IsUpper<MT4>::value )
8173  ?( ( IsLower<MT5>::value )
8174  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8175  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8176  :( IsLower<MT5>::value ? j : 0UL ) );
8177  const size_t kend( ( IsLower<MT4>::value )
8178  ?( ( IsUpper<MT5>::value )
8179  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8180  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8181  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
8182 
8183  SIMDType xmm1, xmm2, xmm3, xmm4;
8184 
8185  for( size_t k=kbegin; k<kend; ++k ) {
8186  const SIMDType a1( set( A(i ,k) ) );
8187  const SIMDType a2( set( A(i+1UL,k) ) );
8188  const SIMDType b1( B.load(k,j ) );
8189  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8190  xmm1 += a1 * b1;
8191  xmm2 += a1 * b2;
8192  xmm3 += a2 * b1;
8193  xmm4 += a2 * b2;
8194  }
8195 
8196  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8197  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
8198  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8199  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
8200  }
8201 
8202  if( i < iend )
8203  {
8204  const size_t kbegin( ( IsUpper<MT4>::value )
8205  ?( ( IsLower<MT5>::value )
8206  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8207  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8208  :( IsLower<MT5>::value ? j : 0UL ) );
8209  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8210 
8211  SIMDType xmm1, xmm2;
8212 
8213  for( size_t k=kbegin; k<kend; ++k ) {
8214  const SIMDType a1( set( A(i,k) ) );
8215  xmm1 += a1 * B.load(k,j );
8216  xmm2 += a1 * B.load(k,j+SIMDSIZE);
8217  }
8218 
8219  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8220  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + xmm2 * factor );
8221  }
8222  }
8223 
8224  for( ; j<jpos; j+=SIMDSIZE )
8225  {
8226  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
8227  size_t i( LOW ? j : 0UL );
8228 
8229  for( ; (i+2UL) <= iend; i+=2UL )
8230  {
8231  const size_t kbegin( ( IsUpper<MT4>::value )
8232  ?( ( IsLower<MT5>::value )
8233  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8234  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8235  :( IsLower<MT5>::value ? j : 0UL ) );
8236  const size_t kend( ( IsLower<MT4>::value )
8237  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8238  :( K ) );
8239 
8240  SIMDType xmm1, xmm2;
8241 
8242  for( size_t k=kbegin; k<kend; ++k ) {
8243  const SIMDType b1( B.load(k,j) );
8244  xmm1 += set( A(i ,k) ) * b1;
8245  xmm2 += set( A(i+1UL,k) ) * b1;
8246  }
8247 
8248  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8249  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
8250  }
8251 
8252  if( i < iend )
8253  {
8254  const size_t kbegin( ( IsUpper<MT4>::value )
8255  ?( ( IsLower<MT5>::value )
8256  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8257  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8258  :( IsLower<MT5>::value ? j : 0UL ) );
8259 
8260  SIMDType xmm1;
8261 
8262  for( size_t k=kbegin; k<K; ++k ) {
8263  xmm1 += set( A(i,k) ) * B.load(k,j);
8264  }
8265 
8266  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8267  }
8268  }
8269 
8270  for( ; remainder && j<N; ++j )
8271  {
8272  const size_t iend( UPP ? j+1UL : M );
8273  size_t i( LOW ? j : 0UL );
8274 
8275  for( ; (i+2UL) <= iend; i+=2UL )
8276  {
8277  const size_t kbegin( ( IsUpper<MT4>::value )
8278  ?( ( IsLower<MT5>::value )
8279  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8280  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8281  :( IsLower<MT5>::value ? j : 0UL ) );
8282  const size_t kend( ( IsLower<MT4>::value )
8283  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8284  :( K ) );
8285 
8286  ElementType value1 = ElementType();
8287  ElementType value2 = ElementType();
8288 
8289  for( size_t k=kbegin; k<kend; ++k ) {
8290  value1 += A(i ,k) * B(k,j);
8291  value2 += A(i+1UL,k) * B(k,j);
8292  }
8293 
8294  (~C)(i ,j) += value1 * scalar;
8295  (~C)(i+1UL,j) += value2 * scalar;
8296  }
8297 
8298  if( i < iend )
8299  {
8300  const size_t kbegin( ( IsUpper<MT4>::value )
8301  ?( ( IsLower<MT5>::value )
8302  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8303  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8304  :( IsLower<MT5>::value ? j : 0UL ) );
8305 
8306  ElementType value = ElementType();
8307 
8308  for( size_t k=kbegin; k<K; ++k ) {
8309  value += A(i,k) * B(k,j);
8310  }
8311 
8312  (~C)(i,j) += value * scalar;
8313  }
8314  }
8315  }
8316  //**********************************************************************************************
8317 
8318  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
8333  template< typename MT3 // Type of the left-hand side target matrix
8334  , typename MT4 // Type of the left-hand side matrix operand
8335  , typename MT5 // Type of the right-hand side matrix operand
8336  , typename ST2 > // Type of the scalar value
8338  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8339  {
8340  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
8341 
8342  const size_t M( A.rows() );
8343  const size_t N( B.columns() );
8344  const size_t K( A.columns() );
8345 
8346  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8347 
8348  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
8349  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
8350 
8351  const SIMDType factor( set( scalar ) );
8352 
8353  size_t i( 0UL );
8354 
8356  {
8357  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8358  for( size_t j=0UL; j<N; ++j )
8359  {
8360  const size_t kbegin( ( IsLower<MT5>::value )
8361  ?( ( IsUpper<MT4>::value )
8362  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8363  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8364  :( IsUpper<MT4>::value ? i : 0UL ) );
8365  const size_t kend( ( IsUpper<MT5>::value )
8366  ?( ( IsLower<MT4>::value )
8367  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
8368  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
8369  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
8370 
8371  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8372 
8373  for( size_t k=kbegin; k<kend; ++k ) {
8374  const SIMDType b1( set( B(k,j) ) );
8375  xmm1 += A.load(i ,k) * b1;
8376  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8377  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8378  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8379  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8380  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
8381  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
8382  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
8383  }
8384 
8385  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8386  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8387  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8388  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8389  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
8390  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
8391  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
8392  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
8393  }
8394  }
8395  }
8396 
8397  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
8398  {
8399  size_t j( 0UL );
8400 
8401  for( ; (j+2UL) <= N; j+=2UL )
8402  {
8403  const size_t kbegin( ( IsLower<MT5>::value )
8404  ?( ( IsUpper<MT4>::value )
8405  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8406  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8407  :( IsUpper<MT4>::value ? i : 0UL ) );
8408  const size_t kend( ( IsUpper<MT5>::value )
8409  ?( ( IsLower<MT4>::value )
8410  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8411  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8412  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
8413 
8414  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8415 
8416  for( size_t k=kbegin; k<kend; ++k ) {
8417  const SIMDType a1( A.load(i ,k) );
8418  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8419  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8420  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8421  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
8422  const SIMDType b1( set( B(k,j ) ) );
8423  const SIMDType b2( set( B(k,j+1UL) ) );
8424  xmm1 += a1 * b1;
8425  xmm2 += a2 * b1;
8426  xmm3 += a3 * b1;
8427  xmm4 += a4 * b1;
8428  xmm5 += a5 * b1;
8429  xmm6 += a1 * b2;
8430  xmm7 += a2 * b2;
8431  xmm8 += a3 * b2;
8432  xmm9 += a4 * b2;
8433  xmm10 += a5 * b2;
8434  }
8435 
8436  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8437  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
8438  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
8439  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
8440  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
8441  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
8442  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
8443  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
8444  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
8445  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
8446  }
8447 
8448  if( j < N )
8449  {
8450  const size_t kbegin( ( IsLower<MT5>::value )
8451  ?( ( IsUpper<MT4>::value )
8452  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8453  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8454  :( IsUpper<MT4>::value ? i : 0UL ) );
8455  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
8456 
8457  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8458 
8459  for( size_t k=kbegin; k<kend; ++k ) {
8460  const SIMDType b1( set( B(k,j) ) );
8461  xmm1 += A.load(i ,k) * b1;
8462  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8463  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8464  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8465  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8466  }
8467 
8468  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8469  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8470  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8471  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8472  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
8473  }
8474  }
8475 
8476  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8477  {
8478  size_t j( 0UL );
8479 
8480  for( ; (j+2UL) <= N; j+=2UL )
8481  {
8482  const size_t kbegin( ( IsLower<MT5>::value )
8483  ?( ( IsUpper<MT4>::value )
8484  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8485  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8486  :( IsUpper<MT4>::value ? i : 0UL ) );
8487  const size_t kend( ( IsUpper<MT5>::value )
8488  ?( ( IsLower<MT4>::value )
8489  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8490  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8491  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
8492 
8493  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8494 
8495  for( size_t k=kbegin; k<kend; ++k ) {
8496  const SIMDType a1( A.load(i ,k) );
8497  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8498  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8499  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8500  const SIMDType b1( set( B(k,j ) ) );
8501  const SIMDType b2( set( B(k,j+1UL) ) );
8502  xmm1 += a1 * b1;
8503  xmm2 += a2 * b1;
8504  xmm3 += a3 * b1;
8505  xmm4 += a4 * b1;
8506  xmm5 += a1 * b2;
8507  xmm6 += a2 * b2;
8508  xmm7 += a3 * b2;
8509  xmm8 += a4 * b2;
8510  }
8511 
8512  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8513  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
8514  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
8515  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
8516  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
8517  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
8518  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
8519  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
8520  }
8521 
8522  if( j < N )
8523  {
8524  const size_t kbegin( ( IsLower<MT5>::value )
8525  ?( ( IsUpper<MT4>::value )
8526  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8527  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8528  :( IsUpper<MT4>::value ? i : 0UL ) );
8529  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
8530 
8531  SIMDType xmm1, xmm2, xmm3, xmm4;
8532 
8533  for( size_t k=kbegin; k<kend; ++k ) {
8534  const SIMDType b1( set( B(k,j) ) );
8535  xmm1 += A.load(i ,k) * b1;
8536  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8537  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8538  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8539  }
8540 
8541  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8542  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8543  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8544  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8545  }
8546  }
8547 
8548  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
8549  {
8550  size_t j( 0UL );
8551 
8552  for( ; (j+2UL) <= N; j+=2UL )
8553  {
8554  const size_t kbegin( ( IsLower<MT5>::value )
8555  ?( ( IsUpper<MT4>::value )
8556  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8557  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8558  :( IsUpper<MT4>::value ? i : 0UL ) );
8559  const size_t kend( ( IsUpper<MT5>::value )
8560  ?( ( IsLower<MT4>::value )
8561  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8562  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8563  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
8564 
8565  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8566 
8567  for( size_t k=kbegin; k<kend; ++k ) {
8568  const SIMDType a1( A.load(i ,k) );
8569  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8570  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8571  const SIMDType b1( set( B(k,j ) ) );
8572  const SIMDType b2( set( B(k,j+1UL) ) );
8573  xmm1 += a1 * b1;
8574  xmm2 += a2 * b1;
8575  xmm3 += a3 * b1;
8576  xmm4 += a1 * b2;
8577  xmm5 += a2 * b2;
8578  xmm6 += a3 * b2;
8579  }
8580 
8581  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8582  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
8583  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
8584  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
8585  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
8586  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
8587  }
8588 
8589  if( j < N )
8590  {
8591  const size_t kbegin( ( IsLower<MT5>::value )
8592  ?( ( IsUpper<MT4>::value )
8593  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8594  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8595  :( IsUpper<MT4>::value ? i : 0UL ) );
8596  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
8597 
8598  SIMDType xmm1, xmm2, xmm3;
8599 
8600  for( size_t k=kbegin; k<kend; ++k ) {
8601  const SIMDType b1( set( B(k,j) ) );
8602  xmm1 += A.load(i ,k) * b1;
8603  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8604  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8605  }
8606 
8607  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8608  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8609  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8610  }
8611  }
8612 
8613  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8614  {
8615  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
8616  size_t j( UPP ? i : 0UL );
8617 
8618  for( ; (j+2UL) <= jend; j+=2UL )
8619  {
8620  const size_t kbegin( ( IsLower<MT5>::value )
8621  ?( ( IsUpper<MT4>::value )
8622  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8623  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8624  :( IsUpper<MT4>::value ? i : 0UL ) );
8625  const size_t kend( ( IsUpper<MT5>::value )
8626  ?( ( IsLower<MT4>::value )
8627  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8628  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8629  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8630 
8631  SIMDType xmm1, xmm2, xmm3, xmm4;
8632 
8633  for( size_t k=kbegin; k<kend; ++k ) {
8634  const SIMDType a1( A.load(i ,k) );
8635  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8636  const SIMDType b1( set( B(k,j ) ) );
8637  const SIMDType b2( set( B(k,j+1UL) ) );
8638  xmm1 += a1 * b1;
8639  xmm2 += a2 * b1;
8640  xmm3 += a1 * b2;
8641  xmm4 += a2 * b2;
8642  }
8643 
8644  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8645  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
8646  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
8647  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
8648  }
8649 
8650  if( j < jend )
8651  {
8652  const size_t kbegin( ( IsLower<MT5>::value )
8653  ?( ( IsUpper<MT4>::value )
8654  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8655  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8656  :( IsUpper<MT4>::value ? i : 0UL ) );
8657  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8658 
8659  SIMDType xmm1, xmm2;
8660 
8661  for( size_t k=kbegin; k<kend; ++k ) {
8662  const SIMDType b1( set( B(k,j) ) );
8663  xmm1 += A.load(i ,k) * b1;
8664  xmm2 += A.load(i+SIMDSIZE,k) * b1;
8665  }
8666 
8667  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8668  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + xmm2 * factor );
8669  }
8670  }
8671 
8672  for( ; i<ipos; i+=SIMDSIZE )
8673  {
8674  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
8675  size_t j( UPP ? i : 0UL );
8676 
8677  for( ; (j+2UL) <= jend; j+=2UL )
8678  {
8679  const size_t kbegin( ( IsLower<MT5>::value )
8680  ?( ( IsUpper<MT4>::value )
8681  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8682  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8683  :( IsUpper<MT4>::value ? i : 0UL ) );
8684  const size_t kend( ( IsUpper<MT5>::value )
8685  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8686  :( K ) );
8687 
8688  SIMDType xmm1, xmm2;
8689 
8690  for( size_t k=kbegin; k<kend; ++k ) {
8691  const SIMDType a1( A.load(i,k) );
8692  xmm1 += a1 * set( B(k,j ) );
8693  xmm2 += a1 * set( B(k,j+1UL) );
8694  }
8695 
8696  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8697  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
8698  }
8699 
8700  if( j < jend )
8701  {
8702  const size_t kbegin( ( IsLower<MT5>::value )
8703  ?( ( IsUpper<MT4>::value )
8704  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8705  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8706  :( IsUpper<MT4>::value ? i : 0UL ) );
8707 
8708  SIMDType xmm1;
8709 
8710  for( size_t k=kbegin; k<K; ++k ) {
8711  xmm1 += A.load(i,k) * set( B(k,j) );
8712  }
8713 
8714  (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8715  }
8716  }
8717 
8718  for( ; remainder && i<M; ++i )
8719  {
8720  const size_t jend( LOW ? i+1UL : N );
8721  size_t j( UPP ? i : 0UL );
8722 
8723  for( ; (j+2UL) <= jend; j+=2UL )
8724  {
8725  const size_t kbegin( ( IsLower<MT5>::value )
8726  ?( ( IsUpper<MT4>::value )
8727  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8728  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8729  :( IsUpper<MT4>::value ? i : 0UL ) );
8730  const size_t kend( ( IsUpper<MT5>::value )
8731  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8732  :( K ) );
8733 
8734  ElementType value1 = ElementType();
8735  ElementType value2 = ElementType();
8736 
8737  for( size_t k=kbegin; k<kend; ++k ) {
8738  value1 += A(i,k) * B(k,j );
8739  value2 += A(i,k) * B(k,j+1UL);
8740  }
8741 
8742  (~C)(i,j ) += value1 * scalar;
8743  (~C)(i,j+1UL) += value2 * scalar;
8744  }
8745 
8746  if( j < jend )
8747  {
8748  const size_t kbegin( ( IsLower<MT5>::value )
8749  ?( ( IsUpper<MT4>::value )
8750  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8751  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8752  :( IsUpper<MT4>::value ? i : 0UL ) );
8753 
8754  ElementType value = ElementType();
8755 
8756  for( size_t k=kbegin; k<K; ++k ) {
8757  value += A(i,k) * B(k,j);
8758  }
8759 
8760  (~C)(i,j) += value * scalar;
8761  }
8762  }
8763  }
8764  //**********************************************************************************************
8765 
8766  //**Default addition assignment to dense matrices (large matrices)******************************
8780  template< typename MT3 // Type of the left-hand side target matrix
8781  , typename MT4 // Type of the left-hand side matrix operand
8782  , typename MT5 // Type of the right-hand side matrix operand
8783  , typename ST2 > // Type of the scalar value
8785  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8786  {
8787  selectDefaultAddAssignKernel( C, A, B, scalar );
8788  }
8789  //**********************************************************************************************
8790 
8791  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
8806  template< typename MT3 // Type of the left-hand side target matrix
8807  , typename MT4 // Type of the left-hand side matrix operand
8808  , typename MT5 // Type of the right-hand side matrix operand
8809  , typename ST2 > // Type of the scalar value
8811  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8812  {
8813  if( LOW )
8814  lmmm( C, A, B, scalar, ST2(1) );
8815  else if( UPP )
8816  ummm( C, A, B, scalar, ST2(1) );
8817  else
8818  mmm( C, A, B, scalar, ST2(1) );
8819  }
8820  //**********************************************************************************************
8821 
8822  //**BLAS-based addition assignment to dense matrices (default)**********************************
8836  template< typename MT3 // Type of the left-hand side target matrix
8837  , typename MT4 // Type of the left-hand side matrix operand
8838  , typename MT5 // Type of the right-hand side matrix operand
8839  , typename ST2 > // Type of the scalar value
8841  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8842  {
8843  selectLargeAddAssignKernel( C, A, B, scalar );
8844  }
8845  //**********************************************************************************************
8846 
8847  //**BLAS-based addition assignment to dense matrices********************************************
8848 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
8849 
8862  template< typename MT3 // Type of the left-hand side target matrix
8863  , typename MT4 // Type of the left-hand side matrix operand
8864  , typename MT5 // Type of the right-hand side matrix operand
8865  , typename ST2 > // Type of the scalar value
8867  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8868  {
8869  typedef ElementType_<MT3> ET;
8870 
8871  if( IsTriangular<MT4>::value ) {
8872  ResultType_<MT3> tmp( serial( B ) );
8873  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8874  addAssign( C, tmp );
8875  }
8876  else if( IsTriangular<MT5>::value ) {
8877  ResultType_<MT3> tmp( serial( A ) );
8878  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8879  addAssign( C, tmp );
8880  }
8881  else {
8882  gemm( C, A, B, ET(scalar), ET(1) );
8883  }
8884  }
8885 #endif
8886  //**********************************************************************************************
8887 
8888  //**Addition assignment to sparse matrices******************************************************
8889  // No special implementation for the addition assignment to sparse matrices.
8890  //**********************************************************************************************
8891 
8892  //**Subtraction assignment to dense matrices****************************************************
8904  template< typename MT // Type of the target dense matrix
8905  , bool SO > // Storage order of the target dense matrix
8906  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8907  {
8909 
8910  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8911  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8912 
8913  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8914  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8915 
8916  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8917  return;
8918  }
8919 
8920  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
8921  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
8922 
8923  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8924  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8925  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8926  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8927  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8928  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8929 
8930  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
8931  }
8932  //**********************************************************************************************
8933 
8934  //**Subtraction assignment to dense matrices (kernel selection)*********************************
8945  template< typename MT3 // Type of the left-hand side target matrix
8946  , typename MT4 // Type of the left-hand side matrix operand
8947  , typename MT5 // Type of the right-hand side matrix operand
8948  , typename ST2 > // Type of the scalar value
8949  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8950  {
8952  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
8953  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
8954  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
8955  selectSmallSubAssignKernel( C, A, B, scalar );
8956  else
8957  selectBlasSubAssignKernel( C, A, B, scalar );
8958  }
8959  //**********************************************************************************************
8960 
8961  //**Default subtraction assignment to dense matrices********************************************
8975  template< typename MT3 // Type of the left-hand side target matrix
8976  , typename MT4 // Type of the left-hand side matrix operand
8977  , typename MT5 // Type of the right-hand side matrix operand
8978  , typename ST2 > // Type of the scalar value
8979  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
8980  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8981  {
8982  const ResultType tmp( serial( A * B * scalar ) );
8983  subAssign( C, tmp );
8984  }
8985  //**********************************************************************************************
8986 
8987  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
9001  template< typename MT3 // Type of the left-hand side target matrix
9002  , typename MT4 // Type of the left-hand side matrix operand
9003  , typename MT5 // Type of the right-hand side matrix operand
9004  , typename ST2 > // Type of the scalar value
9005  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9006  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9007  {
9008  constexpr size_t block( BLOCK_SIZE );
9009 
9010  const size_t M( A.rows() );
9011  const size_t N( B.columns() );
9012 
9013  for( size_t ii=0UL; ii<M; ii+=block ) {
9014  const size_t iend( min( M, ii+block ) );
9015  for( size_t jj=0UL; jj<N; jj+=block ) {
9016  const size_t jend( min( N, jj+block ) );
9017  for( size_t i=ii; i<iend; ++i )
9018  {
9019  const size_t jbegin( ( IsUpper<MT4>::value )
9020  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
9021  :( jj ) );
9022  const size_t jpos( ( IsLower<MT4>::value )
9023  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
9024  :( jend ) );
9025 
9026  for( size_t j=jbegin; j<jpos; ++j ) {
9027  (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
9028  }
9029  }
9030  }
9031  }
9032  }
9033  //**********************************************************************************************
9034 
9035  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
9049  template< typename MT3 // Type of the left-hand side target matrix
9050  , typename MT4 // Type of the left-hand side matrix operand
9051  , typename MT5 // Type of the right-hand side matrix operand
9052  , typename ST2 > // Type of the scalar value
9053  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9054  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9055  {
9056  const size_t M( A.rows() );
9057  const size_t N( B.columns() );
9058 
9059  for( size_t j=0UL; j<N; ++j )
9060  {
9061  const size_t ibegin( ( IsLower<MT4>::value )
9062  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
9063  :( 0UL ) );
9064  const size_t iend( ( IsUpper<MT4>::value )
9065  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
9066  :( M ) );
9067  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
9068 
9069  const size_t inum( iend - ibegin );
9070  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
9071 
9072  for( size_t i=ibegin; i<ipos; i+=2UL ) {
9073  (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
9074  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
9075  }
9076  if( ipos < iend ) {
9077  (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
9078  }
9079  }
9080  }
9081  //**********************************************************************************************
9082 
9083  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
9097  template< typename MT3 // Type of the left-hand side target matrix
9098  , typename MT4 // Type of the left-hand side matrix operand
9099  , typename MT5 // Type of the right-hand side matrix operand
9100  , typename ST2 > // Type of the scalar value
9101  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9102  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9103  {
9104  const size_t M( A.rows() );
9105  const size_t N( B.columns() );
9106 
9107  for( size_t i=0UL; i<M; ++i )
9108  {
9109  const size_t jbegin( ( IsUpper<MT5>::value )
9110  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
9111  :( 0UL ) );
9112  const size_t jend( ( IsLower<MT5>::value )
9113  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
9114  :( N ) );
9115  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
9116 
9117  const size_t jnum( jend - jbegin );
9118  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
9119 
9120  for( size_t j=jbegin; j<jpos; j+=2UL ) {
9121  (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
9122  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
9123  }
9124  if( jpos < jend ) {
9125  (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
9126  }
9127  }
9128  }
9129  //**********************************************************************************************
9130 
9131  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
9145  template< typename MT3 // Type of the left-hand side target matrix
9146  , typename MT4 // Type of the left-hand side matrix operand
9147  , typename MT5 // Type of the right-hand side matrix operand
9148  , typename ST2 > // Type of the scalar value
9149  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9150  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9151  {
9152  constexpr size_t block( BLOCK_SIZE );
9153 
9154  const size_t M( A.rows() );
9155  const size_t N( B.columns() );
9156 
9157  for( size_t jj=0UL; jj<N; jj+=block ) {
9158  const size_t jend( min( N, jj+block ) );
9159  for( size_t ii=0UL; ii<M; ii+=block ) {
9160  const size_t iend( min( M, ii+block ) );
9161  for( size_t j=jj; j<jend; ++j )
9162  {
9163  const size_t ibegin( ( IsLower<MT5>::value )
9164  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
9165  :( ii ) );
9166  const size_t ipos( ( IsUpper<MT5>::value )
9167  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
9168  :( iend ) );
9169 
9170  for( size_t i=ibegin; i<ipos; ++i ) {
9171  (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
9172  }
9173  }
9174  }
9175  }
9176  }
9177  //**********************************************************************************************
9178 
9179  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
9193  template< typename MT3 // Type of the left-hand side target matrix
9194  , typename MT4 // Type of the left-hand side matrix operand
9195  , typename MT5 // Type of the right-hand side matrix operand
9196  , typename ST2 > // Type of the scalar value
9197  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
9198  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9199  {
9200  for( size_t i=0UL; i<A.rows(); ++i ) {
9201  C(i,i) -= A(i,i) * B(i,i) * scalar;
9202  }
9203  }
9204  //**********************************************************************************************
9205 
9206  //**Default subtraction assignment to dense matrices (small matrices)***************************
9220  template< typename MT3 // Type of the left-hand side target matrix
9221  , typename MT4 // Type of the left-hand side matrix operand
9222  , typename MT5 // Type of the right-hand side matrix operand
9223  , typename ST2 > // Type of the scalar value
9225  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9226  {
9227  selectDefaultSubAssignKernel( C, A, B, scalar );
9228  }
9229  //**********************************************************************************************
9230 
9231  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
9246  template< typename MT3 // Type of the left-hand side target matrix
9247  , typename MT4 // Type of the left-hand side matrix operand
9248  , typename MT5 // Type of the right-hand side matrix operand
9249  , typename ST2 > // Type of the scalar value
9251  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9252  {
9253  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
9254 
9255  const size_t M( A.rows() );
9256  const size_t N( B.columns() );
9257  const size_t K( A.columns() );
9258 
9259  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
9260 
9261  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
9262  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
9263 
9264  const SIMDType factor( set( scalar ) );
9265 
9266  size_t j( 0UL );
9267 
9269  {
9270  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
9271  for( size_t i=0UL; i<M; ++i )
9272  {
9273  const size_t kbegin( ( IsUpper<MT4>::value )
9274  ?( ( IsLower<MT5>::value )
9275  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9276  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9277  :( IsLower<MT5>::value ? j : 0UL ) );
9278  const size_t kend( ( IsLower<MT4>::value )
9279  ?( ( IsUpper<MT5>::value )
9280  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
9281  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
9282  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
9283 
9284  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9285 
9286  for( size_t k=kbegin; k<kend; ++k ) {
9287  const SIMDType a1( set( A(i,k) ) );
9288  xmm1 += a1 * B.load(k,j );
9289  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9290  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9291  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9292  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9293  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
9294  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
9295  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
9296  }
9297 
9298  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9299  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9300  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9301  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9302  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
9303  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
9304  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
9305  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
9306  }
9307  }
9308  }
9309 
9310  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
9311  {
9312  size_t i( 0UL );
9313 
9314  for( ; (i+2UL) <= M; i+=2UL )
9315  {
9316  const size_t kbegin( ( IsUpper<MT4>::value )
9317  ?( ( IsLower<MT5>::value )
9318  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9319  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9320  :( IsLower<MT5>::value ? j : 0UL ) );
9321  const size_t kend( ( IsLower<MT4>::value )
9322  ?( ( IsUpper<MT5>::value )
9323  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
9324  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9325  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
9326 
9327  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9328 
9329  for( size_t k=kbegin; k<kend; ++k ) {
9330  const SIMDType a1( set( A(i ,k) ) );
9331  const SIMDType a2( set( A(i+1UL,k) ) );
9332  const SIMDType b1( B.load(k,j ) );
9333  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9334  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9335  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9336  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
9337  xmm1 += a1 * b1;
9338  xmm2 += a1 * b2;
9339  xmm3 += a1 * b3;
9340  xmm4 += a1 * b4;
9341  xmm5 += a1 * b5;
9342  xmm6 += a2 * b1;
9343  xmm7 += a2 * b2;
9344  xmm8 += a2 * b3;
9345  xmm9 += a2 * b4;
9346  xmm10 += a2 * b5;
9347  }
9348 
9349  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9350  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
9351  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
9352  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
9353  (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
9354  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm6 * factor );
9355  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
9356  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
9357  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
9358  (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
9359  }
9360 
9361  if( i < M )
9362  {
9363  const size_t kbegin( ( IsUpper<MT4>::value )
9364  ?( ( IsLower<MT5>::value )
9365  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9366  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9367  :( IsLower<MT5>::value ? j : 0UL ) );
9368  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
9369 
9370  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9371 
9372  for( size_t k=kbegin; k<kend; ++k ) {
9373  const SIMDType a1( set( A(i,k) ) );
9374  xmm1 += a1 * B.load(k,j );
9375  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9376  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9377  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9378  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9379  }
9380 
9381  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9382  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9383  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9384  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9385  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
9386  }
9387  }
9388 
9389  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
9390  {
9391  size_t i( 0UL );
9392 
9393  for( ; (i+2UL) <= M; i+=2UL )
9394  {
9395  const size_t kbegin( ( IsUpper<MT4>::value )
9396  ?( ( IsLower<MT5>::value )
9397  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9398  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9399  :( IsLower<MT5>::value ? j : 0UL ) );
9400  const size_t kend( ( IsLower<MT4>::value )
9401  ?( ( IsUpper<MT5>::value )
9402  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
9403  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9404  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
9405 
9406  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9407 
9408  for( size_t k=kbegin; k<kend; ++k ) {
9409  const SIMDType a1( set( A(i ,k) ) );
9410  const SIMDType a2( set( A(i+1UL,k) ) );
9411  const SIMDType b1( B.load(k,j ) );
9412  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9413  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9414  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9415  xmm1 += a1 * b1;
9416  xmm2 += a1 * b2;
9417  xmm3 += a1 * b3;
9418  xmm4 += a1 * b4;
9419  xmm5 += a2 * b1;
9420  xmm6 += a2 * b2;
9421  xmm7 += a2 * b3;
9422  xmm8 += a2 * b4;
9423  }
9424 
9425  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9426  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
9427  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
9428  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
9429  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
9430  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
9431  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
9432  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
9433  }
9434 
9435  if( i < M )
9436  {
9437  const size_t kbegin( ( IsUpper<MT4>::value )
9438  ?( ( IsLower<MT5>::value )
9439  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9440  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9441  :( IsLower<MT5>::value ? j : 0UL ) );
9442  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
9443 
9444  SIMDType xmm1, xmm2, xmm3, xmm4;
9445 
9446  for( size_t k=kbegin; k<kend; ++k ) {
9447  const SIMDType a1( set( A(i,k) ) );
9448  xmm1 += a1 * B.load(k,j );
9449  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9450  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9451  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9452  }
9453 
9454  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9455  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9456  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9457  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9458  }
9459  }
9460 
9461  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
9462  {
9463  size_t i( 0UL );
9464 
9465  for( ; (i+2UL) <= M; i+=2UL )
9466  {
9467  const size_t kbegin( ( IsUpper<MT4>::value )
9468  ?( ( IsLower<MT5>::value )
9469  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9470  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9471  :( IsLower<MT5>::value ? j : 0UL ) );
9472  const size_t kend( ( IsLower<MT4>::value )
9473  ?( ( IsUpper<MT5>::value )
9474  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
9475  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9476  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
9477 
9478  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9479 
9480  for( size_t k=kbegin; k<kend; ++k ) {
9481  const SIMDType a1( set( A(i ,k) ) );
9482  const SIMDType a2( set( A(i+1UL,k) ) );
9483  const SIMDType b1( B.load(k,j ) );
9484  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9485  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9486  xmm1 += a1 * b1;
9487  xmm2 += a1 * b2;
9488  xmm3 += a1 * b3;
9489  xmm4 += a2 * b1;
9490  xmm5 += a2 * b2;
9491  xmm6 += a2 * b3;
9492  }
9493 
9494  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9495  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
9496  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
9497  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm4 * factor );
9498  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
9499  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
9500  }
9501 
9502  if( i < M )
9503  {
9504  const size_t kbegin( ( IsUpper<MT4>::value )
9505  ?( ( IsLower<MT5>::value )
9506  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9507  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9508  :( IsLower<MT5>::value ? j : 0UL ) );
9509  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
9510 
9511  SIMDType xmm1, xmm2, xmm3;
9512 
9513  for( size_t k=kbegin; k<kend; ++k ) {
9514  const SIMDType a1( set( A(i,k) ) );
9515  xmm1 += a1 * B.load(k,j );
9516  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9517  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9518  }
9519 
9520  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9521  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9522  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9523  }
9524  }
9525 
9526  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
9527  {
9528  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
9529  size_t i( LOW ? j : 0UL );
9530 
9531  for( ; (i+2UL) <= iend; i+=2UL )
9532  {
9533  const size_t kbegin( ( IsUpper<MT4>::value )
9534  ?( ( IsLower<MT5>::value )
9535  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9536  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9537  :( IsLower<MT5>::value ? j : 0UL ) );
9538  const size_t kend( ( IsLower<MT4>::value )
9539  ?( ( IsUpper<MT5>::value )
9540  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
9541  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9542  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
9543 
9544  SIMDType xmm1, xmm2, xmm3, xmm4;
9545 
9546  for( size_t k=kbegin; k<kend; ++k ) {
9547  const SIMDType a1( set( A(i ,k) ) );
9548  const SIMDType a2( set( A(i+1UL,k) ) );
9549  const SIMDType b1( B.load(k,j ) );
9550  const SIMDType b2( B.load(k,j+SIMDSIZE) );
9551  xmm1 += a1 * b1;
9552  xmm2 += a1 * b2;
9553  xmm3 += a2 * b1;
9554  xmm4 += a2 * b2;
9555  }
9556 
9557  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9558  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
9559  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
9560  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
9561  }
9562 
9563  if( i < iend )
9564  {
9565  const size_t kbegin( ( IsUpper<MT4>::value )
9566  ?( ( IsLower<MT5>::value )
9567  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9568  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9569  :( IsLower<MT5>::value ? j : 0UL ) );
9570  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
9571 
9572  SIMDType xmm1, xmm2;
9573 
9574  for( size_t k=kbegin; k<kend; ++k ) {
9575  const SIMDType a1( set( A(i,k) ) );
9576  xmm1 += a1 * B.load(k,j );
9577  xmm2 += a1 * B.load(k,j+SIMDSIZE);
9578  }
9579 
9580  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9581  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - xmm2 * factor );
9582  }
9583  }
9584 
9585  for( ; j<jpos; j+=SIMDSIZE )
9586  {
9587  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
9588  size_t i( LOW ? j : 0UL );
9589 
9590  for( ; (i+2UL) <= iend; i+=2UL )
9591  {
9592  const size_t kbegin( ( IsUpper<MT4>::value )
9593  ?( ( IsLower<MT5>::value )
9594  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9595  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9596  :( IsLower<MT5>::value ? j : 0UL ) );
9597  const size_t kend( ( IsLower<MT4>::value )
9598  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
9599  :( K ) );
9600 
9601  SIMDType xmm1, xmm2;
9602 
9603  for( size_t k=kbegin; k<kend; ++k ) {
9604  const SIMDType b1( B.load(k,j) );
9605  xmm1 += set( A(i ,k) ) * b1;
9606  xmm2 += set( A(i+1UL,k) ) * b1;
9607  }
9608 
9609  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9610  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
9611  }
9612 
9613  if( i < iend )
9614  {
9615  const size_t kbegin( ( IsUpper<MT4>::value )
9616  ?( ( IsLower<MT5>::value )
9617  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9618  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9619  :( IsLower<MT5>::value ? j : 0UL ) );
9620 
9621  SIMDType xmm1;
9622 
9623  for( size_t k=kbegin; k<K; ++k ) {
9624  xmm1 += set( A(i,k) ) * B.load(k,j);
9625  }
9626 
9627  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
9628  }
9629  }
9630 
9631  for( ; remainder && j<N; ++j )
9632  {
9633  const size_t iend( UPP ? j+1UL : M );
9634  size_t i( LOW ? j : 0UL );
9635 
9636  for( ; (i+2UL) <= iend; i+=2UL )
9637  {
9638  const size_t kbegin( ( IsUpper<MT4>::value )
9639  ?( ( IsLower<MT5>::value )
9640  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9641  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9642  :( IsLower<MT5>::value ? j : 0UL ) );
9643  const size_t kend( ( IsLower<MT4>::value )
9644  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
9645  :( K ) );
9646 
9647  ElementType value1 = ElementType();
9648  ElementType value2 = ElementType();
9649 
9650  for( size_t k=kbegin; k<kend; ++k ) {
9651  value1 += A(i ,k) * B(k,j);
9652  value2 += A(i+1UL,k) * B(k,j);
9653  }
9654 
9655  (~C)(i ,j) -= value1 * scalar;
9656  (~C)(i+1UL,j) -= value2 * scalar;
9657  }
9658 
9659  if( i < iend )
9660  {
9661  const size_t kbegin( ( IsUpper<MT4>::value )
9662  ?( ( IsLower<MT5>::value )
9663  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9664  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9665  :( IsLower<MT5>::value ? j : 0UL ) );
9666 
9667  ElementType value = ElementType();
9668 
9669  for( size_t k=kbegin; k<K; ++k ) {
9670  value += A(i,k) * B(k,j);
9671  }
9672 
9673  (~C)(i,j) -= value * scalar;
9674  }
9675  }
9676  }
9677  //**********************************************************************************************
9678 
9679  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
9694  template< typename MT3 // Type of the left-hand side target matrix
9695  , typename MT4 // Type of the left-hand side matrix operand
9696  , typename MT5 // Type of the right-hand side matrix operand
9697  , typename ST2 > // Type of the scalar value
9699  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9700  {
9701  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
9702 
9703  const size_t M( A.rows() );
9704  const size_t N( B.columns() );
9705  const size_t K( A.columns() );
9706 
9707  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
9708 
9709  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
9710  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
9711 
9712  const SIMDType factor( set( scalar ) );
9713 
9714  size_t i( 0UL );
9715 
9717  {
9718  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
9719  for( size_t j=0UL; j<N; ++j )
9720  {
9721  const size_t kbegin( ( IsLower<MT5>::value )
9722  ?( ( IsUpper<MT4>::value )
9723  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9724  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9725  :( IsUpper<MT4>::value ? i : 0UL ) );
9726  const size_t kend( ( IsUpper<MT5>::value )
9727  ?( ( IsLower<MT4>::value )
9728  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
9729  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
9730  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
9731 
9732  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9733 
9734  for( size_t k=kbegin; k<kend; ++k ) {
9735  const SIMDType b1( set( B(k,j) ) );
9736  xmm1 += A.load(i ,k) * b1;
9737  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9738  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9739  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
9740  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
9741  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
9742  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
9743  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
9744  }
9745 
9746  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9747  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
9748  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
9749  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
9750  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
9751  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
9752  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
9753  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
9754  }
9755  }
9756  }
9757 
9758  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
9759  {
9760  size_t j( 0UL );
9761 
9762  for( ; (j+2UL) <= N; j+=2UL )
9763  {
9764  const size_t kbegin( ( IsLower<MT5>::value )
9765  ?( ( IsUpper<MT4>::value )
9766  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9767  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9768  :( IsUpper<MT4>::value ? i : 0UL ) );
9769  const size_t kend( ( IsUpper<MT5>::value )
9770  ?( ( IsLower<MT4>::value )
9771  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
9772  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
9773  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
9774 
9775  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9776 
9777  for( size_t k=kbegin; k<kend; ++k ) {
9778  const SIMDType a1( A.load(i ,k) );
9779  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
9780  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9781  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
9782  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
9783  const SIMDType b1( set( B(k,j ) ) );
9784  const SIMDType b2( set( B(k,j+1UL) ) );
9785  xmm1 += a1 * b1;
9786  xmm2 += a2 * b1;
9787  xmm3 += a3 * b1;
9788  xmm4 += a4 * b1;
9789  xmm5 += a5 * b1;
9790  xmm6 += a1 * b2;
9791  xmm7 += a2 * b2;
9792  xmm8 += a3 * b2;
9793  xmm9 += a4 * b2;
9794  xmm10 += a5 * b2;
9795  }
9796 
9797  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9798  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
9799  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
9800  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
9801  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
9802  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
9803  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
9804  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
9805  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
9806  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
9807  }
9808 
9809  if( j < N )
9810  {
9811  const size_t kbegin( ( IsLower<MT5>::value )
9812  ?( ( IsUpper<MT4>::value )
9813  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9814  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9815  :( IsUpper<MT4>::value ? i : 0UL ) );
9816  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
9817 
9818  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9819 
9820  for( size_t k=kbegin; k<kend; ++k ) {
9821  const SIMDType b1( set( B(k,j) ) );
9822  xmm1 += A.load(i ,k) * b1;
9823  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9824  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9825  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
9826  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
9827  }
9828 
9829  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9830  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
9831  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
9832  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
9833  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
9834  }
9835  }
9836 
9837  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
9838  {
9839  size_t j( 0UL );
9840 
9841  for( ; (j+2UL) <= N; j+=2UL )
9842  {
9843  const size_t kbegin( ( IsLower<MT5>::value )
9844  ?( ( IsUpper<MT4>::value )
9845  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9846  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9847  :( IsUpper<MT4>::value ? i : 0UL ) );
9848  const size_t kend( ( IsUpper<MT5>::value )
9849  ?( ( IsLower<MT4>::value )
9850  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
9851  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
9852  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
9853 
9854  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9855 
9856  for( size_t k=kbegin; k<kend; ++k ) {
9857  const SIMDType a1( A.load(i ,k) );
9858  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
9859  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9860  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
9861  const SIMDType b1( set( B(k,j ) ) );
9862  const SIMDType b2( set( B(k,j+1UL) ) );
9863  xmm1 += a1 * b1;
9864  xmm2 += a2 * b1;
9865  xmm3 += a3 * b1;
9866  xmm4 += a4 * b1;
9867  xmm5 += a1 * b2;
9868  xmm6 += a2 * b2;
9869  xmm7 += a3 * b2;
9870  xmm8 += a4 * b2;
9871  }
9872 
9873  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9874  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
9875  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
9876  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
9877  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
9878  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
9879  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
9880  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
9881  }
9882 
9883  if( j < N )
9884  {
9885  const size_t kbegin( ( IsLower<MT5>::value )
9886  ?( ( IsUpper<MT4>::value )
9887  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9888  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9889  :( IsUpper<MT4>::value ? i : 0UL ) );
9890  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
9891 
9892  SIMDType xmm1, xmm2, xmm3, xmm4;
9893 
9894  for( size_t k=kbegin; k<kend; ++k ) {
9895  const SIMDType b1( set( B(k,j) ) );
9896  xmm1 += A.load(i ,k) * b1;
9897  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9898  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9899  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
9900  }
9901 
9902  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9903  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
9904  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
9905  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
9906  }
9907  }
9908 
9909  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
9910  {
9911  size_t j( 0UL );
9912 
9913  for( ; (j+2UL) <= N; j+=2UL )
9914  {
9915  const size_t kbegin( ( IsLower<MT5>::value )
9916  ?( ( IsUpper<MT4>::value )
9917  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9918  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9919  :( IsUpper<MT4>::value ? i : 0UL ) );
9920  const size_t kend( ( IsUpper<MT5>::value )
9921  ?( ( IsLower<MT4>::value )
9922  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
9923  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
9924  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
9925 
9926  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9927 
9928  for( size_t k=kbegin; k<kend; ++k ) {
9929  const SIMDType a1( A.load(i ,k) );
9930  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
9931  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
9932  const SIMDType b1( set( B(k,j ) ) );
9933  const SIMDType b2( set( B(k,j+1UL) ) );
9934  xmm1 += a1 * b1;
9935  xmm2 += a2 * b1;
9936  xmm3 += a3 * b1;
9937  xmm4 += a1 * b2;
9938  xmm5 += a2 * b2;
9939  xmm6 += a3 * b2;
9940  }
9941 
9942  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9943  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
9944  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
9945  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
9946  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
9947  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
9948  }
9949 
9950  if( j < N )
9951  {
9952  const size_t kbegin( ( IsLower<MT5>::value )
9953  ?( ( IsUpper<MT4>::value )
9954  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9955  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9956  :( IsUpper<MT4>::value ? i : 0UL ) );
9957  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
9958 
9959  SIMDType xmm1, xmm2, xmm3;
9960 
9961  for( size_t k=kbegin; k<kend; ++k ) {
9962  const SIMDType b1( set( B(k,j) ) );
9963  xmm1 += A.load(i ,k) * b1;
9964  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
9965  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
9966  }
9967 
9968  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9969  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
9970  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
9971  }
9972  }
9973 
9974  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
9975  {
9976  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
9977  size_t j( UPP ? i : 0UL );
9978 
9979  for( ; (j+2UL) <= jend; j+=2UL )
9980  {
9981  const size_t kbegin( ( IsLower<MT5>::value )
9982  ?( ( IsUpper<MT4>::value )
9983  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9984  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9985  :( IsUpper<MT4>::value ? i : 0UL ) );
9986  const size_t kend( ( IsUpper<MT5>::value )
9987  ?( ( IsLower<MT4>::value )
9988  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
9989  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
9990  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
9991 
9992  SIMDType xmm1, xmm2, xmm3, xmm4;
9993 
9994  for( size_t k=kbegin; k<kend; ++k ) {
9995  const SIMDType a1( A.load(i ,k) );
9996  const SIMDType a2( A.load(i+SIMDSIZE,k) );
9997  const SIMDType b1( set( B(k,j ) ) );
9998  const SIMDType b2( set( B(k,j+1UL) ) );
9999  xmm1 += a1 * b1;
10000  xmm2 += a2 * b1;
10001  xmm3 += a1 * b2;
10002  xmm4 += a2 * b2;
10003  }
10004 
10005  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10006  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
10007  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10008  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
10009  }
10010 
10011  if( j < jend )
10012  {
10013  const size_t kbegin( ( IsLower<MT5>::value )
10014  ?( ( IsUpper<MT4>::value )
10015  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10016  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10017  :( IsUpper<MT4>::value ? i : 0UL ) );
10018  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
10019 
10020  SIMDType xmm1, xmm2;
10021 
10022  for( size_t k=kbegin; k<kend; ++k ) {
10023  const SIMDType b1( set( B(k,j) ) );
10024  xmm1 += A.load(i ,k) * b1;
10025  xmm2 += A.load(i+SIMDSIZE,k) * b1;
10026  }
10027 
10028  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10029  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - xmm2 * factor );
10030  }
10031  }
10032 
10033  for( ; i<ipos; i+=SIMDSIZE )
10034  {
10035  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
10036  size_t j( UPP ? i : 0UL );
10037 
10038  for( ; (j+2UL) <= jend; j+=2UL )
10039  {
10040  const size_t kbegin( ( IsLower<MT5>::value )
10041  ?( ( IsUpper<MT4>::value )
10042  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10043  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10044  :( IsUpper<MT4>::value ? i : 0UL ) );
10045  const size_t kend( ( IsUpper<MT5>::value )
10046  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10047  :( K ) );
10048 
10049  SIMDType xmm1, xmm2;
10050 
10051  for( size_t k=kbegin; k<kend; ++k ) {
10052  const SIMDType a1( A.load(i,k) );
10053  xmm1 += a1 * set( B(k,j ) );
10054  xmm2 += a1 * set( B(k,j+1UL) );
10055  }
10056 
10057  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10058  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
10059  }
10060 
10061  if( j < jend )
10062  {
10063  const size_t kbegin( ( IsLower<MT5>::value )
10064  ?( ( IsUpper<MT4>::value )
10065  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10066  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10067  :( IsUpper<MT4>::value ? i : 0UL ) );
10068 
10069  SIMDType xmm1;
10070 
10071  for( size_t k=kbegin; k<K; ++k ) {
10072  xmm1 += A.load(i,k) * set( B(k,j) );
10073  }
10074 
10075  (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10076  }
10077  }
10078 
10079  for( ; remainder && i<M; ++i )
10080  {
10081  const size_t jend( LOW ? i+1UL : N );
10082  size_t j( UPP ? i : 0UL );
10083 
10084  for( ; (j+2UL) <= jend; j+=2UL )
10085  {
10086  const size_t kbegin( ( IsLower<MT5>::value )
10087  ?( ( IsUpper<MT4>::value )
10088  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10089  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10090  :( IsUpper<MT4>::value ? i : 0UL ) );
10091  const size_t kend( ( IsUpper<MT5>::value )
10092  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10093  :( K ) );
10094 
10095  ElementType value1 = ElementType();
10096  ElementType value2 = ElementType();
10097 
10098  for( size_t k=kbegin; k<kend; ++k ) {
10099  value1 += A(i,k) * B(k,j );
10100  value2 += A(i,k) * B(k,j+1UL);
10101  }
10102 
10103  (~C)(i,j ) -= value1 * scalar;
10104  (~C)(i,j+1UL) -= value2 * scalar;
10105  }
10106 
10107  if( j < jend )
10108  {
10109  const size_t kbegin( ( IsLower<MT5>::value )
10110  ?( ( IsUpper<MT4>::value )
10111  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10112  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10113  :( IsUpper<MT4>::value ? i : 0UL ) );
10114 
10115  ElementType value = ElementType();
10116 
10117  for( size_t k=kbegin; k<K; ++k ) {
10118  value += A(i,k) * B(k,j);
10119  }
10120 
10121  (~C)(i,j) -= value * scalar;
10122  }
10123  }
10124  }
10125  //**********************************************************************************************
10126 
10127  //**Default subtraction assignment to dense matrices (large matrices)***************************
10141  template< typename MT3 // Type of the left-hand side target matrix
10142  , typename MT4 // Type of the left-hand side matrix operand
10143  , typename MT5 // Type of the right-hand side matrix operand
10144  , typename ST2 > // Type of the scalar value
10146  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10147  {
10148  selectDefaultSubAssignKernel( C, A, B, scalar );
10149  }
10150  //**********************************************************************************************
10151 
10152  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
10167  template< typename MT3 // Type of the left-hand side target matrix
10168  , typename MT4 // Type of the left-hand side matrix operand
10169  , typename MT5 // Type of the right-hand side matrix operand
10170  , typename ST2 > // Type of the scalar value
10172  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10173  {
10174  if( LOW )
10175  lmmm( C, A, B, -scalar, ST2(1) );
10176  else if( UPP )
10177  ummm( C, A, B, -scalar, ST2(1) );
10178  else
10179  mmm( C, A, B, -scalar, ST2(1) );
10180  }
10181  //**********************************************************************************************
10182 
10183  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
10197  template< typename MT3 // Type of the left-hand side target matrix
10198  , typename MT4 // Type of the left-hand side matrix operand
10199  , typename MT5 // Type of the right-hand side matrix operand
10200  , typename ST2 > // Type of the scalar value
10202  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10203  {
10204  selectLargeSubAssignKernel( C, A, B, scalar );
10205  }
10206  //**********************************************************************************************
10207 
10208  //**BLAS-based subraction assignment to dense matrices******************************************
10209 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
10210 
10223  template< typename MT3 // Type of the left-hand side target matrix
10224  , typename MT4 // Type of the left-hand side matrix operand
10225  , typename MT5 // Type of the right-hand side matrix operand
10226  , typename ST2 > // Type of the scalar value
10228  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10229  {
10230  typedef ElementType_<MT3> ET;
10231 
10232  if( IsTriangular<MT4>::value ) {
10233  ResultType_<MT3> tmp( serial( B ) );
10234  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10235  subAssign( C, tmp );
10236  }
10237  else if( IsTriangular<MT5>::value ) {
10238  ResultType_<MT3> tmp( serial( A ) );
10239  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10240  subAssign( C, tmp );
10241  }
10242  else {
10243  gemm( C, A, B, ET(-scalar), ET(1) );
10244  }
10245  }
10246 #endif
10247  //**********************************************************************************************
10248 
10249  //**Subtraction assignment to sparse matrices***************************************************
10250  // No special implementation for the subtraction assignment to sparse matrices.
10251  //**********************************************************************************************
10252 
10253  //**Multiplication assignment to dense matrices*************************************************
10254  // No special implementation for the multiplication assignment to dense matrices.
10255  //**********************************************************************************************
10256 
10257  //**Multiplication assignment to sparse matrices************************************************
10258  // No special implementation for the multiplication assignment to sparse matrices.
10259  //**********************************************************************************************
10260 
10261  //**SMP assignment to dense matrices************************************************************
10276  template< typename MT // Type of the target dense matrix
10277  , bool SO > // Storage order of the target dense matrix
10279  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
10280  {
10282 
10283  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
10284  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
10285 
10286  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
10287  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
10288 
10289  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
10290  return;
10291  }
10292  else if( left.columns() == 0UL ) {
10293  reset( ~lhs );
10294  return;
10295  }
10296 
10297  LT A( left ); // Evaluation of the left-hand side dense matrix operand
10298  RT B( right ); // Evaluation of the right-hand side dense matrix operand
10299 
10300  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
10301  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
10302  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
10303  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
10304  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
10305  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
10306 
10307  smpAssign( ~lhs, A * B * rhs.scalar_ );
10308  }
10309  //**********************************************************************************************
10310 
10311  //**SMP assignment to sparse matrices***********************************************************
10326  template< typename MT // Type of the target sparse matrix
10327  , bool SO > // Storage order of the target sparse matrix
10330  {
10332 
10334 
10341 
10342  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
10343  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
10344 
10345  const ForwardFunctor fwd;
10346 
10347  const TmpType tmp( rhs );
10348  smpAssign( ~lhs, fwd( tmp ) );
10349  }
10350  //**********************************************************************************************
10351 
10352  //**SMP addition assignment to dense matrices***************************************************
10367  template< typename MT // Type of the target dense matrix
10368  , bool SO > // Storage order of the target dense matrix
10371  {
10373 
10374  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
10375  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
10376 
10377  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
10378  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
10379 
10380  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
10381  return;
10382  }
10383 
10384  LT A( left ); // Evaluation of the left-hand side dense matrix operand
10385  RT B( right ); // Evaluation of the right-hand side dense matrix operand
10386 
10387  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
10388  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
10389  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
10390  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
10391  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
10392  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
10393 
10394  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
10395  }
10396  //**********************************************************************************************
10397 
10398  //**SMP addition assignment to sparse matrices**************************************************
10399  // No special implementation for the SMP addition assignment to sparse matrices.
10400  //**********************************************************************************************
10401 
10402  //**SMP subtraction assignment to dense matrices************************************************
10417  template< typename MT // Type of the target dense matrix
10418  , bool SO > // Storage order of the target dense matrix
10421  {
10423 
10424  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
10425  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
10426 
10427  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
10428  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
10429 
10430  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
10431  return;
10432  }
10433 
10434  LT A( left ); // Evaluation of the left-hand side dense matrix operand
10435  RT B( right ); // Evaluation of the right-hand side dense matrix operand
10436 
10437  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
10438  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
10439  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
10440  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
10441  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
10442  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
10443 
10444  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
10445  }
10446  //**********************************************************************************************
10447 
10448  //**SMP subtraction assignment to sparse matrices***********************************************
10449  // No special implementation for the SMP subtraction assignment to sparse matrices.
10450  //**********************************************************************************************
10451 
10452  //**SMP multiplication assignment to dense matrices*********************************************
10453  // No special implementation for the SMP multiplication assignment to dense matrices.
10454  //**********************************************************************************************
10455 
10456  //**SMP multiplication assignment to sparse matrices********************************************
10457  // No special implementation for the SMP multiplication assignment to sparse matrices.
10458  //**********************************************************************************************
10459 
10460  //**Compile time checks*************************************************************************
10468  BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE( ST, RightOperand );
10469  //**********************************************************************************************
10470 };
10472 //*************************************************************************************************
10473 
10474 
10475 
10476 
10477 //=================================================================================================
10478 //
10479 // GLOBAL BINARY ARITHMETIC OPERATORS
10480 //
10481 //=================================================================================================
10482 
10483 //*************************************************************************************************
10513 template< typename T1 // Type of the left-hand side dense matrix
10514  , typename T2 > // Type of the right-hand side dense matrix
10517 {
10519 
10520  if( (~lhs).columns() != (~rhs).rows() ) {
10521  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
10522  }
10523 
10525 }
10526 //*************************************************************************************************
10527 
10528 
10529 
10530 
10531 //=================================================================================================
10532 //
10533 // GLOBAL FUNCTIONS
10534 //
10535 //=================================================================================================
10536 
10537 //*************************************************************************************************
10562 template< typename MT1 // Type of the left-hand side dense matrix
10563  , typename MT2 // Type of the right-hand side dense matrix
10564  , bool SF // Symmetry flag
10565  , bool HF // Hermitian flag
10566  , bool LF // Lower flag
10567  , bool UF > // Upper flag
10570 {
10572 
10573  if( !isSquare( dm ) ) {
10574  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
10575  }
10576 
10577  return TDMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>( dm.leftOperand(), dm.rightOperand() );
10578 }
10580 //*************************************************************************************************
10581 
10582 
10583 //*************************************************************************************************
10608 template< typename MT1 // Type of the left-hand side dense matrix
10609  , typename MT2 // Type of the right-hand side dense matrix
10610  , bool SF // Symmetry flag
10611  , bool HF // Hermitian flag
10612  , bool LF // Lower flag
10613  , bool UF > // Upper flag
10616 {
10618 
10619  if( !isSquare( dm ) ) {
10620  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
10621  }
10622 
10623  return TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>( dm.leftOperand(), dm.rightOperand() );
10624 }
10626 //*************************************************************************************************
10627 
10628 
10629 //*************************************************************************************************
10654 template< typename MT1 // Type of the left-hand side dense matrix
10655  , typename MT2 // Type of the right-hand side dense matrix
10656  , bool SF // Symmetry flag
10657  , bool HF // Hermitian flag
10658  , bool LF // Lower flag
10659  , bool UF > // Upper flag
10662 {
10664 
10665  if( !isSquare( dm ) ) {
10666  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
10667  }
10668 
10669  return TDMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>( dm.leftOperand(), dm.rightOperand() );
10670 }
10672 //*************************************************************************************************
10673 
10674 
10675 //*************************************************************************************************
10700 template< typename MT1 // Type of the left-hand side dense matrix
10701  , typename MT2 // Type of the right-hand side dense matrix
10702  , bool SF // Symmetry flag
10703  , bool HF // Hermitian flag
10704  , bool LF // Lower flag
10705  , bool UF > // Upper flag
10708 {
10710 
10711  if( !isSquare( dm ) ) {
10712  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
10713  }
10714 
10715  return TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>( dm.leftOperand(), dm.rightOperand() );
10716 }
10718 //*************************************************************************************************
10719 
10720 
10721 //*************************************************************************************************
10746 template< typename MT1 // Type of the left-hand side dense matrix
10747  , typename MT2 // Type of the right-hand side dense matrix
10748  , bool SF // Symmetry flag
10749  , bool HF // Hermitian flag
10750  , bool LF // Lower flag
10751  , bool UF > // Upper flag
10754 {
10756 
10757  if( !isSquare( dm ) ) {
10758  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
10759  }
10760 
10761  return TDMatDMatMultExpr<MT1,MT2,SF,HF,true,true>( dm.leftOperand(), dm.rightOperand() );
10762 }
10764 //*************************************************************************************************
10765 
10766 
10767 
10768 
10769 //=================================================================================================
10770 //
10771 // ROWS SPECIALIZATIONS
10772 //
10773 //=================================================================================================
10774 
10775 //*************************************************************************************************
10777 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10778 struct Rows< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > : public Rows<MT1>
10779 {};
10781 //*************************************************************************************************
10782 
10783 
10784 
10785 
10786 //=================================================================================================
10787 //
10788 // COLUMNS SPECIALIZATIONS
10789 //
10790 //=================================================================================================
10791 
10792 //*************************************************************************************************
10794 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10795 struct Columns< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> > : public Columns<MT2>
10796 {};
10798 //*************************************************************************************************
10799 
10800 
10801 
10802 
10803 //=================================================================================================
10804 //
10805 // ISALIGNED SPECIALIZATIONS
10806 //
10807 //=================================================================================================
10808 
10809 //*************************************************************************************************
10811 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10812 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10813  : public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
10814 {};
10816 //*************************************************************************************************
10817 
10818 
10819 
10820 
10821 //=================================================================================================
10822 //
10823 // ISSYMMETRIC SPECIALIZATIONS
10824 //
10825 //=================================================================================================
10826 
10827 //*************************************************************************************************
10829 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10830 struct IsSymmetric< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10831  : public BoolConstant< Or< Bool<SF>
10832  , And< Bool<HF>
10833  , IsBuiltin< ElementType_< TDMatDMatMultExpr<MT1,MT2,false,true,false,false> > > >
10834  , And< Bool<LF>, Bool<UF> > >::value >
10835 {};
10837 //*************************************************************************************************
10838 
10839 
10840 
10841 
10842 //=================================================================================================
10843 //
10844 // ISHERMITIAN SPECIALIZATIONS
10845 //
10846 //=================================================================================================
10847 
10848 //*************************************************************************************************
10850 template< typename MT1, typename MT2, bool SF, bool LF, bool UF >
10851 struct IsHermitian< TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
10852  : public TrueType
10853 {};
10855 //*************************************************************************************************
10856 
10857 
10858 
10859 
10860 //=================================================================================================
10861 //
10862 // ISLOWER SPECIALIZATIONS
10863 //
10864 //=================================================================================================
10865 
10866 //*************************************************************************************************
10868 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10869 struct IsLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10870  : public BoolConstant< Or< Bool<LF>
10871  , And< IsLower<MT1>, IsLower<MT2> >
10872  , And< Or< Bool<SF>, Bool<HF> >
10873  , IsUpper<MT1>, IsUpper<MT2> > >::value >
10874 {};
10876 //*************************************************************************************************
10877 
10878 
10879 
10880 
10881 //=================================================================================================
10882 //
10883 // ISUNILOWER SPECIALIZATIONS
10884 //
10885 //=================================================================================================
10886 
10887 //*************************************************************************************************
10889 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10890 struct IsUniLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10891  : public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
10892  , And< Or< Bool<SF>, Bool<HF> >
10893  , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
10894 {};
10896 //*************************************************************************************************
10897 
10898 
10899 
10900 
10901 //=================================================================================================
10902 //
10903 // ISSTRICTLYLOWER SPECIALIZATIONS
10904 //
10905 //=================================================================================================
10906 
10907 //*************************************************************************************************
10909 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10910 struct IsStrictlyLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10911  : public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
10912  , And< IsStrictlyLower<MT2>, IsLower<MT1> >
10913  , And< Or< Bool<SF>, Bool<HF> >
10914  , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
10915  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
10916 {};
10918 //*************************************************************************************************
10919 
10920 
10921 
10922 
10923 //=================================================================================================
10924 //
10925 // ISUPPER SPECIALIZATIONS
10926 //
10927 //=================================================================================================
10928 
10929 //*************************************************************************************************
10931 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10932 struct IsUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10933  : public BoolConstant< Or< Bool<UF>
10934  , And< IsUpper<MT1>, IsUpper<MT2> >
10935  , And< Or< Bool<SF>, Bool<HF> >
10936  , IsLower<MT1>, IsLower<MT2> > >::value >
10937 {};
10939 //*************************************************************************************************
10940 
10941 
10942 
10943 
10944 //=================================================================================================
10945 //
10946 // ISUNIUPPER SPECIALIZATIONS
10947 //
10948 //=================================================================================================
10949 
10950 //*************************************************************************************************
10952 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10953 struct IsUniUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10954  : public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
10955  , And< Or< Bool<SF>, Bool<HF> >
10956  , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
10957 {};
10959 //*************************************************************************************************
10960 
10961 
10962 
10963 
10964 //=================================================================================================
10965 //
10966 // ISSTRICTLYUPPER SPECIALIZATIONS
10967 //
10968 //=================================================================================================
10969 
10970 //*************************************************************************************************
10972 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
10973 struct IsStrictlyUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10974  : public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
10975  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
10976  , And< Or< Bool<SF>, Bool<HF> >
10977  , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
10978  , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
10979 {};
10981 //*************************************************************************************************
10982 
10983 
10984 
10985 
10986 //=================================================================================================
10987 //
10988 // EXPRESSION TRAIT SPECIALIZATIONS
10989 //
10990 //=================================================================================================
10991 
10992 //*************************************************************************************************
10994 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF, typename VT >
10995 struct TDMatDVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, VT >
10996 {
10997  public:
10998  //**********************************************************************************************
11003  , INVALID_TYPE >;
11004  //**********************************************************************************************
11005 };
11007 //*************************************************************************************************
11008 
11009 
11010 //*************************************************************************************************
11012 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF, typename VT >
11013 struct TDMatSVecMultExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, VT >
11014 {
11015  public:
11016  //**********************************************************************************************
11021  , INVALID_TYPE >;
11022  //**********************************************************************************************
11023 };
11025 //*************************************************************************************************
11026 
11027 
11028 //*************************************************************************************************
11030 template< typename VT, typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
11031 struct TDVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11032 {
11033  public:
11034  //**********************************************************************************************
11039  , INVALID_TYPE >;
11040  //**********************************************************************************************
11041 };
11043 //*************************************************************************************************
11044 
11045 
11046 //*************************************************************************************************
11048 template< typename VT, typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
11049 struct TSVecTDMatMultExprTrait< VT, TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11050 {
11051  public:
11052  //**********************************************************************************************
11057  , INVALID_TYPE >;
11058  //**********************************************************************************************
11059 };
11061 //*************************************************************************************************
11062 
11063 
11064 //*************************************************************************************************
11066 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
11067 struct TDMatDeclSymExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11068 {
11069  public:
11070  //**********************************************************************************************
11074  , INVALID_TYPE >;
11075  //**********************************************************************************************
11076 };
11078 //*************************************************************************************************
11079 
11080 
11081 //*************************************************************************************************
11083 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
11084 struct TDMatDeclHermExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11085 {
11086  public:
11087  //**********************************************************************************************
11091  , INVALID_TYPE >;
11092  //**********************************************************************************************
11093 };
11095 //*************************************************************************************************
11096 
11097 
11098 //*************************************************************************************************
11100 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
11101 struct TDMatDeclLowExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11102 {
11103  public:
11104  //**********************************************************************************************
11108  , INVALID_TYPE >;
11109  //**********************************************************************************************
11110 };
11112 //*************************************************************************************************
11113 
11114 
11115 //*************************************************************************************************
11117 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
11118 struct TDMatDeclUppExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11119 {
11120  public:
11121  //**********************************************************************************************
11125  , INVALID_TYPE >;
11126  //**********************************************************************************************
11127 };
11129 //*************************************************************************************************
11130 
11131 
11132 //*************************************************************************************************
11134 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
11135 struct TDMatDeclDiagExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11136 {
11137  public:
11138  //**********************************************************************************************
11142  , INVALID_TYPE >;
11143  //**********************************************************************************************
11144 };
11146 //*************************************************************************************************
11147 
11148 
11149 //*************************************************************************************************
11151 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF, bool AF >
11152 struct SubmatrixExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, AF >
11153 {
11154  public:
11155  //**********************************************************************************************
11158  //**********************************************************************************************
11159 };
11161 //*************************************************************************************************
11162 
11163 
11164 //*************************************************************************************************
11166 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
11167 struct RowExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11168 {
11169  public:
11170  //**********************************************************************************************
11171  using Type = MultExprTrait_< RowExprTrait_<const MT1>, MT2 >;
11172  //**********************************************************************************************
11173 };
11175 //*************************************************************************************************
11176 
11177 
11178 //*************************************************************************************************
11180 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
11181 struct ColumnExprTrait< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
11182 {
11183  public:
11184  //**********************************************************************************************
11186  //**********************************************************************************************
11187 };
11189 //*************************************************************************************************
11190 
11191 } // namespace blaze
11192 
11193 #endif
typename SubmatrixExprTrait< MT, AF >::Type SubmatrixExprTrait_
Auxiliary alias declaration for the SubmatrixExprTrait type trait.The SubmatrixExprTrait_ alias decla...
Definition: SubmatrixExprTrait.h:134
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Evaluation of the expression type of a dense matrix declherm operation.Via this type trait it is poss...
Definition: TDMatDeclHermExprTrait.h:75
Compile time check for row vector types.This type trait tests whether or not the given template argum...
Definition: IsRowVector.h:80
const DMatForEachExpr< MT, Conj, SO > conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatForEachExpr.h:1214
Header file for auxiliary alias declarations.
Data type constraint.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Evaluation of the expression type of a dense matrix decllow operation.Via this type trait it is possi...
Definition: TDMatDeclLowExprTrait.h:75
Header file for kernel specific block sizes.
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:195
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:488
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:281
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:560
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:489
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:194
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:633
Header file for the dense matrix multiplication kernels.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:299
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:290
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:550
Header file for the IsRowVector type trait.
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:66
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1755
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:163
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Evaluation of the expression type of a sparse vector/transpose dense matrix multiplication.Via this type trait it is possible to evaluate the resulting expression type of a sparse vector/transpose dense matrix multiplication. Given the transpose sparse vector type VT and the column-major dense matrix type MT, the nested type Type corresponds to the resulting expression type. In case either VT is not a transpose sparse vector type or MT is not a column-major dense matrix type, the resulting data type Type is set to INVALID_TYPE.
Definition: TSVecTDMatMultExprTrait.h:81
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
DisableIf_< IsSymmetric< MT >, const DMatDeclSymExpr< MT, SO > > declsym(const DenseMatrix< MT, SO > &dm)
Declares the given non-symmetric dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:841
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the TDMatDeclDiagExprTrait class template.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:424
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:434
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:198
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:285
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1802
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:71
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:119
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:176
Constraint on the data type.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:283
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:414
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:138
Constraint on the data type.
typename MultExprTrait< T1, T2 >::Type MultExprTrait_
Auxiliary alias declaration for the MultExprTrait class template.The MultExprTrait_ alias declaration...
Definition: MultExprTrait.h:344
Header file for the MultExprTrait class template.
DisableIf_< IsHermitian< MT >, const DMatDeclHermExpr< MT, SO > > declherm(const DenseMatrix< MT, SO > &dm)
Declares the given non-Hermitian dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:841
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:179
Header file for the DisableIf class template.
Compile time check for dense vector types.This type trait tests whether or not the given template par...
Definition: IsDenseVector.h:78
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:446
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:83
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Header file for the TSVecTDMatMultExprTrait class template.
Evaluation of the expression type of a dense matrix declupp operation.Via this type trait it is possi...
Definition: TDMatDeclUppExprTrait.h:75
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:123
Header file for the TDMatSVecMultExprTrait class template.
Header file for the TDMatDeclHermExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:293
Header file for the Not class template.
Header file for the TDMatDeclUppExprTrait class template.
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:404
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:177
Evaluation of the expression type of a dense matrix decldiag operation.Via this type trait it is poss...
Definition: TDMatDeclDiagExprTrait.h:75
Compile time check for sparse vector types.This type trait tests whether or not the given template pa...
Definition: IsSparseVector.h:78
Evaluation of the expression type type of a submatrix operation.Via this type trait it is possible to...
Definition: SubmatrixExprTrait.h:80
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:128
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Compile time check for column vector types.This type trait tests whether or not the given template ar...
Definition: IsColumnVector.h:80
Evaluation of the expression type of a dense matrix declsym operation.Via this type trait it is possi...
Definition: TDMatDeclSymExprTrait.h:75
Constraints on the storage order of matrix types.
typename TDVecDMatMultExprTrait< VT, MT >::Type TDVecDMatMultExprTrait_
Auxiliary alias declaration for the TDVecDMatMultExprTrait class template.The TDVecDMatMultExprTrait_...
Definition: TDVecDMatMultExprTrait.h:119
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:196
Header file for the exception macros of the math module.
DisableIf_< IsLower< MT >, const DMatDeclLowExpr< MT, SO > > decllow(const DenseMatrix< MT, SO > &dm)
Declares the given non-lower dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:842
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Evaluation of the expression type type of a row operation.Via this type trait it is possible to evalu...
Definition: RowExprTrait.h:79
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:632
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:260
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:478
Compile time check for dense matrix types.This type trait tests whether or not the given template par...
Definition: IsDenseMatrix.h:78
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:284
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT >, IsDeclExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:128
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:458
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:388
Header file for the conjugate shim.
Header file for the IsNumeric type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Evaluation of the expression type of a transpose dense matrix/sparse vector multiplication.Via this type trait it is possible to evaluate the resulting expression type of a transpose dense matrix/sparse vector multiplication. Given the column-major dense matrix type MT and the non-transpose sparse vector type VT, the nested type Type corresponds to the resulting expression type. In case either MT is not a column-major dense matrix type or VT is not a non-transpose sparse vector type, the resulting data type Type is set to INVALID_TYPE.
Definition: TDMatSVecMultExprTrait.h:79
Header file for the IsSIMDCombinable type trait.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:197
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TDMatDMatMultExpr< MT1, MT2, SF, HF, LF, UF > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:279
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:287
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:83
Utility type for generic codes.
Header file for the TDMatDeclLowExprTrait class template.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:325
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:93
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:282
Compile time type negation.The Not class template negates the given compile time condition. In case the given condition would evaluate to true, the nested member enumeration is set to false and vice versa:
Definition: Not.h:70
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
Header file for the TDMatDeclSymExprTrait class template.
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename TDMatDVecMultExprTrait< MT, VT >::Type TDMatDVecMultExprTrait_
Auxiliary alias declaration for the TDMatDVecMultExprTrait class template.The TDMatDVecMultExprTrait_...
Definition: TDMatDVecMultExprTrait.h:120
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:223
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:468
Evaluation of the expression type of a dense vector/transpose dense matrix multiplication.Via this type trait it is possible to evaluate the resulting expression type of a dense vector/transpose dense matrix multiplication. Given the transpose dense vector type VT and the column-major dense matrix type MT, the nested type Type corresponds to the resulting expression type. In case either VT is not a transpose dense vector type or MT is not a column-major dense matrix type, the resulting data type Type is set to INVALID_TYPE.
Definition: TDVecTDMatMultExprTrait.h:79
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:174
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:340
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:296
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
Header file for the TDMatDVecMultExprTrait class template.
Evaluation of the expression type of a transpose dense matrix/dense vector multiplication.Via this type trait it is possible to evaluate the resulting expression type of a transpose dense matrix/dense vector multiplication. Given the column-major dense matrix type MT and the non-transpose dense vector type VT, the nested type Type corresponds to the resulting expression type. In case either MT is not a column-major dense matrix type or VT is not a non-transpose dense vector type, the resulting data type Type is set to INVALID_TYPE.
Definition: TDMatDVecMultExprTrait.h:79
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:286
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:76
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:76
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:363
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:175
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:677
const DMatDMatMultExpr< T1, T2, false, false, false, false > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7505
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
DisableIf_< IsDiagonal< MT >, const DMatDeclDiagExpr< MT, SO > > decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given non-diagonal dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:841
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:178
DisableIf_< IsUpper< MT >, const DMatDeclUppExpr< MT, SO > > declupp(const DenseMatrix< MT, SO > &dm)
Declares the given non-upper dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:842
Evaluation of the expression type type of a column operation.Via this type trait it is possible to ev...
Definition: ColumnExprTrait.h:78
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.