DMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/dense/MMM.h>
54 #include <blaze/math/Exception.h>
69 #include <blaze/math/shims/Reset.h>
71 #include <blaze/math/SIMD.h>
102 #include <blaze/math/views/Check.h>
103 #include <blaze/system/BLAS.h>
104 #include <blaze/system/Blocking.h>
105 #include <blaze/system/Debugging.h>
107 #include <blaze/system/Thresholds.h>
110 #include <blaze/util/Assert.h>
111 #include <blaze/util/Complex.h>
114 #include <blaze/util/DisableIf.h>
115 #include <blaze/util/EnableIf.h>
118 #include <blaze/util/mpl/If.h>
119 #include <blaze/util/TrueType.h>
120 #include <blaze/util/Types.h>
129 
130 
131 namespace blaze {
132 
133 //=================================================================================================
134 //
135 // CLASS DMATDMATMULTEXPR
136 //
137 //=================================================================================================
138 
139 //*************************************************************************************************
146 template< typename MT1 // Type of the left-hand side dense matrix
147  , typename MT2 // Type of the right-hand side dense matrix
148  , bool SF // Symmetry flag
149  , bool HF // Hermitian flag
150  , bool LF // Lower flag
151  , bool UF > // Upper flag
153  : public MatMatMultExpr< DenseMatrix< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
154  , private Computation
155 {
156  private:
157  //**Type definitions****************************************************************************
164  //**********************************************************************************************
165 
166  //**********************************************************************************************
168  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
169  //**********************************************************************************************
170 
171  //**********************************************************************************************
173  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
174  //**********************************************************************************************
175 
176  //**********************************************************************************************
177  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
178  static constexpr bool HERM = ( HF && !( LF || UF ) );
179  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
180  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
181  //**********************************************************************************************
182 
183  //**********************************************************************************************
185 
190  template< typename T1, typename T2, typename T3 >
191  static constexpr bool CanExploitSymmetry_v =
192  ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
194  //**********************************************************************************************
195 
196  //**********************************************************************************************
198 
202  template< typename T1, typename T2, typename T3 >
203  static constexpr bool IsEvaluationRequired_v =
204  ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
206  //**********************************************************************************************
207 
208  //**********************************************************************************************
210 
213  template< typename T1, typename T2, typename T3 >
214  static constexpr bool UseBlasKernel_v =
215  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
216  !SYM && !HERM && !LOW && !UPP &&
217  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
218  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
219  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
220  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
221  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
222  IsBLASCompatible_v< ElementType_t<T1> > &&
223  IsBLASCompatible_v< ElementType_t<T2> > &&
224  IsBLASCompatible_v< ElementType_t<T3> > &&
225  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
226  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > );
228  //**********************************************************************************************
229 
230  //**********************************************************************************************
232 
235  template< typename T1, typename T2, typename T3 >
236  static constexpr bool UseVectorizedDefaultKernel_v =
237  ( useOptimizedKernels &&
238  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
239  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
240  IsSIMDCombinable_v< ElementType_t<T1>
242  , ElementType_t<T3> > &&
243  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
244  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
246  //**********************************************************************************************
247 
248  //**********************************************************************************************
250 
253  using ForwardFunctor = If_t< HERM
254  , DeclHerm
255  , If_t< SYM
256  , DeclSym
257  , If_t< LOW
258  , If_t< UPP
259  , DeclDiag
260  , DeclLow >
261  , If_t< UPP
262  , DeclUpp
263  , Noop > > > >;
265  //**********************************************************************************************
266 
267  public:
268  //**Type definitions****************************************************************************
271 
274 
276  using ResultType = typename If_t< HERM
278  , If_t< SYM
280  , If_t< LOW
281  , If_t< UPP
284  , If_t< UPP
286  , MultTrait<RT1,RT2> > > > >::Type;
287 
292  using ReturnType = const ElementType;
293  using CompositeType = const ResultType;
294 
296  using LeftOperand = If_t< IsExpression_v<MT1>, const MT1, const MT1& >;
297 
299  using RightOperand = If_t< IsExpression_v<MT2>, const MT2, const MT2& >;
300 
303 
306  //**********************************************************************************************
307 
308  //**Compilation flags***************************************************************************
310  static constexpr bool simdEnabled =
311  ( !IsDiagonal_v<MT2> &&
312  MT1::simdEnabled && MT2::simdEnabled &&
313  HasSIMDAdd_v<ET1,ET2> &&
314  HasSIMDMult_v<ET1,ET2> );
315 
317  static constexpr bool smpAssignable =
319  //**********************************************************************************************
320 
321  //**SIMD properties*****************************************************************************
323  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
324  //**********************************************************************************************
325 
326  //**Constructor*********************************************************************************
332  explicit inline DMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
333  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
334  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
335  {
336  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
337  }
338  //**********************************************************************************************
339 
340  //**Access operator*****************************************************************************
347  inline ReturnType operator()( size_t i, size_t j ) const {
348  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
349  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
350 
351  if( IsDiagonal_v<MT1> ) {
352  return lhs_(i,i) * rhs_(i,j);
353  }
354  else if( IsDiagonal_v<MT2> ) {
355  return lhs_(i,j) * rhs_(j,j);
356  }
357  else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
358  const size_t begin( ( IsUpper_v<MT1> )
359  ?( ( IsLower_v<MT2> )
360  ?( max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
361  , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
362  :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
363  :( ( IsLower_v<MT2> )
364  ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
365  :( 0UL ) ) );
366  const size_t end( ( IsLower_v<MT1> )
367  ?( ( IsUpper_v<MT2> )
368  ?( min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
369  , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
370  :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
371  :( ( IsUpper_v<MT2> )
372  ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
373  :( lhs_.columns() ) ) );
374 
375  if( begin >= end ) return ElementType();
376 
377  const size_t n( end - begin );
378 
379  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
380  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
381  }
382  else {
383  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
384  }
385  }
386  //**********************************************************************************************
387 
388  //**At function*********************************************************************************
396  inline ReturnType at( size_t i, size_t j ) const {
397  if( i >= lhs_.rows() ) {
398  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
399  }
400  if( j >= rhs_.columns() ) {
401  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
402  }
403  return (*this)(i,j);
404  }
405  //**********************************************************************************************
406 
407  //**Rows function*******************************************************************************
412  inline size_t rows() const noexcept {
413  return lhs_.rows();
414  }
415  //**********************************************************************************************
416 
417  //**Columns function****************************************************************************
422  inline size_t columns() const noexcept {
423  return rhs_.columns();
424  }
425  //**********************************************************************************************
426 
427  //**Left operand access*************************************************************************
432  inline LeftOperand leftOperand() const noexcept {
433  return lhs_;
434  }
435  //**********************************************************************************************
436 
437  //**Right operand access************************************************************************
442  inline RightOperand rightOperand() const noexcept {
443  return rhs_;
444  }
445  //**********************************************************************************************
446 
447  //**********************************************************************************************
453  template< typename T >
454  inline bool canAlias( const T* alias ) const noexcept {
455  return ( lhs_.canAlias( alias ) || rhs_.canAlias( alias ) );
456  }
457  //**********************************************************************************************
458 
459  //**********************************************************************************************
465  template< typename T >
466  inline bool isAliased( const T* alias ) const noexcept {
467  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
468  }
469  //**********************************************************************************************
470 
471  //**********************************************************************************************
476  inline bool isAligned() const noexcept {
477  return lhs_.isAligned() && rhs_.isAligned();
478  }
479  //**********************************************************************************************
480 
481  //**********************************************************************************************
486  inline bool canSMPAssign() const noexcept {
487  return ( !BLAZE_BLAS_MODE ||
488  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
490  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
491  ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD ) &&
492  !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
493  }
494  //**********************************************************************************************
495 
496  private:
497  //**Member variables****************************************************************************
500  //**********************************************************************************************
501 
502  //**Assignment to dense matrices****************************************************************
515  template< typename MT // Type of the target dense matrix
516  , bool SO > // Storage order of the target dense matrix
517  friend inline auto assign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
519  {
521 
522  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
523  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
524 
525  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
526  return;
527  }
528  else if( rhs.lhs_.columns() == 0UL ) {
529  reset( ~lhs );
530  return;
531  }
532 
533  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
534  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
535 
536  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
537  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
538  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
539  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
540  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
541  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
542 
543  DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
544  }
546  //**********************************************************************************************
547 
548  //**Assignment to dense matrices (kernel selection)*********************************************
559  template< typename MT3 // Type of the left-hand side target matrix
560  , typename MT4 // Type of the left-hand side matrix operand
561  , typename MT5 > // Type of the right-hand side matrix operand
562  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
563  {
564  if( ( IsDiagonal_v<MT5> ) ||
565  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
566  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
567  selectSmallAssignKernel( C, A, B );
568  else
569  selectBlasAssignKernel( C, A, B );
570  }
572  //**********************************************************************************************
573 
574  //**Default assignment to dense matrices (general/general)**************************************
588  template< typename MT3 // Type of the left-hand side target matrix
589  , typename MT4 // Type of the left-hand side matrix operand
590  , typename MT5 > // Type of the right-hand side matrix operand
591  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
592  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
593  {
594  const size_t M( A.rows() );
595  const size_t N( B.columns() );
596  const size_t K( A.columns() );
597 
598  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
599 
600  for( size_t i=0UL; i<M; ++i )
601  {
602  const size_t kbegin( ( IsUpper_v<MT4> )
603  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
604  :( 0UL ) );
605  const size_t kend( ( IsLower_v<MT4> )
606  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
607  :( K ) );
608  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
609 
610  if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
611  for( size_t j=0UL; j<N; ++j ) {
612  reset( C(i,j) );
613  }
614  continue;
615  }
616 
617  {
618  const size_t jbegin( ( IsUpper_v<MT5> )
619  ?( ( IsStrictlyUpper_v<MT5> )
620  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
621  :( UPP ? max(i,kbegin) : kbegin ) )
622  :( UPP ? i : 0UL ) );
623  const size_t jend( ( IsLower_v<MT5> )
624  ?( ( IsStrictlyLower_v<MT5> )
625  ?( LOW ? min(i+1UL,kbegin) : kbegin )
626  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
627  :( LOW ? i+1UL : N ) );
628 
629  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
630  for( size_t j=0UL; j<jbegin; ++j ) {
631  reset( C(i,j) );
632  }
633  }
634  else if( IsStrictlyUpper_v<MT5> ) {
635  reset( C(i,0UL) );
636  }
637  for( size_t j=jbegin; j<jend; ++j ) {
638  C(i,j) = A(i,kbegin) * B(kbegin,j);
639  }
640  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
641  for( size_t j=jend; j<N; ++j ) {
642  reset( C(i,j) );
643  }
644  }
645  else if( IsStrictlyLower_v<MT5> ) {
646  reset( C(i,N-1UL) );
647  }
648  }
649 
650  for( size_t k=kbegin+1UL; k<kend; ++k )
651  {
652  const size_t jbegin( ( IsUpper_v<MT5> )
653  ?( ( IsStrictlyUpper_v<MT5> )
654  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
655  :( SYM || HERM || UPP ? max( i, k ) : k ) )
656  :( SYM || HERM || UPP ? i : 0UL ) );
657  const size_t jend( ( IsLower_v<MT5> )
658  ?( ( IsStrictlyLower_v<MT5> )
659  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
660  :( LOW ? min(i+1UL,k) : k ) )
661  :( LOW ? i+1UL : N ) );
662 
663  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
664  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
665 
666  for( size_t j=jbegin; j<jend; ++j ) {
667  C(i,j) += A(i,k) * B(k,j);
668  }
669  if( IsLower_v<MT5> ) {
670  C(i,jend) = A(i,k) * B(k,jend);
671  }
672  }
673  }
674 
675  if( SYM || HERM ) {
676  for( size_t i=1UL; i<M; ++i ) {
677  for( size_t j=0UL; j<i; ++j ) {
678  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
679  }
680  }
681  }
682  }
684  //**********************************************************************************************
685 
686  //**Default assignment to dense matrices (general/diagonal)*************************************
700  template< typename MT3 // Type of the left-hand side target matrix
701  , typename MT4 // Type of the left-hand side matrix operand
702  , typename MT5 > // Type of the right-hand side matrix operand
703  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
704  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
705  {
707 
708  const size_t M( A.rows() );
709  const size_t N( B.columns() );
710 
711  for( size_t i=0UL; i<M; ++i )
712  {
713  const size_t jbegin( ( IsUpper_v<MT4> )
714  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
715  :( 0UL ) );
716  const size_t jend( ( IsLower_v<MT4> )
717  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
718  :( N ) );
719  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
720 
721  if( IsUpper_v<MT4> ) {
722  for( size_t j=0UL; j<jbegin; ++j ) {
723  reset( C(i,j) );
724  }
725  }
726  for( size_t j=jbegin; j<jend; ++j ) {
727  C(i,j) = A(i,j) * B(j,j);
728  }
729  if( IsLower_v<MT4> ) {
730  for( size_t j=jend; j<N; ++j ) {
731  reset( C(i,j) );
732  }
733  }
734  }
735  }
737  //**********************************************************************************************
738 
739  //**Default assignment to dense matrices (diagonal/general)*************************************
753  template< typename MT3 // Type of the left-hand side target matrix
754  , typename MT4 // Type of the left-hand side matrix operand
755  , typename MT5 > // Type of the right-hand side matrix operand
756  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
757  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
758  {
760 
761  const size_t M( A.rows() );
762  const size_t N( B.columns() );
763 
764  for( size_t i=0UL; i<M; ++i )
765  {
766  const size_t jbegin( ( IsUpper_v<MT5> )
767  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
768  :( 0UL ) );
769  const size_t jend( ( IsLower_v<MT5> )
770  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
771  :( N ) );
772  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
773 
774  if( IsUpper_v<MT5> ) {
775  for( size_t j=0UL; j<jbegin; ++j ) {
776  reset( C(i,j) );
777  }
778  }
779  for( size_t j=jbegin; j<jend; ++j ) {
780  C(i,j) = A(i,i) * B(i,j);
781  }
782  if( IsLower_v<MT5> ) {
783  for( size_t j=jend; j<N; ++j ) {
784  reset( C(i,j) );
785  }
786  }
787  }
788  }
790  //**********************************************************************************************
791 
792  //**Default assignment to dense matrices (diagonal/diagonal)************************************
806  template< typename MT3 // Type of the left-hand side target matrix
807  , typename MT4 // Type of the left-hand side matrix operand
808  , typename MT5 > // Type of the right-hand side matrix operand
809  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
810  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
811  {
813 
814  reset( C );
815 
816  for( size_t i=0UL; i<A.rows(); ++i ) {
817  C(i,i) = A(i,i) * B(i,i);
818  }
819  }
821  //**********************************************************************************************
822 
823  //**Default assignment to dense matrices (small matrices)***************************************
836  template< typename MT3 // Type of the left-hand side target matrix
837  , typename MT4 // Type of the left-hand side matrix operand
838  , typename MT5 > // Type of the right-hand side matrix operand
839  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
840  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
841  {
842  selectDefaultAssignKernel( C, A, B );
843  }
845  //**********************************************************************************************
846 
847  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
862  template< typename MT3 // Type of the left-hand side target matrix
863  , typename MT4 // Type of the left-hand side matrix operand
864  , typename MT5 > // Type of the right-hand side matrix operand
865  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
866  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
867  {
868  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
869 
870  const size_t M( A.rows() );
871  const size_t N( B.columns() );
872  const size_t K( A.columns() );
873 
874  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
875 
876  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
877  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
878 
879  if( LOW && UPP && N > SIMDSIZE*3UL ) {
880  reset( C );
881  }
882 
883  {
884  size_t j( 0UL );
885 
886  if( IsIntegral_v<ElementType> )
887  {
888  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
889  for( size_t i=0UL; i<M; ++i )
890  {
891  const size_t kbegin( ( IsUpper_v<MT4> )
892  ?( ( IsLower_v<MT5> )
893  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
894  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
895  :( IsLower_v<MT5> ? j : 0UL ) );
896  const size_t kend( ( IsLower_v<MT4> )
897  ?( ( IsUpper_v<MT5> )
898  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
899  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
900  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
901 
902  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
903 
904  for( size_t k=kbegin; k<kend; ++k ) {
905  const SIMDType a1( set( A(i,k) ) );
906  xmm1 += a1 * B.load(k,j );
907  xmm2 += a1 * B.load(k,j+SIMDSIZE );
908  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
909  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
910  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
911  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
912  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
913  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
914  }
915 
916  C.store( i, j , xmm1 );
917  C.store( i, j+SIMDSIZE , xmm2 );
918  C.store( i, j+SIMDSIZE*2UL, xmm3 );
919  C.store( i, j+SIMDSIZE*3UL, xmm4 );
920  C.store( i, j+SIMDSIZE*4UL, xmm5 );
921  C.store( i, j+SIMDSIZE*5UL, xmm6 );
922  C.store( i, j+SIMDSIZE*6UL, xmm7 );
923  C.store( i, j+SIMDSIZE*7UL, xmm8 );
924  }
925  }
926  }
927 
928  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
929  {
930  size_t i( 0UL );
931 
932  for( ; (i+2UL) <= M; i+=2UL )
933  {
934  const size_t kbegin( ( IsUpper_v<MT4> )
935  ?( ( IsLower_v<MT5> )
936  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
937  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
938  :( IsLower_v<MT5> ? j : 0UL ) );
939  const size_t kend( ( IsLower_v<MT4> )
940  ?( ( IsUpper_v<MT5> )
941  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
942  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
943  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
944 
945  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
946 
947  for( size_t k=kbegin; k<kend; ++k ) {
948  const SIMDType a1( set( A(i ,k) ) );
949  const SIMDType a2( set( A(i+1UL,k) ) );
950  const SIMDType b1( B.load(k,j ) );
951  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
952  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
953  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
954  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
955  xmm1 += a1 * b1;
956  xmm2 += a1 * b2;
957  xmm3 += a1 * b3;
958  xmm4 += a1 * b4;
959  xmm5 += a1 * b5;
960  xmm6 += a2 * b1;
961  xmm7 += a2 * b2;
962  xmm8 += a2 * b3;
963  xmm9 += a2 * b4;
964  xmm10 += a2 * b5;
965  }
966 
967  C.store( i , j , xmm1 );
968  C.store( i , j+SIMDSIZE , xmm2 );
969  C.store( i , j+SIMDSIZE*2UL, xmm3 );
970  C.store( i , j+SIMDSIZE*3UL, xmm4 );
971  C.store( i , j+SIMDSIZE*4UL, xmm5 );
972  C.store( i+1UL, j , xmm6 );
973  C.store( i+1UL, j+SIMDSIZE , xmm7 );
974  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
975  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
976  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
977  }
978 
979  if( i < M )
980  {
981  const size_t kbegin( ( IsUpper_v<MT4> )
982  ?( ( IsLower_v<MT5> )
983  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
984  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
985  :( IsLower_v<MT5> ? j : 0UL ) );
986  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
987 
988  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
989 
990  for( size_t k=kbegin; k<kend; ++k ) {
991  const SIMDType a1( set( A(i,k) ) );
992  xmm1 += a1 * B.load(k,j );
993  xmm2 += a1 * B.load(k,j+SIMDSIZE );
994  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
995  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
996  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
997  }
998 
999  C.store( i, j , xmm1 );
1000  C.store( i, j+SIMDSIZE , xmm2 );
1001  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1002  C.store( i, j+SIMDSIZE*3UL, xmm4 );
1003  C.store( i, j+SIMDSIZE*4UL, xmm5 );
1004  }
1005  }
1006 
1007  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1008  {
1009  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
1010  size_t i( LOW ? j : 0UL );
1011 
1012  for( ; (i+2UL) <= iend; i+=2UL )
1013  {
1014  const size_t kbegin( ( IsUpper_v<MT4> )
1015  ?( ( IsLower_v<MT5> )
1016  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1017  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1018  :( IsLower_v<MT5> ? j : 0UL ) );
1019  const size_t kend( ( IsLower_v<MT4> )
1020  ?( ( IsUpper_v<MT5> )
1021  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1022  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1023  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
1024 
1025  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1026 
1027  for( size_t k=kbegin; k<kend; ++k ) {
1028  const SIMDType a1( set( A(i ,k) ) );
1029  const SIMDType a2( set( A(i+1UL,k) ) );
1030  const SIMDType b1( B.load(k,j ) );
1031  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1032  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1033  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1034  xmm1 += a1 * b1;
1035  xmm2 += a1 * b2;
1036  xmm3 += a1 * b3;
1037  xmm4 += a1 * b4;
1038  xmm5 += a2 * b1;
1039  xmm6 += a2 * b2;
1040  xmm7 += a2 * b3;
1041  xmm8 += a2 * b4;
1042  }
1043 
1044  C.store( i , j , xmm1 );
1045  C.store( i , j+SIMDSIZE , xmm2 );
1046  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1047  C.store( i , j+SIMDSIZE*3UL, xmm4 );
1048  C.store( i+1UL, j , xmm5 );
1049  C.store( i+1UL, j+SIMDSIZE , xmm6 );
1050  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1051  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1052  }
1053 
1054  if( i < iend )
1055  {
1056  const size_t kbegin( ( IsUpper_v<MT4> )
1057  ?( ( IsLower_v<MT5> )
1058  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1059  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1060  :( IsLower_v<MT5> ? j : 0UL ) );
1061  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1062 
1063  SIMDType xmm1, xmm2, xmm3, xmm4;
1064 
1065  for( size_t k=kbegin; k<kend; ++k ) {
1066  const SIMDType a1( set( A(i,k) ) );
1067  xmm1 += a1 * B.load(k,j );
1068  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1069  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1070  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1071  }
1072 
1073  C.store( i, j , xmm1 );
1074  C.store( i, j+SIMDSIZE , xmm2 );
1075  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1076  C.store( i, j+SIMDSIZE*3UL, xmm4 );
1077  }
1078  }
1079 
1080  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1081  {
1082  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
1083  size_t i( LOW ? j : 0UL );
1084 
1085  for( ; (i+2UL) <= iend; i+=2UL )
1086  {
1087  const size_t kbegin( ( IsUpper_v<MT4> )
1088  ?( ( IsLower_v<MT5> )
1089  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1090  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1091  :( IsLower_v<MT5> ? j : 0UL ) );
1092  const size_t kend( ( IsLower_v<MT4> )
1093  ?( ( IsUpper_v<MT5> )
1094  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1095  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1096  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
1097 
1098  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1099 
1100  for( size_t k=kbegin; k<kend; ++k ) {
1101  const SIMDType a1( set( A(i ,k) ) );
1102  const SIMDType a2( set( A(i+1UL,k) ) );
1103  const SIMDType b1( B.load(k,j ) );
1104  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1105  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1106  xmm1 += a1 * b1;
1107  xmm2 += a1 * b2;
1108  xmm3 += a1 * b3;
1109  xmm4 += a2 * b1;
1110  xmm5 += a2 * b2;
1111  xmm6 += a2 * b3;
1112  }
1113 
1114  C.store( i , j , xmm1 );
1115  C.store( i , j+SIMDSIZE , xmm2 );
1116  C.store( i , j+SIMDSIZE*2UL, xmm3 );
1117  C.store( i+1UL, j , xmm4 );
1118  C.store( i+1UL, j+SIMDSIZE , xmm5 );
1119  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1120  }
1121 
1122  if( i < iend )
1123  {
1124  const size_t kbegin( ( IsUpper_v<MT4> )
1125  ?( ( IsLower_v<MT5> )
1126  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1127  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1128  :( IsLower_v<MT5> ? j : 0UL ) );
1129  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1130 
1131  SIMDType xmm1, xmm2, xmm3;
1132 
1133  for( size_t k=kbegin; k<kend; ++k ) {
1134  const SIMDType a1( set( A(i,k) ) );
1135  xmm1 += a1 * B.load(k,j );
1136  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1137  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1138  }
1139 
1140  C.store( i, j , xmm1 );
1141  C.store( i, j+SIMDSIZE , xmm2 );
1142  C.store( i, j+SIMDSIZE*2UL, xmm3 );
1143  }
1144  }
1145 
1146  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1147  {
1148  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
1149  size_t i( LOW ? j : 0UL );
1150 
1151  for( ; (i+4UL) <= iend; i+=4UL )
1152  {
1153  const size_t kbegin( ( IsUpper_v<MT4> )
1154  ?( ( IsLower_v<MT5> )
1155  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1156  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1157  :( IsLower_v<MT5> ? j : 0UL ) );
1158  const size_t kend( ( IsLower_v<MT4> )
1159  ?( ( IsUpper_v<MT5> )
1160  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
1161  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1162  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1163 
1164  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1165 
1166  for( size_t k=kbegin; k<kend; ++k ) {
1167  const SIMDType a1( set( A(i ,k) ) );
1168  const SIMDType a2( set( A(i+1UL,k) ) );
1169  const SIMDType a3( set( A(i+2UL,k) ) );
1170  const SIMDType a4( set( A(i+3UL,k) ) );
1171  const SIMDType b1( B.load(k,j ) );
1172  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1173  xmm1 += a1 * b1;
1174  xmm2 += a1 * b2;
1175  xmm3 += a2 * b1;
1176  xmm4 += a2 * b2;
1177  xmm5 += a3 * b1;
1178  xmm6 += a3 * b2;
1179  xmm7 += a4 * b1;
1180  xmm8 += a4 * b2;
1181  }
1182 
1183  C.store( i , j , xmm1 );
1184  C.store( i , j+SIMDSIZE, xmm2 );
1185  C.store( i+1UL, j , xmm3 );
1186  C.store( i+1UL, j+SIMDSIZE, xmm4 );
1187  C.store( i+2UL, j , xmm5 );
1188  C.store( i+2UL, j+SIMDSIZE, xmm6 );
1189  C.store( i+3UL, j , xmm7 );
1190  C.store( i+3UL, j+SIMDSIZE, xmm8 );
1191  }
1192 
1193  for( ; (i+3UL) <= iend; i+=3UL )
1194  {
1195  const size_t kbegin( ( IsUpper_v<MT4> )
1196  ?( ( IsLower_v<MT5> )
1197  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1198  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1199  :( IsLower_v<MT5> ? j : 0UL ) );
1200  const size_t kend( ( IsLower_v<MT4> )
1201  ?( ( IsUpper_v<MT5> )
1202  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
1203  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1204  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1205 
1206  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1207 
1208  for( size_t k=kbegin; k<kend; ++k ) {
1209  const SIMDType a1( set( A(i ,k) ) );
1210  const SIMDType a2( set( A(i+1UL,k) ) );
1211  const SIMDType a3( set( A(i+2UL,k) ) );
1212  const SIMDType b1( B.load(k,j ) );
1213  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1214  xmm1 += a1 * b1;
1215  xmm2 += a1 * b2;
1216  xmm3 += a2 * b1;
1217  xmm4 += a2 * b2;
1218  xmm5 += a3 * b1;
1219  xmm6 += a3 * b2;
1220  }
1221 
1222  C.store( i , j , xmm1 );
1223  C.store( i , j+SIMDSIZE, xmm2 );
1224  C.store( i+1UL, j , xmm3 );
1225  C.store( i+1UL, j+SIMDSIZE, xmm4 );
1226  C.store( i+2UL, j , xmm5 );
1227  C.store( i+2UL, j+SIMDSIZE, xmm6 );
1228  }
1229 
1230  for( ; (i+2UL) <= iend; i+=2UL )
1231  {
1232  const size_t kbegin( ( IsUpper_v<MT4> )
1233  ?( ( IsLower_v<MT5> )
1234  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1235  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1236  :( IsLower_v<MT5> ? j : 0UL ) );
1237  const size_t kend( ( IsLower_v<MT4> )
1238  ?( ( IsUpper_v<MT5> )
1239  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1240  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1241  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
1242 
1243  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1244  size_t k( kbegin );
1245 
1246  for( ; (k+2UL) <= kend; k+=2UL ) {
1247  const SIMDType a1( set( A(i ,k ) ) );
1248  const SIMDType a2( set( A(i+1UL,k ) ) );
1249  const SIMDType a3( set( A(i ,k+1UL) ) );
1250  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
1251  const SIMDType b1( B.load(k ,j ) );
1252  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1253  const SIMDType b3( B.load(k+1UL,j ) );
1254  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1255  xmm1 += a1 * b1;
1256  xmm2 += a1 * b2;
1257  xmm3 += a2 * b1;
1258  xmm4 += a2 * b2;
1259  xmm5 += a3 * b3;
1260  xmm6 += a3 * b4;
1261  xmm7 += a4 * b3;
1262  xmm8 += a4 * b4;
1263  }
1264 
1265  for( ; k<kend; ++k ) {
1266  const SIMDType a1( set( A(i ,k) ) );
1267  const SIMDType a2( set( A(i+1UL,k) ) );
1268  const SIMDType b1( B.load(k,j ) );
1269  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1270  xmm1 += a1 * b1;
1271  xmm2 += a1 * b2;
1272  xmm3 += a2 * b1;
1273  xmm4 += a2 * b2;
1274  }
1275 
1276  C.store( i , j , xmm1+xmm5 );
1277  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
1278  C.store( i+1UL, j , xmm3+xmm7 );
1279  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1280  }
1281 
1282  if( i < iend )
1283  {
1284  const size_t kbegin( ( IsUpper_v<MT4> )
1285  ?( ( IsLower_v<MT5> )
1286  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1287  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1288  :( IsLower_v<MT5> ? j : 0UL ) );
1289  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1290 
1291  SIMDType xmm1, xmm2, xmm3, xmm4;
1292  size_t k( kbegin );
1293 
1294  for( ; (k+2UL) <= kend; k+=2UL ) {
1295  const SIMDType a1( set( A(i,k ) ) );
1296  const SIMDType a2( set( A(i,k+1UL) ) );
1297  xmm1 += a1 * B.load(k ,j );
1298  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1299  xmm3 += a2 * B.load(k+1UL,j );
1300  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1301  }
1302 
1303  for( ; k<kend; ++k ) {
1304  const SIMDType a1( set( A(i,k) ) );
1305  xmm1 += a1 * B.load(k,j );
1306  xmm2 += a1 * B.load(k,j+SIMDSIZE);
1307  }
1308 
1309  C.store( i, j , xmm1+xmm3 );
1310  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
1311  }
1312  }
1313 
1314  for( ; j<jpos; j+=SIMDSIZE )
1315  {
1316  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
1317  size_t i( LOW ? j : 0UL );
1318 
1319  for( ; (i+4UL) <= iend; i+=4UL )
1320  {
1321  const size_t kbegin( ( IsUpper_v<MT4> )
1322  ?( ( IsLower_v<MT5> )
1323  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1324  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1325  :( IsLower_v<MT5> ? j : 0UL ) );
1326  const size_t kend( ( IsLower_v<MT4> )
1327  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1328  :( K ) );
1329 
1330  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1331  size_t k( kbegin );
1332 
1333  for( ; (k+2UL) <= kend; k+=2UL ) {
1334  const SIMDType b1( B.load(k ,j) );
1335  const SIMDType b2( B.load(k+1UL,j) );
1336  xmm1 += set( A(i ,k ) ) * b1;
1337  xmm2 += set( A(i+1UL,k ) ) * b1;
1338  xmm3 += set( A(i+2UL,k ) ) * b1;
1339  xmm4 += set( A(i+3UL,k ) ) * b1;
1340  xmm5 += set( A(i ,k+1UL) ) * b2;
1341  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
1342  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
1343  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
1344  }
1345 
1346  for( ; k<kend; ++k ) {
1347  const SIMDType b1( B.load(k,j) );
1348  xmm1 += set( A(i ,k) ) * b1;
1349  xmm2 += set( A(i+1UL,k) ) * b1;
1350  xmm3 += set( A(i+2UL,k) ) * b1;
1351  xmm4 += set( A(i+3UL,k) ) * b1;
1352  }
1353 
1354  C.store( i , j, xmm1+xmm5 );
1355  C.store( i+1UL, j, xmm2+xmm6 );
1356  C.store( i+2UL, j, xmm3+xmm7 );
1357  C.store( i+3UL, j, xmm4+xmm8 );
1358  }
1359 
1360  for( ; (i+3UL) <= iend; i+=3UL )
1361  {
1362  const size_t kbegin( ( IsUpper_v<MT4> )
1363  ?( ( IsLower_v<MT5> )
1364  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1365  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1366  :( IsLower_v<MT5> ? j : 0UL ) );
1367  const size_t kend( ( IsLower_v<MT4> )
1368  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1369  :( K ) );
1370 
1371  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1372  size_t k( kbegin );
1373 
1374  for( ; (k+2UL) <= kend; k+=2UL ) {
1375  const SIMDType b1( B.load(k ,j) );
1376  const SIMDType b2( B.load(k+1UL,j) );
1377  xmm1 += set( A(i ,k ) ) * b1;
1378  xmm2 += set( A(i+1UL,k ) ) * b1;
1379  xmm3 += set( A(i+2UL,k ) ) * b1;
1380  xmm4 += set( A(i ,k+1UL) ) * b2;
1381  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
1382  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
1383  }
1384 
1385  for( ; k<kend; ++k ) {
1386  const SIMDType b1( B.load(k,j) );
1387  xmm1 += set( A(i ,k) ) * b1;
1388  xmm2 += set( A(i+1UL,k) ) * b1;
1389  xmm3 += set( A(i+2UL,k) ) * b1;
1390  }
1391 
1392  C.store( i , j, xmm1+xmm4 );
1393  C.store( i+1UL, j, xmm2+xmm5 );
1394  C.store( i+2UL, j, xmm3+xmm6 );
1395  }
1396 
1397  for( ; (i+2UL) <= iend; i+=2UL )
1398  {
1399  const size_t kbegin( ( IsUpper_v<MT4> )
1400  ?( ( IsLower_v<MT5> )
1401  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1402  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1403  :( IsLower_v<MT5> ? j : 0UL ) );
1404  const size_t kend( ( IsLower_v<MT4> )
1405  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1406  :( K ) );
1407 
1408  SIMDType xmm1, xmm2, xmm3, xmm4;
1409  size_t k( kbegin );
1410 
1411  for( ; (k+2UL) <= kend; k+=2UL ) {
1412  const SIMDType b1( B.load(k ,j) );
1413  const SIMDType b2( B.load(k+1UL,j) );
1414  xmm1 += set( A(i ,k ) ) * b1;
1415  xmm2 += set( A(i+1UL,k ) ) * b1;
1416  xmm3 += set( A(i ,k+1UL) ) * b2;
1417  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
1418  }
1419 
1420  for( ; k<kend; ++k ) {
1421  const SIMDType b1( B.load(k,j) );
1422  xmm1 += set( A(i ,k) ) * b1;
1423  xmm2 += set( A(i+1UL,k) ) * b1;
1424  }
1425 
1426  C.store( i , j, xmm1+xmm3 );
1427  C.store( i+1UL, j, xmm2+xmm4 );
1428  }
1429 
1430  if( i < iend )
1431  {
1432  const size_t kbegin( ( IsUpper_v<MT4> )
1433  ?( ( IsLower_v<MT5> )
1434  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1435  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1436  :( IsLower_v<MT5> ? j : 0UL ) );
1437 
1438  SIMDType xmm1, xmm2;
1439  size_t k( kbegin );
1440 
1441  for( ; (k+2UL) <= K; k+=2UL ) {
1442  xmm1 += set( A(i,k ) ) * B.load(k ,j);
1443  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
1444  }
1445 
1446  for( ; k<K; ++k ) {
1447  xmm1 += set( A(i,k) ) * B.load(k,j);
1448  }
1449 
1450  C.store( i, j, xmm1+xmm2 );
1451  }
1452  }
1453 
1454  for( ; remainder && j<N; ++j )
1455  {
1456  size_t i( LOW && UPP ? j : 0UL );
1457 
1458  for( ; (i+2UL) <= M; i+=2UL )
1459  {
1460  const size_t kbegin( ( IsUpper_v<MT4> )
1461  ?( ( IsLower_v<MT5> )
1462  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1463  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1464  :( IsLower_v<MT5> ? j : 0UL ) );
1465  const size_t kend( ( IsLower_v<MT4> )
1466  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1467  :( K ) );
1468 
1469  ElementType value1{};
1470  ElementType value2{};
1471 
1472  for( size_t k=kbegin; k<kend; ++k ) {
1473  value1 += A(i ,k) * B(k,j);
1474  value2 += A(i+1UL,k) * B(k,j);
1475  }
1476 
1477  C(i ,j) = value1;
1478  C(i+1UL,j) = value2;
1479  }
1480 
1481  if( i < M )
1482  {
1483  const size_t kbegin( ( IsUpper_v<MT4> )
1484  ?( ( IsLower_v<MT5> )
1485  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1486  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1487  :( IsLower_v<MT5> ? j : 0UL ) );
1488 
1489  ElementType value{};
1490 
1491  for( size_t k=kbegin; k<K; ++k ) {
1492  value += A(i,k) * B(k,j);
1493  }
1494 
1495  C(i,j) = value;
1496  }
1497  }
1498  }
1499 
1500  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1501  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1502  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1503  for( size_t j=0UL; j<jend; ++j ) {
1504  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
1505  }
1506  }
1507  }
1508  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1509  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1510  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1511  for( size_t i=0UL; i<iend; ++i ) {
1512  reset( C(i,j) );
1513  }
1514  }
1515  }
1516  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1517  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1518  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1519  for( size_t j=0UL; j<jend; ++j ) {
1520  reset( C(i,j) );
1521  }
1522  }
1523  }
1524  }
1526  //**********************************************************************************************
1527 
1528  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1543  template< typename MT3 // Type of the left-hand side target matrix
1544  , typename MT4 // Type of the left-hand side matrix operand
1545  , typename MT5 > // Type of the right-hand side matrix operand
1546  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1547  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1548  {
1553 
1554  const ForwardFunctor fwd;
1555 
1556  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
1557  const OppositeType_t<MT4> tmp( serial( A ) );
1558  assign( C, fwd( tmp * B ) );
1559  }
1560  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
1561  const OppositeType_t<MT5> tmp( serial( B ) );
1562  assign( C, fwd( A * tmp ) );
1563  }
1564  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1565  const OppositeType_t<MT4> tmp( serial( A ) );
1566  assign( C, fwd( tmp * B ) );
1567  }
1568  else {
1569  const OppositeType_t<MT5> tmp( serial( B ) );
1570  assign( C, fwd( A * tmp ) );
1571  }
1572  }
1574  //**********************************************************************************************
1575 
1576  //**Default assignment to dense matrices (large matrices)***************************************
1589  template< typename MT3 // Type of the left-hand side target matrix
1590  , typename MT4 // Type of the left-hand side matrix operand
1591  , typename MT5 > // Type of the right-hand side matrix operand
1592  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1593  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1594  {
1595  selectDefaultAssignKernel( C, A, B );
1596  }
1598  //**********************************************************************************************
1599 
1600  //**Vectorized default assignment to dense matrices (large matrices)****************************
1614  template< typename MT3 // Type of the left-hand side target matrix
1615  , typename MT4 // Type of the left-hand side matrix operand
1616  , typename MT5 > // Type of the right-hand side matrix operand
1617  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1618  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1619  {
1620  if( SYM )
1621  smmm( C, A, B, ElementType(1) );
1622  else if( HERM )
1623  hmmm( C, A, B, ElementType(1) );
1624  else if( LOW )
1625  lmmm( C, A, B, ElementType(1), ElementType(0) );
1626  else if( UPP )
1627  ummm( C, A, B, ElementType(1), ElementType(0) );
1628  else
1629  mmm( C, A, B, ElementType(1), ElementType(0) );
1630  }
1632  //**********************************************************************************************
1633 
1634  //**BLAS-based assignment to dense matrices (default)*******************************************
1647  template< typename MT3 // Type of the left-hand side target matrix
1648  , typename MT4 // Type of the left-hand side matrix operand
1649  , typename MT5 > // Type of the right-hand side matrix operand
1650  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1651  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1652  {
1653  selectLargeAssignKernel( C, A, B );
1654  }
1656  //**********************************************************************************************
1657 
1658  //**BLAS-based assignment to dense matrices*****************************************************
1659 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1660 
1672  template< typename MT3 // Type of the left-hand side target matrix
1673  , typename MT4 // Type of the left-hand side matrix operand
1674  , typename MT5 > // Type of the right-hand side matrix operand
1675  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1676  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1677  {
1678  using ET = ElementType_t<MT3>;
1679 
1680  if( IsTriangular_v<MT4> ) {
1681  assign( C, B );
1682  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1683  }
1684  else if( IsTriangular_v<MT5> ) {
1685  assign( C, A );
1686  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1687  }
1688  else {
1689  gemm( C, A, B, ET(1), ET(0) );
1690  }
1691  }
1693 #endif
1694  //**********************************************************************************************
1695 
1696  //**Assignment to sparse matrices***************************************************************
1709  template< typename MT // Type of the target sparse matrix
1710  , bool SO > // Storage order of the target sparse matrix
1711  friend inline auto assign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1712  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1713  {
1715 
1716  using TmpType = If_t< SO, OppositeType, ResultType >;
1717 
1724 
1725  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1726  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1727 
1728  const ForwardFunctor fwd;
1729 
1730  const TmpType tmp( serial( rhs ) );
1731  assign( ~lhs, fwd( tmp ) );
1732  }
1734  //**********************************************************************************************
1735 
1736  //**Restructuring assignment to column-major matrices*******************************************
1751  template< typename MT > // Type of the target matrix
1752  friend inline auto assign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
1753  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1754  {
1756 
1758 
1759  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1760  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1761 
1762  const ForwardFunctor fwd;
1763 
1764  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
1765  assign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
1766  else if( IsSymmetric_v<MT1> )
1767  assign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
1768  else
1769  assign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
1770  }
1772  //**********************************************************************************************
1773 
1774  //**Addition assignment to dense matrices*******************************************************
1787  template< typename MT // Type of the target dense matrix
1788  , bool SO > // Storage order of the target dense matrix
1789  friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1790  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1791  {
1793 
1794  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1795  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1796 
1797  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1798  return;
1799  }
1800 
1801  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1802  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1803 
1804  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1805  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1806  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1807  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1808  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1809  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1810 
1811  DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1812  }
1814  //**********************************************************************************************
1815 
1816  //**Addition assignment to dense matrices (kernel selection)************************************
1827  template< typename MT3 // Type of the left-hand side target matrix
1828  , typename MT4 // Type of the left-hand side matrix operand
1829  , typename MT5 > // Type of the right-hand side matrix operand
1830  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1831  {
1832  if( ( IsDiagonal_v<MT5> ) ||
1833  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
1834  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1835  selectSmallAddAssignKernel( C, A, B );
1836  else
1837  selectBlasAddAssignKernel( C, A, B );
1838  }
1840  //**********************************************************************************************
1841 
1842  //**Default addition assignment to dense matrices (general/general)*****************************
1856  template< typename MT3 // Type of the left-hand side target matrix
1857  , typename MT4 // Type of the left-hand side matrix operand
1858  , typename MT5 > // Type of the right-hand side matrix operand
1859  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1860  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1861  {
1862  const size_t M( A.rows() );
1863  const size_t N( B.columns() );
1864  const size_t K( A.columns() );
1865 
1866  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1867 
1868  for( size_t i=0UL; i<M; ++i )
1869  {
1870  const size_t kbegin( ( IsUpper_v<MT4> )
1871  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1872  :( 0UL ) );
1873  const size_t kend( ( IsLower_v<MT4> )
1874  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
1875  :( K ) );
1876  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1877 
1878  for( size_t k=kbegin; k<kend; ++k )
1879  {
1880  const size_t jbegin( ( IsUpper_v<MT5> )
1881  ?( ( IsStrictlyUpper_v<MT5> )
1882  ?( UPP ? max(i,k+1UL) : k+1UL )
1883  :( UPP ? max(i,k) : k ) )
1884  :( UPP ? i : 0UL ) );
1885  const size_t jend( ( IsLower_v<MT5> )
1886  ?( ( IsStrictlyLower_v<MT5> )
1887  ?( LOW ? min(i+1UL,k) : k )
1888  :( LOW ? min(i,k)+1UL : k+1UL ) )
1889  :( LOW ? i+1UL : N ) );
1890 
1891  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
1892  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1893 
1894  const size_t jnum( jend - jbegin );
1895  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1896 
1897  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1898  C(i,j ) += A(i,k) * B(k,j );
1899  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1900  }
1901  if( jpos < jend ) {
1902  C(i,jpos) += A(i,k) * B(k,jpos);
1903  }
1904  }
1905  }
1906  }
1908  //**********************************************************************************************
1909 
1910  //**Default addition assignment to dense matrices (general/diagonal)****************************
1924  template< typename MT3 // Type of the left-hand side target matrix
1925  , typename MT4 // Type of the left-hand side matrix operand
1926  , typename MT5 > // Type of the right-hand side matrix operand
1927  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1928  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1929  {
1931 
1932  const size_t M( A.rows() );
1933  const size_t N( B.columns() );
1934 
1935  for( size_t i=0UL; i<M; ++i )
1936  {
1937  const size_t jbegin( ( IsUpper_v<MT4> )
1938  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1939  :( 0UL ) );
1940  const size_t jend( ( IsLower_v<MT4> )
1941  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
1942  :( N ) );
1943  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1944 
1945  const size_t jnum( jend - jbegin );
1946  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1947 
1948  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1949  C(i,j ) += A(i,j ) * B(j ,j );
1950  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1951  }
1952  if( jpos < jend ) {
1953  C(i,jpos) += A(i,jpos) * B(jpos,jpos);
1954  }
1955  }
1956  }
1958  //**********************************************************************************************
1959 
1960  //**Default addition assignment to dense matrices (diagonal/general)****************************
1974  template< typename MT3 // Type of the left-hand side target matrix
1975  , typename MT4 // Type of the left-hand side matrix operand
1976  , typename MT5 > // Type of the right-hand side matrix operand
1977  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1978  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1979  {
1981 
1982  const size_t M( A.rows() );
1983  const size_t N( B.columns() );
1984 
1985  for( size_t i=0UL; i<M; ++i )
1986  {
1987  const size_t jbegin( ( IsUpper_v<MT5> )
1988  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
1989  :( 0UL ) );
1990  const size_t jend( ( IsLower_v<MT5> )
1991  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
1992  :( N ) );
1993  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1994 
1995  const size_t jnum( jend - jbegin );
1996  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1997 
1998  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1999  C(i,j ) += A(i,i) * B(i,j );
2000  C(i,j+1UL) += A(i,i) * B(i,j+1UL);
2001  }
2002  if( jpos < jend ) {
2003  C(i,jpos) += A(i,i) * B(i,jpos);
2004  }
2005  }
2006  }
2008  //**********************************************************************************************
2009 
2010  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2024  template< typename MT3 // Type of the left-hand side target matrix
2025  , typename MT4 // Type of the left-hand side matrix operand
2026  , typename MT5 > // Type of the right-hand side matrix operand
2027  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2028  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2029  {
2031 
2032  for( size_t i=0UL; i<A.rows(); ++i ) {
2033  C(i,i) += A(i,i) * B(i,i);
2034  }
2035  }
2037  //**********************************************************************************************
2038 
2039  //**Default addition assignment to dense matrices (small matrices)******************************
2053  template< typename MT3 // Type of the left-hand side target matrix
2054  , typename MT4 // Type of the left-hand side matrix operand
2055  , typename MT5 > // Type of the right-hand side matrix operand
2056  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2057  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2058  {
2059  selectDefaultAddAssignKernel( C, A, B );
2060  }
2062  //**********************************************************************************************
2063 
2064  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2079  template< typename MT3 // Type of the left-hand side target matrix
2080  , typename MT4 // Type of the left-hand side matrix operand
2081  , typename MT5 > // Type of the right-hand side matrix operand
2082  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2083  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2084  {
2085  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
2086 
2087  const size_t M( A.rows() );
2088  const size_t N( B.columns() );
2089  const size_t K( A.columns() );
2090 
2091  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2092 
2093  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2094  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2095 
2096  size_t j( 0UL );
2097 
2098  if( IsIntegral_v<ElementType> )
2099  {
2100  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2101  for( size_t i=0UL; i<M; ++i )
2102  {
2103  const size_t kbegin( ( IsUpper_v<MT4> )
2104  ?( ( IsLower_v<MT5> )
2105  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2106  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2107  :( IsLower_v<MT5> ? j : 0UL ) );
2108  const size_t kend( ( IsLower_v<MT4> )
2109  ?( ( IsUpper_v<MT5> )
2110  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
2111  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2112  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
2113 
2114  SIMDType xmm1( C.load(i,j ) );
2115  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2116  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2117  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
2118  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
2119  SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
2120  SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
2121  SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
2122 
2123  for( size_t k=kbegin; k<kend; ++k ) {
2124  const SIMDType a1( set( A(i,k) ) );
2125  xmm1 += a1 * B.load(k,j );
2126  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2127  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2128  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2129  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2130  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
2131  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
2132  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
2133  }
2134 
2135  C.store( i, j , xmm1 );
2136  C.store( i, j+SIMDSIZE , xmm2 );
2137  C.store( i, j+SIMDSIZE*2UL, xmm3 );
2138  C.store( i, j+SIMDSIZE*3UL, xmm4 );
2139  C.store( i, j+SIMDSIZE*4UL, xmm5 );
2140  C.store( i, j+SIMDSIZE*5UL, xmm6 );
2141  C.store( i, j+SIMDSIZE*6UL, xmm7 );
2142  C.store( i, j+SIMDSIZE*7UL, xmm8 );
2143  }
2144  }
2145  }
2146 
2147  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
2148  {
2149  size_t i( 0UL );
2150 
2151  for( ; (i+2UL) <= M; i+=2UL )
2152  {
2153  const size_t kbegin( ( IsUpper_v<MT4> )
2154  ?( ( IsLower_v<MT5> )
2155  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2156  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2157  :( IsLower_v<MT5> ? j : 0UL ) );
2158  const size_t kend( ( IsLower_v<MT4> )
2159  ?( ( IsUpper_v<MT5> )
2160  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
2161  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2162  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
2163 
2164  SIMDType xmm1 ( C.load(i ,j ) );
2165  SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
2166  SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
2167  SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
2168  SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
2169  SIMDType xmm6 ( C.load(i+1UL,j ) );
2170  SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
2171  SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
2172  SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
2173  SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
2174 
2175  for( size_t k=kbegin; k<kend; ++k ) {
2176  const SIMDType a1( set( A(i ,k) ) );
2177  const SIMDType a2( set( A(i+1UL,k) ) );
2178  const SIMDType b1( B.load(k,j ) );
2179  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2180  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2181  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2182  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
2183  xmm1 += a1 * b1;
2184  xmm2 += a1 * b2;
2185  xmm3 += a1 * b3;
2186  xmm4 += a1 * b4;
2187  xmm5 += a1 * b5;
2188  xmm6 += a2 * b1;
2189  xmm7 += a2 * b2;
2190  xmm8 += a2 * b3;
2191  xmm9 += a2 * b4;
2192  xmm10 += a2 * b5;
2193  }
2194 
2195  C.store( i , j , xmm1 );
2196  C.store( i , j+SIMDSIZE , xmm2 );
2197  C.store( i , j+SIMDSIZE*2UL, xmm3 );
2198  C.store( i , j+SIMDSIZE*3UL, xmm4 );
2199  C.store( i , j+SIMDSIZE*4UL, xmm5 );
2200  C.store( i+1UL, j , xmm6 );
2201  C.store( i+1UL, j+SIMDSIZE , xmm7 );
2202  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
2203  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
2204  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
2205  }
2206 
2207  if( i < M )
2208  {
2209  const size_t kbegin( ( IsUpper_v<MT4> )
2210  ?( ( IsLower_v<MT5> )
2211  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2212  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2213  :( IsLower_v<MT5> ? j : 0UL ) );
2214  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
2215 
2216  SIMDType xmm1( C.load(i,j ) );
2217  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2218  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2219  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
2220  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
2221 
2222  for( size_t k=kbegin; k<kend; ++k ) {
2223  const SIMDType a1( set( A(i,k) ) );
2224  xmm1 += a1 * B.load(k,j );
2225  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2226  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2227  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2228  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2229  }
2230 
2231  C.store( i, j , xmm1 );
2232  C.store( i, j+SIMDSIZE , xmm2 );
2233  C.store( i, j+SIMDSIZE*2UL, xmm3 );
2234  C.store( i, j+SIMDSIZE*3UL, xmm4 );
2235  C.store( i, j+SIMDSIZE*4UL, xmm5 );
2236  }
2237  }
2238 
2239  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2240  {
2241  size_t i( 0UL );
2242 
2243  for( ; (i+2UL) <= M; i+=2UL )
2244  {
2245  const size_t kbegin( ( IsUpper_v<MT4> )
2246  ?( ( IsLower_v<MT5> )
2247  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2248  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2249  :( IsLower_v<MT5> ? j : 0UL ) );
2250  const size_t kend( ( IsLower_v<MT4> )
2251  ?( ( IsUpper_v<MT5> )
2252  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
2253  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2254  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
2255 
2256  SIMDType xmm1( C.load(i ,j ) );
2257  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
2258  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
2259  SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
2260  SIMDType xmm5( C.load(i+1UL,j ) );
2261  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
2262  SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
2263  SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
2264 
2265  for( size_t k=kbegin; k<kend; ++k ) {
2266  const SIMDType a1( set( A(i ,k) ) );
2267  const SIMDType a2( set( A(i+1UL,k) ) );
2268  const SIMDType b1( B.load(k,j ) );
2269  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2270  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2271  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2272  xmm1 += a1 * b1;
2273  xmm2 += a1 * b2;
2274  xmm3 += a1 * b3;
2275  xmm4 += a1 * b4;
2276  xmm5 += a2 * b1;
2277  xmm6 += a2 * b2;
2278  xmm7 += a2 * b3;
2279  xmm8 += a2 * b4;
2280  }
2281 
2282  C.store( i , j , xmm1 );
2283  C.store( i , j+SIMDSIZE , xmm2 );
2284  C.store( i , j+SIMDSIZE*2UL, xmm3 );
2285  C.store( i , j+SIMDSIZE*3UL, xmm4 );
2286  C.store( i+1UL, j , xmm5 );
2287  C.store( i+1UL, j+SIMDSIZE , xmm6 );
2288  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2289  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2290  }
2291 
2292  if( i < M )
2293  {
2294  const size_t kbegin( ( IsUpper_v<MT4> )
2295  ?( ( IsLower_v<MT5> )
2296  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2297  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2298  :( IsLower_v<MT5> ? j : 0UL ) );
2299  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
2300 
2301  SIMDType xmm1( C.load(i,j ) );
2302  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2303  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2304  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
2305 
2306  for( size_t k=kbegin; k<kend; ++k ) {
2307  const SIMDType a1( set( A(i,k) ) );
2308  xmm1 += a1 * B.load(k,j );
2309  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2310  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2311  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2312  }
2313 
2314  C.store( i, j , xmm1 );
2315  C.store( i, j+SIMDSIZE , xmm2 );
2316  C.store( i, j+SIMDSIZE*2UL, xmm3 );
2317  C.store( i, j+SIMDSIZE*3UL, xmm4 );
2318  }
2319  }
2320 
2321  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2322  {
2323  size_t i( 0UL );
2324 
2325  for( ; (i+2UL) <= M; i+=2UL )
2326  {
2327  const size_t kbegin( ( IsUpper_v<MT4> )
2328  ?( ( IsLower_v<MT5> )
2329  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2330  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2331  :( IsLower_v<MT5> ? j : 0UL ) );
2332  const size_t kend( ( IsLower_v<MT4> )
2333  ?( ( IsUpper_v<MT5> )
2334  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
2335  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2336  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
2337 
2338  SIMDType xmm1( C.load(i ,j ) );
2339  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
2340  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
2341  SIMDType xmm4( C.load(i+1UL,j ) );
2342  SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
2343  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
2344 
2345  for( size_t k=kbegin; k<kend; ++k ) {
2346  const SIMDType a1( set( A(i ,k) ) );
2347  const SIMDType a2( set( A(i+1UL,k) ) );
2348  const SIMDType b1( B.load(k,j ) );
2349  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2350  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2351  xmm1 += a1 * b1;
2352  xmm2 += a1 * b2;
2353  xmm3 += a1 * b3;
2354  xmm4 += a2 * b1;
2355  xmm5 += a2 * b2;
2356  xmm6 += a2 * b3;
2357  }
2358 
2359  C.store( i , j , xmm1 );
2360  C.store( i , j+SIMDSIZE , xmm2 );
2361  C.store( i , j+SIMDSIZE*2UL, xmm3 );
2362  C.store( i+1UL, j , xmm4 );
2363  C.store( i+1UL, j+SIMDSIZE , xmm5 );
2364  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
2365  }
2366 
2367  if( i < M )
2368  {
2369  const size_t kbegin( ( IsUpper_v<MT4> )
2370  ?( ( IsLower_v<MT5> )
2371  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2372  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2373  :( IsLower_v<MT5> ? j : 0UL ) );
2374  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
2375 
2376  SIMDType xmm1( C.load(i,j ) );
2377  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
2378  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
2379 
2380  for( size_t k=kbegin; k<kend; ++k ) {
2381  const SIMDType a1( set( A(i,k) ) );
2382  xmm1 += a1 * B.load(k,j );
2383  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2384  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2385  }
2386 
2387  C.store( i, j , xmm1 );
2388  C.store( i, j+SIMDSIZE , xmm2 );
2389  C.store( i, j+SIMDSIZE*2UL, xmm3 );
2390  }
2391  }
2392 
2393  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2394  {
2395  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
2396  size_t i( LOW ? j : 0UL );
2397 
2398  for( ; (i+4UL) <= iend; i+=4UL )
2399  {
2400  const size_t kbegin( ( IsUpper_v<MT4> )
2401  ?( ( IsLower_v<MT5> )
2402  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2403  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2404  :( IsLower_v<MT5> ? j : 0UL ) );
2405  const size_t kend( ( IsLower_v<MT4> )
2406  ?( ( IsUpper_v<MT5> )
2407  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
2408  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
2409  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
2410 
2411  SIMDType xmm1( C.load(i ,j ) );
2412  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
2413  SIMDType xmm3( C.load(i+1UL,j ) );
2414  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
2415  SIMDType xmm5( C.load(i+2UL,j ) );
2416  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
2417  SIMDType xmm7( C.load(i+3UL,j ) );
2418  SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
2419 
2420  for( size_t k=kbegin; k<kend; ++k ) {
2421  const SIMDType a1( set( A(i ,k) ) );
2422  const SIMDType a2( set( A(i+1UL,k) ) );
2423  const SIMDType a3( set( A(i+2UL,k) ) );
2424  const SIMDType a4( set( A(i+3UL,k) ) );
2425  const SIMDType b1( B.load(k,j ) );
2426  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2427  xmm1 += a1 * b1;
2428  xmm2 += a1 * b2;
2429  xmm3 += a2 * b1;
2430  xmm4 += a2 * b2;
2431  xmm5 += a3 * b1;
2432  xmm6 += a3 * b2;
2433  xmm7 += a4 * b1;
2434  xmm8 += a4 * b2;
2435  }
2436 
2437  C.store( i , j , xmm1 );
2438  C.store( i , j+SIMDSIZE, xmm2 );
2439  C.store( i+1UL, j , xmm3 );
2440  C.store( i+1UL, j+SIMDSIZE, xmm4 );
2441  C.store( i+2UL, j , xmm5 );
2442  C.store( i+2UL, j+SIMDSIZE, xmm6 );
2443  C.store( i+3UL, j , xmm7 );
2444  C.store( i+3UL, j+SIMDSIZE, xmm8 );
2445  }
2446 
2447  for( ; (i+3UL) <= iend; i+=3UL )
2448  {
2449  const size_t kbegin( ( IsUpper_v<MT4> )
2450  ?( ( IsLower_v<MT5> )
2451  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2452  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2453  :( IsLower_v<MT5> ? j : 0UL ) );
2454  const size_t kend( ( IsLower_v<MT4> )
2455  ?( ( IsUpper_v<MT5> )
2456  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
2457  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
2458  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
2459 
2460  SIMDType xmm1( C.load(i ,j ) );
2461  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
2462  SIMDType xmm3( C.load(i+1UL,j ) );
2463  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
2464  SIMDType xmm5( C.load(i+2UL,j ) );
2465  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
2466 
2467  for( size_t k=kbegin; k<kend; ++k ) {
2468  const SIMDType a1( set( A(i ,k) ) );
2469  const SIMDType a2( set( A(i+1UL,k) ) );
2470  const SIMDType a3( set( A(i+2UL,k) ) );
2471  const SIMDType b1( B.load(k,j ) );
2472  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2473  xmm1 += a1 * b1;
2474  xmm2 += a1 * b2;
2475  xmm3 += a2 * b1;
2476  xmm4 += a2 * b2;
2477  xmm5 += a3 * b1;
2478  xmm6 += a3 * b2;
2479  }
2480 
2481  C.store( i , j , xmm1 );
2482  C.store( i , j+SIMDSIZE, xmm2 );
2483  C.store( i+1UL, j , xmm3 );
2484  C.store( i+1UL, j+SIMDSIZE, xmm4 );
2485  C.store( i+2UL, j , xmm5 );
2486  C.store( i+2UL, j+SIMDSIZE, xmm6 );
2487  }
2488 
2489  for( ; (i+2UL) <= iend; i+=2UL )
2490  {
2491  const size_t kbegin( ( IsUpper_v<MT4> )
2492  ?( ( IsLower_v<MT5> )
2493  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2494  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2495  :( IsLower_v<MT5> ? j : 0UL ) );
2496  const size_t kend( ( IsLower_v<MT4> )
2497  ?( ( IsUpper_v<MT5> )
2498  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
2499  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2500  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
2501 
2502  SIMDType xmm1( C.load(i ,j ) );
2503  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
2504  SIMDType xmm3( C.load(i+1UL,j ) );
2505  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
2506  SIMDType xmm5, xmm6, xmm7, xmm8;
2507  size_t k( kbegin );
2508 
2509  for( ; (k+2UL) <= kend; k+=2UL ) {
2510  const SIMDType a1( set( A(i ,k ) ) );
2511  const SIMDType a2( set( A(i+1UL,k ) ) );
2512  const SIMDType a3( set( A(i ,k+1UL) ) );
2513  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
2514  const SIMDType b1( B.load(k ,j ) );
2515  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
2516  const SIMDType b3( B.load(k+1UL,j ) );
2517  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
2518  xmm1 += a1 * b1;
2519  xmm2 += a1 * b2;
2520  xmm3 += a2 * b1;
2521  xmm4 += a2 * b2;
2522  xmm5 += a3 * b3;
2523  xmm6 += a3 * b4;
2524  xmm7 += a4 * b3;
2525  xmm8 += a4 * b4;
2526  }
2527 
2528  for( ; k<kend; ++k ) {
2529  const SIMDType a1( set( A(i ,k) ) );
2530  const SIMDType a2( set( A(i+1UL,k) ) );
2531  const SIMDType b1( B.load(k,j ) );
2532  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2533  xmm1 += a1 * b1;
2534  xmm2 += a1 * b2;
2535  xmm3 += a2 * b1;
2536  xmm4 += a2 * b2;
2537  }
2538 
2539  C.store( i , j , xmm1+xmm5 );
2540  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
2541  C.store( i+1UL, j , xmm3+xmm7 );
2542  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
2543  }
2544 
2545  if( i < iend )
2546  {
2547  const size_t kbegin( ( IsUpper_v<MT4> )
2548  ?( ( IsLower_v<MT5> )
2549  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2550  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2551  :( IsLower_v<MT5> ? j : 0UL ) );
2552  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
2553 
2554  SIMDType xmm1( C.load(i,j ) );
2555  SIMDType xmm2( C.load(i,j+SIMDSIZE) );
2556  SIMDType xmm3, xmm4;
2557  size_t k( kbegin );
2558 
2559  for( ; (k+2UL) <= kend; k+=2UL ) {
2560  const SIMDType a1( set( A(i,k ) ) );
2561  const SIMDType a2( set( A(i,k+1UL) ) );
2562  xmm1 += a1 * B.load(k ,j );
2563  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
2564  xmm3 += a2 * B.load(k+1UL,j );
2565  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
2566  }
2567 
2568  for( ; k<kend; ++k ) {
2569  const SIMDType a1( set( A(i,k) ) );
2570  xmm1 += a1 * B.load(k,j );
2571  xmm2 += a1 * B.load(k,j+SIMDSIZE);
2572  }
2573 
2574  C.store( i, j , xmm1+xmm3 );
2575  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
2576  }
2577  }
2578 
2579  for( ; j<jpos; j+=SIMDSIZE )
2580  {
2581  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
2582  size_t i( LOW ? j : 0UL );
2583 
2584  for( ; (i+4UL) <= iend; i+=4UL )
2585  {
2586  const size_t kbegin( ( IsUpper_v<MT4> )
2587  ?( ( IsLower_v<MT5> )
2588  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2589  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2590  :( IsLower_v<MT5> ? j : 0UL ) );
2591  const size_t kend( ( IsLower_v<MT4> )
2592  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
2593  :( K ) );
2594 
2595  SIMDType xmm1( C.load(i ,j) );
2596  SIMDType xmm2( C.load(i+1UL,j) );
2597  SIMDType xmm3( C.load(i+2UL,j) );
2598  SIMDType xmm4( C.load(i+3UL,j) );
2599  SIMDType xmm5, xmm6, xmm7, xmm8;
2600  size_t k( kbegin );
2601 
2602  for( ; (k+2UL) <= kend; k+=2UL ) {
2603  const SIMDType b1( B.load(k ,j) );
2604  const SIMDType b2( B.load(k+1UL,j) );
2605  xmm1 += set( A(i ,k ) ) * b1;
2606  xmm2 += set( A(i+1UL,k ) ) * b1;
2607  xmm3 += set( A(i+2UL,k ) ) * b1;
2608  xmm4 += set( A(i+3UL,k ) ) * b1;
2609  xmm5 += set( A(i ,k+1UL) ) * b2;
2610  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
2611  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
2612  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
2613  }
2614 
2615  for( ; k<kend; ++k ) {
2616  const SIMDType b1( B.load(k,j) );
2617  xmm1 += set( A(i ,k) ) * b1;
2618  xmm2 += set( A(i+1UL,k) ) * b1;
2619  xmm3 += set( A(i+2UL,k) ) * b1;
2620  xmm4 += set( A(i+3UL,k) ) * b1;
2621  }
2622 
2623  C.store( i , j, xmm1+xmm5 );
2624  C.store( i+1UL, j, xmm2+xmm6 );
2625  C.store( i+2UL, j, xmm3+xmm7 );
2626  C.store( i+3UL, j, xmm4+xmm8 );
2627  }
2628 
2629  for( ; (i+3UL) <= iend; i+=3UL )
2630  {
2631  const size_t kbegin( ( IsUpper_v<MT4> )
2632  ?( ( IsLower_v<MT5> )
2633  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2634  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2635  :( IsLower_v<MT5> ? j : 0UL ) );
2636  const size_t kend( ( IsLower_v<MT4> )
2637  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
2638  :( K ) );
2639 
2640  SIMDType xmm1( C.load(i ,j) );
2641  SIMDType xmm2( C.load(i+1UL,j) );
2642  SIMDType xmm3( C.load(i+2UL,j) );
2643  SIMDType xmm4, xmm5, xmm6;
2644  size_t k( kbegin );
2645 
2646  for( ; (k+2UL) <= kend; k+=2UL ) {
2647  const SIMDType b1( B.load(k ,j) );
2648  const SIMDType b2( B.load(k+1UL,j) );
2649  xmm1 += set( A(i ,k ) ) * b1;
2650  xmm2 += set( A(i+1UL,k ) ) * b1;
2651  xmm3 += set( A(i+2UL,k ) ) * b1;
2652  xmm4 += set( A(i ,k+1UL) ) * b2;
2653  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
2654  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
2655  }
2656 
2657  for( ; k<kend; ++k ) {
2658  const SIMDType b1( B.load(k,j) );
2659  xmm1 += set( A(i ,k) ) * b1;
2660  xmm2 += set( A(i+1UL,k) ) * b1;
2661  xmm3 += set( A(i+2UL,k) ) * b1;
2662  }
2663 
2664  C.store( i , j, xmm1+xmm4 );
2665  C.store( i+1UL, j, xmm2+xmm5 );
2666  C.store( i+2UL, j, xmm3+xmm6 );
2667  }
2668 
2669  for( ; (i+2UL) <= iend; i+=2UL )
2670  {
2671  const size_t kbegin( ( IsUpper_v<MT4> )
2672  ?( ( IsLower_v<MT5> )
2673  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2674  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2675  :( IsLower_v<MT5> ? j : 0UL ) );
2676  const size_t kend( ( IsLower_v<MT4> )
2677  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2678  :( K ) );
2679 
2680  SIMDType xmm1( C.load(i ,j) );
2681  SIMDType xmm2( C.load(i+1UL,j) );
2682  SIMDType xmm3, xmm4;
2683  size_t k( kbegin );
2684 
2685  for( ; (k+2UL) <= kend; k+=2UL ) {
2686  const SIMDType b1( B.load(k ,j) );
2687  const SIMDType b2( B.load(k+1UL,j) );
2688  xmm1 += set( A(i ,k ) ) * b1;
2689  xmm2 += set( A(i+1UL,k ) ) * b1;
2690  xmm3 += set( A(i ,k+1UL) ) * b2;
2691  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
2692  }
2693 
2694  for( ; k<kend; ++k ) {
2695  const SIMDType b1( B.load(k,j) );
2696  xmm1 += set( A(i ,k) ) * b1;
2697  xmm2 += set( A(i+1UL,k) ) * b1;
2698  }
2699 
2700  C.store( i , j, xmm1+xmm3 );
2701  C.store( i+1UL, j, xmm2+xmm4 );
2702  }
2703 
2704  if( i < iend )
2705  {
2706  const size_t kbegin( ( IsUpper_v<MT4> )
2707  ?( ( IsLower_v<MT5> )
2708  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2709  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2710  :( IsLower_v<MT5> ? j : 0UL ) );
2711 
2712  SIMDType xmm1( C.load(i,j) );
2713  SIMDType xmm2;
2714  size_t k( kbegin );
2715 
2716  for( ; (k+2UL) <= K; k+=2UL ) {
2717  xmm1 += set( A(i,k ) ) * B.load(k ,j);
2718  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
2719  }
2720 
2721  for( ; k<K; ++k ) {
2722  xmm1 += set( A(i,k) ) * B.load(k,j);
2723  }
2724 
2725  C.store( i, j, xmm1+xmm2 );
2726  }
2727  }
2728 
2729  for( ; remainder && j<N; ++j )
2730  {
2731  const size_t iend( UPP ? j+1UL : M );
2732  size_t i( LOW ? j : 0UL );
2733 
2734  for( ; (i+2UL) <= iend; i+=2UL )
2735  {
2736  const size_t kbegin( ( IsUpper_v<MT4> )
2737  ?( ( IsLower_v<MT5> )
2738  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2739  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2740  :( IsLower_v<MT5> ? j : 0UL ) );
2741  const size_t kend( ( IsLower_v<MT4> )
2742  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2743  :( K ) );
2744 
2745  ElementType value1( C(i ,j) );
2746  ElementType value2( C(i+1UL,j) );;
2747 
2748  for( size_t k=kbegin; k<kend; ++k ) {
2749  value1 += A(i ,k) * B(k,j);
2750  value2 += A(i+1UL,k) * B(k,j);
2751  }
2752 
2753  C(i ,j) = value1;
2754  C(i+1UL,j) = value2;
2755  }
2756 
2757  if( i < iend )
2758  {
2759  const size_t kbegin( ( IsUpper_v<MT4> )
2760  ?( ( IsLower_v<MT5> )
2761  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2762  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2763  :( IsLower_v<MT5> ? j : 0UL ) );
2764 
2765  ElementType value( C(i,j) );
2766 
2767  for( size_t k=kbegin; k<K; ++k ) {
2768  value += A(i,k) * B(k,j);
2769  }
2770 
2771  C(i,j) = value;
2772  }
2773  }
2774  }
2776  //**********************************************************************************************
2777 
2778  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2793  template< typename MT3 // Type of the left-hand side target matrix
2794  , typename MT4 // Type of the left-hand side matrix operand
2795  , typename MT5 > // Type of the right-hand side matrix operand
2796  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2797  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2798  {
2803 
2804  const ForwardFunctor fwd;
2805 
2806  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2807  const OppositeType_t<MT4> tmp( serial( A ) );
2808  addAssign( C, fwd( tmp * B ) );
2809  }
2810  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2811  const OppositeType_t<MT5> tmp( serial( B ) );
2812  addAssign( C, fwd( A * tmp ) );
2813  }
2814  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2815  const OppositeType_t<MT4> tmp( serial( A ) );
2816  addAssign( C, fwd( tmp * B ) );
2817  }
2818  else {
2819  const OppositeType_t<MT5> tmp( serial( B ) );
2820  addAssign( C, fwd( A * tmp ) );
2821  }
2822  }
2824  //**********************************************************************************************
2825 
2826  //**Default addition assignment to dense matrices (large matrices)******************************
2840  template< typename MT3 // Type of the left-hand side target matrix
2841  , typename MT4 // Type of the left-hand side matrix operand
2842  , typename MT5 > // Type of the right-hand side matrix operand
2843  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2844  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2845  {
2846  selectDefaultAddAssignKernel( C, A, B );
2847  }
2849  //**********************************************************************************************
2850 
2851  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
2866  template< typename MT3 // Type of the left-hand side target matrix
2867  , typename MT4 // Type of the left-hand side matrix operand
2868  , typename MT5 > // Type of the right-hand side matrix operand
2869  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2870  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2871  {
2872  if( LOW )
2873  lmmm( C, A, B, ElementType(1), ElementType(1) );
2874  else if( UPP )
2875  ummm( C, A, B, ElementType(1), ElementType(1) );
2876  else
2877  mmm( C, A, B, ElementType(1), ElementType(1) );
2878  }
2880  //**********************************************************************************************
2881 
2882  //**BLAS-based addition assignment to dense matrices (default)**********************************
2896  template< typename MT3 // Type of the left-hand side target matrix
2897  , typename MT4 // Type of the left-hand side matrix operand
2898  , typename MT5 > // Type of the right-hand side matrix operand
2899  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2900  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2901  {
2902  selectLargeAddAssignKernel( C, A, B );
2903  }
2905  //**********************************************************************************************
2906 
2907  //**BLAS-based addition assignment to dense matrices********************************************
2908 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2909 
2922  template< typename MT3 // Type of the left-hand side target matrix
2923  , typename MT4 // Type of the left-hand side matrix operand
2924  , typename MT5 > // Type of the right-hand side matrix operand
2925  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2926  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2927  {
2928  using ET = ElementType_t<MT3>;
2929 
2930  if( IsTriangular_v<MT4> ) {
2931  ResultType_t<MT3> tmp( serial( B ) );
2932  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2933  addAssign( C, tmp );
2934  }
2935  else if( IsTriangular_v<MT5> ) {
2936  ResultType_t<MT3> tmp( serial( A ) );
2937  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2938  addAssign( C, tmp );
2939  }
2940  else {
2941  gemm( C, A, B, ET(1), ET(1) );
2942  }
2943  }
2945 #endif
2946  //**********************************************************************************************
2947 
2948  //**Restructuring addition assignment to column-major matrices**********************************
2963  template< typename MT > // Type of the target matrix
2964  friend inline auto addAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
2965  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2966  {
2968 
2970 
2971  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2972  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2973 
2974  const ForwardFunctor fwd;
2975 
2976  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
2977  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
2978  else if( IsSymmetric_v<MT1> )
2979  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
2980  else
2981  addAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
2982  }
2984  //**********************************************************************************************
2985 
2986  //**Addition assignment to sparse matrices******************************************************
2987  // No special implementation for the addition assignment to sparse matrices.
2988  //**********************************************************************************************
2989 
2990  //**Subtraction assignment to dense matrices****************************************************
3003  template< typename MT // Type of the target dense matrix
3004  , bool SO > // Storage order of the target dense matrix
3005  friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
3006  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3007  {
3009 
3010  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3011  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3012 
3013  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3014  return;
3015  }
3016 
3017  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3018  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3019 
3020  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3021  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3022  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3023  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3024  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3025  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3026 
3027  DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3028  }
3030  //**********************************************************************************************
3031 
3032  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3043  template< typename MT3 // Type of the left-hand side target matrix
3044  , typename MT4 // Type of the left-hand side matrix operand
3045  , typename MT5 > // Type of the right-hand side matrix operand
3046  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3047  {
3048  if( ( IsDiagonal_v<MT5> ) ||
3049  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
3050  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
3051  selectSmallSubAssignKernel( C, A, B );
3052  else
3053  selectBlasSubAssignKernel( C, A, B );
3054  }
3056  //**********************************************************************************************
3057 
3058  //**Default subtraction assignment to dense matrices (general/general)**************************
3072  template< typename MT3 // Type of the left-hand side target matrix
3073  , typename MT4 // Type of the left-hand side matrix operand
3074  , typename MT5 > // Type of the right-hand side matrix operand
3075  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3076  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3077  {
3078  const size_t M( A.rows() );
3079  const size_t N( B.columns() );
3080  const size_t K( A.columns() );
3081 
3082  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3083 
3084  for( size_t i=0UL; i<M; ++i )
3085  {
3086  const size_t kbegin( ( IsUpper_v<MT4> )
3087  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3088  :( 0UL ) );
3089  const size_t kend( ( IsLower_v<MT4> )
3090  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3091  :( K ) );
3092  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3093 
3094  for( size_t k=kbegin; k<kend; ++k )
3095  {
3096  const size_t jbegin( ( IsUpper_v<MT5> )
3097  ?( ( IsStrictlyUpper_v<MT5> )
3098  ?( UPP ? max(i,k+1UL) : k+1UL )
3099  :( UPP ? max(i,k) : k ) )
3100  :( UPP ? i : 0UL ) );
3101  const size_t jend( ( IsLower_v<MT5> )
3102  ?( ( IsStrictlyLower_v<MT5> )
3103  ?( LOW ? min(i+1UL,k) : k )
3104  :( LOW ? min(i,k)+1UL : k+1UL ) )
3105  :( LOW ? i+1UL : N ) );
3106 
3107  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
3108  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3109 
3110  const size_t jnum( jend - jbegin );
3111  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3112 
3113  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3114  C(i,j ) -= A(i,k) * B(k,j );
3115  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3116  }
3117  if( jpos < jend ) {
3118  C(i,jpos) -= A(i,k) * B(k,jpos);
3119  }
3120  }
3121  }
3122  }
3124  //**********************************************************************************************
3125 
3126  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
3140  template< typename MT3 // Type of the left-hand side target matrix
3141  , typename MT4 // Type of the left-hand side matrix operand
3142  , typename MT5 > // Type of the right-hand side matrix operand
3143  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3144  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3145  {
3147 
3148  const size_t M( A.rows() );
3149  const size_t N( B.columns() );
3150 
3151  for( size_t i=0UL; i<M; ++i )
3152  {
3153  const size_t jbegin( ( IsUpper_v<MT4> )
3154  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3155  :( 0UL ) );
3156  const size_t jend( ( IsLower_v<MT4> )
3157  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3158  :( N ) );
3159  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3160 
3161  const size_t jnum( jend - jbegin );
3162  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3163 
3164  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3165  C(i,j ) -= A(i,j ) * B(j ,j );
3166  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3167  }
3168  if( jpos < jend ) {
3169  C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3170  }
3171  }
3172  }
3174  //**********************************************************************************************
3175 
3176  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
3190  template< typename MT3 // Type of the left-hand side target matrix
3191  , typename MT4 // Type of the left-hand side matrix operand
3192  , typename MT5 > // Type of the right-hand side matrix operand
3193  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3194  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3195  {
3197 
3198  const size_t M( A.rows() );
3199  const size_t N( B.columns() );
3200 
3201  for( size_t i=0UL; i<M; ++i )
3202  {
3203  const size_t jbegin( ( IsUpper_v<MT5> )
3204  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3205  :( 0UL ) );
3206  const size_t jend( ( IsLower_v<MT5> )
3207  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3208  :( N ) );
3209  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3210 
3211  const size_t jnum( jend - jbegin );
3212  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3213 
3214  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3215  C(i,j ) -= A(i,i) * B(i,j );
3216  C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
3217  }
3218  if( jpos < jend ) {
3219  C(i,jpos) -= A(i,i) * B(i,jpos);
3220  }
3221  }
3222  }
3224  //**********************************************************************************************
3225 
3226  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3240  template< typename MT3 // Type of the left-hand side target matrix
3241  , typename MT4 // Type of the left-hand side matrix operand
3242  , typename MT5 > // Type of the right-hand side matrix operand
3243  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3244  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3245  {
3247 
3248  for( size_t i=0UL; i<A.rows(); ++i ) {
3249  C(i,i) -= A(i,i) * B(i,i);
3250  }
3251  }
3253  //**********************************************************************************************
3254 
3255  //**Default subtraction assignment to dense matrices (small matrices)***************************
3269  template< typename MT3 // Type of the left-hand side target matrix
3270  , typename MT4 // Type of the left-hand side matrix operand
3271  , typename MT5 > // Type of the right-hand side matrix operand
3272  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3273  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3274  {
3275  selectDefaultSubAssignKernel( C, A, B );
3276  }
3278  //**********************************************************************************************
3279 
3280  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
3295  template< typename MT3 // Type of the left-hand side target matrix
3296  , typename MT4 // Type of the left-hand side matrix operand
3297  , typename MT5 > // Type of the right-hand side matrix operand
3298  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3299  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3300  {
3301  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3302 
3303  const size_t M( A.rows() );
3304  const size_t N( B.columns() );
3305  const size_t K( A.columns() );
3306 
3307  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3308 
3309  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3310  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3311 
3312  size_t j( 0UL );
3313 
3314  if( IsIntegral_v<ElementType> )
3315  {
3316  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3317  for( size_t i=0UL; i<M; ++i )
3318  {
3319  const size_t kbegin( ( IsUpper_v<MT4> )
3320  ?( ( IsLower_v<MT5> )
3321  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3322  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3323  :( IsLower_v<MT5> ? j : 0UL ) );
3324  const size_t kend( ( IsLower_v<MT4> )
3325  ?( ( IsUpper_v<MT5> )
3326  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
3327  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3328  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
3329 
3330  SIMDType xmm1( C.load(i,j ) );
3331  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3332  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3333  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3334  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3335  SIMDType xmm6( C.load(i,j+SIMDSIZE*5UL) );
3336  SIMDType xmm7( C.load(i,j+SIMDSIZE*6UL) );
3337  SIMDType xmm8( C.load(i,j+SIMDSIZE*7UL) );
3338 
3339  for( size_t k=kbegin; k<kend; ++k ) {
3340  const SIMDType a1( set( A(i,k) ) );
3341  xmm1 -= a1 * B.load(k,j );
3342  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3343  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3344  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3345  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3346  xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
3347  xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
3348  xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
3349  }
3350 
3351  C.store( i, j , xmm1 );
3352  C.store( i, j+SIMDSIZE , xmm2 );
3353  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3354  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3355  C.store( i, j+SIMDSIZE*4UL, xmm5 );
3356  C.store( i, j+SIMDSIZE*5UL, xmm6 );
3357  C.store( i, j+SIMDSIZE*6UL, xmm7 );
3358  C.store( i, j+SIMDSIZE*7UL, xmm8 );
3359  }
3360  }
3361  }
3362 
3363  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3364  {
3365  size_t i( 0UL );
3366 
3367  for( ; (i+2UL) <= M; i+=2UL )
3368  {
3369  const size_t kbegin( ( IsUpper_v<MT4> )
3370  ?( ( IsLower_v<MT5> )
3371  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3372  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3373  :( IsLower_v<MT5> ? j : 0UL ) );
3374  const size_t kend( ( IsLower_v<MT4> )
3375  ?( ( IsUpper_v<MT5> )
3376  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
3377  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3378  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
3379 
3380  SIMDType xmm1 ( C.load(i ,j ) );
3381  SIMDType xmm2 ( C.load(i ,j+SIMDSIZE ) );
3382  SIMDType xmm3 ( C.load(i ,j+SIMDSIZE*2UL) );
3383  SIMDType xmm4 ( C.load(i ,j+SIMDSIZE*3UL) );
3384  SIMDType xmm5 ( C.load(i ,j+SIMDSIZE*4UL) );
3385  SIMDType xmm6 ( C.load(i+1UL,j ) );
3386  SIMDType xmm7 ( C.load(i+1UL,j+SIMDSIZE ) );
3387  SIMDType xmm8 ( C.load(i+1UL,j+SIMDSIZE*2UL) );
3388  SIMDType xmm9 ( C.load(i+1UL,j+SIMDSIZE*3UL) );
3389  SIMDType xmm10( C.load(i+1UL,j+SIMDSIZE*4UL) );
3390 
3391  for( size_t k=kbegin; k<kend; ++k ) {
3392  const SIMDType a1( set( A(i ,k) ) );
3393  const SIMDType a2( set( A(i+1UL,k) ) );
3394  const SIMDType b1( B.load(k,j ) );
3395  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3396  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3397  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3398  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3399  xmm1 -= a1 * b1;
3400  xmm2 -= a1 * b2;
3401  xmm3 -= a1 * b3;
3402  xmm4 -= a1 * b4;
3403  xmm5 -= a1 * b5;
3404  xmm6 -= a2 * b1;
3405  xmm7 -= a2 * b2;
3406  xmm8 -= a2 * b3;
3407  xmm9 -= a2 * b4;
3408  xmm10 -= a2 * b5;
3409  }
3410 
3411  C.store( i , j , xmm1 );
3412  C.store( i , j+SIMDSIZE , xmm2 );
3413  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3414  C.store( i , j+SIMDSIZE*3UL, xmm4 );
3415  C.store( i , j+SIMDSIZE*4UL, xmm5 );
3416  C.store( i+1UL, j , xmm6 );
3417  C.store( i+1UL, j+SIMDSIZE , xmm7 );
3418  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3419  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3420  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3421  }
3422 
3423  if( i < M )
3424  {
3425  const size_t kbegin( ( IsUpper_v<MT4> )
3426  ?( ( IsLower_v<MT5> )
3427  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3428  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3429  :( IsLower_v<MT5> ? j : 0UL ) );
3430  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
3431 
3432  SIMDType xmm1( C.load(i,j ) );
3433  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3434  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3435  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3436  SIMDType xmm5( C.load(i,j+SIMDSIZE*4UL) );
3437 
3438  for( size_t k=kbegin; k<kend; ++k ) {
3439  const SIMDType a1( set( A(i,k) ) );
3440  xmm1 -= a1 * B.load(k,j );
3441  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3442  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3443  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3444  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3445  }
3446 
3447  C.store( i, j , xmm1 );
3448  C.store( i, j+SIMDSIZE , xmm2 );
3449  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3450  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3451  C.store( i, j+SIMDSIZE*4UL, xmm5 );
3452  }
3453  }
3454 
3455  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3456  {
3457  size_t i( 0UL );
3458 
3459  for( ; (i+2UL) <= M; i+=2UL )
3460  {
3461  const size_t kbegin( ( IsUpper_v<MT4> )
3462  ?( ( IsLower_v<MT5> )
3463  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3464  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3465  :( IsLower_v<MT5> ? j : 0UL ) );
3466  const size_t kend( ( IsLower_v<MT4> )
3467  ?( ( IsUpper_v<MT5> )
3468  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3469  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3470  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
3471 
3472  SIMDType xmm1( C.load(i ,j ) );
3473  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3474  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3475  SIMDType xmm4( C.load(i ,j+SIMDSIZE*3UL) );
3476  SIMDType xmm5( C.load(i+1UL,j ) );
3477  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE ) );
3478  SIMDType xmm7( C.load(i+1UL,j+SIMDSIZE*2UL) );
3479  SIMDType xmm8( C.load(i+1UL,j+SIMDSIZE*3UL) );
3480 
3481  for( size_t k=kbegin; k<kend; ++k ) {
3482  const SIMDType a1( set( A(i ,k) ) );
3483  const SIMDType a2( set( A(i+1UL,k) ) );
3484  const SIMDType b1( B.load(k,j ) );
3485  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3486  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3487  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3488  xmm1 -= a1 * b1;
3489  xmm2 -= a1 * b2;
3490  xmm3 -= a1 * b3;
3491  xmm4 -= a1 * b4;
3492  xmm5 -= a2 * b1;
3493  xmm6 -= a2 * b2;
3494  xmm7 -= a2 * b3;
3495  xmm8 -= a2 * b4;
3496  }
3497 
3498  C.store( i , j , xmm1 );
3499  C.store( i , j+SIMDSIZE , xmm2 );
3500  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3501  C.store( i , j+SIMDSIZE*3UL, xmm4 );
3502  C.store( i+1UL, j , xmm5 );
3503  C.store( i+1UL, j+SIMDSIZE , xmm6 );
3504  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3505  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3506  }
3507 
3508  if( i < M )
3509  {
3510  const size_t kbegin( ( IsUpper_v<MT4> )
3511  ?( ( IsLower_v<MT5> )
3512  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3513  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3514  :( IsLower_v<MT5> ? j : 0UL ) );
3515  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3516 
3517  SIMDType xmm1( C.load(i,j ) );
3518  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3519  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3520  SIMDType xmm4( C.load(i,j+SIMDSIZE*3UL) );
3521 
3522  for( size_t k=kbegin; k<kend; ++k ) {
3523  const SIMDType a1( set( A(i,k) ) );
3524  xmm1 -= a1 * B.load(k,j );
3525  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3526  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3527  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3528  }
3529 
3530  C.store( i, j , xmm1 );
3531  C.store( i, j+SIMDSIZE , xmm2 );
3532  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3533  C.store( i, j+SIMDSIZE*3UL, xmm4 );
3534  }
3535  }
3536 
3537  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3538  {
3539  size_t i( 0UL );
3540 
3541  for( ; (i+2UL) <= M; i+=2UL )
3542  {
3543  const size_t kbegin( ( IsUpper_v<MT4> )
3544  ?( ( IsLower_v<MT5> )
3545  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3546  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3547  :( IsLower_v<MT5> ? j : 0UL ) );
3548  const size_t kend( ( IsLower_v<MT4> )
3549  ?( ( IsUpper_v<MT5> )
3550  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
3551  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3552  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
3553 
3554  SIMDType xmm1( C.load(i ,j ) );
3555  SIMDType xmm2( C.load(i ,j+SIMDSIZE ) );
3556  SIMDType xmm3( C.load(i ,j+SIMDSIZE*2UL) );
3557  SIMDType xmm4( C.load(i+1UL,j ) );
3558  SIMDType xmm5( C.load(i+1UL,j+SIMDSIZE ) );
3559  SIMDType xmm6( C.load(i+1UL,j+SIMDSIZE*2UL) );
3560 
3561  for( size_t k=kbegin; k<kend; ++k ) {
3562  const SIMDType a1( set( A(i ,k) ) );
3563  const SIMDType a2( set( A(i+1UL,k) ) );
3564  const SIMDType b1( B.load(k,j ) );
3565  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3566  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3567  xmm1 -= a1 * b1;
3568  xmm2 -= a1 * b2;
3569  xmm3 -= a1 * b3;
3570  xmm4 -= a2 * b1;
3571  xmm5 -= a2 * b2;
3572  xmm6 -= a2 * b3;
3573  }
3574 
3575  C.store( i , j , xmm1 );
3576  C.store( i , j+SIMDSIZE , xmm2 );
3577  C.store( i , j+SIMDSIZE*2UL, xmm3 );
3578  C.store( i+1UL, j , xmm4 );
3579  C.store( i+1UL, j+SIMDSIZE , xmm5 );
3580  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3581  }
3582 
3583  if( i < M )
3584  {
3585  const size_t kbegin( ( IsUpper_v<MT4> )
3586  ?( ( IsLower_v<MT5> )
3587  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3588  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3589  :( IsLower_v<MT5> ? j : 0UL ) );
3590  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
3591 
3592  SIMDType xmm1( C.load(i,j ) );
3593  SIMDType xmm2( C.load(i,j+SIMDSIZE ) );
3594  SIMDType xmm3( C.load(i,j+SIMDSIZE*2UL) );
3595 
3596  for( size_t k=kbegin; k<kend; ++k ) {
3597  const SIMDType a1( set( A(i,k) ) );
3598  xmm1 -= a1 * B.load(k,j );
3599  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3600  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3601  }
3602 
3603  C.store( i, j , xmm1 );
3604  C.store( i, j+SIMDSIZE , xmm2 );
3605  C.store( i, j+SIMDSIZE*2UL, xmm3 );
3606  }
3607  }
3608 
3609  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3610  {
3611  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
3612  size_t i( LOW ? j : 0UL );
3613 
3614  for( ; (i+4UL) <= iend; i+=4UL )
3615  {
3616  const size_t kbegin( ( IsUpper_v<MT4> )
3617  ?( ( IsLower_v<MT5> )
3618  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3619  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3620  :( IsLower_v<MT5> ? j : 0UL ) );
3621  const size_t kend( ( IsLower_v<MT4> )
3622  ?( ( IsUpper_v<MT5> )
3623  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
3624  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3625  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3626 
3627  SIMDType xmm1( C.load(i ,j ) );
3628  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3629  SIMDType xmm3( C.load(i+1UL,j ) );
3630  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3631  SIMDType xmm5( C.load(i+2UL,j ) );
3632  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3633  SIMDType xmm7( C.load(i+3UL,j ) );
3634  SIMDType xmm8( C.load(i+3UL,j+SIMDSIZE) );
3635 
3636  for( size_t k=kbegin; k<kend; ++k ) {
3637  const SIMDType a1( set( A(i ,k) ) );
3638  const SIMDType a2( set( A(i+1UL,k) ) );
3639  const SIMDType a3( set( A(i+2UL,k) ) );
3640  const SIMDType a4( set( A(i+3UL,k) ) );
3641  const SIMDType b1( B.load(k,j ) );
3642  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3643  xmm1 -= a1 * b1;
3644  xmm2 -= a1 * b2;
3645  xmm3 -= a2 * b1;
3646  xmm4 -= a2 * b2;
3647  xmm5 -= a3 * b1;
3648  xmm6 -= a3 * b2;
3649  xmm7 -= a4 * b1;
3650  xmm8 -= a4 * b2;
3651  }
3652 
3653  C.store( i , j , xmm1 );
3654  C.store( i , j+SIMDSIZE, xmm2 );
3655  C.store( i+1UL, j , xmm3 );
3656  C.store( i+1UL, j+SIMDSIZE, xmm4 );
3657  C.store( i+2UL, j , xmm5 );
3658  C.store( i+2UL, j+SIMDSIZE, xmm6 );
3659  C.store( i+3UL, j , xmm7 );
3660  C.store( i+3UL, j+SIMDSIZE, xmm8 );
3661  }
3662 
3663  for( ; (i+3UL) <= iend; i+=3UL )
3664  {
3665  const size_t kbegin( ( IsUpper_v<MT4> )
3666  ?( ( IsLower_v<MT5> )
3667  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3668  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3669  :( IsLower_v<MT5> ? j : 0UL ) );
3670  const size_t kend( ( IsLower_v<MT4> )
3671  ?( ( IsUpper_v<MT5> )
3672  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
3673  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3674  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3675 
3676  SIMDType xmm1( C.load(i ,j ) );
3677  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3678  SIMDType xmm3( C.load(i+1UL,j ) );
3679  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3680  SIMDType xmm5( C.load(i+2UL,j ) );
3681  SIMDType xmm6( C.load(i+2UL,j+SIMDSIZE) );
3682 
3683  for( size_t k=kbegin; k<kend; ++k ) {
3684  const SIMDType a1( set( A(i ,k) ) );
3685  const SIMDType a2( set( A(i+1UL,k) ) );
3686  const SIMDType a3( set( A(i+2UL,k) ) );
3687  const SIMDType b1( B.load(k,j ) );
3688  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3689  xmm1 -= a1 * b1;
3690  xmm2 -= a1 * b2;
3691  xmm3 -= a2 * b1;
3692  xmm4 -= a2 * b2;
3693  xmm5 -= a3 * b1;
3694  xmm6 -= a3 * b2;
3695  }
3696 
3697  C.store( i , j , xmm1 );
3698  C.store( i , j+SIMDSIZE, xmm2 );
3699  C.store( i+1UL, j , xmm3 );
3700  C.store( i+1UL, j+SIMDSIZE, xmm4 );
3701  C.store( i+2UL, j , xmm5 );
3702  C.store( i+2UL, j+SIMDSIZE, xmm6 );
3703  }
3704 
3705  for( ; (i+2UL) <= iend; i+=2UL )
3706  {
3707  const size_t kbegin( ( IsUpper_v<MT4> )
3708  ?( ( IsLower_v<MT5> )
3709  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3710  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3711  :( IsLower_v<MT5> ? j : 0UL ) );
3712  const size_t kend( ( IsLower_v<MT4> )
3713  ?( ( IsUpper_v<MT5> )
3714  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3715  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3716  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
3717 
3718  SIMDType xmm1( C.load(i ,j ) );
3719  SIMDType xmm2( C.load(i ,j+SIMDSIZE) );
3720  SIMDType xmm3( C.load(i+1UL,j ) );
3721  SIMDType xmm4( C.load(i+1UL,j+SIMDSIZE) );
3722  SIMDType xmm5, xmm6, xmm7, xmm8;
3723  size_t k( kbegin );
3724 
3725  for( ; (k+2UL) <= kend; k+=2UL ) {
3726  const SIMDType a1( set( A(i ,k ) ) );
3727  const SIMDType a2( set( A(i+1UL,k ) ) );
3728  const SIMDType a3( set( A(i ,k+1UL) ) );
3729  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
3730  const SIMDType b1( B.load(k ,j ) );
3731  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3732  const SIMDType b3( B.load(k+1UL,j ) );
3733  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3734  xmm1 -= a1 * b1;
3735  xmm2 -= a1 * b2;
3736  xmm3 -= a2 * b1;
3737  xmm4 -= a2 * b2;
3738  xmm5 -= a3 * b3;
3739  xmm6 -= a3 * b4;
3740  xmm7 -= a4 * b3;
3741  xmm8 -= a4 * b4;
3742  }
3743 
3744  for( ; k<kend; ++k ) {
3745  const SIMDType a1( set( A(i ,k) ) );
3746  const SIMDType a2( set( A(i+1UL,k) ) );
3747  const SIMDType b1( B.load(k,j ) );
3748  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3749  xmm1 -= a1 * b1;
3750  xmm2 -= a1 * b2;
3751  xmm3 -= a2 * b1;
3752  xmm4 -= a2 * b2;
3753  }
3754 
3755  C.store( i , j , xmm1+xmm5 );
3756  C.store( i , j+SIMDSIZE, xmm2+xmm6 );
3757  C.store( i+1UL, j , xmm3+xmm7 );
3758  C.store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3759  }
3760 
3761  if( i < iend )
3762  {
3763  const size_t kbegin( ( IsUpper_v<MT4> )
3764  ?( ( IsLower_v<MT5> )
3765  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3766  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3767  :( IsLower_v<MT5> ? j : 0UL ) );
3768  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
3769 
3770  SIMDType xmm1( C.load(i,j ) );
3771  SIMDType xmm2( C.load(i,j+SIMDSIZE) );
3772  SIMDType xmm3, xmm4;
3773  size_t k( kbegin );
3774 
3775  for( ; (k+2UL) <= kend; k+=2UL ) {
3776  const SIMDType a1( set( A(i,k ) ) );
3777  const SIMDType a2( set( A(i,k+1UL) ) );
3778  xmm1 -= a1 * B.load(k ,j );
3779  xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
3780  xmm3 -= a2 * B.load(k+1UL,j );
3781  xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
3782  }
3783 
3784  for( ; k<kend; ++k ) {
3785  const SIMDType a1( set( A(i,k) ) );
3786  xmm1 -= a1 * B.load(k,j );
3787  xmm2 -= a1 * B.load(k,j+SIMDSIZE);
3788  }
3789 
3790  C.store( i, j , xmm1+xmm3 );
3791  C.store( i, j+SIMDSIZE, xmm2+xmm4 );
3792  }
3793  }
3794 
3795  for( ; j<jpos; j+=SIMDSIZE )
3796  {
3797  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
3798  size_t i( LOW ? j : 0UL );
3799 
3800  for( ; (i+4UL) <= iend; i+=4UL )
3801  {
3802  const size_t kbegin( ( IsUpper_v<MT4> )
3803  ?( ( IsLower_v<MT5> )
3804  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3805  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3806  :( IsLower_v<MT5> ? j : 0UL ) );
3807  const size_t kend( ( IsLower_v<MT4> )
3808  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
3809  :( K ) );
3810 
3811  SIMDType xmm1( C.load(i ,j) );
3812  SIMDType xmm2( C.load(i+1UL,j) );
3813  SIMDType xmm3( C.load(i+2UL,j) );
3814  SIMDType xmm4( C.load(i+3UL,j) );
3815  SIMDType xmm5, xmm6, xmm7, xmm8;
3816  size_t k( kbegin );
3817 
3818  for( ; (k+2UL) <= kend; k+=2UL ) {
3819  const SIMDType b1( B.load(k ,j) );
3820  const SIMDType b2( B.load(k+1UL,j) );
3821  xmm1 -= set( A(i ,k ) ) * b1;
3822  xmm2 -= set( A(i+1UL,k ) ) * b1;
3823  xmm3 -= set( A(i+2UL,k ) ) * b1;
3824  xmm4 -= set( A(i+3UL,k ) ) * b1;
3825  xmm5 -= set( A(i ,k+1UL) ) * b2;
3826  xmm6 -= set( A(i+1UL,k+1UL) ) * b2;
3827  xmm7 -= set( A(i+2UL,k+1UL) ) * b2;
3828  xmm8 -= set( A(i+3UL,k+1UL) ) * b2;
3829  }
3830 
3831  for( ; k<kend; ++k ) {
3832  const SIMDType b1( B.load(k,j) );
3833  xmm1 -= set( A(i ,k) ) * b1;
3834  xmm2 -= set( A(i+1UL,k) ) * b1;
3835  xmm3 -= set( A(i+2UL,k) ) * b1;
3836  xmm4 -= set( A(i+3UL,k) ) * b1;
3837  }
3838 
3839  C.store( i , j, xmm1+xmm5 );
3840  C.store( i+1UL, j, xmm2+xmm6 );
3841  C.store( i+2UL, j, xmm3+xmm7 );
3842  C.store( i+3UL, j, xmm4+xmm8 );
3843  }
3844 
3845  for( ; (i+3UL) <= iend; i+=3UL )
3846  {
3847  const size_t kbegin( ( IsUpper_v<MT4> )
3848  ?( ( IsLower_v<MT5> )
3849  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3850  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3851  :( IsLower_v<MT5> ? j : 0UL ) );
3852  const size_t kend( ( IsLower_v<MT4> )
3853  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
3854  :( K ) );
3855 
3856  SIMDType xmm1( C.load(i ,j) );
3857  SIMDType xmm2( C.load(i+1UL,j) );
3858  SIMDType xmm3( C.load(i+2UL,j) );
3859  SIMDType xmm4, xmm5, xmm6;
3860  size_t k( kbegin );
3861 
3862  for( ; (k+2UL) <= kend; k+=2UL ) {
3863  const SIMDType b1( B.load(k ,j) );
3864  const SIMDType b2( B.load(k+1UL,j) );
3865  xmm1 -= set( A(i ,k ) ) * b1;
3866  xmm2 -= set( A(i+1UL,k ) ) * b1;
3867  xmm3 -= set( A(i+2UL,k ) ) * b1;
3868  xmm4 -= set( A(i ,k+1UL) ) * b2;
3869  xmm5 -= set( A(i+1UL,k+1UL) ) * b2;
3870  xmm6 -= set( A(i+2UL,k+1UL) ) * b2;
3871  }
3872 
3873  for( ; k<kend; ++k ) {
3874  const SIMDType b1( B.load(k,j) );
3875  xmm1 -= set( A(i ,k) ) * b1;
3876  xmm2 -= set( A(i+1UL,k) ) * b1;
3877  xmm3 -= set( A(i+2UL,k) ) * b1;
3878  }
3879 
3880  C.store( i , j, xmm1+xmm4 );
3881  C.store( i+1UL, j, xmm2+xmm5 );
3882  C.store( i+2UL, j, xmm3+xmm6 );
3883  }
3884 
3885  for( ; (i+2UL) <= iend; i+=2UL )
3886  {
3887  const size_t kbegin( ( IsUpper_v<MT4> )
3888  ?( ( IsLower_v<MT5> )
3889  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3890  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3891  :( IsLower_v<MT5> ? j : 0UL ) );
3892  const size_t kend( ( IsLower_v<MT4> )
3893  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3894  :( K ) );
3895 
3896  SIMDType xmm1( C.load(i ,j) );
3897  SIMDType xmm2( C.load(i+1UL,j) );
3898  SIMDType xmm3, xmm4;
3899  size_t k( kbegin );
3900 
3901  for( ; (k+2UL) <= kend; k+=2UL ) {
3902  const SIMDType b1( B.load(k ,j) );
3903  const SIMDType b2( B.load(k+1UL,j) );
3904  xmm1 -= set( A(i ,k ) ) * b1;
3905  xmm2 -= set( A(i+1UL,k ) ) * b1;
3906  xmm3 -= set( A(i ,k+1UL) ) * b2;
3907  xmm4 -= set( A(i+1UL,k+1UL) ) * b2;
3908  }
3909 
3910  for( ; k<kend; ++k ) {
3911  const SIMDType b1( B.load(k,j) );
3912  xmm1 -= set( A(i ,k) ) * b1;
3913  xmm2 -= set( A(i+1UL,k) ) * b1;
3914  }
3915 
3916  C.store( i , j, xmm1+xmm3 );
3917  C.store( i+1UL, j, xmm2+xmm4 );
3918  }
3919 
3920  if( i < iend )
3921  {
3922  const size_t kbegin( ( IsUpper_v<MT4> )
3923  ?( ( IsLower_v<MT5> )
3924  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3925  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3926  :( IsLower_v<MT5> ? j : 0UL ) );
3927 
3928  SIMDType xmm1( C.load(i,j) );
3929  SIMDType xmm2;
3930  size_t k( kbegin );
3931 
3932  for( ; (k+2UL) <= K; k+=2UL ) {
3933  xmm1 -= set( A(i,k ) ) * B.load(k ,j);
3934  xmm2 -= set( A(i,k+1UL) ) * B.load(k+1UL,j);
3935  }
3936 
3937  for( ; k<K; ++k ) {
3938  xmm1 -= set( A(i,k) ) * B.load(k,j);
3939  }
3940 
3941  C.store( i, j, xmm1+xmm2 );
3942  }
3943  }
3944 
3945  for( ; remainder && j<N; ++j )
3946  {
3947  const size_t iend( UPP ? j+1UL : M );
3948  size_t i( LOW ? j : 0UL );
3949 
3950  for( ; (i+2UL) <= iend; i+=2UL )
3951  {
3952  const size_t kbegin( ( IsUpper_v<MT4> )
3953  ?( ( IsLower_v<MT5> )
3954  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3955  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3956  :( IsLower_v<MT5> ? j : 0UL ) );
3957  const size_t kend( ( IsLower_v<MT4> )
3958  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3959  :( K ) );
3960 
3961  ElementType value1( C(i ,j) );
3962  ElementType value2( C(i+1UL,j) );
3963 
3964  for( size_t k=kbegin; k<kend; ++k ) {
3965  value1 -= A(i ,k) * B(k,j);
3966  value2 -= A(i+1UL,k) * B(k,j);
3967  }
3968 
3969  C(i ,j) = value1;
3970  C(i+1UL,j) = value2;
3971  }
3972 
3973  if( i < iend )
3974  {
3975  const size_t kbegin( ( IsUpper_v<MT4> )
3976  ?( ( IsLower_v<MT5> )
3977  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3978  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3979  :( IsLower_v<MT5> ? j : 0UL ) );
3980 
3981  ElementType value( C(i,j) );
3982 
3983  for( size_t k=kbegin; k<K; ++k ) {
3984  value -= A(i,k) * B(k,j);
3985  }
3986 
3987  C(i,j) = value;
3988  }
3989  }
3990  }
3992  //**********************************************************************************************
3993 
3994  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
4009  template< typename MT3 // Type of the left-hand side target matrix
4010  , typename MT4 // Type of the left-hand side matrix operand
4011  , typename MT5 > // Type of the right-hand side matrix operand
4012  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4013  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4014  {
4019 
4020  const ForwardFunctor fwd;
4021 
4022  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
4023  const OppositeType_t<MT4> tmp( serial( A ) );
4024  subAssign( C, fwd( tmp * B ) );
4025  }
4026  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
4027  const OppositeType_t<MT5> tmp( serial( B ) );
4028  subAssign( C, fwd( A * tmp ) );
4029  }
4030  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4031  const OppositeType_t<MT4> tmp( serial( A ) );
4032  subAssign( C, fwd( tmp * B ) );
4033  }
4034  else {
4035  const OppositeType_t<MT5> tmp( serial( B ) );
4036  subAssign( C, fwd( A * tmp ) );
4037  }
4038  }
4040  //**********************************************************************************************
4041 
4042  //**Default subtraction assignment to dense matrices (large matrices)***************************
4056  template< typename MT3 // Type of the left-hand side target matrix
4057  , typename MT4 // Type of the left-hand side matrix operand
4058  , typename MT5 > // Type of the right-hand side matrix operand
4059  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4060  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4061  {
4062  selectDefaultSubAssignKernel( C, A, B );
4063  }
4065  //**********************************************************************************************
4066 
4067  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
4082  template< typename MT3 // Type of the left-hand side target matrix
4083  , typename MT4 // Type of the left-hand side matrix operand
4084  , typename MT5 > // Type of the right-hand side matrix operand
4085  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4086  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4087  {
4088  if( LOW )
4089  lmmm( C, A, B, ElementType(-1), ElementType(1) );
4090  else if( UPP )
4091  ummm( C, A, B, ElementType(-1), ElementType(1) );
4092  else
4093  mmm( C, A, B, ElementType(-1), ElementType(1) );
4094  }
4096  //**********************************************************************************************
4097 
4098  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4112  template< typename MT3 // Type of the left-hand side target matrix
4113  , typename MT4 // Type of the left-hand side matrix operand
4114  , typename MT5 > // Type of the right-hand side matrix operand
4115  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4116  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4117  {
4118  selectLargeSubAssignKernel( C, A, B );
4119  }
4121  //**********************************************************************************************
4122 
4123  //**BLAS-based subraction assignment to dense matrices******************************************
4124 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4125 
4138  template< typename MT3 // Type of the left-hand side target matrix
4139  , typename MT4 // Type of the left-hand side matrix operand
4140  , typename MT5 > // Type of the right-hand side matrix operand
4141  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4142  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4143  {
4144  using ET = ElementType_t<MT3>;
4145 
4146  if( IsTriangular_v<MT4> ) {
4147  ResultType_t<MT3> tmp( serial( B ) );
4148  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4149  subAssign( C, tmp );
4150  }
4151  else if( IsTriangular_v<MT5> ) {
4152  ResultType_t<MT3> tmp( serial( A ) );
4153  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4154  subAssign( C, tmp );
4155  }
4156  else {
4157  gemm( C, A, B, ET(-1), ET(1) );
4158  }
4159  }
4161 #endif
4162  //**********************************************************************************************
4163 
4164  //**Restructuring subtraction assignment to column-major matrices*******************************
4179  template< typename MT > // Type of the target matrix
4180  friend inline auto subAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4181  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4182  {
4184 
4186 
4187  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4188  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4189 
4190  const ForwardFunctor fwd;
4191 
4192  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4193  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4194  else if( IsSymmetric_v<MT1> )
4195  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4196  else
4197  subAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4198  }
4200  //**********************************************************************************************
4201 
4202  //**Subtraction assignment to sparse matrices***************************************************
4203  // No special implementation for the subtraction assignment to sparse matrices.
4204  //**********************************************************************************************
4205 
4206  //**Schur product assignment to dense matrices**************************************************
4219  template< typename MT // Type of the target dense matrix
4220  , bool SO > // Storage order of the target dense matrix
4221  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4222  {
4224 
4228 
4229  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4230  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4231 
4232  const ResultType tmp( serial( rhs ) );
4233  schurAssign( ~lhs, tmp );
4234  }
4236  //**********************************************************************************************
4237 
4238  //**Schur product assignment to sparse matrices*************************************************
4239  // No special implementation for the Schur product assignment to sparse matrices.
4240  //**********************************************************************************************
4241 
4242  //**Multiplication assignment to dense matrices*************************************************
4243  // No special implementation for the multiplication assignment to dense matrices.
4244  //**********************************************************************************************
4245 
4246  //**Multiplication assignment to sparse matrices************************************************
4247  // No special implementation for the multiplication assignment to sparse matrices.
4248  //**********************************************************************************************
4249 
4250  //**SMP assignment to dense matrices************************************************************
4265  template< typename MT // Type of the target dense matrix
4266  , bool SO > // Storage order of the target dense matrix
4267  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4268  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4269  {
4271 
4272  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4273  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4274 
4275  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4276  return;
4277  }
4278  else if( rhs.lhs_.columns() == 0UL ) {
4279  reset( ~lhs );
4280  return;
4281  }
4282 
4283  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4284  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4285 
4286  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4287  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4288  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4289  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4290  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4291  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4292 
4293  smpAssign( ~lhs, A * B );
4294  }
4296  //**********************************************************************************************
4297 
4298  //**SMP assignment to sparse matrices***********************************************************
4313  template< typename MT // Type of the target sparse matrix
4314  , bool SO > // Storage order of the target sparse matrix
4315  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4316  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4317  {
4319 
4320  using TmpType = If_t< SO, OppositeType, ResultType >;
4321 
4328 
4329  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4330  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4331 
4332  const ForwardFunctor fwd;
4333 
4334  const TmpType tmp( rhs );
4335  smpAssign( ~lhs, fwd( tmp ) );
4336  }
4338  //**********************************************************************************************
4339 
4340  //**Restructuring SMP assignment to column-major matrices***************************************
4355  template< typename MT > // Type of the target matrix
4356  friend inline auto smpAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4357  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4358  {
4360 
4362 
4363  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4364  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4365 
4366  const ForwardFunctor fwd;
4367 
4368  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4369  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4370  else if( IsSymmetric_v<MT1> )
4371  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4372  else
4373  smpAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4374  }
4376  //**********************************************************************************************
4377 
4378  //**SMP addition assignment to dense matrices***************************************************
4394  template< typename MT // Type of the target dense matrix
4395  , bool SO > // Storage order of the target dense matrix
4396  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4397  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4398  {
4400 
4401  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4402  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4403 
4404  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4405  return;
4406  }
4407 
4408  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4409  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4410 
4411  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4412  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4413  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4414  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4415  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4416  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4417 
4418  smpAddAssign( ~lhs, A * B );
4419  }
4421  //**********************************************************************************************
4422 
4423  //**Restructuring SMP addition assignment to column-major matrices******************************
4438  template< typename MT > // Type of the target matrix
4439  friend inline auto smpAddAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4440  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4441  {
4443 
4445 
4446  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4447  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4448 
4449  const ForwardFunctor fwd;
4450 
4451  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4452  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4453  else if( IsSymmetric_v<MT1> )
4454  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4455  else
4456  smpAddAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4457  }
4459  //**********************************************************************************************
4460 
4461  //**SMP addition assignment to sparse matrices**************************************************
4462  // No special implementation for the SMP addition assignment to sparse matrices.
4463  //**********************************************************************************************
4464 
4465  //**SMP subtraction assignment to dense matrices************************************************
4481  template< typename MT // Type of the target dense matrix
4482  , bool SO > // Storage order of the target dense matrix
4483  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4484  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4485  {
4487 
4488  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4489  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4490 
4491  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4492  return;
4493  }
4494 
4495  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4496  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4497 
4498  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4499  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4500  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4501  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4502  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4503  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4504 
4505  smpSubAssign( ~lhs, A * B );
4506  }
4508  //**********************************************************************************************
4509 
4510  //**Restructuring SMP subtraction assignment to column-major matrices***************************
4525  template< typename MT > // Type of the target matrix
4526  friend inline auto smpSubAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4527  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4528  {
4530 
4532 
4533  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4534  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4535 
4536  const ForwardFunctor fwd;
4537 
4538  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4539  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4540  else if( IsSymmetric_v<MT1> )
4541  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4542  else
4543  smpSubAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4544  }
4546  //**********************************************************************************************
4547 
4548  //**SMP subtraction assignment to sparse matrices***********************************************
4549  // No special implementation for the SMP subtraction assignment to sparse matrices.
4550  //**********************************************************************************************
4551 
4552  //**SMP Schur product assignment to dense matrices**********************************************
4565  template< typename MT // Type of the target dense matrix
4566  , bool SO > // Storage order of the target dense matrix
4567  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4568  {
4570 
4574 
4575  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4576  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4577 
4578  const ResultType tmp( rhs );
4579  smpSchurAssign( ~lhs, tmp );
4580  }
4582  //**********************************************************************************************
4583 
4584  //**SMP Schur product assignment to sparse matrices*********************************************
4585  // No special implementation for the SMP Schur product assignment to sparse matrices.
4586  //**********************************************************************************************
4587 
4588  //**SMP multiplication assignment to dense matrices*********************************************
4589  // No special implementation for the SMP multiplication assignment to dense matrices.
4590  //**********************************************************************************************
4591 
4592  //**SMP multiplication assignment to sparse matrices********************************************
4593  // No special implementation for the SMP multiplication assignment to sparse matrices.
4594  //**********************************************************************************************
4595 
4596  //**Compile time checks*************************************************************************
4604  //**********************************************************************************************
4605 };
4606 //*************************************************************************************************
4607 
4608 
4609 
4610 
4611 //=================================================================================================
4612 //
4613 // DMATSCALARMULTEXPR SPECIALIZATION
4614 //
4615 //=================================================================================================
4616 
4617 //*************************************************************************************************
4625 template< typename MT1 // Type of the left-hand side dense matrix
4626  , typename MT2 // Type of the right-hand side dense matrix
4627  , bool SF // Symmetry flag
4628  , bool HF // Hermitian flag
4629  , bool LF // Lower flag
4630  , bool UF // Upper flag
4631  , typename ST > // Type of the right-hand side scalar value
4632 class DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4633  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4634  , private Computation
4635 {
4636  private:
4637  //**Type definitions****************************************************************************
4639  using MMM = DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4640 
4641  using RES = ResultType_t<MMM>;
4642  using RT1 = ResultType_t<MT1>;
4643  using RT2 = ResultType_t<MT2>;
4644  using ET1 = ElementType_t<RT1>;
4645  using ET2 = ElementType_t<RT2>;
4646  using CT1 = CompositeType_t<MT1>;
4647  using CT2 = CompositeType_t<MT2>;
4648  //**********************************************************************************************
4649 
4650  //**********************************************************************************************
4652  static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4653  //**********************************************************************************************
4654 
4655  //**********************************************************************************************
4657  static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4658  //**********************************************************************************************
4659 
4660  //**********************************************************************************************
4661  static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4662  static constexpr bool HERM = ( HF && !( LF || UF ) );
4663  static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4664  static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4665  //**********************************************************************************************
4666 
4667  //**********************************************************************************************
4669 
4673  template< typename T1, typename T2, typename T3 >
4674  static constexpr bool CanExploitSymmetry_v =
4675  ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4676  //**********************************************************************************************
4677 
4678  //**********************************************************************************************
4680 
4683  template< typename T1, typename T2, typename T3 >
4684  static constexpr bool IsEvaluationRequired_v =
4685  ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4686  //**********************************************************************************************
4687 
4688  //**********************************************************************************************
4690 
4692  template< typename T1, typename T2, typename T3, typename T4 >
4693  static constexpr bool UseBlasKernel_v =
4694  ( BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4695  !SYM && !HERM && !LOW && !UPP &&
4696  IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4697  IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4698  IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4699  !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4700  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4701  IsBLASCompatible_v< ElementType_t<T1> > &&
4702  IsBLASCompatible_v< ElementType_t<T2> > &&
4703  IsBLASCompatible_v< ElementType_t<T3> > &&
4704  IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4705  IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4706  !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4707  //**********************************************************************************************
4708 
4709  //**********************************************************************************************
4711 
4713  template< typename T1, typename T2, typename T3, typename T4 >
4714  static constexpr bool UseVectorizedDefaultKernel_v =
4715  ( useOptimizedKernels &&
4716  !IsDiagonal_v<T3> &&
4717  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4718  IsSIMDCombinable_v< ElementType_t<T1>
4719  , ElementType_t<T2>
4720  , ElementType_t<T3>
4721  , T4 > &&
4722  HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4723  HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4724  //**********************************************************************************************
4725 
4726  //**********************************************************************************************
4728 
4730  using ForwardFunctor = If_t< HERM
4731  , DeclHerm
4732  , If_t< SYM
4733  , DeclSym
4734  , If_t< LOW
4735  , If_t< UPP
4736  , DeclDiag
4737  , DeclLow >
4738  , If_t< UPP
4739  , DeclUpp
4740  , Noop > > > >;
4741  //**********************************************************************************************
4742 
4743  public:
4744  //**Type definitions****************************************************************************
4746  using This = DMatScalarMultExpr<MMM,ST,false>;
4747 
4749  using BaseType = DenseMatrix<This,false>;
4750 
4752  using ResultType = typename If_t< HERM
4753  , DeclHermTrait< MultTrait_t<RES,ST> >
4754  , If_t< SYM
4755  , DeclSymTrait< MultTrait_t<RES,ST> >
4756  , If_t< LOW
4757  , If_t< UPP
4758  , DeclDiagTrait< MultTrait_t<RES,ST> >
4759  , DeclLowTrait< MultTrait_t<RES,ST> > >
4760  , If_t< UPP
4761  , DeclUppTrait< MultTrait_t<RES,ST> >
4762  , MultTrait<RES,ST> > > > >::Type;
4763 
4764  using OppositeType = OppositeType_t<ResultType>;
4765  using TransposeType = TransposeType_t<ResultType>;
4766  using ElementType = ElementType_t<ResultType>;
4767  using SIMDType = SIMDTrait_t<ElementType>;
4768  using ReturnType = const ElementType;
4769  using CompositeType = const ResultType;
4770 
4772  using LeftOperand = const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4773 
4775  using RightOperand = ST;
4776 
4778  using LT = If_t< evaluateLeft, const RT1, CT1 >;
4779 
4781  using RT = If_t< evaluateRight, const RT2, CT2 >;
4782  //**********************************************************************************************
4783 
4784  //**Compilation flags***************************************************************************
4786  static constexpr bool simdEnabled =
4787  ( !IsDiagonal_v<MT2> &&
4788  MT1::simdEnabled && MT2::simdEnabled &&
4789  IsSIMDCombinable_v<ET1,ET2,ST> &&
4790  HasSIMDAdd_v<ET1,ET2> &&
4791  HasSIMDMult_v<ET1,ET2> );
4792 
4794  static constexpr bool smpAssignable =
4795  ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4796  //**********************************************************************************************
4797 
4798  //**SIMD properties*****************************************************************************
4800  static constexpr size_t SIMDSIZE = SIMDTrait<ElementType>::size;
4801  //**********************************************************************************************
4802 
4803  //**Constructor*********************************************************************************
4809  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4810  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4811  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4812  {}
4813  //**********************************************************************************************
4814 
4815  //**Access operator*****************************************************************************
4822  inline ReturnType operator()( size_t i, size_t j ) const {
4823  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4824  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4825  return matrix_(i,j) * scalar_;
4826  }
4827  //**********************************************************************************************
4828 
4829  //**At function*********************************************************************************
4837  inline ReturnType at( size_t i, size_t j ) const {
4838  if( i >= matrix_.rows() ) {
4839  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4840  }
4841  if( j >= matrix_.columns() ) {
4842  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4843  }
4844  return (*this)(i,j);
4845  }
4846  //**********************************************************************************************
4847 
4848  //**Rows function*******************************************************************************
4853  inline size_t rows() const {
4854  return matrix_.rows();
4855  }
4856  //**********************************************************************************************
4857 
4858  //**Columns function****************************************************************************
4863  inline size_t columns() const {
4864  return matrix_.columns();
4865  }
4866  //**********************************************************************************************
4867 
4868  //**Left operand access*************************************************************************
4873  inline LeftOperand leftOperand() const {
4874  return matrix_;
4875  }
4876  //**********************************************************************************************
4877 
4878  //**Right operand access************************************************************************
4883  inline RightOperand rightOperand() const {
4884  return scalar_;
4885  }
4886  //**********************************************************************************************
4887 
4888  //**********************************************************************************************
4894  template< typename T >
4895  inline bool canAlias( const T* alias ) const {
4896  return matrix_.canAlias( alias );
4897  }
4898  //**********************************************************************************************
4899 
4900  //**********************************************************************************************
4906  template< typename T >
4907  inline bool isAliased( const T* alias ) const {
4908  return matrix_.isAliased( alias );
4909  }
4910  //**********************************************************************************************
4911 
4912  //**********************************************************************************************
4917  inline bool isAligned() const {
4918  return matrix_.isAligned();
4919  }
4920  //**********************************************************************************************
4921 
4922  //**********************************************************************************************
4927  inline bool canSMPAssign() const noexcept {
4928  return ( !BLAZE_BLAS_MODE ||
4929  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4931  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
4932  ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD );
4933  }
4934  //**********************************************************************************************
4935 
4936  private:
4937  //**Member variables****************************************************************************
4940  //**********************************************************************************************
4941 
4942  //**Assignment to dense matrices****************************************************************
4954  template< typename MT // Type of the target dense matrix
4955  , bool SO > // Storage order of the target dense matrix
4956  friend inline auto assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4957  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4958  {
4960 
4961  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4962  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4963 
4964  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
4965  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
4966 
4967  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4968  return;
4969  }
4970  else if( left.columns() == 0UL ) {
4971  reset( ~lhs );
4972  return;
4973  }
4974 
4975  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4976  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4977 
4978  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4979  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4980  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4981  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4982  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4983  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4984 
4985  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4986  }
4987  //**********************************************************************************************
4988 
4989  //**Assignment to dense matrices (kernel selection)*********************************************
5000  template< typename MT3 // Type of the left-hand side target matrix
5001  , typename MT4 // Type of the left-hand side matrix operand
5002  , typename MT5 // Type of the right-hand side matrix operand
5003  , typename ST2 > // Type of the scalar value
5004  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5005  {
5006  if( ( IsDiagonal_v<MT5> ) ||
5007  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
5008  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5009  selectSmallAssignKernel( C, A, B, scalar );
5010  else
5011  selectBlasAssignKernel( C, A, B, scalar );
5012  }
5013  //**********************************************************************************************
5014 
5015  //**Default assignment to dense matrices (general/general)**************************************
5029  template< typename MT3 // Type of the left-hand side target matrix
5030  , typename MT4 // Type of the left-hand side matrix operand
5031  , typename MT5 // Type of the right-hand side matrix operand
5032  , typename ST2 > // Type of the scalar value
5033  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5034  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5035  {
5036  const size_t M( A.rows() );
5037  const size_t N( B.columns() );
5038  const size_t K( A.columns() );
5039 
5040  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5041 
5042  for( size_t i=0UL; i<M; ++i )
5043  {
5044  const size_t kbegin( ( IsUpper_v<MT4> )
5045  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5046  :( 0UL ) );
5047  const size_t kend( ( IsLower_v<MT4> )
5048  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5049  :( K ) );
5050  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5051 
5052  if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
5053  for( size_t j=0UL; j<N; ++j ) {
5054  reset( C(i,j) );
5055  }
5056  continue;
5057  }
5058 
5059  {
5060  const size_t jbegin( ( IsUpper_v<MT5> )
5061  ?( ( IsStrictlyUpper_v<MT5> )
5062  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
5063  :( UPP ? max(i,kbegin) : kbegin ) )
5064  :( UPP ? i : 0UL ) );
5065  const size_t jend( ( IsLower_v<MT5> )
5066  ?( ( IsStrictlyLower_v<MT5> )
5067  ?( LOW ? min(i+1UL,kbegin) : kbegin )
5068  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
5069  :( LOW ? i+1UL : N ) );
5070 
5071  if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5072  for( size_t j=0UL; j<jbegin; ++j ) {
5073  reset( C(i,j) );
5074  }
5075  }
5076  else if( IsStrictlyUpper_v<MT5> ) {
5077  reset( C(i,0UL) );
5078  }
5079  for( size_t j=jbegin; j<jend; ++j ) {
5080  C(i,j) = A(i,kbegin) * B(kbegin,j);
5081  }
5082  if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5083  for( size_t j=jend; j<N; ++j ) {
5084  reset( C(i,j) );
5085  }
5086  }
5087  else if( IsStrictlyLower_v<MT5> ) {
5088  reset( C(i,N-1UL) );
5089  }
5090  }
5091 
5092  for( size_t k=kbegin+1UL; k<kend; ++k )
5093  {
5094  const size_t jbegin( ( IsUpper_v<MT5> )
5095  ?( ( IsStrictlyUpper_v<MT5> )
5096  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
5097  :( SYM || HERM || UPP ? max( i, k ) : k ) )
5098  :( SYM || HERM || UPP ? i : 0UL ) );
5099  const size_t jend( ( IsLower_v<MT5> )
5100  ?( ( IsStrictlyLower_v<MT5> )
5101  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
5102  :( LOW ? min(i+1UL,k) : k ) )
5103  :( LOW ? i+1UL : N ) );
5104 
5105  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5106  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5107 
5108  for( size_t j=jbegin; j<jend; ++j ) {
5109  C(i,j) += A(i,k) * B(k,j);
5110  }
5111  if( IsLower_v<MT5> ) {
5112  C(i,jend) = A(i,k) * B(k,jend);
5113  }
5114  }
5115 
5116  {
5117  const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5118  ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
5119  :( SYM || HERM || UPP ? i : 0UL ) );
5120  const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
5121  ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
5122  :( LOW ? i+1UL : N ) );
5123 
5124  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5125  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5126 
5127  for( size_t j=jbegin; j<jend; ++j ) {
5128  C(i,j) *= scalar;
5129  }
5130  }
5131  }
5132 
5133  if( SYM || HERM ) {
5134  for( size_t i=1UL; i<M; ++i ) {
5135  for( size_t j=0UL; j<i; ++j ) {
5136  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5137  }
5138  }
5139  }
5140  }
5141  //**********************************************************************************************
5142 
5143  //**Default assignment to dense matrices (general/diagonal)*************************************
5157  template< typename MT3 // Type of the left-hand side target matrix
5158  , typename MT4 // Type of the left-hand side matrix operand
5159  , typename MT5 // Type of the right-hand side matrix operand
5160  , typename ST2 > // Type of the scalar value
5161  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5162  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5163  {
5165 
5166  const size_t M( A.rows() );
5167  const size_t N( B.columns() );
5168 
5169  for( size_t i=0UL; i<M; ++i )
5170  {
5171  const size_t jbegin( ( IsUpper_v<MT4> )
5172  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5173  :( 0UL ) );
5174  const size_t jend( ( IsLower_v<MT4> )
5175  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5176  :( N ) );
5177  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5178 
5179  if( IsUpper_v<MT4> ) {
5180  for( size_t j=0UL; j<jbegin; ++j ) {
5181  reset( C(i,j) );
5182  }
5183  }
5184  for( size_t j=jbegin; j<jend; ++j ) {
5185  C(i,j) = A(i,j) * B(j,j) * scalar;
5186  }
5187  if( IsLower_v<MT4> ) {
5188  for( size_t j=jend; j<N; ++j ) {
5189  reset( C(i,j) );
5190  }
5191  }
5192  }
5193  }
5194  //**********************************************************************************************
5195 
5196  //**Default assignment to dense matrices (diagonal/general)*************************************
5210  template< typename MT3 // Type of the left-hand side target matrix
5211  , typename MT4 // Type of the left-hand side matrix operand
5212  , typename MT5 // Type of the right-hand side matrix operand
5213  , typename ST2 > // Type of the scalar value
5214  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5215  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5216  {
5218 
5219  const size_t M( A.rows() );
5220  const size_t N( B.columns() );
5221 
5222  for( size_t i=0UL; i<M; ++i )
5223  {
5224  const size_t jbegin( ( IsUpper_v<MT5> )
5225  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5226  :( 0UL ) );
5227  const size_t jend( ( IsLower_v<MT5> )
5228  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5229  :( N ) );
5230  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5231 
5232  if( IsUpper_v<MT5> ) {
5233  for( size_t j=0UL; j<jbegin; ++j ) {
5234  reset( C(i,j) );
5235  }
5236  }
5237  for( size_t j=jbegin; j<jend; ++j ) {
5238  C(i,j) = A(i,i) * B(i,j) * scalar;
5239  }
5240  if( IsLower_v<MT5> ) {
5241  for( size_t j=jend; j<N; ++j ) {
5242  reset( C(i,j) );
5243  }
5244  }
5245  }
5246  }
5247  //**********************************************************************************************
5248 
5249  //**Default assignment to dense matrices (diagonal/diagonal)************************************
5263  template< typename MT3 // Type of the left-hand side target matrix
5264  , typename MT4 // Type of the left-hand side matrix operand
5265  , typename MT5 // Type of the right-hand side matrix operand
5266  , typename ST2 > // Type of the scalar value
5267  static inline auto selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5268  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5269  {
5271 
5272  reset( C );
5273 
5274  for( size_t i=0UL; i<A.rows(); ++i ) {
5275  C(i,i) = A(i,i) * B(i,i) * scalar;
5276  }
5277  }
5278  //**********************************************************************************************
5279 
5280  //**Default assignment to dense matrices (small matrices)***************************************
5294  template< typename MT3 // Type of the left-hand side target matrix
5295  , typename MT4 // Type of the left-hand side matrix operand
5296  , typename MT5 // Type of the right-hand side matrix operand
5297  , typename ST2 > // Type of the scalar value
5298  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5299  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5300  {
5301  selectDefaultAssignKernel( C, A, B, scalar );
5302  }
5303  //**********************************************************************************************
5304 
5305  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5320  template< typename MT3 // Type of the left-hand side target matrix
5321  , typename MT4 // Type of the left-hand side matrix operand
5322  , typename MT5 // Type of the right-hand side matrix operand
5323  , typename ST2 > // Type of the scalar value
5324  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5325  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5326  {
5327  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5328 
5329  const size_t M( A.rows() );
5330  const size_t N( B.columns() );
5331  const size_t K( A.columns() );
5332 
5333  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5334 
5335  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
5336  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5337 
5338  const SIMDType factor( set( scalar ) );
5339 
5340  if( LOW && UPP && N > SIMDSIZE*3UL ) {
5341  reset( C );
5342  }
5343 
5344  {
5345  size_t j( 0UL );
5346 
5347  if( IsIntegral_v<ElementType> )
5348  {
5349  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5350  for( size_t i=0UL; i<M; ++i )
5351  {
5352  const size_t kbegin( ( IsUpper_v<MT4> )
5353  ?( ( IsLower_v<MT5> )
5354  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5355  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5356  :( IsLower_v<MT5> ? j : 0UL ) );
5357  const size_t kend( ( IsLower_v<MT4> )
5358  ?( ( IsUpper_v<MT5> )
5359  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5360  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5361  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
5362 
5363  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5364 
5365  for( size_t k=kbegin; k<kend; ++k ) {
5366  const SIMDType a1( set( A(i,k) ) );
5367  xmm1 += a1 * B.load(k,j );
5368  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5369  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5370  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5371  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5372  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
5373  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
5374  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
5375  }
5376 
5377  C.store( i, j , xmm1 * factor );
5378  C.store( i, j+SIMDSIZE , xmm2 * factor );
5379  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5380  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5381  C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5382  C.store( i, j+SIMDSIZE*5UL, xmm6 * factor );
5383  C.store( i, j+SIMDSIZE*6UL, xmm7 * factor );
5384  C.store( i, j+SIMDSIZE*7UL, xmm8 * factor );
5385  }
5386  }
5387  }
5388 
5389  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5390  {
5391  size_t i( 0UL );
5392 
5393  for( ; (i+2UL) <= M; i+=2UL )
5394  {
5395  const size_t kbegin( ( IsUpper_v<MT4> )
5396  ?( ( IsLower_v<MT5> )
5397  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5398  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5399  :( IsLower_v<MT5> ? j : 0UL ) );
5400  const size_t kend( ( IsLower_v<MT4> )
5401  ?( ( IsUpper_v<MT5> )
5402  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
5403  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5404  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
5405 
5406  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5407 
5408  for( size_t k=kbegin; k<kend; ++k ) {
5409  const SIMDType a1( set( A(i ,k) ) );
5410  const SIMDType a2( set( A(i+1UL,k) ) );
5411  const SIMDType b1( B.load(k,j ) );
5412  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5413  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5414  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5415  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5416  xmm1 += a1 * b1;
5417  xmm2 += a1 * b2;
5418  xmm3 += a1 * b3;
5419  xmm4 += a1 * b4;
5420  xmm5 += a1 * b5;
5421  xmm6 += a2 * b1;
5422  xmm7 += a2 * b2;
5423  xmm8 += a2 * b3;
5424  xmm9 += a2 * b4;
5425  xmm10 += a2 * b5;
5426  }
5427 
5428  C.store( i , j , xmm1 * factor );
5429  C.store( i , j+SIMDSIZE , xmm2 * factor );
5430  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5431  C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5432  C.store( i , j+SIMDSIZE*4UL, xmm5 * factor );
5433  C.store( i+1UL, j , xmm6 * factor );
5434  C.store( i+1UL, j+SIMDSIZE , xmm7 * factor );
5435  C.store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
5436  C.store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
5437  C.store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
5438  }
5439 
5440  if( i < M )
5441  {
5442  const size_t kbegin( ( IsUpper_v<MT4> )
5443  ?( ( IsLower_v<MT5> )
5444  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5445  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5446  :( IsLower_v<MT5> ? j : 0UL ) );
5447  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
5448 
5449  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5450 
5451  for( size_t k=kbegin; k<kend; ++k ) {
5452  const SIMDType a1( set( A(i,k) ) );
5453  xmm1 += a1 * B.load(k,j );
5454  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5455  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5456  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5457  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5458  }
5459 
5460  C.store( i, j , xmm1 * factor );
5461  C.store( i, j+SIMDSIZE , xmm2 * factor );
5462  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5463  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5464  C.store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5465  }
5466  }
5467 
5468  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5469  {
5470  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
5471  size_t i( LOW ? j : 0UL );
5472 
5473  for( ; (i+2UL) <= iend; i+=2UL )
5474  {
5475  const size_t kbegin( ( IsUpper_v<MT4> )
5476  ?( ( IsLower_v<MT5> )
5477  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5478  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5479  :( IsLower_v<MT5> ? j : 0UL ) );
5480  const size_t kend( ( IsLower_v<MT4> )
5481  ?( ( IsUpper_v<MT5> )
5482  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5483  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5484  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
5485 
5486  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5487 
5488  for( size_t k=kbegin; k<kend; ++k ) {
5489  const SIMDType a1( set( A(i ,k) ) );
5490  const SIMDType a2( set( A(i+1UL,k) ) );
5491  const SIMDType b1( B.load(k,j ) );
5492  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5493  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5494  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5495  xmm1 += a1 * b1;
5496  xmm2 += a1 * b2;
5497  xmm3 += a1 * b3;
5498  xmm4 += a1 * b4;
5499  xmm5 += a2 * b1;
5500  xmm6 += a2 * b2;
5501  xmm7 += a2 * b3;
5502  xmm8 += a2 * b4;
5503  }
5504 
5505  C.store( i , j , xmm1 * factor );
5506  C.store( i , j+SIMDSIZE , xmm2 * factor );
5507  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5508  C.store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5509  C.store( i+1UL, j , xmm5 * factor );
5510  C.store( i+1UL, j+SIMDSIZE , xmm6 * factor );
5511  C.store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
5512  C.store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
5513  }
5514 
5515  if( i < iend )
5516  {
5517  const size_t kbegin( ( IsUpper_v<MT4> )
5518  ?( ( IsLower_v<MT5> )
5519  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5520  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5521  :( IsLower_v<MT5> ? j : 0UL ) );
5522  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5523 
5524  SIMDType xmm1, xmm2, xmm3, xmm4;
5525 
5526  for( size_t k=kbegin; k<kend; ++k ) {
5527  const SIMDType a1( set( A(i,k) ) );
5528  xmm1 += a1 * B.load(k,j );
5529  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5530  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5531  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5532  }
5533 
5534  C.store( i, j , xmm1 * factor );
5535  C.store( i, j+SIMDSIZE , xmm2 * factor );
5536  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5537  C.store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5538  }
5539  }
5540 
5541  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5542  {
5543  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
5544  size_t i( LOW ? j : 0UL );
5545 
5546  for( ; (i+2UL) <= iend; i+=2UL )
5547  {
5548  const size_t kbegin( ( IsUpper_v<MT4> )
5549  ?( ( IsLower_v<MT5> )
5550  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5551  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5552  :( IsLower_v<MT5> ? j : 0UL ) );
5553  const size_t kend( ( IsLower_v<MT4> )
5554  ?( ( IsUpper_v<MT5> )
5555  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
5556  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5557  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
5558 
5559  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5560 
5561  for( size_t k=kbegin; k<kend; ++k ) {
5562  const SIMDType a1( set( A(i ,k) ) );
5563  const SIMDType a2( set( A(i+1UL,k) ) );
5564  const SIMDType b1( B.load(k,j ) );
5565  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5566  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5567  xmm1 += a1 * b1;
5568  xmm2 += a1 * b2;
5569  xmm3 += a1 * b3;
5570  xmm4 += a2 * b1;
5571  xmm5 += a2 * b2;
5572  xmm6 += a2 * b3;
5573  }
5574 
5575  C.store( i , j , xmm1 * factor );
5576  C.store( i , j+SIMDSIZE , xmm2 * factor );
5577  C.store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5578  C.store( i+1UL, j , xmm4 * factor );
5579  C.store( i+1UL, j+SIMDSIZE , xmm5 * factor );
5580  C.store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
5581  }
5582 
5583  if( i < iend )
5584  {
5585  const size_t kbegin( ( IsUpper_v<MT4> )
5586  ?( ( IsLower_v<MT5> )
5587  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5588  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5589  :( IsLower_v<MT5> ? j : 0UL ) );
5590  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
5591 
5592  SIMDType xmm1, xmm2, xmm3;
5593 
5594  for( size_t k=kbegin; k<kend; ++k ) {
5595  const SIMDType a1( set( A(i,k) ) );
5596  xmm1 += a1 * B.load(k,j );
5597  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5598  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5599  }
5600 
5601  C.store( i, j , xmm1 * factor );
5602  C.store( i, j+SIMDSIZE , xmm2 * factor );
5603  C.store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5604  }
5605  }
5606 
5607  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5608  {
5609  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
5610  size_t i( LOW ? j : 0UL );
5611 
5612  for( ; (i+4UL) <= iend; i+=4UL )
5613  {
5614  const size_t kbegin( ( IsUpper_v<MT4> )
5615  ?( ( IsLower_v<MT5> )
5616  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5617  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5618  :( IsLower_v<MT5> ? j : 0UL ) );
5619  const size_t kend( ( IsLower_v<MT4> )
5620  ?( ( IsUpper_v<MT5> )
5621  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
5622  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5623  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5624 
5625  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5626 
5627  for( size_t k=kbegin; k<kend; ++k ) {
5628  const SIMDType a1( set( A(i ,k) ) );
5629  const SIMDType a2( set( A(i+1UL,k) ) );
5630  const SIMDType a3( set( A(i+2UL,k) ) );
5631  const SIMDType a4( set( A(i+3UL,k) ) );
5632  const SIMDType b1( B.load(k,j ) );
5633  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5634  xmm1 += a1 * b1;
5635  xmm2 += a1 * b2;
5636  xmm3 += a2 * b1;
5637  xmm4 += a2 * b2;
5638  xmm5 += a3 * b1;
5639  xmm6 += a3 * b2;
5640  xmm7 += a4 * b1;
5641  xmm8 += a4 * b2;
5642  }
5643 
5644  C.store( i , j , xmm1 * factor );
5645  C.store( i , j+SIMDSIZE, xmm2 * factor );
5646  C.store( i+1UL, j , xmm3 * factor );
5647  C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
5648  C.store( i+2UL, j , xmm5 * factor );
5649  C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
5650  C.store( i+3UL, j , xmm7 * factor );
5651  C.store( i+3UL, j+SIMDSIZE, xmm8 * factor );
5652  }
5653 
5654  for( ; (i+3UL) <= iend; i+=3UL )
5655  {
5656  const size_t kbegin( ( IsUpper_v<MT4> )
5657  ?( ( IsLower_v<MT5> )
5658  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5659  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5660  :( IsLower_v<MT5> ? j : 0UL ) );
5661  const size_t kend( ( IsLower_v<MT4> )
5662  ?( ( IsUpper_v<MT5> )
5663  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
5664  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5665  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5666 
5667  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5668 
5669  for( size_t k=kbegin; k<kend; ++k ) {
5670  const SIMDType a1( set( A(i ,k) ) );
5671  const SIMDType a2( set( A(i+1UL,k) ) );
5672  const SIMDType a3( set( A(i+2UL,k) ) );
5673  const SIMDType b1( B.load(k,j ) );
5674  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5675  xmm1 += a1 * b1;
5676  xmm2 += a1 * b2;
5677  xmm3 += a2 * b1;
5678  xmm4 += a2 * b2;
5679  xmm5 += a3 * b1;
5680  xmm6 += a3 * b2;
5681  }
5682 
5683  C.store( i , j , xmm1 * factor );
5684  C.store( i , j+SIMDSIZE, xmm2 * factor );
5685  C.store( i+1UL, j , xmm3 * factor );
5686  C.store( i+1UL, j+SIMDSIZE, xmm4 * factor );
5687  C.store( i+2UL, j , xmm5 * factor );
5688  C.store( i+2UL, j+SIMDSIZE, xmm6 * factor );
5689  }
5690 
5691  for( ; (i+2UL) <= iend; i+=2UL )
5692  {
5693  const size_t kbegin( ( IsUpper_v<MT4> )
5694  ?( ( IsLower_v<MT5> )
5695  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5696  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5697  :( IsLower_v<MT5> ? j : 0UL ) );
5698  const size_t kend( ( IsLower_v<MT4> )
5699  ?( ( IsUpper_v<MT5> )
5700  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
5701  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5702  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
5703 
5704  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5705  size_t k( kbegin );
5706 
5707  for( ; (k+2UL) <= kend; k+=2UL ) {
5708  const SIMDType a1( set( A(i ,k ) ) );
5709  const SIMDType a2( set( A(i+1UL,k ) ) );
5710  const SIMDType a3( set( A(i ,k+1UL) ) );
5711  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
5712  const SIMDType b1( B.load(k ,j ) );
5713  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5714  const SIMDType b3( B.load(k+1UL,j ) );
5715  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5716  xmm1 += a1 * b1;
5717  xmm2 += a1 * b2;
5718  xmm3 += a2 * b1;
5719  xmm4 += a2 * b2;
5720  xmm5 += a3 * b3;
5721  xmm6 += a3 * b4;
5722  xmm7 += a4 * b3;
5723  xmm8 += a4 * b4;
5724  }
5725 
5726  for( ; k<kend; ++k ) {
5727  const SIMDType a1( set( A(i ,k) ) );
5728  const SIMDType a2( set( A(i+1UL,k) ) );
5729  const SIMDType b1( B.load(k,j ) );
5730  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5731  xmm1 += a1 * b1;
5732  xmm2 += a1 * b2;
5733  xmm3 += a2 * b1;
5734  xmm4 += a2 * b2;
5735  }
5736 
5737  C.store( i , j , (xmm1+xmm5) * factor );
5738  C.store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
5739  C.store( i+1UL, j , (xmm3+xmm7) * factor );
5740  C.store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
5741  }
5742 
5743  if( i < iend )
5744  {
5745  const size_t kbegin( ( IsUpper_v<MT4> )
5746  ?( ( IsLower_v<MT5> )
5747  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5748  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5749  :( IsLower_v<MT5> ? j : 0UL ) );
5750  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
5751 
5752  SIMDType xmm1, xmm2, xmm3, xmm4;
5753  size_t k( kbegin );
5754 
5755  for( ; (k+2UL) <= kend; k+=2UL ) {
5756  const SIMDType a1( set( A(i,k ) ) );
5757  const SIMDType a2( set( A(i,k+1UL) ) );
5758  xmm1 += a1 * B.load(k ,j );
5759  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
5760  xmm3 += a2 * B.load(k+1UL,j );
5761  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
5762  }
5763 
5764  for( ; k<kend; ++k ) {
5765  const SIMDType a1( set( A(i,k) ) );
5766  xmm1 += a1 * B.load(k,j );
5767  xmm2 += a1 * B.load(k,j+SIMDSIZE);
5768  }
5769 
5770  C.store( i, j , (xmm1+xmm3) * factor );
5771  C.store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
5772  }
5773  }
5774 
5775  for( ; j<jpos; j+=SIMDSIZE )
5776  {
5777  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
5778  size_t i( LOW ? j : 0UL );
5779 
5780  for( ; (i+4UL) <= iend; i+=4UL )
5781  {
5782  const size_t kbegin( ( IsUpper_v<MT4> )
5783  ?( ( IsLower_v<MT5> )
5784  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5785  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5786  :( IsLower_v<MT5> ? j : 0UL ) );
5787  const size_t kend( ( IsLower_v<MT4> )
5788  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
5789  :( K ) );
5790 
5791  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5792  size_t k( kbegin );
5793 
5794  for( ; (k+2UL) <= kend; k+=2UL ) {
5795  const SIMDType b1( B.load(k ,j) );
5796  const SIMDType b2( B.load(k+1UL,j) );
5797  xmm1 += set( A(i ,k ) ) * b1;
5798  xmm2 += set( A(i+1UL,k ) ) * b1;
5799  xmm3 += set( A(i+2UL,k ) ) * b1;
5800  xmm4 += set( A(i+3UL,k ) ) * b1;
5801  xmm5 += set( A(i ,k+1UL) ) * b2;
5802  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
5803  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
5804  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
5805  }
5806 
5807  for( ; k<kend; ++k ) {
5808  const SIMDType b1( B.load(k,j) );
5809  xmm1 += set( A(i ,k) ) * b1;
5810  xmm2 += set( A(i+1UL,k) ) * b1;
5811  xmm3 += set( A(i+2UL,k) ) * b1;
5812  xmm4 += set( A(i+3UL,k) ) * b1;
5813  }
5814 
5815  C.store( i , j, (xmm1+xmm5) * factor );
5816  C.store( i+1UL, j, (xmm2+xmm6) * factor );
5817  C.store( i+2UL, j, (xmm3+xmm7) * factor );
5818  C.store( i+3UL, j, (xmm4+xmm8) * factor );
5819  }
5820 
5821  for( ; (i+3UL) <= iend; i+=3UL )
5822  {
5823  const size_t kbegin( ( IsUpper_v<MT4> )
5824  ?( ( IsLower_v<MT5> )
5825  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5826  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5827  :( IsLower_v<MT5> ? j : 0UL ) );
5828  const size_t kend( ( IsLower_v<MT4> )
5829  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
5830  :( K ) );
5831 
5832  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5833  size_t k( kbegin );
5834 
5835  for( ; (k+2UL) <= kend; k+=2UL ) {
5836  const SIMDType b1( B.load(k ,j) );
5837  const SIMDType b2( B.load(k+1UL,j) );
5838  xmm1 += set( A(i ,k ) ) * b1;
5839  xmm2 += set( A(i+1UL,k ) ) * b1;
5840  xmm3 += set( A(i+2UL,k ) ) * b1;
5841  xmm4 += set( A(i ,k+1UL) ) * b2;
5842  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
5843  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
5844  }
5845 
5846  for( ; k<kend; ++k ) {
5847  const SIMDType b1( B.load(k,j) );
5848  xmm1 += set( A(i ,k) ) * b1;
5849  xmm2 += set( A(i+1UL,k) ) * b1;
5850  xmm3 += set( A(i+2UL,k) ) * b1;
5851  }
5852 
5853  C.store( i , j, (xmm1+xmm4) * factor );
5854  C.store( i+1UL, j, (xmm2+xmm5) * factor );
5855  C.store( i+2UL, j, (xmm3+xmm6) * factor );
5856  }
5857 
5858  for( ; (i+2UL) <= iend; i+=2UL )
5859  {
5860  const size_t kbegin( ( IsUpper_v<MT4> )
5861  ?( ( IsLower_v<MT5> )
5862  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5863  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5864  :( IsLower_v<MT5> ? j : 0UL ) );
5865  const size_t kend( ( IsLower_v<MT4> )
5866  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5867  :( K ) );
5868 
5869  SIMDType xmm1, xmm2, xmm3, xmm4;
5870  size_t k( kbegin );
5871 
5872  for( ; (k+2UL) <= kend; k+=2UL ) {
5873  const SIMDType b1( B.load(k ,j) );
5874  const SIMDType b2( B.load(k+1UL,j) );
5875  xmm1 += set( A(i ,k ) ) * b1;
5876  xmm2 += set( A(i+1UL,k ) ) * b1;
5877  xmm3 += set( A(i ,k+1UL) ) * b2;
5878  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
5879  }
5880 
5881  for( ; k<kend; ++k ) {
5882  const SIMDType b1( B.load(k,j) );
5883  xmm1 += set( A(i ,k) ) * b1;
5884  xmm2 += set( A(i+1UL,k) ) * b1;
5885  }
5886 
5887  C.store( i , j, (xmm1+xmm3) * factor );
5888  C.store( i+1UL, j, (xmm2+xmm4) * factor );
5889  }
5890 
5891  if( i < iend )
5892  {
5893  const size_t kbegin( ( IsUpper_v<MT4> )
5894  ?( ( IsLower_v<MT5> )
5895  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5896  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5897  :( IsLower_v<MT5> ? j : 0UL ) );
5898 
5899  SIMDType xmm1, xmm2;
5900  size_t k( kbegin );
5901 
5902  for( ; (k+2UL) <= K; k+=2UL ) {
5903  xmm1 += set( A(i,k ) ) * B.load(k ,j);
5904  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
5905  }
5906 
5907  for( ; k<K; ++k ) {
5908  xmm1 += set( A(i,k) ) * B.load(k,j);
5909  }
5910 
5911  C.store( i, j, (xmm1+xmm2) * factor );
5912  }
5913  }
5914 
5915  for( ; remainder && j<N; ++j )
5916  {
5917  size_t i( LOW && UPP ? j : 0UL );
5918 
5919  for( ; (i+2UL) <= M; i+=2UL )
5920  {
5921  const size_t kbegin( ( IsUpper_v<MT4> )
5922  ?( ( IsLower_v<MT5> )
5923  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5924  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5925  :( IsLower_v<MT5> ? j : 0UL ) );
5926  const size_t kend( ( IsLower_v<MT4> )
5927  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5928  :( K ) );
5929 
5930  ElementType value1{};
5931  ElementType value2{};
5932 
5933  for( size_t k=kbegin; k<kend; ++k ) {
5934  value1 += A(i ,k) * B(k,j);
5935  value2 += A(i+1UL,k) * B(k,j);
5936  }
5937 
5938  C(i ,j) = value1 * scalar;
5939  C(i+1UL,j) = value2 * scalar;
5940  }
5941 
5942  if( i < M )
5943  {
5944  const size_t kbegin( ( IsUpper_v<MT4> )
5945  ?( ( IsLower_v<MT5> )
5946  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5947  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5948  :( IsLower_v<MT5> ? j : 0UL ) );
5949 
5950  ElementType value{};
5951 
5952  for( size_t k=kbegin; k<K; ++k ) {
5953  value += A(i,k) * B(k,j);
5954  }
5955 
5956  C(i,j) = value * scalar;
5957  }
5958  }
5959  }
5960 
5961  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
5962  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
5963  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
5964  for( size_t j=0UL; j<jend; ++j ) {
5965  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5966  }
5967  }
5968  }
5969  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
5970  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
5971  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
5972  for( size_t i=0UL; i<iend; ++i ) {
5973  reset( C(i,j) );
5974  }
5975  }
5976  }
5977  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
5978  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
5979  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
5980  for( size_t j=0UL; j<jend; ++j ) {
5981  reset( C(i,j) );
5982  }
5983  }
5984  }
5985  }
5986  //**********************************************************************************************
5987 
5988  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
6003  template< typename MT3 // Type of the left-hand side target matrix
6004  , typename MT4 // Type of the left-hand side matrix operand
6005  , typename MT5 // Type of the right-hand side matrix operand
6006  , typename ST2 > // Type of the scalar value
6007  static inline auto selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6008  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6009  {
6014 
6015  const ForwardFunctor fwd;
6016 
6017  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6018  const OppositeType_t<MT4> tmp( serial( A ) );
6019  assign( C, fwd( tmp * B ) * scalar );
6020  }
6021  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6022  const OppositeType_t<MT5> tmp( serial( B ) );
6023  assign( C, fwd( A * tmp ) * scalar );
6024  }
6025  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6026  const OppositeType_t<MT4> tmp( serial( A ) );
6027  assign( C, fwd( tmp * B ) * scalar );
6028  }
6029  else {
6030  const OppositeType_t<MT5> tmp( serial( B ) );
6031  assign( C, fwd( A * tmp ) * scalar );
6032  }
6033  }
6034  //**********************************************************************************************
6035 
6036  //**Default assignment to dense matrices (large matrices)***************************************
6050  template< typename MT3 // Type of the left-hand side target matrix
6051  , typename MT4 // Type of the left-hand side matrix operand
6052  , typename MT5 // Type of the right-hand side matrix operand
6053  , typename ST2 > // Type of the scalar value
6054  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6055  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6056  {
6057  selectDefaultAssignKernel( C, A, B, scalar );
6058  }
6059  //**********************************************************************************************
6060 
6061  //**Vectorized default assignment to dense matrices (large matrices)****************************
6076  template< typename MT3 // Type of the left-hand side target matrix
6077  , typename MT4 // Type of the left-hand side matrix operand
6078  , typename MT5 // Type of the right-hand side matrix operand
6079  , typename ST2 > // Type of the scalar value
6080  static inline auto selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6081  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6082  {
6083  if( SYM )
6084  smmm( C, A, B, scalar );
6085  else if( HERM )
6086  hmmm( C, A, B, scalar );
6087  else if( LOW )
6088  lmmm( C, A, B, scalar, ST2(0) );
6089  else if( UPP )
6090  ummm( C, A, B, scalar, ST2(0) );
6091  else
6092  mmm( C, A, B, scalar, ST2(0) );
6093  }
6094  //**********************************************************************************************
6095 
6096  //**BLAS-based assignment to dense matrices (default)*******************************************
6110  template< typename MT3 // Type of the left-hand side target matrix
6111  , typename MT4 // Type of the left-hand side matrix operand
6112  , typename MT5 // Type of the right-hand side matrix operand
6113  , typename ST2 > // Type of the scalar value
6114  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6115  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6116  {
6117  selectLargeAssignKernel( C, A, B, scalar );
6118  }
6119  //**********************************************************************************************
6120 
6121  //**BLAS-based assignment to dense matrices*****************************************************
6122 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6123 
6136  template< typename MT3 // Type of the left-hand side target matrix
6137  , typename MT4 // Type of the left-hand side matrix operand
6138  , typename MT5 // Type of the right-hand side matrix operand
6139  , typename ST2 > // Type of the scalar value
6140  static inline auto selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6141  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6142  {
6143  using ET = ElementType_t<MT3>;
6144 
6145  if( IsTriangular_v<MT4> ) {
6146  assign( C, B );
6147  trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6148  }
6149  else if( IsTriangular_v<MT5> ) {
6150  assign( C, A );
6151  trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
6152  }
6153  else {
6154  gemm( C, A, B, ET(scalar), ET(0) );
6155  }
6156  }
6157 #endif
6158  //**********************************************************************************************
6159 
6160  //**Assignment to sparse matrices***************************************************************
6172  template< typename MT // Type of the target sparse matrix
6173  , bool SO > // Storage order of the target sparse matrix
6174  friend inline auto assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6175  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6176  {
6178 
6179  using TmpType = If_t< SO, OppositeType, ResultType >;
6180 
6187 
6188  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6189  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6190 
6191  const ForwardFunctor fwd;
6192 
6193  const TmpType tmp( serial( rhs ) );
6194  assign( ~lhs, fwd( tmp ) );
6195  }
6196  //**********************************************************************************************
6197 
6198  //**Restructuring assignment to column-major matrices*******************************************
6212  template< typename MT > // Type of the target matrix
6213  friend inline auto assign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
6214  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6215  {
6217 
6219 
6220  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6221  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6222 
6223  const ForwardFunctor fwd;
6224 
6225  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6226  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6227 
6228  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
6229  assign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
6230  else if( IsSymmetric_v<MT1> )
6231  assign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
6232  else
6233  assign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
6234  }
6235  //**********************************************************************************************
6236 
6237  //**Addition assignment to dense matrices*******************************************************
6249  template< typename MT // Type of the target dense matrix
6250  , bool SO > // Storage order of the target dense matrix
6251  friend inline auto addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6252  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6253  {
6255 
6256  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6257  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6258 
6259  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6260  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6261 
6262  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6263  return;
6264  }
6265 
6266  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6267  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6268 
6269  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6270  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6271  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6272  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6273  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6274  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6275 
6276  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6277  }
6278  //**********************************************************************************************
6279 
6280  //**Addition assignment to dense matrices (kernel selection)************************************
6291  template< typename MT3 // Type of the left-hand side target matrix
6292  , typename MT4 // Type of the left-hand side matrix operand
6293  , typename MT5 // Type of the right-hand side matrix operand
6294  , typename ST2 > // Type of the scalar value
6295  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6296  {
6297  if( ( IsDiagonal_v<MT5> ) ||
6298  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
6299  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6300  selectSmallAddAssignKernel( C, A, B, scalar );
6301  else
6302  selectBlasAddAssignKernel( C, A, B, scalar );
6303  }
6304  //**********************************************************************************************
6305 
6306  //**Default addition assignment to dense matrices (general/general)*****************************
6320  template< typename MT3 // Type of the left-hand side target matrix
6321  , typename MT4 // Type of the left-hand side matrix operand
6322  , typename MT5 // Type of the right-hand side matrix operand
6323  , typename ST2 > // Type of the scalar value
6324  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6325  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6326  {
6327  const ResultType tmp( serial( A * B * scalar ) );
6328  addAssign( C, tmp );
6329  }
6330  //**********************************************************************************************
6331 
6332  //**Default addition assignment to dense matrices (general/diagonal)****************************
6346  template< typename MT3 // Type of the left-hand side target matrix
6347  , typename MT4 // Type of the left-hand side matrix operand
6348  , typename MT5 // Type of the right-hand side matrix operand
6349  , typename ST2 > // Type of the scalar value
6350  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6351  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6352  {
6354 
6355  const size_t M( A.rows() );
6356  const size_t N( B.columns() );
6357 
6358  for( size_t i=0UL; i<M; ++i )
6359  {
6360  const size_t jbegin( ( IsUpper_v<MT4> )
6361  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6362  :( 0UL ) );
6363  const size_t jend( ( IsLower_v<MT4> )
6364  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6365  :( N ) );
6366  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6367 
6368  const size_t jnum( jend - jbegin );
6369  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6370 
6371  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6372  C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6373  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6374  }
6375  if( jpos < jend ) {
6376  C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6377  }
6378  }
6379  }
6380  //**********************************************************************************************
6381 
6382  //**Default addition assignment to dense matrices (diagonal/general)****************************
6396  template< typename MT3 // Type of the left-hand side target matrix
6397  , typename MT4 // Type of the left-hand side matrix operand
6398  , typename MT5 // Type of the right-hand side matrix operand
6399  , typename ST2 > // Type of the scalar value
6400  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6401  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6402  {
6404 
6405  const size_t M( A.rows() );
6406  const size_t N( B.columns() );
6407 
6408  for( size_t i=0UL; i<M; ++i )
6409  {
6410  const size_t jbegin( ( IsUpper_v<MT5> )
6411  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
6412  :( 0UL ) );
6413  const size_t jend( ( IsLower_v<MT5> )
6414  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
6415  :( N ) );
6416  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6417 
6418  const size_t jnum( jend - jbegin );
6419  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6420 
6421  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6422  C(i,j ) += A(i,i) * B(i,j ) * scalar;
6423  C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
6424  }
6425  if( jpos < jend ) {
6426  C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
6427  }
6428  }
6429  }
6430  //**********************************************************************************************
6431 
6432  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6446  template< typename MT3 // Type of the left-hand side target matrix
6447  , typename MT4 // Type of the left-hand side matrix operand
6448  , typename MT5 // Type of the right-hand side matrix operand
6449  , typename ST2 > // Type of the scalar value
6450  static inline auto selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6451  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6452  {
6454 
6455  for( size_t i=0UL; i<A.rows(); ++i ) {
6456  C(i,i) += A(i,i) * B(i,i) * scalar;
6457  }
6458  }
6459  //**********************************************************************************************
6460 
6461  //**Default addition assignment to dense matrices (small matrices)******************************
6475  template< typename MT3 // Type of the left-hand side target matrix
6476  , typename MT4 // Type of the left-hand side matrix operand
6477  , typename MT5 // Type of the right-hand side matrix operand
6478  , typename ST2 > // Type of the scalar value
6479  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6480  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6481  {
6482  selectDefaultAddAssignKernel( C, A, B, scalar );
6483  }
6484  //**********************************************************************************************
6485 
6486  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
6501  template< typename MT3 // Type of the left-hand side target matrix
6502  , typename MT4 // Type of the left-hand side matrix operand
6503  , typename MT5 // Type of the right-hand side matrix operand
6504  , typename ST2 > // Type of the scalar value
6505  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6506  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6507  {
6508  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
6509 
6510  const size_t M( A.rows() );
6511  const size_t N( B.columns() );
6512  const size_t K( A.columns() );
6513 
6514  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6515 
6516  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
6517  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
6518 
6519  const SIMDType factor( set( scalar ) );
6520 
6521  size_t j( 0UL );
6522 
6523  if( IsIntegral_v<ElementType> )
6524  {
6525  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6526  for( size_t i=0UL; i<M; ++i )
6527  {
6528  const size_t kbegin( ( IsUpper_v<MT4> )
6529  ?( ( IsLower_v<MT5> )
6530  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6531  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6532  :( IsLower_v<MT5> ? j : 0UL ) );
6533  const size_t kend( ( IsLower_v<MT4> )
6534  ?( ( IsUpper_v<MT5> )
6535  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
6536  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
6537  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
6538 
6539  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6540 
6541  for( size_t k=kbegin; k<kend; ++k ) {
6542  const SIMDType a1( set( A(i,k) ) );
6543  xmm1 += a1 * B.load(k,j );
6544  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6545  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6546  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6547  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6548  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
6549  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
6550  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
6551  }
6552 
6553  C.store( i, j , C.load(i,j ) + xmm1 * factor );
6554  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
6555  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6556  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6557  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
6558  C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
6559  C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
6560  C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
6561  }
6562  }
6563  }
6564 
6565  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
6566  {
6567  size_t i( 0UL );
6568 
6569  for( ; (i+2UL) <= M; i+=2UL )
6570  {
6571  const size_t kbegin( ( IsUpper_v<MT4> )
6572  ?( ( IsLower_v<MT5> )
6573  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6574  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6575  :( IsLower_v<MT5> ? j : 0UL ) );
6576  const size_t kend( ( IsLower_v<MT4> )
6577  ?( ( IsUpper_v<MT5> )
6578  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
6579  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6580  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
6581 
6582  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6583 
6584  for( size_t k=kbegin; k<kend; ++k ) {
6585  const SIMDType a1( set( A(i ,k) ) );
6586  const SIMDType a2( set( A(i+1UL,k) ) );
6587  const SIMDType b1( B.load(k,j ) );
6588  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6589  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6590  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6591  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
6592  xmm1 += a1 * b1;
6593  xmm2 += a1 * b2;
6594  xmm3 += a1 * b3;
6595  xmm4 += a1 * b4;
6596  xmm5 += a1 * b5;
6597  xmm6 += a2 * b1;
6598  xmm7 += a2 * b2;
6599  xmm8 += a2 * b3;
6600  xmm9 += a2 * b4;
6601  xmm10 += a2 * b5;
6602  }
6603 
6604  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6605  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
6606  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6607  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
6608  C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
6609  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
6610  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
6611  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
6612  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
6613  C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
6614  }
6615 
6616  if( i < M )
6617  {
6618  const size_t kbegin( ( IsUpper_v<MT4> )
6619  ?( ( IsLower_v<MT5> )
6620  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6621  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6622  :( IsLower_v<MT5> ? j : 0UL ) );
6623  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
6624 
6625  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6626 
6627  for( size_t k=kbegin; k<kend; ++k ) {
6628  const SIMDType a1( set( A(i,k) ) );
6629  xmm1 += a1 * B.load(k,j );
6630  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6631  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6632  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6633  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6634  }
6635 
6636  C.store( i, j , C.load(i,j ) + xmm1 * factor );
6637  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
6638  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6639  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6640  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
6641  }
6642  }
6643 
6644  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6645  {
6646  size_t i( 0UL );
6647 
6648  for( ; (i+2UL) <= M; i+=2UL )
6649  {
6650  const size_t kbegin( ( IsUpper_v<MT4> )
6651  ?( ( IsLower_v<MT5> )
6652  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6653  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6654  :( IsLower_v<MT5> ? j : 0UL ) );
6655  const size_t kend( ( IsLower_v<MT4> )
6656  ?( ( IsUpper_v<MT5> )
6657  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
6658  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6659  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
6660 
6661  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6662 
6663  for( size_t k=kbegin; k<kend; ++k ) {
6664  const SIMDType a1( set( A(i ,k) ) );
6665  const SIMDType a2( set( A(i+1UL,k) ) );
6666  const SIMDType b1( B.load(k,j ) );
6667  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6668  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6669  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6670  xmm1 += a1 * b1;
6671  xmm2 += a1 * b2;
6672  xmm3 += a1 * b3;
6673  xmm4 += a1 * b4;
6674  xmm5 += a2 * b1;
6675  xmm6 += a2 * b2;
6676  xmm7 += a2 * b3;
6677  xmm8 += a2 * b4;
6678  }
6679 
6680  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6681  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
6682  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6683  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
6684  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
6685  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
6686  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
6687  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
6688  }
6689 
6690  if( i < M )
6691  {
6692  const size_t kbegin( ( IsUpper_v<MT4> )
6693  ?( ( IsLower_v<MT5> )
6694  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6695  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6696  :( IsLower_v<MT5> ? j : 0UL ) );
6697  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
6698 
6699  SIMDType xmm1, xmm2, xmm3, xmm4;
6700 
6701  for( size_t k=kbegin; k<kend; ++k ) {
6702  const SIMDType a1( set( A(i,k) ) );
6703  xmm1 += a1 * B.load(k,j );
6704  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6705  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6706  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6707  }
6708 
6709  C.store( i, j , C.load(i,j ) + xmm1 * factor );
6710  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
6711  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6712  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6713  }
6714  }
6715 
6716  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
6717  {
6718  size_t i( 0UL );
6719 
6720  for( ; (i+2UL) <= M; i+=2UL )
6721  {
6722  const size_t kbegin( ( IsUpper_v<MT4> )
6723  ?( ( IsLower_v<MT5> )
6724  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6725  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6726  :( IsLower_v<MT5> ? j : 0UL ) );
6727  const size_t kend( ( IsLower_v<MT4> )
6728  ?( ( IsUpper_v<MT5> )
6729  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
6730  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6731  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
6732 
6733  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6734 
6735  for( size_t k=kbegin; k<kend; ++k ) {
6736  const SIMDType a1( set( A(i ,k) ) );
6737  const SIMDType a2( set( A(i+1UL,k) ) );
6738  const SIMDType b1( B.load(k,j ) );
6739  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6740  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6741  xmm1 += a1 * b1;
6742  xmm2 += a1 * b2;
6743  xmm3 += a1 * b3;
6744  xmm4 += a2 * b1;
6745  xmm5 += a2 * b2;
6746  xmm6 += a2 * b3;
6747  }
6748 
6749  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6750  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) + xmm2 * factor );
6751  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6752  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
6753  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
6754  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
6755  }
6756 
6757  if( i < M )
6758  {
6759  const size_t kbegin( ( IsUpper_v<MT4> )
6760  ?( ( IsLower_v<MT5> )
6761  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6762  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6763  :( IsLower_v<MT5> ? j : 0UL ) );
6764  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
6765 
6766  SIMDType xmm1, xmm2, xmm3;
6767 
6768  for( size_t k=kbegin; k<kend; ++k ) {
6769  const SIMDType a1( set( A(i,k) ) );
6770  xmm1 += a1 * B.load(k,j );
6771  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6772  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6773  }
6774 
6775  C.store( i, j , C.load(i,j ) + xmm1 * factor );
6776  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) + xmm2 * factor );
6777  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6778  }
6779  }
6780 
6781  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6782  {
6783  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
6784  size_t i( LOW ? j : 0UL );
6785 
6786  for( ; (i+4UL) <= iend; i+=4UL )
6787  {
6788  const size_t kbegin( ( IsUpper_v<MT4> )
6789  ?( ( IsLower_v<MT5> )
6790  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6791  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6792  :( IsLower_v<MT5> ? j : 0UL ) );
6793  const size_t kend( ( IsLower_v<MT4> )
6794  ?( ( IsUpper_v<MT5> )
6795  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
6796  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
6797  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
6798 
6799  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6800 
6801  for( size_t k=kbegin; k<kend; ++k ) {
6802  const SIMDType a1( set( A(i ,k) ) );
6803  const SIMDType a2( set( A(i+1UL,k) ) );
6804  const SIMDType a3( set( A(i+2UL,k) ) );
6805  const SIMDType a4( set( A(i+3UL,k) ) );
6806  const SIMDType b1( B.load(k,j ) );
6807  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6808  xmm1 += a1 * b1;
6809  xmm2 += a1 * b2;
6810  xmm3 += a2 * b1;
6811  xmm4 += a2 * b2;
6812  xmm5 += a3 * b1;
6813  xmm6 += a3 * b2;
6814  xmm7 += a4 * b1;
6815  xmm8 += a4 * b2;
6816  }
6817 
6818  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6819  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
6820  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
6821  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
6822  C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
6823  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
6824  C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
6825  C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
6826  }
6827 
6828  for( ; (i+3UL) <= iend; i+=3UL )
6829  {
6830  const size_t kbegin( ( IsUpper_v<MT4> )
6831  ?( ( IsLower_v<MT5> )
6832  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6833  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6834  :( IsLower_v<MT5> ? j : 0UL ) );
6835  const size_t kend( ( IsLower_v<MT4> )
6836  ?( ( IsUpper_v<MT5> )
6837  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
6838  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
6839  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
6840 
6841  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6842 
6843  for( size_t k=kbegin; k<kend; ++k ) {
6844  const SIMDType a1( set( A(i ,k) ) );
6845  const SIMDType a2( set( A(i+1UL,k) ) );
6846  const SIMDType a3( set( A(i+2UL,k) ) );
6847  const SIMDType b1( B.load(k,j ) );
6848  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6849  xmm1 += a1 * b1;
6850  xmm2 += a1 * b2;
6851  xmm3 += a2 * b1;
6852  xmm4 += a2 * b2;
6853  xmm5 += a3 * b1;
6854  xmm6 += a3 * b2;
6855  }
6856 
6857  C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6858  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + xmm2 * factor );
6859  C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
6860  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
6861  C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
6862  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
6863  }
6864 
6865  for( ; (i+2UL) <= iend; i+=2UL )
6866  {
6867  const size_t kbegin( ( IsUpper_v<MT4> )
6868  ?( ( IsLower_v<MT5> )
6869  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6870  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6871  :( IsLower_v<MT5> ? j : 0UL ) );
6872  const size_t kend( ( IsLower_v<MT4> )
6873  ?( ( IsUpper_v<MT5> )
6874  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
6875  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6876  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
6877 
6878  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6879  size_t k( kbegin );
6880 
6881  for( ; (k+2UL) <= kend; k+=2UL ) {
6882  const SIMDType a1( set( A(i ,k ) ) );
6883  const SIMDType a2( set( A(i+1UL,k ) ) );
6884  const SIMDType a3( set( A(i ,k+1UL) ) );
6885  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
6886  const SIMDType b1( B.load(k ,j ) );
6887  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
6888  const SIMDType b3( B.load(k+1UL,j ) );
6889  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
6890  xmm1 += a1 * b1;
6891  xmm2 += a1 * b2;
6892  xmm3 += a2 * b1;
6893  xmm4 += a2 * b2;
6894  xmm5 += a3 * b3;
6895  xmm6 += a3 * b4;
6896  xmm7 += a4 * b3;
6897  xmm8 += a4 * b4;
6898  }
6899 
6900  for( ; k<kend; ++k ) {
6901  const SIMDType a1( set( A(i ,k) ) );
6902  const SIMDType a2( set( A(i+1UL,k) ) );
6903  const SIMDType b1( B.load(k,j ) );
6904  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6905  xmm1 += a1 * b1;
6906  xmm2 += a1 * b2;
6907  xmm3 += a2 * b1;
6908  xmm4 += a2 * b2;
6909  }
6910 
6911  C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
6912  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
6913  C.store( i+1UL, j , C.load(i+1UL,j ) + (xmm3+xmm7) * factor );
6914  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
6915  }
6916 
6917  if( i < iend )
6918  {
6919  const size_t kbegin( ( IsUpper_v<MT4> )
6920  ?( ( IsLower_v<MT5> )
6921  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6922  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6923  :( IsLower_v<MT5> ? j : 0UL ) );
6924  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
6925 
6926  SIMDType xmm1, xmm2, xmm3, xmm4;
6927  size_t k( kbegin );
6928 
6929  for( ; (k+2UL) <= kend; k+=2UL ) {
6930  const SIMDType a1( set( A(i,k ) ) );
6931  const SIMDType a2( set( A(i,k+1UL) ) );
6932  xmm1 += a1 * B.load(k ,j );
6933  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
6934  xmm3 += a2 * B.load(k+1UL,j );
6935  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
6936  }
6937 
6938  for( ; k<kend; ++k ) {
6939  const SIMDType a1( set( A(i,k) ) );
6940  xmm1 += a1 * B.load(k,j );
6941  xmm2 += a1 * B.load(k,j+SIMDSIZE);
6942  }
6943 
6944  C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
6945  C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
6946  }
6947  }
6948 
6949  for( ; j<jpos; j+=SIMDSIZE )
6950  {
6951  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
6952  size_t i( LOW ? j : 0UL );
6953 
6954  for( ; (i+4UL) <= iend; i+=4UL )
6955  {
6956  const size_t kbegin( ( IsUpper_v<MT4> )
6957  ?( ( IsLower_v<MT5> )
6958  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6959  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6960  :( IsLower_v<MT5> ? j : 0UL ) );
6961  const size_t kend( ( IsLower_v<MT4> )
6962  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
6963  :( K ) );
6964 
6965  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6966  size_t k( kbegin );
6967 
6968  for( ; (k+2UL) <= kend; k+=2UL ) {
6969  const SIMDType b1( B.load(k ,j) );
6970  const SIMDType b2( B.load(k+1UL,j) );
6971  xmm1 += set( A(i ,k ) ) * b1;
6972  xmm2 += set( A(i+1UL,k ) ) * b1;
6973  xmm3 += set( A(i+2UL,k ) ) * b1;
6974  xmm4 += set( A(i+3UL,k ) ) * b1;
6975  xmm5 += set( A(i ,k+1UL) ) * b2;
6976  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
6977  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
6978  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
6979  }
6980 
6981  for( ; k<kend; ++k ) {
6982  const SIMDType b1( B.load(k,j) );
6983  xmm1 += set( A(i ,k) ) * b1;
6984  xmm2 += set( A(i+1UL,k) ) * b1;
6985  xmm3 += set( A(i+2UL,k) ) * b1;
6986  xmm4 += set( A(i+3UL,k) ) * b1;
6987  }
6988 
6989  C.store( i , j, C.load(i ,j) + (xmm1+xmm5) * factor );
6990  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm6) * factor );
6991  C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm7) * factor );
6992  C.store( i+3UL, j, C.load(i+3UL,j) + (xmm4+xmm8) * factor );
6993  }
6994 
6995  for( ; (i+3UL) <= iend; i+=3UL )
6996  {
6997  const size_t kbegin( ( IsUpper_v<MT4> )
6998  ?( ( IsLower_v<MT5> )
6999  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7000  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7001  :( IsLower_v<MT5> ? j : 0UL ) );
7002  const size_t kend( ( IsLower_v<MT4> )
7003  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
7004  :( K ) );
7005 
7006  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7007  size_t k( kbegin );
7008 
7009  for( ; (k+2UL) <= kend; k+=2UL ) {
7010  const SIMDType b1( B.load(k ,j) );
7011  const SIMDType b2( B.load(k+1UL,j) );
7012  xmm1 += set( A(i ,k ) ) * b1;
7013  xmm2 += set( A(i+1UL,k ) ) * b1;
7014  xmm3 += set( A(i+2UL,k ) ) * b1;
7015  xmm4 += set( A(i ,k+1UL) ) * b2;
7016  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
7017  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
7018  }
7019 
7020  for( ; k<kend; ++k ) {
7021  const SIMDType b1( B.load(k,j) );
7022  xmm1 += set( A(i ,k) ) * b1;
7023  xmm2 += set( A(i+1UL,k) ) * b1;
7024  xmm3 += set( A(i+2UL,k) ) * b1;
7025  }
7026 
7027  C.store( i , j, C.load(i ,j) + (xmm1+xmm4) * factor );
7028  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm5) * factor );
7029  C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm6) * factor );
7030  }
7031 
7032  for( ; (i+2UL) <= iend; i+=2UL )
7033  {
7034  const size_t kbegin( ( IsUpper_v<MT4> )
7035  ?( ( IsLower_v<MT5> )
7036  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7037  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7038  :( IsLower_v<MT5> ? j : 0UL ) );
7039  const size_t kend( ( IsLower_v<MT4> )
7040  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7041  :( K ) );
7042 
7043  SIMDType xmm1, xmm2, xmm3, xmm4;
7044  size_t k( kbegin );
7045 
7046  for( ; (k+2UL) <= kend; k+=2UL ) {
7047  const SIMDType b1( B.load(k ,j) );
7048  const SIMDType b2( B.load(k+1UL,j) );
7049  xmm1 += set( A(i ,k ) ) * b1;
7050  xmm2 += set( A(i+1UL,k ) ) * b1;
7051  xmm3 += set( A(i ,k+1UL) ) * b2;
7052  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
7053  }
7054 
7055  for( ; k<kend; ++k ) {
7056  const SIMDType b1( B.load(k,j) );
7057  xmm1 += set( A(i ,k) ) * b1;
7058  xmm2 += set( A(i+1UL,k) ) * b1;
7059  }
7060 
7061  C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
7062  C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm4) * factor );
7063  }
7064 
7065  if( i < iend )
7066  {
7067  const size_t kbegin( ( IsUpper_v<MT4> )
7068  ?( ( IsLower_v<MT5> )
7069  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7070  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7071  :( IsLower_v<MT5> ? j : 0UL ) );
7072 
7073  SIMDType xmm1, xmm2;
7074  size_t k( kbegin );
7075 
7076  for( ; (k+2UL) <= K; k+=2UL ) {
7077  xmm1 += set( A(i,k ) ) * B.load(k ,j);
7078  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
7079  }
7080 
7081  for( ; k<K; ++k ) {
7082  xmm1 += set( A(i,k) ) * B.load(k,j);
7083  }
7084 
7085  C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
7086  }
7087  }
7088 
7089  for( ; remainder && j<N; ++j )
7090  {
7091  const size_t iend( UPP ? j+1UL : M );
7092  size_t i( LOW ? j : 0UL );
7093 
7094  for( ; (i+2UL) <= iend; i+=2UL )
7095  {
7096  const size_t kbegin( ( IsUpper_v<MT4> )
7097  ?( ( IsLower_v<MT5> )
7098  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7099  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7100  :( IsLower_v<MT5> ? j : 0UL ) );
7101  const size_t kend( ( IsLower_v<MT4> )
7102  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7103  :( K ) );
7104 
7105  ElementType value1{};
7106  ElementType value2{};
7107 
7108  for( size_t k=kbegin; k<kend; ++k ) {
7109  value1 += A(i ,k) * B(k,j);
7110  value2 += A(i+1UL,k) * B(k,j);
7111  }
7112 
7113  C(i ,j) += value1 * scalar;
7114  C(i+1UL,j) += value2 * scalar;
7115  }
7116 
7117  if( i < iend )
7118  {
7119  const size_t kbegin( ( IsUpper_v<MT4> )
7120  ?( ( IsLower_v<MT5> )
7121  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7122  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7123  :( IsLower_v<MT5> ? j : 0UL ) );
7124 
7125  ElementType value{};
7126 
7127  for( size_t k=kbegin; k<K; ++k ) {
7128  value += A(i,k) * B(k,j);
7129  }
7130 
7131  C(i,j) += value * scalar;
7132  }
7133  }
7134  }
7135  //**********************************************************************************************
7136 
7137  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
7152  template< typename MT3 // Type of the left-hand side target matrix
7153  , typename MT4 // Type of the left-hand side matrix operand
7154  , typename MT5 // Type of the right-hand side matrix operand
7155  , typename ST2 > // Type of the scalar value
7156  static inline auto selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7157  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7158  {
7163 
7164  const ForwardFunctor fwd;
7165 
7166  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7167  const OppositeType_t<MT4> tmp( serial( A ) );
7168  addAssign( C, fwd( tmp * B ) * scalar );
7169  }
7170  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7171  const OppositeType_t<MT5> tmp( serial( B ) );
7172  addAssign( C, fwd( A * tmp ) * scalar );
7173  }
7174  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7175  const OppositeType_t<MT4> tmp( serial( A ) );
7176  addAssign( C, fwd( tmp * B ) * scalar );
7177  }
7178  else {
7179  const OppositeType_t<MT5> tmp( serial( B ) );
7180  addAssign( C, fwd( A * tmp ) * scalar );
7181  }
7182  }
7183  //**********************************************************************************************
7184 
7185  //**Default addition assignment to dense matrices (large matrices)******************************
7199  template< typename MT3 // Type of the left-hand side target matrix
7200  , typename MT4 // Type of the left-hand side matrix operand
7201  , typename MT5 // Type of the right-hand side matrix operand
7202  , typename ST2 > // Type of the scalar value
7203  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7204  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7205  {
7206  selectDefaultAddAssignKernel( C, A, B, scalar );
7207  }
7208  //**********************************************************************************************
7209 
7210  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
7225  template< typename MT3 // Type of the left-hand side target matrix
7226  , typename MT4 // Type of the left-hand side matrix operand
7227  , typename MT5 // Type of the right-hand side matrix operand
7228  , typename ST2 > // Type of the scalar value
7229  static inline auto selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7230  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7231  {
7232  if( LOW )
7233  lmmm( C, A, B, scalar, ST2(1) );
7234  else if( UPP )
7235  ummm( C, A, B, scalar, ST2(1) );
7236  else
7237  mmm( C, A, B, scalar, ST2(1) );
7238  }
7239  //**********************************************************************************************
7240 
7241  //**BLAS-based addition assignment to dense matrices (default)**********************************
7255  template< typename MT3 // Type of the left-hand side target matrix
7256  , typename MT4 // Type of the left-hand side matrix operand
7257  , typename MT5 // Type of the right-hand side matrix operand
7258  , typename ST2 > // Type of the scalar value
7259  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7260  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7261  {
7262  selectLargeAddAssignKernel( C, A, B, scalar );
7263  }
7264  //**********************************************************************************************
7265 
7266  //**BLAS-based addition assignment to dense matrices********************************************
7267 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7268 
7281  template< typename MT3 // Type of the left-hand side target matrix
7282  , typename MT4 // Type of the left-hand side matrix operand
7283  , typename MT5 // Type of the right-hand side matrix operand
7284  , typename ST2 > // Type of the scalar value
7285  static inline auto selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7286  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7287  {
7288  using ET = ElementType_t<MT3>;
7289 
7290  if( IsTriangular_v<MT4> ) {
7291  ResultType_t<MT3> tmp( serial( B ) );
7292  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7293  addAssign( C, tmp );
7294  }
7295  else if( IsTriangular_v<MT5> ) {
7296  ResultType_t<MT3> tmp( serial( A ) );
7297  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
7298  addAssign( C, tmp );
7299  }
7300  else {
7301  gemm( C, A, B, ET(scalar), ET(1) );
7302  }
7303  }
7304 #endif
7305  //**********************************************************************************************
7306 
7307  //**Restructuring addition assignment to column-major matrices**********************************
7321  template< typename MT > // Type of the target matrix
7322  friend inline auto addAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7323  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7324  {
7326 
7328 
7329  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7330  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7331 
7332  const ForwardFunctor fwd;
7333 
7334  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7335  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7336 
7337  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
7338  addAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
7339  else if( IsSymmetric_v<MT1> )
7340  addAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
7341  else
7342  addAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
7343  }
7344  //**********************************************************************************************
7345 
7346  //**Addition assignment to sparse matrices******************************************************
7347  // No special implementation for the addition assignment to sparse matrices.
7348  //**********************************************************************************************
7349 
7350  //**Subtraction assignment to dense matrices****************************************************
7362  template< typename MT // Type of the target dense matrix
7363  , bool SO > // Storage order of the target dense matrix
7364  friend inline auto subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7365  -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7366  {
7368 
7369  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7370  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7371 
7372  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7373  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7374 
7375  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7376  return;
7377  }
7378 
7379  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7380  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7381 
7382  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7383  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7384  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7385  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7386  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7387  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7388 
7389  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7390  }
7391  //**********************************************************************************************
7392 
7393  //**Subtraction assignment to dense matrices (kernel selection)*********************************
7404  template< typename MT3 // Type of the left-hand side target matrix
7405  , typename MT4 // Type of the left-hand side matrix operand
7406  , typename MT5 // Type of the right-hand side matrix operand
7407  , typename ST2 > // Type of the scalar value
7408  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7409  {
7410  if( ( IsDiagonal_v<MT5> ) ||
7411  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
7412  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
7413  selectSmallSubAssignKernel( C, A, B, scalar );
7414  else
7415  selectBlasSubAssignKernel( C, A, B, scalar );
7416  }
7417  //**********************************************************************************************
7418 
7419  //**Default subtraction assignment to dense matrices (general/general)**************************
7433  template< typename MT3 // Type of the left-hand side target matrix
7434  , typename MT4 // Type of the left-hand side matrix operand
7435  , typename MT5 // Type of the right-hand side matrix operand
7436  , typename ST2 > // Type of the scalar value
7437  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7438  -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7439  {
7440  const ResultType tmp( serial( A * B * scalar ) );
7441  subAssign( C, tmp );
7442  }
7443  //**********************************************************************************************
7444 
7445  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
7459  template< typename MT3 // Type of the left-hand side target matrix
7460  , typename MT4 // Type of the left-hand side matrix operand
7461  , typename MT5 // Type of the right-hand side matrix operand
7462  , typename ST2 > // Type of the scalar value
7463  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7464  -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7465  {
7467 
7468  const size_t M( A.rows() );
7469  const size_t N( B.columns() );
7470 
7471  for( size_t i=0UL; i<M; ++i )
7472  {
7473  const size_t jbegin( ( IsUpper_v<MT4> )
7474  ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7475  :( 0UL ) );
7476  const size_t jend( ( IsLower_v<MT4> )
7477  ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7478  :( N ) );
7479  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7480 
7481  const size_t jnum( jend - jbegin );
7482  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
7483 
7484  for( size_t j=jbegin; j<jpos; j+=2UL ) {
7485  C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7486  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7487  }
7488  if( jpos < jend ) {
7489  C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7490  }
7491  }
7492  }
7493  //**********************************************************************************************
7494 
7495  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
7509  template< typename MT3 // Type of the left-hand side target matrix
7510  , typename MT4 // Type of the left-hand side matrix operand
7511  , typename MT5 // Type of the right-hand side matrix operand
7512  , typename ST2 > // Type of the scalar value
7513  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7514  -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7515  {
7517 
7518  const size_t M( A.rows() );
7519  const size_t N( B.columns() );
7520 
7521  for( size_t i=0UL; i<M; ++i )
7522  {
7523  const size_t jbegin( ( IsUpper_v<MT5> )
7524  ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
7525  :( 0UL ) );
7526  const size_t jend( ( IsLower_v<MT5> )
7527  ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
7528  :( N ) );
7529  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7530 
7531  const size_t jnum( jend - jbegin );
7532  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
7533 
7534  for( size_t j=jbegin; j<jpos; j+=2UL ) {
7535  C(i,j ) -= A(i,i) * B(i,j ) * scalar;
7536  C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
7537  }
7538  if( jpos < jend ) {
7539  C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
7540  }
7541  }
7542  }
7543  //**********************************************************************************************
7544 
7545  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
7559  template< typename MT3 // Type of the left-hand side target matrix
7560  , typename MT4 // Type of the left-hand side matrix operand
7561  , typename MT5 // Type of the right-hand side matrix operand
7562  , typename ST2 > // Type of the scalar value
7563  static inline auto selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7564  -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7565  {
7567 
7568  for( size_t i=0UL; i<A.rows(); ++i ) {
7569  C(i,i) -= A(i,i) * B(i,i) * scalar;
7570  }
7571  }
7572  //**********************************************************************************************
7573 
7574  //**Default subtraction assignment to dense matrices (small matrices)***************************
7588  template< typename MT3 // Type of the left-hand side target matrix
7589  , typename MT4 // Type of the left-hand side matrix operand
7590  , typename MT5 // Type of the right-hand side matrix operand
7591  , typename ST2 > // Type of the scalar value
7592  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7593  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7594  {
7595  selectDefaultSubAssignKernel( C, A, B, scalar );
7596  }
7597  //**********************************************************************************************
7598 
7599  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
7614  template< typename MT3 // Type of the left-hand side target matrix
7615  , typename MT4 // Type of the left-hand side matrix operand
7616  , typename MT5 // Type of the right-hand side matrix operand
7617  , typename ST2 > // Type of the scalar value
7618  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7619  -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7620  {
7621  constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
7622 
7623  const size_t M( A.rows() );
7624  const size_t N( B.columns() );
7625  const size_t K( A.columns() );
7626 
7627  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7628 
7629  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
7630  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
7631 
7632  const SIMDType factor( set( scalar ) );
7633 
7634  size_t j( 0UL );
7635 
7636  if( IsIntegral_v<ElementType> )
7637  {
7638  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7639  for( size_t i=0UL; i<M; ++i )
7640  {
7641  const size_t kbegin( ( IsUpper_v<MT4> )
7642  ?( ( IsLower_v<MT5> )
7643  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7644  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7645  :( IsLower_v<MT5> ? j : 0UL ) );
7646  const size_t kend( ( IsLower_v<MT4> )
7647  ?( ( IsUpper_v<MT5> )
7648  ?( min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
7649  :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7650  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*8UL, K ) : K ) );
7651 
7652  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7653 
7654  for( size_t k=kbegin; k<kend; ++k ) {
7655  const SIMDType a1( set( A(i,k) ) );
7656  xmm1 += a1 * B.load(k,j );
7657  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7658  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7659  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7660  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7661  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7662  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7663  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7664  }
7665 
7666  C.store( i, j , C.load(i,j ) - xmm1 * factor );
7667  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
7668  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7669  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7670  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
7671  C.store( i, j+SIMDSIZE*5UL, C.load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
7672  C.store( i, j+SIMDSIZE*6UL, C.load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
7673  C.store( i, j+SIMDSIZE*7UL, C.load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
7674  }
7675  }
7676  }
7677 
7678  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7679  {
7680  size_t i( 0UL );
7681 
7682  for( ; (i+2UL) <= M; i+=2UL )
7683  {
7684  const size_t kbegin( ( IsUpper_v<MT4> )
7685  ?( ( IsLower_v<MT5> )
7686  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7687  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7688  :( IsLower_v<MT5> ? j : 0UL ) );
7689  const size_t kend( ( IsLower_v<MT4> )
7690  ?( ( IsUpper_v<MT5> )
7691  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
7692  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7693  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*5UL, K ) : K ) );
7694 
7695  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7696 
7697  for( size_t k=kbegin; k<kend; ++k ) {
7698  const SIMDType a1( set( A(i ,k) ) );
7699  const SIMDType a2( set( A(i+1UL,k) ) );
7700  const SIMDType b1( B.load(k,j ) );
7701  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7702  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7703  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7704  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7705  xmm1 += a1 * b1;
7706  xmm2 += a1 * b2;
7707  xmm3 += a1 * b3;
7708  xmm4 += a1 * b4;
7709  xmm5 += a1 * b5;
7710  xmm6 += a2 * b1;
7711  xmm7 += a2 * b2;
7712  xmm8 += a2 * b3;
7713  xmm9 += a2 * b4;
7714  xmm10 += a2 * b5;
7715  }
7716 
7717  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7718  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
7719  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7720  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
7721  C.store( i , j+SIMDSIZE*4UL, C.load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
7722  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
7723  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
7724  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
7725  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
7726  C.store( i+1UL, j+SIMDSIZE*4UL, C.load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
7727  }
7728 
7729  if( i < M )
7730  {
7731  const size_t kbegin( ( IsUpper_v<MT4> )
7732  ?( ( IsLower_v<MT5> )
7733  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7734  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7735  :( IsLower_v<MT5> ? j : 0UL ) );
7736  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
7737 
7738  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7739 
7740  for( size_t k=kbegin; k<kend; ++k ) {
7741  const SIMDType a1( set( A(i,k) ) );
7742  xmm1 += a1 * B.load(k,j );
7743  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7744  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7745  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7746  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7747  }
7748 
7749  C.store( i, j , C.load(i,j ) - xmm1 * factor );
7750  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
7751  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7752  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7753  C.store( i, j+SIMDSIZE*4UL, C.load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
7754  }
7755  }
7756 
7757  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7758  {
7759  size_t i( 0UL );
7760 
7761  for( ; (i+2UL) <= M; i+=2UL )
7762  {
7763  const size_t kbegin( ( IsUpper_v<MT4> )
7764  ?( ( IsLower_v<MT5> )
7765  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7766  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7767  :( IsLower_v<MT5> ? j : 0UL ) );
7768  const size_t kend( ( IsLower_v<MT4> )
7769  ?( ( IsUpper_v<MT5> )
7770  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
7771  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7772  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*4UL, K ) : K ) );
7773 
7774  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7775 
7776  for( size_t k=kbegin; k<kend; ++k ) {
7777  const SIMDType a1( set( A(i ,k) ) );
7778  const SIMDType a2( set( A(i+1UL,k) ) );
7779  const SIMDType b1( B.load(k,j ) );
7780  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7781  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7782  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7783  xmm1 += a1 * b1;
7784  xmm2 += a1 * b2;
7785  xmm3 += a1 * b3;
7786  xmm4 += a1 * b4;
7787  xmm5 += a2 * b1;
7788  xmm6 += a2 * b2;
7789  xmm7 += a2 * b3;
7790  xmm8 += a2 * b4;
7791  }
7792 
7793  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7794  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
7795  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7796  C.store( i , j+SIMDSIZE*3UL, C.load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
7797  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
7798  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
7799  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
7800  C.store( i+1UL, j+SIMDSIZE*3UL, C.load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
7801  }
7802 
7803  if( i < M )
7804  {
7805  const size_t kbegin( ( IsUpper_v<MT4> )
7806  ?( ( IsLower_v<MT5> )
7807  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7808  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7809  :( IsLower_v<MT5> ? j : 0UL ) );
7810  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
7811 
7812  SIMDType xmm1, xmm2, xmm3, xmm4;
7813 
7814  for( size_t k=kbegin; k<kend; ++k ) {
7815  const SIMDType a1( set( A(i,k) ) );
7816  xmm1 += a1 * B.load(k,j );
7817  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7818  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7819  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7820  }
7821 
7822  C.store( i, j , C.load(i,j ) - xmm1 * factor );
7823  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
7824  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7825  C.store( i, j+SIMDSIZE*3UL, C.load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7826  }
7827  }
7828 
7829  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
7830  {
7831  size_t i( 0UL );
7832 
7833  for( ; (i+2UL) <= M; i+=2UL )
7834  {
7835  const size_t kbegin( ( IsUpper_v<MT4> )
7836  ?( ( IsLower_v<MT5> )
7837  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7838  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7839  :( IsLower_v<MT5> ? j : 0UL ) );
7840  const size_t kend( ( IsLower_v<MT4> )
7841  ?( ( IsUpper_v<MT5> )
7842  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
7843  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7844  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*3UL, K ) : K ) );
7845 
7846  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7847 
7848  for( size_t k=kbegin; k<kend; ++k ) {
7849  const SIMDType a1( set( A(i ,k) ) );
7850  const SIMDType a2( set( A(i+1UL,k) ) );
7851  const SIMDType b1( B.load(k,j ) );
7852  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7853  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7854  xmm1 += a1 * b1;
7855  xmm2 += a1 * b2;
7856  xmm3 += a1 * b3;
7857  xmm4 += a2 * b1;
7858  xmm5 += a2 * b2;
7859  xmm6 += a2 * b3;
7860  }
7861 
7862  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7863  C.store( i , j+SIMDSIZE , C.load(i ,j+SIMDSIZE ) - xmm2 * factor );
7864  C.store( i , j+SIMDSIZE*2UL, C.load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7865  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
7866  C.store( i+1UL, j+SIMDSIZE , C.load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
7867  C.store( i+1UL, j+SIMDSIZE*2UL, C.load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
7868  }
7869 
7870  if( i < M )
7871  {
7872  const size_t kbegin( ( IsUpper_v<MT4> )
7873  ?( ( IsLower_v<MT5> )
7874  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7875  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7876  :( IsLower_v<MT5> ? j : 0UL ) );
7877  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
7878 
7879  SIMDType xmm1, xmm2, xmm3;
7880 
7881  for( size_t k=kbegin; k<kend; ++k ) {
7882  const SIMDType a1( set( A(i,k) ) );
7883  xmm1 += a1 * B.load(k,j );
7884  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7885  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7886  }
7887 
7888  C.store( i, j , C.load(i,j ) - xmm1 * factor );
7889  C.store( i, j+SIMDSIZE , C.load(i,j+SIMDSIZE ) - xmm2 * factor );
7890  C.store( i, j+SIMDSIZE*2UL, C.load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7891  }
7892  }
7893 
7894  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
7895  {
7896  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
7897  size_t i( LOW ? j : 0UL );
7898 
7899  for( ; (i+4UL) <= iend; i+=4UL )
7900  {
7901  const size_t kbegin( ( IsUpper_v<MT4> )
7902  ?( ( IsLower_v<MT5> )
7903  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7904  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7905  :( IsLower_v<MT5> ? j : 0UL ) );
7906  const size_t kend( ( IsLower_v<MT4> )
7907  ?( ( IsUpper_v<MT5> )
7908  ?( min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
7909  :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
7910  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
7911 
7912  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7913 
7914  for( size_t k=kbegin; k<kend; ++k ) {
7915  const SIMDType a1( set( A(i ,k) ) );
7916  const SIMDType a2( set( A(i+1UL,k) ) );
7917  const SIMDType a3( set( A(i+2UL,k) ) );
7918  const SIMDType a4( set( A(i+3UL,k) ) );
7919  const SIMDType b1( B.load(k,j ) );
7920  const SIMDType b2( B.load(k,j+SIMDSIZE) );
7921  xmm1 += a1 * b1;
7922  xmm2 += a1 * b2;
7923  xmm3 += a2 * b1;
7924  xmm4 += a2 * b2;
7925  xmm5 += a3 * b1;
7926  xmm6 += a3 * b2;
7927  xmm7 += a4 * b1;
7928  xmm8 += a4 * b2;
7929  }
7930 
7931  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7932  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
7933  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
7934  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
7935  C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
7936  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
7937  C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
7938  C.store( i+3UL, j+SIMDSIZE, C.load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
7939  }
7940 
7941  for( ; (i+3UL) <= iend; i+=3UL )
7942  {
7943  const size_t kbegin( ( IsUpper_v<MT4> )
7944  ?( ( IsLower_v<MT5> )
7945  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7946  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7947  :( IsLower_v<MT5> ? j : 0UL ) );
7948  const size_t kend( ( IsLower_v<MT4> )
7949  ?( ( IsUpper_v<MT5> )
7950  ?( min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
7951  :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
7952  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
7953 
7954  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7955 
7956  for( size_t k=kbegin; k<kend; ++k ) {
7957  const SIMDType a1( set( A(i ,k) ) );
7958  const SIMDType a2( set( A(i+1UL,k) ) );
7959  const SIMDType a3( set( A(i+2UL,k) ) );
7960  const SIMDType b1( B.load(k,j ) );
7961  const SIMDType b2( B.load(k,j+SIMDSIZE) );
7962  xmm1 += a1 * b1;
7963  xmm2 += a1 * b2;
7964  xmm3 += a2 * b1;
7965  xmm4 += a2 * b2;
7966  xmm5 += a3 * b1;
7967  xmm6 += a3 * b2;
7968  }
7969 
7970  C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7971  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - xmm2 * factor );
7972  C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
7973  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
7974  C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
7975  C.store( i+2UL, j+SIMDSIZE, C.load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
7976  }
7977 
7978  for( ; (i+2UL) <= iend; i+=2UL )
7979  {
7980  const size_t kbegin( ( IsUpper_v<MT4> )
7981  ?( ( IsLower_v<MT5> )
7982  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7983  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7984  :( IsLower_v<MT5> ? j : 0UL ) );
7985  const size_t kend( ( IsLower_v<MT4> )
7986  ?( ( IsUpper_v<MT5> )
7987  ?( min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
7988  :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7989  :( IsUpper_v<MT5> ? min( j+SIMDSIZE*2UL, K ) : K ) );
7990 
7991  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7992  size_t k( kbegin );
7993 
7994  for( ; (k+2UL) <= kend; k+=2UL ) {
7995  const SIMDType a1( set( A(i ,k ) ) );
7996  const SIMDType a2( set( A(i+1UL,k ) ) );
7997  const SIMDType a3( set( A(i ,k+1UL) ) );
7998  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
7999  const SIMDType b1( B.load(k ,j ) );
8000  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
8001  const SIMDType b3( B.load(k+1UL,j ) );
8002  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
8003  xmm1 += a1 * b1;
8004  xmm2 += a1 * b2;
8005  xmm3 += a2 * b1;
8006  xmm4 += a2 * b2;
8007  xmm5 += a3 * b3;
8008  xmm6 += a3 * b4;
8009  xmm7 += a4 * b3;
8010  xmm8 += a4 * b4;
8011  }
8012 
8013  for( ; k<kend; ++k ) {
8014  const SIMDType a1( set( A(i ,k) ) );
8015  const SIMDType a2( set( A(i+1UL,k) ) );
8016  const SIMDType b1( B.load(k,j ) );
8017  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8018  xmm1 += a1 * b1;
8019  xmm2 += a1 * b2;
8020  xmm3 += a2 * b1;
8021  xmm4 += a2 * b2;
8022  }
8023 
8024  C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
8025  C.store( i , j+SIMDSIZE, C.load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
8026  C.store( i+1UL, j , C.load(i+1UL,j ) - (xmm3+xmm7) * factor );
8027  C.store( i+1UL, j+SIMDSIZE, C.load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
8028  }
8029 
8030  if( i < iend )
8031  {
8032  const size_t kbegin( ( IsUpper_v<MT4> )
8033  ?( ( IsLower_v<MT5> )
8034  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8035  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8036  :( IsLower_v<MT5> ? j : 0UL ) );
8037  const size_t kend( ( IsUpper_v<MT5> )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8038 
8039  SIMDType xmm1, xmm2, xmm3, xmm4;
8040  size_t k( kbegin );
8041 
8042  for( ; (k+2UL) <= kend; k+=2UL ) {
8043  const SIMDType a1( set( A(i,k ) ) );
8044  const SIMDType a2( set( A(i,k+1UL) ) );
8045  xmm1 += a1 * B.load(k ,j );
8046  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8047  xmm3 += a2 * B.load(k+1UL,j );
8048  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8049  }
8050 
8051  for( ; k<kend; ++k ) {
8052  const SIMDType a1( set( A(i,k) ) );
8053  xmm1 += a1 * B.load(k,j );
8054  xmm2 += a1 * B.load(k,j+SIMDSIZE);
8055  }
8056 
8057  C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
8058  C.store( i, j+SIMDSIZE, C.load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
8059  }
8060  }
8061 
8062  for( ; j<jpos; j+=SIMDSIZE )
8063  {
8064  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
8065  size_t i( LOW ? j : 0UL );
8066 
8067  for( ; (i+4UL) <= iend; i+=4UL )
8068  {
8069  const size_t kbegin( ( IsUpper_v<MT4> )
8070  ?( ( IsLower_v<MT5> )
8071  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8072  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8073  :( IsLower_v<MT5> ? j : 0UL ) );
8074  const size_t kend( ( IsLower_v<MT4> )
8075  ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8076  :( K ) );
8077 
8078  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8079  size_t k( kbegin );
8080 
8081  for( ; (k+2UL) <= kend; k+=2UL ) {
8082  const SIMDType b1( B.load(k ,j) );
8083  const SIMDType b2( B.load(k+1UL,j) );
8084  xmm1 += set( A(i ,k ) ) * b1;
8085  xmm2 += set( A(i+1UL,k ) ) * b1;
8086  xmm3 += set( A(i+2UL,k ) ) * b1;
8087  xmm4 += set( A(i+3UL,k ) ) * b1;
8088  xmm5 += set( A(i ,k+1UL) ) * b2;
8089  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
8090  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
8091  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
8092  }
8093 
8094  for( ; k<kend; ++k ) {
8095  const SIMDType b1( B.load(k,j) );
8096  xmm1 += set( A(i ,k) ) * b1;
8097  xmm2 += set( A(i+1UL,k) ) * b1;
8098  xmm3 += set( A(i+2UL,k) ) * b1;
8099  xmm4 += set( A(i+3UL,k) ) * b1;
8100  }
8101 
8102  C.store( i , j, C.load(i ,j) - (xmm1+xmm5) * factor );
8103  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm6) * factor );
8104  C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm7) * factor );
8105  C.store( i+3UL, j, C.load(i+3UL,j) - (xmm4+xmm8) * factor );
8106  }
8107 
8108  for( ; (i+3UL) <= iend; i+=3UL )
8109  {
8110  const size_t kbegin( ( IsUpper_v<MT4> )
8111  ?( ( IsLower_v<MT5> )
8112  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8113  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8114  :( IsLower_v<MT5> ? j : 0UL ) );
8115  const size_t kend( ( IsLower_v<MT4> )
8116  ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8117  :( K ) );
8118 
8119  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8120  size_t k( kbegin );
8121 
8122  for( ; (k+2UL) <= kend; k+=2UL ) {
8123  const SIMDType b1( B.load(k ,j) );
8124  const SIMDType b2( B.load(k+1UL,j) );
8125  xmm1 += set( A(i ,k ) ) * b1;
8126  xmm2 += set( A(i+1UL,k ) ) * b1;
8127  xmm3 += set( A(i+2UL,k ) ) * b1;
8128  xmm4 += set( A(i ,k+1UL) ) * b2;
8129  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
8130  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
8131  }
8132 
8133  for( ; k<kend; ++k ) {
8134  const SIMDType b1( B.load(k,j) );
8135  xmm1 += set( A(i ,k) ) * b1;
8136  xmm2 += set( A(i+1UL,k) ) * b1;
8137  xmm3 += set( A(i+2UL,k) ) * b1;
8138  }
8139 
8140  C.store( i , j, C.load(i ,j) - (xmm1+xmm4) * factor );
8141  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm5) * factor );
8142  C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm6) * factor );
8143  }
8144 
8145  for( ; (i+2UL) <= iend; i+=2UL )
8146  {
8147  const size_t kbegin( ( IsUpper_v<MT4> )
8148  ?( ( IsLower_v<MT5> )
8149  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8150  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8151  :( IsLower_v<MT5> ? j : 0UL ) );
8152  const size_t kend( ( IsLower_v<MT4> )
8153  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8154  :( K ) );
8155 
8156  SIMDType xmm1, xmm2, xmm3, xmm4;
8157  size_t k( kbegin );
8158 
8159  for( ; (k+2UL) <= kend; k+=2UL ) {
8160  const SIMDType b1( B.load(k ,j) );
8161  const SIMDType b2( B.load(k+1UL,j) );
8162  xmm1 += set( A(i ,k ) ) * b1;
8163  xmm2 += set( A(i+1UL,k ) ) * b1;
8164  xmm3 += set( A(i ,k+1UL) ) * b2;
8165  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
8166  }
8167 
8168  for( ; k<kend; ++k ) {
8169  const SIMDType b1( B.load(k,j) );
8170  xmm1 += set( A(i ,k) ) * b1;
8171  xmm2 += set( A(i+1UL,k) ) * b1;
8172  }
8173 
8174  C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
8175  C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm4) * factor );
8176  }
8177 
8178  if( i < iend )
8179  {
8180  const size_t kbegin( ( IsUpper_v<MT4> )
8181  ?( ( IsLower_v<MT5> )
8182  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8183  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8184  :( IsLower_v<MT5> ? j : 0UL ) );
8185 
8186  SIMDType xmm1, xmm2;
8187  size_t k( kbegin );
8188 
8189  for( ; (k+2UL) <= K; k+=2UL ) {
8190  xmm1 += set( A(i,k ) ) * B.load(k ,j);
8191  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
8192  }
8193 
8194  for( ; k<K; ++k ) {
8195  xmm1 += set( A(i,k) ) * B.load(k,j);
8196  }
8197 
8198  C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
8199  }
8200  }
8201 
8202  for( ; remainder && j<N; ++j )
8203  {
8204  const size_t iend( UPP ? j+1UL : M );
8205  size_t i( LOW ? j : 0UL );
8206 
8207  for( ; (i+2UL) <= iend; i+=2UL )
8208  {
8209  const size_t kbegin( ( IsUpper_v<MT4> )
8210  ?( ( IsLower_v<MT5> )
8211  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8212  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8213  :( IsLower_v<MT5> ? j : 0UL ) );
8214  const size_t kend( ( IsLower_v<MT4> )
8215  ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8216  :( K ) );
8217 
8218  ElementType value1{};
8219  ElementType value2{};
8220 
8221  for( size_t k=kbegin; k<kend; ++k ) {
8222  value1 += A(i ,k) * B(k,j);
8223  value2 += A(i+1UL,k) * B(k,j);
8224  }
8225 
8226  C(i ,j) -= value1 * scalar;
8227  C(i+1UL,j) -= value2 * scalar;
8228  }
8229 
8230  if( i < iend )
8231  {
8232  const size_t kbegin( ( IsUpper_v<MT4> )
8233  ?( ( IsLower_v<MT5> )
8234  ?( max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8235  :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8236  :( IsLower_v<MT5> ? j : 0UL ) );
8237 
8238  ElementType value{};
8239 
8240  for( size_t k=kbegin; k<K; ++k ) {
8241  value += A(i,k) * B(k,j);
8242  }
8243 
8244  C(i,j) -= value * scalar;
8245  }
8246  }
8247  }
8248  //**********************************************************************************************
8249 
8250  //**********************************************************************************************
8251  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
8265  template< typename MT3 // Type of the left-hand side target matrix
8266  , typename MT4 // Type of the left-hand side matrix operand
8267  , typename MT5 // Type of the right-hand side matrix operand
8268  , typename ST2 > // Type of the scalar value
8269  static inline auto selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8270  -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8271  {
8276 
8277  const ForwardFunctor fwd;
8278 
8279  if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
8280  const OppositeType_t<MT4> tmp( serial( A ) );
8281  subAssign( C, fwd( tmp * B ) * scalar );
8282  }
8283  else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
8284  const OppositeType_t<MT5> tmp( serial( B ) );
8285  subAssign( C, fwd( A * tmp ) * scalar );
8286  }
8287  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
8288  const OppositeType_t<MT4> tmp( serial( A ) );
8289  subAssign( C, fwd( tmp * B ) * scalar );
8290  }
8291  else {
8292  const OppositeType_t<MT5> tmp( serial( B ) );
8293  subAssign( C, fwd( A * tmp ) * scalar );
8294  }
8295  }
8296  //**********************************************************************************************
8297 
8298  //**Default subtraction assignment to dense matrices (large matrices)***************************
8312  template< typename MT3 // Type of the left-hand side target matrix
8313  , typename MT4 // Type of the left-hand side matrix operand
8314  , typename MT5 // Type of the right-hand side matrix operand
8315  , typename ST2 > // Type of the scalar value
8316  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8317  -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8318  {
8319  selectDefaultSubAssignKernel( C, A, B, scalar );
8320  }
8321  //**********************************************************************************************
8322 
8323  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
8338  template< typename MT3 // Type of the left-hand side target matrix
8339  , typename MT4 // Type of the left-hand side matrix operand
8340  , typename MT5 // Type of the right-hand side matrix operand
8341  , typename ST2 > // Type of the scalar value
8342  static inline auto selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8343  -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8344  {
8345  if( LOW )
8346  lmmm( C, A, B, -scalar, ST2(1) );
8347  else if( UPP )
8348  ummm( C, A, B, -scalar, ST2(1) );
8349  else
8350  mmm( C, A, B, -scalar, ST2(1) );
8351  }
8352  //**********************************************************************************************
8353 
8354  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
8368  template< typename MT3 // Type of the left-hand side target matrix
8369  , typename MT4 // Type of the left-hand side matrix operand
8370  , typename MT5 // Type of the right-hand side matrix operand
8371  , typename ST2 > // Type of the scalar value
8372  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8373  -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8374  {
8375  selectLargeSubAssignKernel( C, A, B, scalar );
8376  }
8377  //**********************************************************************************************
8378 
8379  //**BLAS-based subraction assignment to dense matrices******************************************
8380 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
8381 
8394  template< typename MT3 // Type of the left-hand side target matrix
8395  , typename MT4 // Type of the left-hand side matrix operand
8396  , typename MT5 // Type of the right-hand side matrix operand
8397  , typename ST2 > // Type of the scalar value
8398  static inline auto selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8399  -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8400  {
8401  using ET = ElementType_t<MT3>;
8402 
8403  if( IsTriangular_v<MT4> ) {
8404  ResultType_t<MT3> tmp( serial( B ) );
8405  trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(scalar) );
8406  subAssign( C, tmp );
8407  }
8408  else if( IsTriangular_v<MT5> ) {
8409  ResultType_t<MT3> tmp( serial( A ) );
8410  trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(scalar) );
8411  subAssign( C, tmp );
8412  }
8413  else {
8414  gemm( C, A, B, ET(-scalar), ET(1) );
8415  }
8416  }
8417 #endif
8418  //**********************************************************************************************
8419 
8420  //**Restructuring subtraction assignment to column-major matrices*******************************
8434  template< typename MT > // Type of the target matrix
8435  friend inline auto subAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8436  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8437  {
8439 
8441 
8442  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8443  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8444 
8445  const ForwardFunctor fwd;
8446 
8447  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8448  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8449 
8450  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8451  subAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8452  else if( IsSymmetric_v<MT1> )
8453  subAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8454  else
8455  subAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8456  }
8457  //**********************************************************************************************
8458 
8459  //**Subtraction assignment to sparse matrices***************************************************
8460  // No special implementation for the subtraction assignment to sparse matrices.
8461  //**********************************************************************************************
8462 
8463  //**Schur product assignment to dense matrices**************************************************
8475  template< typename MT // Type of the target dense matrix
8476  , bool SO > // Storage order of the target dense matrix
8477  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8478  {
8480 
8484 
8485  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8486  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8487 
8488  const ResultType tmp( serial( rhs ) );
8489  schurAssign( ~lhs, tmp );
8490  }
8491  //**********************************************************************************************
8492 
8493  //**Schur product assignment to sparse matrices*************************************************
8494  // No special implementation for the Schur product assignment to sparse matrices.
8495  //**********************************************************************************************
8496 
8497  //**Multiplication assignment to dense matrices*************************************************
8498  // No special implementation for the multiplication assignment to dense matrices.
8499  //**********************************************************************************************
8500 
8501  //**Multiplication assignment to sparse matrices************************************************
8502  // No special implementation for the multiplication assignment to sparse matrices.
8503  //**********************************************************************************************
8504 
8505  //**SMP assignment to dense matrices************************************************************
8520  template< typename MT // Type of the target dense matrix
8521  , bool SO > // Storage order of the target dense matrix
8522  friend inline auto smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8523  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8524  {
8526 
8527  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8528  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8529 
8530  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8531  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8532 
8533  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
8534  return;
8535  }
8536  else if( left.columns() == 0UL ) {
8537  reset( ~lhs );
8538  return;
8539  }
8540 
8541  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8542  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8543 
8544  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8545  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8546  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8547  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8548  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8549  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8550 
8551  smpAssign( ~lhs, A * B * rhs.scalar_ );
8552  }
8553  //**********************************************************************************************
8554 
8555  //**SMP assignment to sparse matrices***********************************************************
8570  template< typename MT // Type of the target sparse matrix
8571  , bool SO > // Storage order of the target sparse matrix
8572  friend inline auto smpAssign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8573  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8574  {
8576 
8577  using TmpType = If_t< SO, OppositeType, ResultType >;
8578 
8585 
8586  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8587  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8588 
8589  const ForwardFunctor fwd;
8590 
8591  const TmpType tmp( rhs );
8592  smpAssign( ~lhs, fwd( tmp ) );
8593  }
8594  //**********************************************************************************************
8595 
8596  //**Restructuring SMP assignment to column-major matrices***************************************
8610  template< typename MT > // Type of the target matrix
8611  friend inline auto smpAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8612  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8613  {
8615 
8617 
8618  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8619  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8620 
8621  const ForwardFunctor fwd;
8622 
8623  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8624  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8625 
8626  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8627  smpAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8628  else if( IsSymmetric_v<MT1> )
8629  smpAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8630  else
8631  smpAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8632  }
8633  //**********************************************************************************************
8634 
8635  //**SMP addition assignment to dense matrices***************************************************
8650  template< typename MT // Type of the target dense matrix
8651  , bool SO > // Storage order of the target dense matrix
8652  friend inline auto smpAddAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8653  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8654  {
8656 
8657  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8658  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8659 
8660  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8661  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8662 
8663  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8664  return;
8665  }
8666 
8667  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8668  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8669 
8670  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8671  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8672  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8673  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8674  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8675  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8676 
8677  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
8678  }
8679  //**********************************************************************************************
8680 
8681  //**Restructuring SMP addition assignment to column-major matrices******************************
8695  template< typename MT > // Type of the target matrix
8696  friend inline auto smpAddAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8697  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8698  {
8700 
8702 
8703  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8704  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8705 
8706  const ForwardFunctor fwd;
8707 
8708  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8709  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8710 
8711  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8712  smpAddAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8713  else if( IsSymmetric_v<MT1> )
8714  smpAddAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8715  else
8716  smpAddAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8717  }
8718  //**********************************************************************************************
8719 
8720  //**SMP addition assignment to sparse matrices**************************************************
8721  // No special implementation for the SMP addition assignment to sparse matrices.
8722  //**********************************************************************************************
8723 
8724  //**SMP subtraction assignment to dense matrices************************************************
8739  template< typename MT // Type of the target dense matrix
8740  , bool SO > // Storage order of the target dense matrix
8741  friend inline auto smpSubAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8742  -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8743  {
8745 
8746  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8747  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8748 
8749  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8750  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8751 
8752  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8753  return;
8754  }
8755 
8756  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8757  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8758 
8759  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8760  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8761  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8762  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8763  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8764  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8765 
8766  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
8767  }
8768  //**********************************************************************************************
8769 
8770  //**Restructuring SMP subtraction assignment to column-major matrices***************************
8784  template< typename MT > // Type of the target matrix
8785  friend inline auto smpSubAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8786  -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8787  {
8789 
8791 
8792  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8793  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8794 
8795  const ForwardFunctor fwd;
8796 
8797  LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8798  RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8799 
8800  if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8801  smpSubAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8802  else if( IsSymmetric_v<MT1> )
8803  smpSubAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8804  else
8805  smpSubAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8806  }
8807  //**********************************************************************************************
8808 
8809  //**SMP subtraction assignment to sparse matrices***********************************************
8810  // No special implementation for the SMP subtraction assignment to sparse matrices.
8811  //**********************************************************************************************
8812 
8813  //**SMP Schur product assignment to dense matrices**********************************************
8825  template< typename MT // Type of the target dense matrix
8826  , bool SO > // Storage order of the target dense matrix
8827  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8828  {
8830 
8834 
8835  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8836  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8837 
8838  const ResultType tmp( rhs );
8839  smpSchurAssign( ~lhs, tmp );
8840  }
8841  //**********************************************************************************************
8842 
8843  //**SMP Schur product assignment to sparse matrices*********************************************
8844  // No special implementation for the SMP Schur product assignment to sparse matrices.
8845  //**********************************************************************************************
8846 
8847  //**SMP multiplication assignment to dense matrices*********************************************
8848  // No special implementation for the SMP multiplication assignment to dense matrices.
8849  //**********************************************************************************************
8850 
8851  //**SMP multiplication assignment to sparse matrices********************************************
8852  // No special implementation for the SMP multiplication assignment to sparse matrices.
8853  //**********************************************************************************************
8854 
8855  //**Compile time checks*************************************************************************
8864  //**********************************************************************************************
8865 };
8867 //*************************************************************************************************
8868 
8869 
8870 
8871 
8872 //=================================================================================================
8873 //
8874 // GLOBAL BINARY ARITHMETIC OPERATORS
8875 //
8876 //=================================================================================================
8877 
8878 //*************************************************************************************************
8905 template< typename MT1 // Type of the left-hand side dense matrix
8906  , typename MT2 > // Type of the right-hand side dense matrix
8907 inline decltype(auto)
8908  operator*( const DenseMatrix<MT1,false>& lhs, const DenseMatrix<MT2,false>& rhs )
8909 {
8911 
8912  if( (~lhs).columns() != (~rhs).rows() ) {
8913  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
8914  }
8915 
8917  return ReturnType( ~lhs, ~rhs );
8918 }
8919 //*************************************************************************************************
8920 
8921 
8922 
8923 
8924 //=================================================================================================
8925 //
8926 // GLOBAL FUNCTIONS
8927 //
8928 //=================================================================================================
8929 
8930 //*************************************************************************************************
8953 template< typename MT1 // Type of the left-hand side dense matrix
8954  , typename MT2 // Type of the right-hand side dense matrix
8955  , bool SF // Symmetry flag
8956  , bool HF // Hermitian flag
8957  , bool LF // Lower flag
8958  , bool UF > // Upper flag
8959 inline decltype(auto) declsym( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8960 {
8962 
8963  if( !isSquare( dm ) ) {
8964  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
8965  }
8966 
8967  using ReturnType = const DMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
8968  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8969 }
8971 //*************************************************************************************************
8972 
8973 
8974 //*************************************************************************************************
8997 template< typename MT1 // Type of the left-hand side dense matrix
8998  , typename MT2 // Type of the right-hand side dense matrix
8999  , bool SF // Symmetry flag
9000  , bool HF // Hermitian flag
9001  , bool LF // Lower flag
9002  , bool UF > // Upper flag
9003 inline decltype(auto) declherm( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9004 {
9006 
9007  if( !isSquare( dm ) ) {
9008  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
9009  }
9010 
9011  using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9012  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9013 }
9015 //*************************************************************************************************
9016 
9017 
9018 //*************************************************************************************************
9041 template< typename MT1 // Type of the left-hand side dense matrix
9042  , typename MT2 // Type of the right-hand side dense matrix
9043  , bool SF // Symmetry flag
9044  , bool HF // Hermitian flag
9045  , bool LF // Lower flag
9046  , bool UF > // Upper flag
9047 inline decltype(auto) decllow( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9048 {
9050 
9051  if( !isSquare( dm ) ) {
9052  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9053  }
9054 
9055  using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9056  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9057 }
9059 //*************************************************************************************************
9060 
9061 
9062 //*************************************************************************************************
9085 template< typename MT1 // Type of the left-hand side dense matrix
9086  , typename MT2 // Type of the right-hand side dense matrix
9087  , bool SF // Symmetry flag
9088  , bool HF // Hermitian flag
9089  , bool LF // Lower flag
9090  , bool UF > // Upper flag
9091 inline decltype(auto) declupp( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9092 {
9094 
9095  if( !isSquare( dm ) ) {
9096  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9097  }
9098 
9099  using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9100  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9101 }
9103 //*************************************************************************************************
9104 
9105 
9106 //*************************************************************************************************
9129 template< typename MT1 // Type of the left-hand side dense matrix
9130  , typename MT2 // Type of the right-hand side dense matrix
9131  , bool SF // Symmetry flag
9132  , bool HF // Hermitian flag
9133  , bool LF // Lower flag
9134  , bool UF > // Upper flag
9135 inline decltype(auto) decldiag( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9136 {
9138 
9139  if( !isSquare( dm ) ) {
9140  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
9141  }
9142 
9143  using ReturnType = const DMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
9144  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9145 }
9147 //*************************************************************************************************
9148 
9149 
9150 
9151 
9152 //=================================================================================================
9153 //
9154 // SIZE SPECIALIZATIONS
9155 //
9156 //=================================================================================================
9157 
9158 //*************************************************************************************************
9160 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9161 struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9162  : public Size<MT1,0UL>
9163 {};
9164 
9165 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9166 struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9167  : public Size<MT2,1UL>
9168 {};
9170 //*************************************************************************************************
9171 
9172 
9173 
9174 
9175 //=================================================================================================
9176 //
9177 // ISALIGNED SPECIALIZATIONS
9178 //
9179 //=================================================================================================
9180 
9181 //*************************************************************************************************
9183 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9184 struct IsAligned< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9185  : public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
9186 {};
9188 //*************************************************************************************************
9189 
9190 } // namespace blaze
9191 
9192 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:332
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:426
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:422
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:292
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDMatMultExpr.h:317
Header file for basic type definitions.
Header file for the SparseVector base class.
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:498
Header file for the declherm trait.
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:173
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:288
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDMatMultExpr.h:310
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:532
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:289
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:522
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:466
Header file for the IsIntegral type trait.
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
Header file for the DenseVector base class.
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatDMatMultExpr.h:177
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:163
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1002
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:596
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:158
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:152
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:431
Header file for the IsBLASCompatible type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:442
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
Header file for the IsComplexDouble type trait.
Constraint on the data type.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:168
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:564
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:302
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:161
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatDMatMultExpr.h:179
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:486
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:167
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:305
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1002
Header file for the IsLower type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:347
Header file for the IsAligned type trait.
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:552
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:161
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:604
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:291
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:468
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:162
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDMatMultExpr.h:323
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:173
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:396
Header file for run time assertion macros.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:290
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatDMatMultExpr.h:178
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:286
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:421
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
If_t< IsExpression_v< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:299
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:293
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:454
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:576
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:432
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:765
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:476
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
If_t< IsExpression_v< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:296
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:453
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:440
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:156
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:412
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatDMatMultExpr.h:180
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:586
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
Header file for the IsUpper type trait.
typename DisableIf< Condition, T >::Type DisableIf_t
Auxiliary type for the DisableIf class template.The DisableIf_t alias declaration provides a convenie...
Definition: DisableIf.h:138
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1326
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
Header file for the IsResizable type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:542
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:170
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:499
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.