TDMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/dense/MMM.h>
54 #include <blaze/math/Exception.h>
67 #include <blaze/math/shims/Reset.h>
69 #include <blaze/math/SIMD.h>
95 #include <blaze/system/BLAS.h>
96 #include <blaze/system/Blocking.h>
97 #include <blaze/system/Debugging.h>
102 #include <blaze/util/Assert.h>
103 #include <blaze/util/Complex.h>
106 #include <blaze/util/DisableIf.h>
107 #include <blaze/util/EnableIf.h>
110 #include <blaze/util/InvalidType.h>
111 #include <blaze/util/mpl/And.h>
112 #include <blaze/util/mpl/Bool.h>
113 #include <blaze/util/mpl/If.h>
114 #include <blaze/util/mpl/Not.h>
115 #include <blaze/util/mpl/Or.h>
116 #include <blaze/util/TrueType.h>
117 #include <blaze/util/Types.h>
126 
127 
128 namespace blaze {
129 
130 //=================================================================================================
131 //
132 // CLASS TDMATTDMATMULTEXPR
133 //
134 //=================================================================================================
135 
136 //*************************************************************************************************
143 template< typename MT1 // Type of the left-hand side dense matrix
144  , typename MT2 // Type of the right-hand side dense matrix
145  , bool SF // Symmetry flag
146  , bool HF // Hermitian flag
147  , bool LF // Lower flag
148  , bool UF > // Upper flag
149 class TDMatTDMatMultExpr
150  : public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
151  , private Computation
152 {
153  private:
154  //**Type definitions****************************************************************************
161  //**********************************************************************************************
162 
163  //**********************************************************************************************
165  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
166  //**********************************************************************************************
167 
168  //**********************************************************************************************
170  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
171  //**********************************************************************************************
172 
173  //**********************************************************************************************
175  enum : bool {
176  SYM = ( SF && !( HF || LF || UF ) ),
177  HERM = ( HF && !( LF || UF ) ),
178  LOW = ( LF || ( ( SF || HF ) && UF ) ),
179  UPP = ( UF || ( ( SF || HF ) && LF ) )
180  };
181  //**********************************************************************************************
182 
183  //**********************************************************************************************
185 
191  template< typename T1, typename T2, typename T3 >
192  struct CanExploitSymmetry {
193  enum : bool { value = IsRowMajorMatrix<T1>::value &&
195  };
197  //**********************************************************************************************
198 
199  //**********************************************************************************************
201 
205  template< typename T1, typename T2, typename T3 >
206  struct IsEvaluationRequired {
207  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
208  CanExploitSymmetry<T1,T2,T3>::value };
209  };
211  //**********************************************************************************************
212 
213  //**********************************************************************************************
215 
218  template< typename T1, typename T2, typename T3 >
219  struct UseBlasKernel {
220  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
221  !SYM && !HERM && !LOW && !UPP &&
226  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
231  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
232  };
234  //**********************************************************************************************
235 
236  //**********************************************************************************************
238 
241  template< typename T1, typename T2, typename T3 >
242  struct UseVectorizedDefaultKernel {
243  enum : bool { value = useOptimizedKernels &&
245  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
248  , ElementType_<T3> >::value &&
251  };
253  //**********************************************************************************************
254 
255  //**********************************************************************************************
257 
260  using ForwardFunctor = IfTrue_< HERM
261  , DeclHerm
262  , IfTrue_< SYM
263  , DeclSym
264  , IfTrue_< LOW
265  , IfTrue_< UPP
266  , DeclDiag
267  , DeclLow >
268  , IfTrue_< UPP
269  , DeclUpp
270  , Noop > > > >;
272  //**********************************************************************************************
273 
274  public:
275  //**Type definitions****************************************************************************
278 
284  using ReturnType = const ElementType;
285  using CompositeType = const ResultType;
286 
288  using LeftOperand = If_< IsExpression<MT1>, const MT1, const MT1& >;
289 
291  using RightOperand = If_< IsExpression<MT2>, const MT2, const MT2& >;
292 
295 
298  //**********************************************************************************************
299 
300  //**Compilation flags***************************************************************************
302  enum : bool { simdEnabled = !IsDiagonal<MT1>::value &&
303  MT1::simdEnabled && MT2::simdEnabled &&
306 
308  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
309  !evaluateRight && MT2::smpAssignable };
310  //**********************************************************************************************
311 
312  //**SIMD properties*****************************************************************************
314  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
315  //**********************************************************************************************
316 
317  //**Constructor*********************************************************************************
323  explicit inline TDMatTDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
324  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
325  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
326  {
327  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
328  }
329  //**********************************************************************************************
330 
331  //**Access operator*****************************************************************************
338  inline ReturnType operator()( size_t i, size_t j ) const {
339  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
340  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
341 
342  if( IsDiagonal<MT1>::value ) {
343  return lhs_(i,i) * rhs_(i,j);
344  }
345  else if( IsDiagonal<MT2>::value ) {
346  return lhs_(i,j) * rhs_(j,j);
347  }
349  const size_t begin( ( IsUpper<MT1>::value )
350  ?( ( IsLower<MT2>::value )
351  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
352  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
353  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
354  :( ( IsLower<MT2>::value )
355  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
356  :( 0UL ) ) );
357  const size_t end( ( IsLower<MT1>::value )
358  ?( ( IsUpper<MT2>::value )
359  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
360  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
361  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
362  :( ( IsUpper<MT2>::value )
363  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
364  :( lhs_.columns() ) ) );
365 
366  if( begin >= end ) return ElementType();
367 
368  const size_t n( end - begin );
369 
370  return subvector( row( lhs_, i ), begin, n ) * subvector( column( rhs_, j ), begin, n );
371  }
372  else {
373  return row( lhs_, i ) * column( rhs_, j );
374  }
375  }
376  //**********************************************************************************************
377 
378  //**At function*********************************************************************************
386  inline ReturnType at( size_t i, size_t j ) const {
387  if( i >= lhs_.rows() ) {
388  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
389  }
390  if( j >= rhs_.columns() ) {
391  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
392  }
393  return (*this)(i,j);
394  }
395  //**********************************************************************************************
396 
397  //**Rows function*******************************************************************************
402  inline size_t rows() const noexcept {
403  return lhs_.rows();
404  }
405  //**********************************************************************************************
406 
407  //**Columns function****************************************************************************
412  inline size_t columns() const noexcept {
413  return rhs_.columns();
414  }
415  //**********************************************************************************************
416 
417  //**Left operand access*************************************************************************
422  inline LeftOperand leftOperand() const noexcept {
423  return lhs_;
424  }
425  //**********************************************************************************************
426 
427  //**Right operand access************************************************************************
432  inline RightOperand rightOperand() const noexcept {
433  return rhs_;
434  }
435  //**********************************************************************************************
436 
437  //**********************************************************************************************
443  template< typename T >
444  inline bool canAlias( const T* alias ) const noexcept {
445  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
446  }
447  //**********************************************************************************************
448 
449  //**********************************************************************************************
455  template< typename T >
456  inline bool isAliased( const T* alias ) const noexcept {
457  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
458  }
459  //**********************************************************************************************
460 
461  //**********************************************************************************************
466  inline bool isAligned() const noexcept {
467  return lhs_.isAligned() && rhs_.isAligned();
468  }
469  //**********************************************************************************************
470 
471  //**********************************************************************************************
476  inline bool canSMPAssign() const noexcept {
477  return ( !BLAZE_BLAS_MODE ||
478  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
480  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
481  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
483  }
484  //**********************************************************************************************
485 
486  private:
487  //**Member variables****************************************************************************
490  //**********************************************************************************************
491 
492  //**Assignment to dense matrices****************************************************************
505  template< typename MT // Type of the target dense matrix
506  , bool SO > // Storage order of the target dense matrix
508  assign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
509  {
511 
512  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
513  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
514 
515  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
516  return;
517  }
518  else if( rhs.lhs_.columns() == 0UL ) {
519  reset( ~lhs );
520  return;
521  }
522 
523  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
524  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
525 
526  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
527  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
528  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
529  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
530  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
531  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
532 
533  TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
534  }
536  //**********************************************************************************************
537 
538  //**Assignment to dense matrices (kernel selection)*********************************************
549  template< typename MT3 // Type of the left-hand side target matrix
550  , typename MT4 // Type of the left-hand side matrix operand
551  , typename MT5 > // Type of the right-hand side matrix operand
552  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
553  {
554  if( ( IsDiagonal<MT4>::value ) ||
555  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
556  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
557  selectSmallAssignKernel( C, A, B );
558  else
559  selectBlasAssignKernel( C, A, B );
560  }
562  //**********************************************************************************************
563 
564  //**Default assignment to dense matrices (general/general)**************************************
578  template< typename MT3 // Type of the left-hand side target matrix
579  , typename MT4 // Type of the left-hand side matrix operand
580  , typename MT5 > // Type of the right-hand side matrix operand
582  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
583  {
584  const size_t M( A.rows() );
585  const size_t N( B.columns() );
586  const size_t K( A.columns() );
587 
588  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
589 
590  for( size_t j=0UL; j<N; ++j )
591  {
592  const size_t kbegin( ( IsLower<MT5>::value )
593  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
594  :( 0UL ) );
595  const size_t kend( ( IsUpper<MT5>::value )
596  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
597  :( K ) );
598  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
599 
600  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
601  for( size_t i=0UL; i<M; ++i ) {
602  reset( C(i,j) );
603  }
604  continue;
605  }
606 
607  {
608  const size_t ibegin( ( IsLower<MT4>::value )
610  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
611  :( LOW ? max(j,kbegin) : kbegin ) )
612  :( LOW ? j : 0UL ) );
613  const size_t iend( ( IsUpper<MT4>::value )
615  ?( UPP ? min(j+1UL,kbegin) : kbegin )
616  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
617  :( UPP ? j+1UL : M ) );
618 
619  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
620  for( size_t i=0UL; i<ibegin; ++i ) {
621  reset( C(i,j) );
622  }
623  }
624  else if( IsStrictlyLower<MT4>::value ) {
625  reset( C(0UL,j) );
626  }
627  for( size_t i=ibegin; i<iend; ++i ) {
628  C(i,j) = A(i,kbegin) * B(kbegin,j);
629  }
630  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
631  for( size_t i=iend; i<M; ++i ) {
632  reset( C(i,j) );
633  }
634  }
635  else if( IsStrictlyUpper<MT4>::value ) {
636  reset( C(M-1UL,j) );
637  }
638  }
639 
640  for( size_t k=kbegin+1UL; k<kend; ++k )
641  {
642  const size_t ibegin( ( IsLower<MT4>::value )
644  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
645  :( SYM || HERM || LOW ? max( j, k ) : k ) )
646  :( SYM || HERM || LOW ? j : 0UL ) );
647  const size_t iend( ( IsUpper<MT4>::value )
649  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
650  :( UPP ? min(j+1UL,k) : k ) )
651  :( UPP ? j+1UL : M ) );
652 
653  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
654  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
655 
656  for( size_t i=ibegin; i<iend; ++i ) {
657  C(i,j) += A(i,k) * B(k,j);
658  }
659  if( IsUpper<MT4>::value ) {
660  C(iend,j) = A(iend,k) * B(k,j);
661  }
662  }
663  }
664 
665  if( SYM || HERM ) {
666  for( size_t j=1UL; j<N; ++j ) {
667  for( size_t i=0UL; i<j; ++i ) {
668  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
669  }
670  }
671  }
672  }
674  //**********************************************************************************************
675 
676  //**Default assignment to dense matrices (general/diagonal)*************************************
690  template< typename MT3 // Type of the left-hand side target matrix
691  , typename MT4 // Type of the left-hand side matrix operand
692  , typename MT5 > // Type of the right-hand side matrix operand
693  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
694  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
695  {
697 
698  const size_t M( A.rows() );
699  const size_t N( B.columns() );
700 
701  for( size_t j=0UL; j<N; ++j )
702  {
703  const size_t ibegin( ( IsLower<MT4>::value )
704  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
705  :( 0UL ) );
706  const size_t iend( ( IsUpper<MT4>::value )
707  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
708  :( M ) );
709  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
710 
711  if( IsLower<MT4>::value ) {
712  for( size_t i=0UL; i<ibegin; ++i ) {
713  reset( C(i,j) );
714  }
715  }
716  for( size_t i=ibegin; i<iend; ++i ) {
717  C(i,j) = A(i,j) * B(j,j);
718  }
719  if( IsUpper<MT4>::value ) {
720  for( size_t i=iend; i<M; ++i ) {
721  reset( C(i,j) );
722  }
723  }
724  }
725  }
727  //**********************************************************************************************
728 
729  //**Default assignment to dense matrices (diagonal/general)*************************************
743  template< typename MT3 // Type of the left-hand side target matrix
744  , typename MT4 // Type of the left-hand side matrix operand
745  , typename MT5 > // Type of the right-hand side matrix operand
747  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
748  {
750 
751  const size_t M( A.rows() );
752  const size_t N( B.columns() );
753 
754  for( size_t j=0UL; j<N; ++j )
755  {
756  const size_t ibegin( ( IsLower<MT5>::value )
757  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
758  :( 0UL ) );
759  const size_t iend( ( IsUpper<MT5>::value )
760  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
761  :( M ) );
762  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
763 
764  if( IsLower<MT4>::value ) {
765  for( size_t i=0UL; i<ibegin; ++i ) {
766  reset( C(i,j) );
767  }
768  }
769  for( size_t i=ibegin; i<iend; ++i ) {
770  C(i,j) = A(i,i) * B(i,j);
771  }
772  if( IsUpper<MT4>::value ) {
773  for( size_t i=iend; i<M; ++i ) {
774  reset( C(i,j) );
775  }
776  }
777  }
778  }
780  //**********************************************************************************************
781 
782  //**Default assignment to dense matrices (diagonal/diagonal)************************************
796  template< typename MT3 // Type of the left-hand side target matrix
797  , typename MT4 // Type of the left-hand side matrix operand
798  , typename MT5 > // Type of the right-hand side matrix operand
799  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
800  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
801  {
803 
804  reset( C );
805 
806  for( size_t i=0UL; i<A.rows(); ++i ) {
807  C(i,i) = A(i,i) * B(i,i);
808  }
809  }
811  //**********************************************************************************************
812 
813  //**Default assignment to dense matrices (small matrices)***************************************
827  template< typename MT3 // Type of the left-hand side target matrix
828  , typename MT4 // Type of the left-hand side matrix operand
829  , typename MT5 > // Type of the right-hand side matrix operand
831  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
832  {
833  selectDefaultAssignKernel( C, A, B );
834  }
836  //**********************************************************************************************
837 
838  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
853  template< typename MT3 // Type of the left-hand side target matrix
854  , typename MT4 // Type of the left-hand side matrix operand
855  , typename MT5 > // Type of the right-hand side matrix operand
857  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
858  {
863 
864  const ForwardFunctor fwd;
865 
867  const OppositeType_<MT5> tmp( serial( B ) );
868  assign( ~C, fwd( A * tmp ) );
869  }
871  const OppositeType_<MT4> tmp( serial( A ) );
872  assign( ~C, fwd( tmp * B ) );
873  }
874  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
875  const OppositeType_<MT5> tmp( serial( B ) );
876  assign( ~C, fwd( A * tmp ) );
877  }
878  else {
879  const OppositeType_<MT4> tmp( serial( A ) );
880  assign( ~C, fwd( tmp * B ) );
881  }
882  }
884  //**********************************************************************************************
885 
886  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
901  template< typename MT3 // Type of the left-hand side target matrix
902  , typename MT4 // Type of the left-hand side matrix operand
903  , typename MT5 > // Type of the right-hand side matrix operand
905  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
906  {
907  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
908 
909  const size_t M( A.rows() );
910  const size_t N( B.columns() );
911  const size_t K( A.columns() );
912 
913  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
914 
915  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
916  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
917 
918  if( LOW && UPP && M > SIMDSIZE*3UL ) {
919  reset( ~C );
920  }
921 
922  {
923  size_t i( 0UL );
924 
926  {
927  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
928  for( size_t j=0UL; j<N; ++j )
929  {
930  const size_t kbegin( ( IsLower<MT5>::value )
931  ?( ( IsUpper<MT4>::value )
932  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
933  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
934  :( IsUpper<MT4>::value ? i : 0UL ) );
935  const size_t kend( ( IsUpper<MT5>::value )
936  ?( ( IsLower<MT4>::value )
937  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
938  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
939  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
940 
941  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
942 
943  for( size_t k=kbegin; k<kend; ++k ) {
944  const SIMDType b1( set( B(k,j) ) );
945  xmm1 += A.load(i ,k) * b1;
946  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
947  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
948  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
949  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
950  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
951  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
952  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
953  }
954 
955  (~C).store( i , j, xmm1 );
956  (~C).store( i+SIMDSIZE , j, xmm2 );
957  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
958  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
959  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
960  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
961  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
962  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
963  }
964  }
965  }
966 
967  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
968  {
969  size_t j( 0UL );
970 
971  for( ; (j+2UL) <= N; j+=2UL )
972  {
973  const size_t kbegin( ( IsLower<MT5>::value )
974  ?( ( IsUpper<MT4>::value )
975  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
976  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
977  :( IsUpper<MT4>::value ? i : 0UL ) );
978  const size_t kend( ( IsUpper<MT5>::value )
979  ?( ( IsLower<MT4>::value )
980  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
981  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
982  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
983 
984  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
985 
986  for( size_t k=kbegin; k<kend; ++k ) {
987  const SIMDType a1( A.load(i ,k) );
988  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
989  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
990  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
991  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
992  const SIMDType b1( set( B(k,j ) ) );
993  const SIMDType b2( set( B(k,j+1UL) ) );
994  xmm1 += a1 * b1;
995  xmm2 += a2 * b1;
996  xmm3 += a3 * b1;
997  xmm4 += a4 * b1;
998  xmm5 += a5 * b1;
999  xmm6 += a1 * b2;
1000  xmm7 += a2 * b2;
1001  xmm8 += a3 * b2;
1002  xmm9 += a4 * b2;
1003  xmm10 += a5 * b2;
1004  }
1005 
1006  (~C).store( i , j , xmm1 );
1007  (~C).store( i+SIMDSIZE , j , xmm2 );
1008  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1009  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1010  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1011  (~C).store( i , j+1UL, xmm6 );
1012  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1013  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1014  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1015  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1016  }
1017 
1018  if( j < N )
1019  {
1020  const size_t kbegin( ( IsLower<MT5>::value )
1021  ?( ( IsUpper<MT4>::value )
1022  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1023  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1024  :( IsUpper<MT4>::value ? i : 0UL ) );
1025  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1026 
1027  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1028 
1029  for( size_t k=kbegin; k<kend; ++k ) {
1030  const SIMDType b1( set( B(k,j) ) );
1031  xmm1 += A.load(i ,k) * b1;
1032  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1033  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1034  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1035  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1036  }
1037 
1038  (~C).store( i , j, xmm1 );
1039  (~C).store( i+SIMDSIZE , j, xmm2 );
1040  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1041  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1042  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1043  }
1044  }
1045 
1046  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1047  {
1048  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
1049  size_t j( UPP ? i : 0UL );
1050 
1051  for( ; (j+2UL) <= jend; j+=2UL )
1052  {
1053  const size_t kbegin( ( IsLower<MT5>::value )
1054  ?( ( IsUpper<MT4>::value )
1055  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1056  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1057  :( IsUpper<MT4>::value ? i : 0UL ) );
1058  const size_t kend( ( IsUpper<MT5>::value )
1059  ?( ( IsLower<MT4>::value )
1060  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1061  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1062  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
1063 
1064  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1065 
1066  for( size_t k=kbegin; k<kend; ++k ) {
1067  const SIMDType a1( A.load(i ,k) );
1068  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1069  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1070  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1071  const SIMDType b1( set( B(k,j ) ) );
1072  const SIMDType b2( set( B(k,j+1UL) ) );
1073  xmm1 += a1 * b1;
1074  xmm2 += a2 * b1;
1075  xmm3 += a3 * b1;
1076  xmm4 += a4 * b1;
1077  xmm5 += a1 * b2;
1078  xmm6 += a2 * b2;
1079  xmm7 += a3 * b2;
1080  xmm8 += a4 * b2;
1081  }
1082 
1083  (~C).store( i , j , xmm1 );
1084  (~C).store( i+SIMDSIZE , j , xmm2 );
1085  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1086  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1087  (~C).store( i , j+1UL, xmm5 );
1088  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1089  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1090  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1091  }
1092 
1093  if( j < jend )
1094  {
1095  const size_t kbegin( ( IsLower<MT5>::value )
1096  ?( ( IsUpper<MT4>::value )
1097  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1098  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1099  :( IsUpper<MT4>::value ? i : 0UL ) );
1100  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1101 
1102  SIMDType xmm1, xmm2, xmm3, xmm4;
1103 
1104  for( size_t k=kbegin; k<kend; ++k ) {
1105  const SIMDType b1( set( B(k,j) ) );
1106  xmm1 += A.load(i ,k) * b1;
1107  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1108  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1109  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1110  }
1111 
1112  (~C).store( i , j, xmm1 );
1113  (~C).store( i+SIMDSIZE , j, xmm2 );
1114  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1115  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1116  }
1117  }
1118 
1119  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1120  {
1121  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
1122  size_t j( UPP ? i : 0UL );
1123 
1124  for( ; (j+2UL) <= jend; j+=2UL )
1125  {
1126  const size_t kbegin( ( IsLower<MT5>::value )
1127  ?( ( IsUpper<MT4>::value )
1128  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1129  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1130  :( IsUpper<MT4>::value ? i : 0UL ) );
1131  const size_t kend( ( IsUpper<MT5>::value )
1132  ?( ( IsLower<MT4>::value )
1133  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1134  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1135  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
1136 
1137  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1138 
1139  for( size_t k=kbegin; k<kend; ++k ) {
1140  const SIMDType a1( A.load(i ,k) );
1141  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1142  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1143  const SIMDType b1( set( B(k,j ) ) );
1144  const SIMDType b2( set( B(k,j+1UL) ) );
1145  xmm1 += a1 * b1;
1146  xmm2 += a2 * b1;
1147  xmm3 += a3 * b1;
1148  xmm4 += a1 * b2;
1149  xmm5 += a2 * b2;
1150  xmm6 += a3 * b2;
1151  }
1152 
1153  (~C).store( i , j , xmm1 );
1154  (~C).store( i+SIMDSIZE , j , xmm2 );
1155  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1156  (~C).store( i , j+1UL, xmm4 );
1157  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1158  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1159  }
1160 
1161  if( j < jend )
1162  {
1163  const size_t kbegin( ( IsLower<MT5>::value )
1164  ?( ( IsUpper<MT4>::value )
1165  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1166  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1167  :( IsUpper<MT4>::value ? i : 0UL ) );
1168  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
1169 
1170  SIMDType xmm1, xmm2, xmm3;
1171 
1172  for( size_t k=kbegin; k<kend; ++k ) {
1173  const SIMDType b1( set( B(k,j) ) );
1174  xmm1 += A.load(i ,k) * b1;
1175  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1176  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1177  }
1178 
1179  (~C).store( i , j, xmm1 );
1180  (~C).store( i+SIMDSIZE , j, xmm2 );
1181  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1182  }
1183  }
1184 
1185  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1186  {
1187  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
1188  size_t j( UPP ? i : 0UL );
1189 
1190  for( ; (j+4UL) <= jend; j+=4UL )
1191  {
1192  const size_t kbegin( ( IsLower<MT5>::value )
1193  ?( ( IsUpper<MT4>::value )
1194  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1195  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1196  :( IsUpper<MT4>::value ? i : 0UL ) );
1197  const size_t kend( ( IsUpper<MT5>::value )
1198  ?( ( IsLower<MT4>::value )
1199  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
1200  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
1201  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
1202 
1203  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1204 
1205  for( size_t k=kbegin; k<kend; ++k ) {
1206  const SIMDType a1( A.load(i ,k) );
1207  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1208  const SIMDType b1( set( B(k,j ) ) );
1209  const SIMDType b2( set( B(k,j+1UL) ) );
1210  const SIMDType b3( set( B(k,j+2UL) ) );
1211  const SIMDType b4( set( B(k,j+3UL) ) );
1212  xmm1 += a1 * b1;
1213  xmm2 += a2 * b1;
1214  xmm3 += a1 * b2;
1215  xmm4 += a2 * b2;
1216  xmm5 += a1 * b3;
1217  xmm6 += a2 * b3;
1218  xmm7 += a1 * b4;
1219  xmm8 += a2 * b4;
1220  }
1221 
1222  (~C).store( i , j , xmm1 );
1223  (~C).store( i+SIMDSIZE, j , xmm2 );
1224  (~C).store( i , j+1UL, xmm3 );
1225  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1226  (~C).store( i , j+2UL, xmm5 );
1227  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
1228  (~C).store( i , j+3UL, xmm7 );
1229  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
1230  }
1231 
1232  for( ; (j+3UL) <= jend; j+=3UL )
1233  {
1234  const size_t kbegin( ( IsLower<MT5>::value )
1235  ?( ( IsUpper<MT4>::value )
1236  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1237  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1238  :( IsUpper<MT4>::value ? i : 0UL ) );
1239  const size_t kend( ( IsUpper<MT5>::value )
1240  ?( ( IsLower<MT4>::value )
1241  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
1242  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
1243  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
1244 
1245  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1246 
1247  for( size_t k=kbegin; k<kend; ++k ) {
1248  const SIMDType a1( A.load(i ,k) );
1249  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1250  const SIMDType b1( set( B(k,j ) ) );
1251  const SIMDType b2( set( B(k,j+1UL) ) );
1252  const SIMDType b3( set( B(k,j+2UL) ) );
1253  xmm1 += a1 * b1;
1254  xmm2 += a2 * b1;
1255  xmm3 += a1 * b2;
1256  xmm4 += a2 * b2;
1257  xmm5 += a1 * b3;
1258  xmm6 += a2 * b3;
1259  }
1260 
1261  (~C).store( i , j , xmm1 );
1262  (~C).store( i+SIMDSIZE, j , xmm2 );
1263  (~C).store( i , j+1UL, xmm3 );
1264  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1265  (~C).store( i , j+2UL, xmm5 );
1266  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
1267  }
1268 
1269  for( ; (j+2UL) <= jend; j+=2UL )
1270  {
1271  const size_t kbegin( ( IsLower<MT5>::value )
1272  ?( ( IsUpper<MT4>::value )
1273  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1274  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1275  :( IsUpper<MT4>::value ? i : 0UL ) );
1276  const size_t kend( ( IsUpper<MT5>::value )
1277  ?( ( IsLower<MT4>::value )
1278  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1279  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1280  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
1281 
1282  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1283  size_t k( kbegin );
1284 
1285  for( ; (k+2UL) <= kend; k+=2UL ) {
1286  const SIMDType a1( A.load(i ,k ) );
1287  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
1288  const SIMDType a3( A.load(i ,k+1UL) );
1289  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
1290  const SIMDType b1( set( B(k ,j ) ) );
1291  const SIMDType b2( set( B(k ,j+1UL) ) );
1292  const SIMDType b3( set( B(k+1UL,j ) ) );
1293  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
1294  xmm1 += a1 * b1;
1295  xmm2 += a2 * b1;
1296  xmm3 += a1 * b2;
1297  xmm4 += a2 * b2;
1298  xmm5 += a3 * b3;
1299  xmm6 += a4 * b3;
1300  xmm7 += a3 * b4;
1301  xmm8 += a4 * b4;
1302  }
1303 
1304  for( ; k<kend; ++k ) {
1305  const SIMDType a1( A.load(i ,k) );
1306  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1307  const SIMDType b1( set( B(k,j ) ) );
1308  const SIMDType b2( set( B(k,j+1UL) ) );
1309  xmm1 += a1 * b1;
1310  xmm2 += a2 * b1;
1311  xmm3 += a1 * b2;
1312  xmm4 += a2 * b2;
1313  }
1314 
1315  (~C).store( i , j , xmm1+xmm5 );
1316  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
1317  (~C).store( i , j+1UL, xmm3+xmm7 );
1318  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
1319  }
1320 
1321  if( j < jend )
1322  {
1323  const size_t kbegin( ( IsLower<MT5>::value )
1324  ?( ( IsUpper<MT4>::value )
1325  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1326  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1327  :( IsUpper<MT4>::value ? i : 0UL ) );
1328  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
1329 
1330  SIMDType xmm1, xmm2, xmm3, xmm4;
1331  size_t k( kbegin );
1332 
1333  for( ; (k+2UL) <= kend; k+=2UL ) {
1334  const SIMDType b1( set( B(k ,j) ) );
1335  const SIMDType b2( set( B(k+1UL,j) ) );
1336  xmm1 += A.load(i ,k ) * b1;
1337  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
1338  xmm3 += A.load(i ,k+1UL) * b2;
1339  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
1340  }
1341 
1342  for( ; k<kend; ++k ) {
1343  const SIMDType b1( set( B(k,j) ) );
1344  xmm1 += A.load(i ,k) * b1;
1345  xmm2 += A.load(i+SIMDSIZE,k) * b1;
1346  }
1347 
1348  (~C).store( i , j, xmm1+xmm3 );
1349  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
1350  }
1351  }
1352 
1353  for( ; i<ipos; i+=SIMDSIZE )
1354  {
1355  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
1356  size_t j( UPP ? i : 0UL );
1357 
1358  for( ; (j+4UL) <= jend; j+=4UL )
1359  {
1360  const size_t kbegin( ( IsLower<MT5>::value )
1361  ?( ( IsUpper<MT4>::value )
1362  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1363  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1364  :( IsUpper<MT4>::value ? i : 0UL ) );
1365  const size_t kend( ( IsUpper<MT5>::value )
1366  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
1367  :( K ) );
1368 
1369  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1370  size_t k( kbegin );
1371 
1372  for( ; (k+2UL) <= kend; k+=2UL ) {
1373  const SIMDType a1( A.load(i,k ) );
1374  const SIMDType a2( A.load(i,k+1UL) );
1375  xmm1 += a1 * set( B(k ,j ) );
1376  xmm2 += a1 * set( B(k ,j+1UL) );
1377  xmm3 += a1 * set( B(k ,j+2UL) );
1378  xmm4 += a1 * set( B(k ,j+3UL) );
1379  xmm5 += a2 * set( B(k+1UL,j ) );
1380  xmm6 += a2 * set( B(k+1UL,j+1UL) );
1381  xmm7 += a2 * set( B(k+1UL,j+2UL) );
1382  xmm8 += a2 * set( B(k+1UL,j+3UL) );
1383  }
1384 
1385  for( ; k<kend; ++k ) {
1386  const SIMDType a1( A.load(i,k) );
1387  xmm1 += a1 * set( B(k,j ) );
1388  xmm2 += a1 * set( B(k,j+1UL) );
1389  xmm3 += a1 * set( B(k,j+2UL) );
1390  xmm4 += a1 * set( B(k,j+3UL) );
1391  }
1392 
1393  (~C).store( i, j , xmm1+xmm5 );
1394  (~C).store( i, j+1UL, xmm2+xmm6 );
1395  (~C).store( i, j+2UL, xmm3+xmm7 );
1396  (~C).store( i, j+3UL, xmm4+xmm8 );
1397  }
1398 
1399  for( ; (j+3UL) <= jend; j+=3UL )
1400  {
1401  const size_t kbegin( ( IsLower<MT5>::value )
1402  ?( ( IsUpper<MT4>::value )
1403  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1404  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1405  :( IsUpper<MT4>::value ? i : 0UL ) );
1406  const size_t kend( ( IsUpper<MT5>::value )
1407  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
1408  :( K ) );
1409 
1410  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1411  size_t k( kbegin );
1412 
1413  for( ; (k+2UL) <= kend; k+=2UL ) {
1414  const SIMDType a1( A.load(i,k ) );
1415  const SIMDType a2( A.load(i,k+1UL) );
1416  xmm1 += a1 * set( B(k ,j ) );
1417  xmm2 += a1 * set( B(k ,j+1UL) );
1418  xmm3 += a1 * set( B(k ,j+2UL) );
1419  xmm4 += a2 * set( B(k+1UL,j ) );
1420  xmm5 += a2 * set( B(k+1UL,j+1UL) );
1421  xmm6 += a2 * set( B(k+1UL,j+2UL) );
1422  }
1423 
1424  for( ; k<kend; ++k ) {
1425  const SIMDType a1( A.load(i,k) );
1426  xmm1 += a1 * set( B(k,j ) );
1427  xmm2 += a1 * set( B(k,j+1UL) );
1428  xmm3 += a1 * set( B(k,j+2UL) );
1429  }
1430 
1431  (~C).store( i, j , xmm1+xmm4 );
1432  (~C).store( i, j+1UL, xmm2+xmm5 );
1433  (~C).store( i, j+2UL, xmm3+xmm6 );
1434  }
1435 
1436  for( ; (j+2UL) <= jend; j+=2UL )
1437  {
1438  const size_t kbegin( ( IsLower<MT5>::value )
1439  ?( ( IsUpper<MT4>::value )
1440  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1441  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1442  :( IsUpper<MT4>::value ? i : 0UL ) );
1443  const size_t kend( ( IsUpper<MT5>::value )
1444  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1445  :( K ) );
1446 
1447  SIMDType xmm1, xmm2, xmm3, xmm4;
1448  size_t k( kbegin );
1449 
1450  for( ; (k+2UL) <= kend; k+=2UL ) {
1451  const SIMDType a1( A.load(i,k ) );
1452  const SIMDType a2( A.load(i,k+1UL) );
1453  xmm1 += a1 * set( B(k ,j ) );
1454  xmm2 += a1 * set( B(k ,j+1UL) );
1455  xmm3 += a2 * set( B(k+1UL,j ) );
1456  xmm4 += a2 * set( B(k+1UL,j+1UL) );
1457  }
1458 
1459  for( ; k<kend; ++k ) {
1460  const SIMDType a1( A.load(i,k) );
1461  xmm1 += a1 * set( B(k,j ) );
1462  xmm2 += a1 * set( B(k,j+1UL) );
1463  }
1464 
1465  (~C).store( i, j , xmm1+xmm3 );
1466  (~C).store( i, j+1UL, xmm2+xmm4 );
1467  }
1468 
1469  if( j < jend )
1470  {
1471  const size_t kbegin( ( IsLower<MT5>::value )
1472  ?( ( IsUpper<MT4>::value )
1473  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1474  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1475  :( IsUpper<MT4>::value ? i : 0UL ) );
1476 
1477  SIMDType xmm1, xmm2;
1478  size_t k( kbegin );
1479 
1480  for( ; (k+2UL) <= K; k+=2UL ) {
1481  xmm1 += A.load(i,k ) * set( B(k ,j) );
1482  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
1483  }
1484 
1485  for( ; k<K; ++k ) {
1486  xmm1 += A.load(i,k) * set( B(k,j) );
1487  }
1488 
1489  (~C).store( i, j, xmm1+xmm2 );
1490  }
1491  }
1492 
1493  for( ; remainder && i<M; ++i )
1494  {
1495  size_t j( LOW && UPP ? i : 0UL );
1496 
1497  for( ; (j+2UL) <= N; j+=2UL )
1498  {
1499  const size_t kbegin( ( IsLower<MT5>::value )
1500  ?( ( IsUpper<MT4>::value )
1501  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1502  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1503  :( IsUpper<MT4>::value ? i : 0UL ) );
1504  const size_t kend( ( IsUpper<MT5>::value )
1505  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1506  :( K ) );
1507 
1508  ElementType value1 = ElementType();
1509  ElementType value2 = ElementType();
1510 
1511  for( size_t k=kbegin; k<kend; ++k ) {
1512  value1 += A(i,k) * B(k,j );
1513  value2 += A(i,k) * B(k,j+1UL);
1514  }
1515 
1516  (~C)(i,j ) = value1;
1517  (~C)(i,j+1UL) = value2;
1518  }
1519 
1520  if( j < N )
1521  {
1522  const size_t kbegin( ( IsLower<MT5>::value )
1523  ?( ( IsUpper<MT4>::value )
1524  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1525  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1526  :( IsUpper<MT4>::value ? i : 0UL ) );
1527 
1528  ElementType value = ElementType();
1529 
1530  for( size_t k=kbegin; k<K; ++k ) {
1531  value += A(i,k) * B(k,j);
1532  }
1533 
1534  (~C)(i,j) = value;
1535  }
1536  }
1537  }
1538 
1539  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
1540  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1541  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1542  for( size_t i=0UL; i<iend; ++i ) {
1543  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
1544  }
1545  }
1546  }
1547  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
1548  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1549  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1550  for( size_t i=0UL; i<iend; ++i ) {
1551  reset( (~C)(i,j) );
1552  }
1553  }
1554  }
1555  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
1556  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1557  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1558  for( size_t j=0UL; j<jend; ++j ) {
1559  reset( (~C)(i,j) );
1560  }
1561  }
1562  }
1563  }
1565  //**********************************************************************************************
1566 
1567  //**Default assignment to dense matrices (large matrices)***************************************
1581  template< typename MT3 // Type of the left-hand side target matrix
1582  , typename MT4 // Type of the left-hand side matrix operand
1583  , typename MT5 > // Type of the right-hand side matrix operand
1585  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1586  {
1587  selectDefaultAssignKernel( C, A, B );
1588  }
1590  //**********************************************************************************************
1591 
1592  //**Vectorized default assignment to dense matrices (large matrices)****************************
1607  template< typename MT3 // Type of the left-hand side target matrix
1608  , typename MT4 // Type of the left-hand side matrix operand
1609  , typename MT5 > // Type of the right-hand side matrix operand
1611  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1612  {
1613  if( SYM )
1614  smmm( C, A, B, ElementType(1) );
1615  else if( HERM )
1616  hmmm( C, A, B, ElementType(1) );
1617  else if( LOW )
1618  lmmm( C, A, B, ElementType(1), ElementType(0) );
1619  else if( UPP )
1620  ummm( C, A, B, ElementType(1), ElementType(0) );
1621  else
1622  mmm( C, A, B, ElementType(1), ElementType(0) );
1623  }
1625  //**********************************************************************************************
1626 
1627  //**BLAS-based assignment to dense matrices (default)*******************************************
1641  template< typename MT3 // Type of the left-hand side target matrix
1642  , typename MT4 // Type of the left-hand side matrix operand
1643  , typename MT5 > // Type of the right-hand side matrix operand
1645  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1646  {
1647  selectLargeAssignKernel( C, A, B );
1648  }
1650  //**********************************************************************************************
1651 
1652  //**BLAS-based assignment to dense matrices*****************************************************
1653 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1654 
1667  template< typename MT3 // Type of the left-hand side target matrix
1668  , typename MT4 // Type of the left-hand side matrix operand
1669  , typename MT5 > // Type of the right-hand side matrix operand
1671  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1672  {
1673  using ET = ElementType_<MT3>;
1674 
1675  if( IsTriangular<MT4>::value ) {
1676  assign( C, B );
1677  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1678  }
1679  else if( IsTriangular<MT5>::value ) {
1680  assign( C, A );
1681  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1682  }
1683  else {
1684  gemm( C, A, B, ET(1), ET(0) );
1685  }
1686  }
1688 #endif
1689  //**********************************************************************************************
1690 
1691  //**Assignment to sparse matrices***************************************************************
1704  template< typename MT // Type of the target sparse matrix
1705  , bool SO > // Storage order of the target sparse matrix
1707  assign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1708  {
1710 
1712 
1719 
1720  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1721  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1722 
1723  const ForwardFunctor fwd;
1724 
1725  const TmpType tmp( serial( rhs ) );
1726  assign( ~lhs, fwd( tmp ) );
1727  }
1729  //**********************************************************************************************
1730 
1731  //**Restructuring assignment to row-major matrices**********************************************
1746  template< typename MT > // Type of the target matrix
1748  assign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
1749  {
1751 
1753 
1754  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1755  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1756 
1757  const ForwardFunctor fwd;
1758 
1760  assign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
1761  else if( IsSymmetric<MT1>::value )
1762  assign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
1763  else
1764  assign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
1765  }
1767  //**********************************************************************************************
1768 
1769  //**Addition assignment to dense matrices*******************************************************
1782  template< typename MT // Type of the target dense matrix
1783  , bool SO > // Storage order of the target dense matrix
1785  addAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1786  {
1788 
1789  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1790  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1791 
1792  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1793  return;
1794  }
1795 
1796  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1797  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1798 
1799  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1800  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1801  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1802  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1803  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1804  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1805 
1806  TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1807  }
1809  //**********************************************************************************************
1810 
1811  //**Addition assignment to dense matrices (kernel selection)************************************
1822  template< typename MT3 // Type of the left-hand side target matrix
1823  , typename MT4 // Type of the left-hand side matrix operand
1824  , typename MT5 > // Type of the right-hand side matrix operand
1825  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1826  {
1827  if( ( IsDiagonal<MT4>::value ) ||
1828  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
1829  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1830  selectSmallAddAssignKernel( C, A, B );
1831  else
1832  selectBlasAddAssignKernel( C, A, B );
1833  }
1835  //**********************************************************************************************
1836 
1837  //**Default addition assignment to dense matrices (general/general)*****************************
1851  template< typename MT3 // Type of the left-hand side target matrix
1852  , typename MT4 // Type of the left-hand side matrix operand
1853  , typename MT5 > // Type of the right-hand side matrix operand
1854  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
1855  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1856  {
1857  const size_t M( A.rows() );
1858  const size_t N( B.columns() );
1859  const size_t K( A.columns() );
1860 
1861  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1862 
1863  for( size_t j=0UL; j<N; ++j )
1864  {
1865  const size_t kbegin( ( IsLower<MT5>::value )
1866  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1867  :( 0UL ) );
1868  const size_t kend( ( IsUpper<MT5>::value )
1869  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1870  :( K ) );
1871  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1872 
1873  for( size_t k=kbegin; k<kend; ++k )
1874  {
1875  const size_t ibegin( ( IsLower<MT4>::value )
1877  ?( LOW ? max(j,k+1UL) : k+1UL )
1878  :( LOW ? max(j,k) : k ) )
1879  :( LOW ? j : 0UL ) );
1880  const size_t iend( ( IsUpper<MT4>::value )
1882  ?( UPP ? min(j+1UL,k) : k )
1883  :( UPP ? min(j,k)+1UL : k+1UL ) )
1884  :( UPP ? j+1UL : M ) );
1885 
1886  if( ( LOW || UPP ) && ibegin >= iend ) continue;
1887  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1888 
1889  const size_t inum( iend - ibegin );
1890  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1891 
1892  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1893  C(i ,j) += A(i ,k) * B(k,j);
1894  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1895  }
1896  if( ipos < iend ) {
1897  C(ipos,j) += A(ipos,k) * B(k,j);
1898  }
1899  }
1900  }
1901  }
1903  //**********************************************************************************************
1904 
1905  //**Default addition assignment to dense matrices (general/diagonal)****************************
1919  template< typename MT3 // Type of the left-hand side target matrix
1920  , typename MT4 // Type of the left-hand side matrix operand
1921  , typename MT5 > // Type of the right-hand side matrix operand
1922  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
1923  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1924  {
1926 
1927  const size_t M( A.rows() );
1928  const size_t N( B.columns() );
1929 
1930  for( size_t j=0UL; j<N; ++j )
1931  {
1932  const size_t ibegin( ( IsLower<MT4>::value )
1933  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
1934  :( 0UL ) );
1935  const size_t iend( ( IsUpper<MT4>::value )
1936  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
1937  :( M ) );
1938  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1939 
1940  const size_t inum( iend - ibegin );
1941  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1942 
1943  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1944  C(i ,j) += A(i ,j) * B(j,j);
1945  C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1946  }
1947  if( ipos < iend ) {
1948  C(ipos,j) += A(ipos,j) * B(j,j);
1949  }
1950  }
1951  }
1953  //**********************************************************************************************
1954 
1955  //**Default addition assignment to dense matrices (diagonal/general)****************************
1969  template< typename MT3 // Type of the left-hand side target matrix
1970  , typename MT4 // Type of the left-hand side matrix operand
1971  , typename MT5 > // Type of the right-hand side matrix operand
1972  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
1973  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1974  {
1976 
1977  const size_t M( A.rows() );
1978  const size_t N( B.columns() );
1979 
1980  for( size_t j=0UL; j<N; ++j )
1981  {
1982  const size_t ibegin( ( IsLower<MT5>::value )
1983  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1984  :( 0UL ) );
1985  const size_t iend( ( IsUpper<MT5>::value )
1986  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1987  :( M ) );
1988  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1989 
1990  const size_t inum( iend - ibegin );
1991  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1992 
1993  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1994  C(i ,j) += A(i ,i ) * B(i ,j);
1995  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1996  }
1997  if( ipos < iend ) {
1998  C(ipos,j) += A(ipos,ipos) * B(ipos,j);
1999  }
2000  }
2001  }
2003  //**********************************************************************************************
2004 
2005  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2019  template< typename MT3 // Type of the left-hand side target matrix
2020  , typename MT4 // Type of the left-hand side matrix operand
2021  , typename MT5 > // Type of the right-hand side matrix operand
2022  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2023  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2024  {
2026 
2027  for( size_t i=0UL; i<A.rows(); ++i ) {
2028  C(i,i) += A(i,i) * B(i,i);
2029  }
2030  }
2032  //**********************************************************************************************
2033 
2034  //**Default addition assignment to dense matrices (small matrices)******************************
2048  template< typename MT3 // Type of the left-hand side target matrix
2049  , typename MT4 // Type of the left-hand side matrix operand
2050  , typename MT5 > // Type of the right-hand side matrix operand
2052  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2053  {
2054  selectDefaultAddAssignKernel( C, A, B );
2055  }
2057  //**********************************************************************************************
2058 
2059  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2074  template< typename MT3 // Type of the left-hand side target matrix
2075  , typename MT4 // Type of the left-hand side matrix operand
2076  , typename MT5 > // Type of the right-hand side matrix operand
2078  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2079  {
2084 
2085  const ForwardFunctor fwd;
2086 
2088  const OppositeType_<MT5> tmp( serial( B ) );
2089  addAssign( ~C, fwd( A * tmp ) );
2090  }
2092  const OppositeType_<MT4> tmp( serial( A ) );
2093  addAssign( ~C, fwd( tmp * B ) );
2094  }
2095  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2096  const OppositeType_<MT5> tmp( serial( B ) );
2097  addAssign( ~C, fwd( A * tmp ) );
2098  }
2099  else {
2100  const OppositeType_<MT4> tmp( serial( A ) );
2101  addAssign( ~C, fwd( tmp * B ) );
2102  }
2103  }
2105  //**********************************************************************************************
2106 
2107  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2122  template< typename MT3 // Type of the left-hand side target matrix
2123  , typename MT4 // Type of the left-hand side matrix operand
2124  , typename MT5 > // Type of the right-hand side matrix operand
2126  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2127  {
2128  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
2129 
2130  const size_t M( A.rows() );
2131  const size_t N( B.columns() );
2132  const size_t K( A.columns() );
2133 
2134  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2135 
2136  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
2137  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
2138 
2139  size_t i( 0UL );
2140 
2142  {
2143  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
2144  for( size_t j=0UL; j<N; ++j )
2145  {
2146  const size_t kbegin( ( IsLower<MT5>::value )
2147  ?( ( IsUpper<MT4>::value )
2148  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2149  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2150  :( IsUpper<MT4>::value ? i : 0UL ) );
2151  const size_t kend( ( IsUpper<MT5>::value )
2152  ?( ( IsLower<MT4>::value )
2153  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2154  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
2155  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
2156 
2157  SIMDType xmm1( (~C).load(i ,j) );
2158  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2159  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2160  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2161  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2162  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
2163  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
2164  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
2165 
2166  for( size_t k=kbegin; k<kend; ++k ) {
2167  const SIMDType b1( set( B(k,j) ) );
2168  xmm1 += A.load(i ,k) * b1;
2169  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2170  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2171  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2172  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2173  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
2174  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
2175  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
2176  }
2177 
2178  (~C).store( i , j, xmm1 );
2179  (~C).store( i+SIMDSIZE , j, xmm2 );
2180  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2181  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2182  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2183  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
2184  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
2185  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
2186  }
2187  }
2188  }
2189 
2190  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2191  {
2192  size_t j( 0UL );
2193 
2194  for( ; (j+2UL) <= N; j+=2UL )
2195  {
2196  const size_t kbegin( ( IsLower<MT5>::value )
2197  ?( ( IsUpper<MT4>::value )
2198  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2199  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2200  :( IsUpper<MT4>::value ? i : 0UL ) );
2201  const size_t kend( ( IsUpper<MT5>::value )
2202  ?( ( IsLower<MT4>::value )
2203  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2204  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2205  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
2206 
2207  SIMDType xmm1 ( (~C).load(i ,j ) );
2208  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
2209  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
2210  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
2211  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
2212  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
2213  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
2214  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2215  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2216  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
2217 
2218  for( size_t k=kbegin; k<kend; ++k ) {
2219  const SIMDType a1( A.load(i ,k) );
2220  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2221  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2222  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2223  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
2224  const SIMDType b1( set( B(k,j ) ) );
2225  const SIMDType b2( set( B(k,j+1UL) ) );
2226  xmm1 += a1 * b1;
2227  xmm2 += a2 * b1;
2228  xmm3 += a3 * b1;
2229  xmm4 += a4 * b1;
2230  xmm5 += a5 * b1;
2231  xmm6 += a1 * b2;
2232  xmm7 += a2 * b2;
2233  xmm8 += a3 * b2;
2234  xmm9 += a4 * b2;
2235  xmm10 += a5 * b2;
2236  }
2237 
2238  (~C).store( i , j , xmm1 );
2239  (~C).store( i+SIMDSIZE , j , xmm2 );
2240  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2241  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2242  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
2243  (~C).store( i , j+1UL, xmm6 );
2244  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
2245  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
2246  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
2247  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
2248  }
2249 
2250  if( j < N )
2251  {
2252  const size_t kbegin( ( IsLower<MT5>::value )
2253  ?( ( IsUpper<MT4>::value )
2254  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2255  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2256  :( IsUpper<MT4>::value ? i : 0UL ) );
2257  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
2258 
2259  SIMDType xmm1( (~C).load(i ,j) );
2260  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2261  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2262  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2263  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2264 
2265  for( size_t k=kbegin; k<kend; ++k ) {
2266  const SIMDType b1( set( B(k,j) ) );
2267  xmm1 += A.load(i ,k) * b1;
2268  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2269  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2270  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2271  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2272  }
2273 
2274  (~C).store( i , j, xmm1 );
2275  (~C).store( i+SIMDSIZE , j, xmm2 );
2276  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2277  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2278  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2279  }
2280  }
2281 
2282  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2283  {
2284  size_t j( 0UL );
2285 
2286  for( ; (j+2UL) <= N; j+=2UL )
2287  {
2288  const size_t kbegin( ( IsLower<MT5>::value )
2289  ?( ( IsUpper<MT4>::value )
2290  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2291  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2292  :( IsUpper<MT4>::value ? i : 0UL ) );
2293  const size_t kend( ( IsUpper<MT5>::value )
2294  ?( ( IsLower<MT4>::value )
2295  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2296  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2297  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
2298 
2299  SIMDType xmm1( (~C).load(i ,j ) );
2300  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2301  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2302  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
2303  SIMDType xmm5( (~C).load(i ,j+1UL) );
2304  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
2305  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2306  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2307 
2308  for( size_t k=kbegin; k<kend; ++k ) {
2309  const SIMDType a1( A.load(i ,k) );
2310  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2311  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2312  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2313  const SIMDType b1( set( B(k,j ) ) );
2314  const SIMDType b2( set( B(k,j+1UL) ) );
2315  xmm1 += a1 * b1;
2316  xmm2 += a2 * b1;
2317  xmm3 += a3 * b1;
2318  xmm4 += a4 * b1;
2319  xmm5 += a1 * b2;
2320  xmm6 += a2 * b2;
2321  xmm7 += a3 * b2;
2322  xmm8 += a4 * b2;
2323  }
2324 
2325  (~C).store( i , j , xmm1 );
2326  (~C).store( i+SIMDSIZE , j , xmm2 );
2327  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2328  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2329  (~C).store( i , j+1UL, xmm5 );
2330  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
2331  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2332  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2333  }
2334 
2335  if( j < N )
2336  {
2337  const size_t kbegin( ( IsLower<MT5>::value )
2338  ?( ( IsUpper<MT4>::value )
2339  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2340  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2341  :( IsUpper<MT4>::value ? i : 0UL ) );
2342  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
2343 
2344  SIMDType xmm1( (~C).load(i ,j) );
2345  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2346  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2347  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2348 
2349  for( size_t k=kbegin; k<kend; ++k ) {
2350  const SIMDType b1( set( B(k,j) ) );
2351  xmm1 += A.load(i ,k) * b1;
2352  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2353  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2354  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2355  }
2356 
2357  (~C).store( i , j, xmm1 );
2358  (~C).store( i+SIMDSIZE , j, xmm2 );
2359  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2360  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2361  }
2362  }
2363 
2364  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2365  {
2366  size_t j( 0UL );
2367 
2368  for( ; (j+2UL) <= N; j+=2UL )
2369  {
2370  const size_t kbegin( ( IsLower<MT5>::value )
2371  ?( ( IsUpper<MT4>::value )
2372  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2373  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2374  :( IsUpper<MT4>::value ? i : 0UL ) );
2375  const size_t kend( ( IsUpper<MT5>::value )
2376  ?( ( IsLower<MT4>::value )
2377  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2378  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2379  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
2380 
2381  SIMDType xmm1( (~C).load(i ,j ) );
2382  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2383  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2384  SIMDType xmm4( (~C).load(i ,j+1UL) );
2385  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
2386  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2387 
2388  for( size_t k=kbegin; k<kend; ++k ) {
2389  const SIMDType a1( A.load(i ,k) );
2390  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2391  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2392  const SIMDType b1( set( B(k,j ) ) );
2393  const SIMDType b2( set( B(k,j+1UL) ) );
2394  xmm1 += a1 * b1;
2395  xmm2 += a2 * b1;
2396  xmm3 += a3 * b1;
2397  xmm4 += a1 * b2;
2398  xmm5 += a2 * b2;
2399  xmm6 += a3 * b2;
2400  }
2401 
2402  (~C).store( i , j , xmm1 );
2403  (~C).store( i+SIMDSIZE , j , xmm2 );
2404  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2405  (~C).store( i , j+1UL, xmm4 );
2406  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
2407  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2408  }
2409 
2410  if( j < N )
2411  {
2412  const size_t kbegin( ( IsLower<MT5>::value )
2413  ?( ( IsUpper<MT4>::value )
2414  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2415  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2416  :( IsUpper<MT4>::value ? i : 0UL ) );
2417  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2418 
2419  SIMDType xmm1( (~C).load(i ,j) );
2420  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2421  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2422 
2423  for( size_t k=kbegin; k<kend; ++k ) {
2424  const SIMDType b1( set( B(k,j) ) );
2425  xmm1 += A.load(i ,k) * b1;
2426  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2427  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2428  }
2429 
2430  (~C).store( i , j, xmm1 );
2431  (~C).store( i+SIMDSIZE , j, xmm2 );
2432  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2433  }
2434  }
2435 
2436  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2437  {
2438  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
2439  size_t j( UPP ? i : 0UL );
2440 
2441  for( ; (j+4UL) <= jend; j+=4UL )
2442  {
2443  const size_t kbegin( ( IsLower<MT5>::value )
2444  ?( ( IsUpper<MT4>::value )
2445  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2446  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2447  :( IsUpper<MT4>::value ? i : 0UL ) );
2448  const size_t kend( ( IsUpper<MT5>::value )
2449  ?( ( IsLower<MT4>::value )
2450  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
2451  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
2452  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2453 
2454  SIMDType xmm1( (~C).load(i ,j ) );
2455  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2456  SIMDType xmm3( (~C).load(i ,j+1UL) );
2457  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2458  SIMDType xmm5( (~C).load(i ,j+2UL) );
2459  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
2460  SIMDType xmm7( (~C).load(i ,j+3UL) );
2461  SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
2462 
2463  for( size_t k=kbegin; k<kend; ++k ) {
2464  const SIMDType a1( A.load(i ,k) );
2465  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2466  const SIMDType b1( set( B(k,j ) ) );
2467  const SIMDType b2( set( B(k,j+1UL) ) );
2468  const SIMDType b3( set( B(k,j+2UL) ) );
2469  const SIMDType b4( set( B(k,j+3UL) ) );
2470  xmm1 += a1 * b1;
2471  xmm2 += a2 * b1;
2472  xmm3 += a1 * b2;
2473  xmm4 += a2 * b2;
2474  xmm5 += a1 * b3;
2475  xmm6 += a2 * b3;
2476  xmm7 += a1 * b4;
2477  xmm8 += a2 * b4;
2478  }
2479 
2480  (~C).store( i , j , xmm1 );
2481  (~C).store( i+SIMDSIZE, j , xmm2 );
2482  (~C).store( i , j+1UL, xmm3 );
2483  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2484  (~C).store( i , j+2UL, xmm5 );
2485  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2486  (~C).store( i , j+3UL, xmm7 );
2487  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
2488  }
2489 
2490  for( ; (j+3UL) <= jend; j+=3UL )
2491  {
2492  const size_t kbegin( ( IsLower<MT5>::value )
2493  ?( ( IsUpper<MT4>::value )
2494  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2495  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2496  :( IsUpper<MT4>::value ? i : 0UL ) );
2497  const size_t kend( ( IsUpper<MT5>::value )
2498  ?( ( IsLower<MT4>::value )
2499  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
2500  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
2501  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2502 
2503  SIMDType xmm1( (~C).load(i ,j ) );
2504  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2505  SIMDType xmm3( (~C).load(i ,j+1UL) );
2506  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2507  SIMDType xmm5( (~C).load(i ,j+2UL) );
2508  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
2509 
2510  for( size_t k=kbegin; k<kend; ++k ) {
2511  const SIMDType a1( A.load(i ,k) );
2512  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2513  const SIMDType b1( set( B(k,j ) ) );
2514  const SIMDType b2( set( B(k,j+1UL) ) );
2515  const SIMDType b3( set( B(k,j+2UL) ) );
2516  xmm1 += a1 * b1;
2517  xmm2 += a2 * b1;
2518  xmm3 += a1 * b2;
2519  xmm4 += a2 * b2;
2520  xmm5 += a1 * b3;
2521  xmm6 += a2 * b3;
2522  }
2523 
2524  (~C).store( i , j , xmm1 );
2525  (~C).store( i+SIMDSIZE, j , xmm2 );
2526  (~C).store( i , j+1UL, xmm3 );
2527  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2528  (~C).store( i , j+2UL, xmm5 );
2529  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2530  }
2531 
2532  for( ; (j+2UL) <= jend; j+=2UL )
2533  {
2534  const size_t kbegin( ( IsLower<MT5>::value )
2535  ?( ( IsUpper<MT4>::value )
2536  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2537  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2538  :( IsUpper<MT4>::value ? i : 0UL ) );
2539  const size_t kend( ( IsUpper<MT5>::value )
2540  ?( ( IsLower<MT4>::value )
2541  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2542  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2543  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2544 
2545  SIMDType xmm1( (~C).load(i ,j ) );
2546  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2547  SIMDType xmm3( (~C).load(i ,j+1UL) );
2548  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2549  SIMDType xmm5, xmm6, xmm7, xmm8;
2550  size_t k( kbegin );
2551 
2552  for( ; (k+2UL) < kend; k+=2UL ) {
2553  const SIMDType a1( A.load(i ,k ) );
2554  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2555  const SIMDType a3( A.load(i ,k+1UL) );
2556  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2557  const SIMDType b1( set( B(k ,j ) ) );
2558  const SIMDType b2( set( B(k ,j+1UL) ) );
2559  const SIMDType b3( set( B(k+1UL,j ) ) );
2560  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
2561  xmm1 += a1 * b1;
2562  xmm2 += a2 * b1;
2563  xmm3 += a1 * b2;
2564  xmm4 += a2 * b2;
2565  xmm5 += a3 * b3;
2566  xmm6 += a4 * b3;
2567  xmm7 += a3 * b4;
2568  xmm8 += a4 * b4;
2569  }
2570 
2571  for( ; k<kend; ++k ) {
2572  const SIMDType a1( A.load(i ,k) );
2573  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2574  const SIMDType b1( set( B(k,j ) ) );
2575  const SIMDType b2( set( B(k,j+1UL) ) );
2576  xmm1 += a1 * b1;
2577  xmm2 += a2 * b1;
2578  xmm3 += a1 * b2;
2579  xmm4 += a2 * b2;
2580  }
2581 
2582  (~C).store( i , j , xmm1+xmm5 );
2583  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
2584  (~C).store( i , j+1UL, xmm3+xmm7 );
2585  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2586  }
2587 
2588  if( j < jend )
2589  {
2590  const size_t kbegin( ( IsLower<MT5>::value )
2591  ?( ( IsUpper<MT4>::value )
2592  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2593  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2594  :( IsUpper<MT4>::value ? i : 0UL ) );
2595  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2596 
2597  SIMDType xmm1( (~C).load(i ,j) );
2598  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
2599  SIMDType xmm3, xmm4;
2600  size_t k( kbegin );
2601 
2602  for( ; (k+2UL) <= kend; k+=2UL ) {
2603  const SIMDType b1( set( B(k ,j) ) );
2604  const SIMDType b2( set( B(k+1UL,j) ) );
2605  xmm1 += A.load(i ,k ) * b1;
2606  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2607  xmm3 += A.load(i ,k+1UL) * b2;
2608  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2609  }
2610 
2611  for( ; k<kend; ++k ) {
2612  const SIMDType b1( set( B(k,j) ) );
2613  xmm1 += A.load(i ,k) * b1;
2614  xmm2 += A.load(i+SIMDSIZE,k) * b1;
2615  }
2616 
2617  (~C).store( i , j, xmm1+xmm3 );
2618  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
2619  }
2620  }
2621 
2622  for( ; i<ipos; i+=SIMDSIZE )
2623  {
2624  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
2625  size_t j( UPP ? i : 0UL );
2626 
2627  for( ; (j+4UL) <= jend; j+=4UL )
2628  {
2629  const size_t kbegin( ( IsLower<MT5>::value )
2630  ?( ( IsUpper<MT4>::value )
2631  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2632  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2633  :( IsUpper<MT4>::value ? i : 0UL ) );
2634  const size_t kend( ( IsUpper<MT5>::value )
2635  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
2636  :( K ) );
2637 
2638  SIMDType xmm1( (~C).load(i,j ) );
2639  SIMDType xmm2( (~C).load(i,j+1UL) );
2640  SIMDType xmm3( (~C).load(i,j+2UL) );
2641  SIMDType xmm4( (~C).load(i,j+3UL) );
2642  SIMDType xmm5, xmm6, xmm7, xmm8;
2643  size_t k( kbegin );
2644 
2645  for( ; (k+2UL) <= kend; k+=2UL ) {
2646  const SIMDType a1( A.load(i,k ) );
2647  const SIMDType a2( A.load(i,k+1UL) );
2648  xmm1 += a1 * set( B(k ,j ) );
2649  xmm2 += a1 * set( B(k ,j+1UL) );
2650  xmm3 += a1 * set( B(k ,j+2UL) );
2651  xmm4 += a1 * set( B(k ,j+3UL) );
2652  xmm5 += a2 * set( B(k+1UL,j ) );
2653  xmm6 += a2 * set( B(k+1UL,j+1UL) );
2654  xmm7 += a2 * set( B(k+1UL,j+2UL) );
2655  xmm8 += a2 * set( B(k+1UL,j+3UL) );
2656  }
2657 
2658  for( ; k<kend; ++k ) {
2659  const SIMDType a1( A.load(i,k) );
2660  xmm1 += a1 * set( B(k,j ) );
2661  xmm2 += a1 * set( B(k,j+1UL) );
2662  xmm3 += a1 * set( B(k,j+2UL) );
2663  xmm4 += a1 * set( B(k,j+3UL) );
2664  }
2665 
2666  (~C).store( i, j , xmm1+xmm5 );
2667  (~C).store( i, j+1UL, xmm2+xmm6 );
2668  (~C).store( i, j+2UL, xmm3+xmm7 );
2669  (~C).store( i, j+3UL, xmm4+xmm8 );
2670  }
2671 
2672  for( ; (j+3UL) <= jend; j+=3UL )
2673  {
2674  const size_t kbegin( ( IsLower<MT5>::value )
2675  ?( ( IsUpper<MT4>::value )
2676  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2677  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2678  :( IsUpper<MT4>::value ? i : 0UL ) );
2679  const size_t kend( ( IsUpper<MT5>::value )
2680  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
2681  :( K ) );
2682 
2683  SIMDType xmm1( (~C).load(i,j ) );
2684  SIMDType xmm2( (~C).load(i,j+1UL) );
2685  SIMDType xmm3( (~C).load(i,j+2UL) );
2686  SIMDType xmm4, xmm5, xmm6;
2687  size_t k( kbegin );
2688 
2689  for( ; (k+2UL) <= kend; k+=2UL ) {
2690  const SIMDType a1( A.load(i,k ) );
2691  const SIMDType a2( A.load(i,k+1UL) );
2692  xmm1 += a1 * set( B(k ,j ) );
2693  xmm2 += a1 * set( B(k ,j+1UL) );
2694  xmm3 += a1 * set( B(k ,j+2UL) );
2695  xmm4 += a2 * set( B(k+1UL,j ) );
2696  xmm5 += a2 * set( B(k+1UL,j+1UL) );
2697  xmm6 += a2 * set( B(k+1UL,j+2UL) );
2698  }
2699 
2700  for( ; k<kend; ++k ) {
2701  const SIMDType a1( A.load(i,k) );
2702  xmm1 += a1 * set( B(k,j ) );
2703  xmm2 += a1 * set( B(k,j+1UL) );
2704  xmm3 += a1 * set( B(k,j+2UL) );
2705  }
2706 
2707  (~C).store( i, j , xmm1+xmm4 );
2708  (~C).store( i, j+1UL, xmm2+xmm5 );
2709  (~C).store( i, j+2UL, xmm3+xmm6 );
2710  }
2711 
2712  for( ; (j+2UL) <= jend; j+=2UL )
2713  {
2714  const size_t kbegin( ( IsLower<MT5>::value )
2715  ?( ( IsUpper<MT4>::value )
2716  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2717  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2718  :( IsUpper<MT4>::value ? i : 0UL ) );
2719  const size_t kend( ( IsUpper<MT5>::value )
2720  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2721  :( K ) );
2722 
2723  SIMDType xmm1( (~C).load(i,j ) );
2724  SIMDType xmm2( (~C).load(i,j+1UL) );
2725  SIMDType xmm3, xmm4;
2726  size_t k( kbegin );
2727 
2728  for( ; (k+2UL) <= kend; k+=2UL ) {
2729  const SIMDType a1( A.load(i,k ) );
2730  const SIMDType a2( A.load(i,k+1UL) );
2731  xmm1 += a1 * set( B(k ,j ) );
2732  xmm2 += a1 * set( B(k ,j+1UL) );
2733  xmm3 += a2 * set( B(k+1UL,j ) );
2734  xmm4 += a2 * set( B(k+1UL,j+1UL) );
2735  }
2736 
2737  for( ; k<kend; ++k ) {
2738  const SIMDType a1( A.load(i,k) );
2739  xmm1 += a1 * set( B(k,j ) );
2740  xmm2 += a1 * set( B(k,j+1UL) );
2741  }
2742 
2743  (~C).store( i, j , xmm1+xmm3 );
2744  (~C).store( i, j+1UL, xmm2+xmm4 );
2745  }
2746 
2747  if( j < jend )
2748  {
2749  const size_t kbegin( ( IsLower<MT5>::value )
2750  ?( ( IsUpper<MT4>::value )
2751  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2752  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2753  :( IsUpper<MT4>::value ? i : 0UL ) );
2754 
2755  SIMDType xmm1( (~C).load(i,j) );
2756  SIMDType xmm2;
2757  size_t k( kbegin );
2758 
2759  for( ; (k+2UL) <= K; k+=2UL ) {
2760  xmm1 += A.load(i,k ) * set( B(k ,j) );
2761  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
2762  }
2763 
2764  for( ; k<K; ++k ) {
2765  xmm1 += A.load(i,k) * set( B(k,j) );
2766  }
2767 
2768  (~C).store( i, j, xmm1+xmm2 );
2769  }
2770  }
2771 
2772  for( ; remainder && i<M; ++i )
2773  {
2774  const size_t jend( LOW ? i+1UL : N );
2775  size_t j( UPP ? i : 0UL );
2776 
2777  for( ; (j+2UL) <= jend; j+=2UL )
2778  {
2779  const size_t kbegin( ( IsLower<MT5>::value )
2780  ?( ( IsUpper<MT4>::value )
2781  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2782  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2783  :( IsUpper<MT4>::value ? i : 0UL ) );
2784  const size_t kend( ( IsUpper<MT5>::value )
2785  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2786  :( K ) );
2787 
2788  ElementType value1( (~C)(i,j ) );
2789  ElementType value2( (~C)(i,j+1UL) );
2790 
2791  for( size_t k=kbegin; k<kend; ++k ) {
2792  value1 += A(i,k) * B(k,j );
2793  value2 += A(i,k) * B(k,j+1UL);
2794  }
2795 
2796  (~C)(i,j ) = value1;
2797  (~C)(i,j+1UL) = value2;
2798  }
2799 
2800  if( j < jend )
2801  {
2802  const size_t kbegin( ( IsLower<MT5>::value )
2803  ?( ( IsUpper<MT4>::value )
2804  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2805  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2806  :( IsUpper<MT4>::value ? i : 0UL ) );
2807 
2808  ElementType value( (~C)(i,j) );
2809 
2810  for( size_t k=kbegin; k<K; ++k ) {
2811  value += A(i,k) * B(k,j);
2812  }
2813 
2814  (~C)(i,j) = value;
2815  }
2816  }
2817  }
2819  //**********************************************************************************************
2820 
2821  //**Default addition assignment to dense matrices (large matrices)******************************
2835  template< typename MT3 // Type of the left-hand side target matrix
2836  , typename MT4 // Type of the left-hand side matrix operand
2837  , typename MT5 > // Type of the right-hand side matrix operand
2839  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2840  {
2841  selectDefaultAddAssignKernel( C, A, B );
2842  }
2844  //**********************************************************************************************
2845 
2846  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
2861  template< typename MT3 // Type of the left-hand side target matrix
2862  , typename MT4 // Type of the left-hand side matrix operand
2863  , typename MT5 > // Type of the right-hand side matrix operand
2865  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2866  {
2867  if( LOW )
2868  lmmm( C, A, B, ElementType(1), ElementType(1) );
2869  else if( UPP )
2870  ummm( C, A, B, ElementType(1), ElementType(1) );
2871  else
2872  mmm( C, A, B, ElementType(1), ElementType(1) );
2873  }
2875  //**********************************************************************************************
2876 
2877  //**BLAS-based addition assignment to dense matrices (default)**********************************
2891  template< typename MT3 // Type of the left-hand side target matrix
2892  , typename MT4 // Type of the left-hand side matrix operand
2893  , typename MT5 > // Type of the right-hand side matrix operand
2895  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2896  {
2897  selectLargeAddAssignKernel( C, A, B );
2898  }
2900  //**********************************************************************************************
2901 
2902  //**BLAS-based addition assignment to dense matrices********************************************
2903 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2904 
2917  template< typename MT3 // Type of the left-hand side target matrix
2918  , typename MT4 // Type of the left-hand side matrix operand
2919  , typename MT5 > // Type of the right-hand side matrix operand
2921  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2922  {
2923  using ET = ElementType_<MT3>;
2924 
2925  if( IsTriangular<MT4>::value ) {
2926  ResultType_<MT3> tmp( serial( B ) );
2927  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2928  addAssign( C, tmp );
2929  }
2930  else if( IsTriangular<MT5>::value ) {
2931  ResultType_<MT3> tmp( serial( A ) );
2932  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2933  addAssign( C, tmp );
2934  }
2935  else {
2936  gemm( C, A, B, ET(1), ET(1) );
2937  }
2938  }
2940 #endif
2941  //**********************************************************************************************
2942 
2943  //**Restructuring addition assignment to row-major matrices*************************************
2958  template< typename MT > // Type of the target matrix
2960  addAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
2961  {
2963 
2965 
2966  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2967  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2968 
2969  const ForwardFunctor fwd;
2970 
2972  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
2973  else if( IsSymmetric<MT1>::value )
2974  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
2975  else
2976  addAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
2977  }
2979  //**********************************************************************************************
2980 
2981  //**Addition assignment to sparse matrices******************************************************
2982  // No special implementation for the addition assignment to sparse matrices.
2983  //**********************************************************************************************
2984 
2985  //**Subtraction assignment to dense matrices****************************************************
2998  template< typename MT // Type of the target dense matrix
2999  , bool SO > // Storage order of the target dense matrix
3001  subAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3002  {
3004 
3005  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3006  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3007 
3008  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3009  return;
3010  }
3011 
3012  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3013  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3014 
3015  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3016  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3017  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3018  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3019  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3020  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3021 
3022  TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3023  }
3025  //**********************************************************************************************
3026 
3027  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3038  template< typename MT3 // Type of the left-hand side target matrix
3039  , typename MT4 // Type of the left-hand side matrix operand
3040  , typename MT5 > // Type of the right-hand side matrix operand
3041  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3042  {
3043  if( ( IsDiagonal<MT4>::value ) ||
3044  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
3045  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3046  selectSmallSubAssignKernel( C, A, B );
3047  else
3048  selectBlasSubAssignKernel( C, A, B );
3049  }
3051  //**********************************************************************************************
3052 
3053  //**Default subtraction assignment to dense matrices (general/general)**************************
3067  template< typename MT3 // Type of the left-hand side target matrix
3068  , typename MT4 // Type of the left-hand side matrix operand
3069  , typename MT5 > // Type of the right-hand side matrix operand
3070  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
3071  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3072  {
3073  const size_t M( A.rows() );
3074  const size_t N( B.columns() );
3075  const size_t K( A.columns() );
3076 
3077  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3078 
3079  for( size_t j=0UL; j<N; ++j )
3080  {
3081  const size_t kbegin( ( IsLower<MT5>::value )
3082  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3083  :( 0UL ) );
3084  const size_t kend( ( IsUpper<MT5>::value )
3085  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3086  :( K ) );
3087  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3088 
3089  for( size_t k=kbegin; k<kend; ++k )
3090  {
3091  const size_t ibegin( ( IsLower<MT4>::value )
3093  ?( LOW ? max(j,k+1UL) : k+1UL )
3094  :( LOW ? max(j,k) : k ) )
3095  :( LOW ? j : 0UL ) );
3096  const size_t iend( ( IsUpper<MT4>::value )
3098  ?( UPP ? min(j+1UL,k) : k )
3099  :( UPP ? min(j,k)+1UL : k+1UL ) )
3100  :( UPP ? j+1UL : M ) );
3101 
3102  if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
3103  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3104 
3105  const size_t inum( iend - ibegin );
3106  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3107 
3108  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3109  C(i ,j) -= A(i ,k) * B(k,j);
3110  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3111  }
3112  if( ipos < iend ) {
3113  C(ipos,j) -= A(ipos,k) * B(k,j);
3114  }
3115  }
3116  }
3117  }
3119  //**********************************************************************************************
3120 
3121  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
3135  template< typename MT3 // Type of the left-hand side target matrix
3136  , typename MT4 // Type of the left-hand side matrix operand
3137  , typename MT5 > // Type of the right-hand side matrix operand
3138  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
3139  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3140  {
3142 
3143  const size_t M( A.rows() );
3144  const size_t N( B.columns() );
3145 
3146  for( size_t j=0UL; j<N; ++j )
3147  {
3148  const size_t ibegin( ( IsLower<MT4>::value )
3149  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
3150  :( 0UL ) );
3151  const size_t iend( ( IsUpper<MT4>::value )
3152  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
3153  :( M ) );
3154  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3155 
3156  const size_t inum( iend - ibegin );
3157  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3158 
3159  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3160  C(i ,j) -= A(i ,j) * B(j,j);
3161  C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3162  }
3163  if( ipos < iend ) {
3164  C(ipos,j) -= A(ipos,j) * B(j,j);
3165  }
3166  }
3167  }
3169  //**********************************************************************************************
3170 
3171  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
3185  template< typename MT3 // Type of the left-hand side target matrix
3186  , typename MT4 // Type of the left-hand side matrix operand
3187  , typename MT5 > // Type of the right-hand side matrix operand
3188  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
3189  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3190  {
3192 
3193  const size_t M( A.rows() );
3194  const size_t N( B.columns() );
3195 
3196  for( size_t j=0UL; j<N; ++j )
3197  {
3198  const size_t ibegin( ( IsLower<MT5>::value )
3199  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3200  :( 0UL ) );
3201  const size_t iend( ( IsUpper<MT5>::value )
3202  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3203  :( M ) );
3204  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3205 
3206  const size_t inum( iend - ibegin );
3207  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3208 
3209  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3210  C(i ,j) -= A(i ,i ) * B(i ,j);
3211  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3212  }
3213  if( ipos < iend ) {
3214  C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3215  }
3216  }
3217  }
3219  //**********************************************************************************************
3220 
3221  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3235  template< typename MT3 // Type of the left-hand side target matrix
3236  , typename MT4 // Type of the left-hand side matrix operand
3237  , typename MT5 > // Type of the right-hand side matrix operand
3238  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
3239  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3240  {
3242 
3243  for( size_t i=0UL; i<A.rows(); ++i ) {
3244  C(i,i) -= A(i,i) * B(i,i);
3245  }
3246  }
3248  //**********************************************************************************************
3249 
3250  //**Default subtraction assignment to dense matrices (small matrices)***************************
3264  template< typename MT3 // Type of the left-hand side target matrix
3265  , typename MT4 // Type of the left-hand side matrix operand
3266  , typename MT5 > // Type of the right-hand side matrix operand
3268  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3269  {
3270  selectDefaultSubAssignKernel( C, A, B );
3271  }
3273  //**********************************************************************************************
3274 
3275  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
3290  template< typename MT3 // Type of the left-hand side target matrix
3291  , typename MT4 // Type of the left-hand side matrix operand
3292  , typename MT5 > // Type of the right-hand side matrix operand
3294  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3295  {
3300 
3301  const ForwardFunctor fwd;
3302 
3304  const OppositeType_<MT5> tmp( serial( B ) );
3305  subAssign( ~C, fwd( A * tmp ) );
3306  }
3308  const OppositeType_<MT4> tmp( serial( A ) );
3309  subAssign( ~C, fwd( tmp * B ) );
3310  }
3311  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3312  const OppositeType_<MT5> tmp( serial( B ) );
3313  subAssign( ~C, fwd( A * tmp ) );
3314  }
3315  else {
3316  const OppositeType_<MT4> tmp( serial( A ) );
3317  subAssign( ~C, fwd( tmp * B ) );
3318  }
3319  }
3321  //**********************************************************************************************
3322 
3323  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
3338  template< typename MT3 // Type of the left-hand side target matrix
3339  , typename MT4 // Type of the left-hand side matrix operand
3340  , typename MT5 > // Type of the right-hand side matrix operand
3342  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3343  {
3344  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3345 
3346  const size_t M( A.rows() );
3347  const size_t N( B.columns() );
3348  const size_t K( A.columns() );
3349 
3350  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3351 
3352  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3353  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3354 
3355  size_t i( 0UL );
3356 
3358  {
3359  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3360  for( size_t j=0UL; j<N; ++j )
3361  {
3362  const size_t kbegin( ( IsLower<MT5>::value )
3363  ?( ( IsUpper<MT4>::value )
3364  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3365  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3366  :( IsUpper<MT4>::value ? i : 0UL ) );
3367  const size_t kend( ( IsUpper<MT5>::value )
3368  ?( ( IsLower<MT4>::value )
3369  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3370  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3371  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
3372 
3373  SIMDType xmm1( (~C).load(i ,j) );
3374  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3375  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3376  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3377  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3378  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3379  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3380  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3381 
3382  for( size_t k=kbegin; k<kend; ++k ) {
3383  const SIMDType b1( set( B(k,j) ) );
3384  xmm1 -= A.load(i ,k) * b1;
3385  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3386  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3387  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3388  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3389  xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
3390  xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
3391  xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
3392  }
3393 
3394  (~C).store( i , j, xmm1 );
3395  (~C).store( i+SIMDSIZE , j, xmm2 );
3396  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3397  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3398  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3399  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3400  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3401  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3402  }
3403  }
3404  }
3405 
3406  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3407  {
3408  size_t j( 0UL );
3409 
3410  for( ; (j+2UL) <= N; j+=2UL )
3411  {
3412  const size_t kbegin( ( IsLower<MT5>::value )
3413  ?( ( IsUpper<MT4>::value )
3414  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3415  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3416  :( IsUpper<MT4>::value ? i : 0UL ) );
3417  const size_t kend( ( IsUpper<MT5>::value )
3418  ?( ( IsLower<MT4>::value )
3419  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3420  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3421  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
3422 
3423  SIMDType xmm1 ( (~C).load(i ,j ) );
3424  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3425  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3426  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3427  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3428  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3429  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3430  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3431  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3432  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3433 
3434  for( size_t k=kbegin; k<kend; ++k ) {
3435  const SIMDType a1( A.load(i ,k) );
3436  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3437  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3438  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3439  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3440  const SIMDType b1( set( B(k,j ) ) );
3441  const SIMDType b2( set( B(k,j+1UL) ) );
3442  xmm1 -= a1 * b1;
3443  xmm2 -= a2 * b1;
3444  xmm3 -= a3 * b1;
3445  xmm4 -= a4 * b1;
3446  xmm5 -= a5 * b1;
3447  xmm6 -= a1 * b2;
3448  xmm7 -= a2 * b2;
3449  xmm8 -= a3 * b2;
3450  xmm9 -= a4 * b2;
3451  xmm10 -= a5 * b2;
3452  }
3453 
3454  (~C).store( i , j , xmm1 );
3455  (~C).store( i+SIMDSIZE , j , xmm2 );
3456  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3457  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3458  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3459  (~C).store( i , j+1UL, xmm6 );
3460  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3461  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3462  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3463  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3464  }
3465 
3466  if( j < N )
3467  {
3468  const size_t kbegin( ( IsLower<MT5>::value )
3469  ?( ( IsUpper<MT4>::value )
3470  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3471  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3472  :( IsUpper<MT4>::value ? i : 0UL ) );
3473  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
3474 
3475  SIMDType xmm1( (~C).load(i ,j) );
3476  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3477  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3478  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3479  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3480 
3481  for( size_t k=kbegin; k<kend; ++k ) {
3482  const SIMDType b1( set( B(k,j) ) );
3483  xmm1 -= A.load(i ,k) * b1;
3484  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3485  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3486  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3487  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3488  }
3489 
3490  (~C).store( i , j, xmm1 );
3491  (~C).store( i+SIMDSIZE , j, xmm2 );
3492  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3493  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3494  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3495  }
3496  }
3497 
3498  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3499  {
3500  size_t j( 0UL );
3501 
3502  for( ; (j+2UL) <= N; j+=2UL )
3503  {
3504  const size_t kbegin( ( IsLower<MT5>::value )
3505  ?( ( IsUpper<MT4>::value )
3506  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3507  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3508  :( IsUpper<MT4>::value ? i : 0UL ) );
3509  const size_t kend( ( IsUpper<MT5>::value )
3510  ?( ( IsLower<MT4>::value )
3511  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3512  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3513  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
3514 
3515  SIMDType xmm1( (~C).load(i ,j ) );
3516  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3517  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3518  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3519  SIMDType xmm5( (~C).load(i ,j+1UL) );
3520  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3521  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3522  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3523 
3524  for( size_t k=kbegin; k<kend; ++k ) {
3525  const SIMDType a1( A.load(i ,k) );
3526  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3527  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3528  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3529  const SIMDType b1( set( B(k,j ) ) );
3530  const SIMDType b2( set( B(k,j+1UL) ) );
3531  xmm1 -= a1 * b1;
3532  xmm2 -= a2 * b1;
3533  xmm3 -= a3 * b1;
3534  xmm4 -= a4 * b1;
3535  xmm5 -= a1 * b2;
3536  xmm6 -= a2 * b2;
3537  xmm7 -= a3 * b2;
3538  xmm8 -= a4 * b2;
3539  }
3540 
3541  (~C).store( i , j , xmm1 );
3542  (~C).store( i+SIMDSIZE , j , xmm2 );
3543  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3544  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3545  (~C).store( i , j+1UL, xmm5 );
3546  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3547  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3548  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3549  }
3550 
3551  if( j < N )
3552  {
3553  const size_t kbegin( ( IsLower<MT5>::value )
3554  ?( ( IsUpper<MT4>::value )
3555  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3556  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3557  :( IsUpper<MT4>::value ? i : 0UL ) );
3558  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3559 
3560  SIMDType xmm1( (~C).load(i ,j) );
3561  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3562  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3563  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3564 
3565  for( size_t k=kbegin; k<kend; ++k ) {
3566  const SIMDType b1( set( B(k,j) ) );
3567  xmm1 -= A.load(i ,k) * b1;
3568  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3569  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3570  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3571  }
3572 
3573  (~C).store( i , j, xmm1 );
3574  (~C).store( i+SIMDSIZE , j, xmm2 );
3575  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3576  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3577  }
3578  }
3579 
3580  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3581  {
3582  size_t j( 0UL );
3583 
3584  for( ; (j+2UL) <= N; j+=2UL )
3585  {
3586  const size_t kbegin( ( IsLower<MT5>::value )
3587  ?( ( IsUpper<MT4>::value )
3588  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3589  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3590  :( IsUpper<MT4>::value ? i : 0UL ) );
3591  const size_t kend( ( IsUpper<MT5>::value )
3592  ?( ( IsLower<MT4>::value )
3593  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3594  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3595  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
3596 
3597  SIMDType xmm1( (~C).load(i ,j ) );
3598  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3599  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3600  SIMDType xmm4( (~C).load(i ,j+1UL) );
3601  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
3602  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3603 
3604  for( size_t k=kbegin; k<kend; ++k ) {
3605  const SIMDType a1( A.load(i ,k) );
3606  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3607  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3608  const SIMDType b1( set( B(k,j ) ) );
3609  const SIMDType b2( set( B(k,j+1UL) ) );
3610  xmm1 -= a1 * b1;
3611  xmm2 -= a2 * b1;
3612  xmm3 -= a3 * b1;
3613  xmm4 -= a1 * b2;
3614  xmm5 -= a2 * b2;
3615  xmm6 -= a3 * b2;
3616  }
3617 
3618  (~C).store( i , j , xmm1 );
3619  (~C).store( i+SIMDSIZE , j , xmm2 );
3620  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3621  (~C).store( i , j+1UL, xmm4 );
3622  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
3623  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3624  }
3625 
3626  if( j < N )
3627  {
3628  const size_t kbegin( ( IsLower<MT5>::value )
3629  ?( ( IsUpper<MT4>::value )
3630  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3631  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3632  :( IsUpper<MT4>::value ? i : 0UL ) );
3633  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
3634 
3635  SIMDType xmm1( (~C).load(i ,j) );
3636  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3637  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3638 
3639  for( size_t k=kbegin; k<kend; ++k ) {
3640  const SIMDType b1( set( B(k,j) ) );
3641  xmm1 -= A.load(i ,k) * b1;
3642  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3643  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3644  }
3645 
3646  (~C).store( i , j, xmm1 );
3647  (~C).store( i+SIMDSIZE , j, xmm2 );
3648  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3649  }
3650  }
3651 
3652  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3653  {
3654  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
3655  size_t j( UPP ? i : 0UL );
3656 
3657  for( ; (j+4UL) <= jend; j+=4UL )
3658  {
3659  const size_t kbegin( ( IsLower<MT5>::value )
3660  ?( ( IsUpper<MT4>::value )
3661  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3662  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3663  :( IsUpper<MT4>::value ? i : 0UL ) );
3664  const size_t kend( ( IsUpper<MT5>::value )
3665  ?( ( IsLower<MT4>::value )
3666  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
3667  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
3668  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
3669 
3670  SIMDType xmm1( (~C).load(i ,j ) );
3671  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3672  SIMDType xmm3( (~C).load(i ,j+1UL) );
3673  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3674  SIMDType xmm5( (~C).load(i ,j+2UL) );
3675  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
3676  SIMDType xmm7( (~C).load(i ,j+3UL) );
3677  SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
3678 
3679  for( size_t k=kbegin; k<kend; ++k ) {
3680  const SIMDType a1( A.load(i ,k) );
3681  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3682  const SIMDType b1( set( B(k,j ) ) );
3683  const SIMDType b2( set( B(k,j+1UL) ) );
3684  const SIMDType b3( set( B(k,j+2UL) ) );
3685  const SIMDType b4( set( B(k,j+3UL) ) );
3686  xmm1 -= a1 * b1;
3687  xmm2 -= a2 * b1;
3688  xmm3 -= a1 * b2;
3689  xmm4 -= a2 * b2;
3690  xmm5 -= a1 * b3;
3691  xmm6 -= a2 * b3;
3692  xmm7 -= a1 * b4;
3693  xmm8 -= a2 * b4;
3694  }
3695 
3696  (~C).store( i , j , xmm1 );
3697  (~C).store( i+SIMDSIZE, j , xmm2 );
3698  (~C).store( i , j+1UL, xmm3 );
3699  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3700  (~C).store( i , j+2UL, xmm5 );
3701  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
3702  (~C).store( i , j+3UL, xmm7 );
3703  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
3704  }
3705 
3706  for( ; (j+3UL) <= jend; j+=3UL )
3707  {
3708  const size_t kbegin( ( IsLower<MT5>::value )
3709  ?( ( IsUpper<MT4>::value )
3710  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3711  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3712  :( IsUpper<MT4>::value ? i : 0UL ) );
3713  const size_t kend( ( IsUpper<MT5>::value )
3714  ?( ( IsLower<MT4>::value )
3715  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
3716  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
3717  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
3718 
3719  SIMDType xmm1( (~C).load(i ,j ) );
3720  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3721  SIMDType xmm3( (~C).load(i ,j+1UL) );
3722  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3723  SIMDType xmm5( (~C).load(i ,j+2UL) );
3724  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
3725 
3726  for( size_t k=kbegin; k<kend; ++k ) {
3727  const SIMDType a1( A.load(i ,k) );
3728  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3729  const SIMDType b1( set( B(k,j ) ) );
3730  const SIMDType b2( set( B(k,j+1UL) ) );
3731  const SIMDType b3( set( B(k,j+2UL) ) );
3732  xmm1 -= a1 * b1;
3733  xmm2 -= a2 * b1;
3734  xmm3 -= a1 * b2;
3735  xmm4 -= a2 * b2;
3736  xmm5 -= a1 * b3;
3737  xmm6 -= a2 * b3;
3738  }
3739 
3740  (~C).store( i , j , xmm1 );
3741  (~C).store( i+SIMDSIZE, j , xmm2 );
3742  (~C).store( i , j+1UL, xmm3 );
3743  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3744  (~C).store( i , j+2UL, xmm5 );
3745  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
3746  }
3747 
3748  for( ; (j+2UL) <= jend; j+=2UL )
3749  {
3750  const size_t kbegin( ( IsLower<MT5>::value )
3751  ?( ( IsUpper<MT4>::value )
3752  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3753  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3754  :( IsUpper<MT4>::value ? i : 0UL ) );
3755  const size_t kend( ( IsUpper<MT5>::value )
3756  ?( ( IsLower<MT4>::value )
3757  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3758  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3759  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
3760 
3761  SIMDType xmm1( (~C).load(i ,j ) );
3762  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3763  SIMDType xmm3( (~C).load(i ,j+1UL) );
3764  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3765  SIMDType xmm5, xmm6, xmm7, xmm8;
3766  size_t k( kbegin );
3767 
3768  for( ; (k+2UL) <= kend; k+=2UL ) {
3769  const SIMDType a1( A.load(i ,k ) );
3770  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
3771  const SIMDType a3( A.load(i ,k+1UL) );
3772  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
3773  const SIMDType b1( set( B(k ,j ) ) );
3774  const SIMDType b2( set( B(k ,j+1UL) ) );
3775  const SIMDType b3( set( B(k+1UL,j ) ) );
3776  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
3777  xmm1 -= a1 * b1;
3778  xmm2 -= a2 * b1;
3779  xmm3 -= a1 * b2;
3780  xmm4 -= a2 * b2;
3781  xmm5 -= a3 * b3;
3782  xmm6 -= a4 * b3;
3783  xmm7 -= a3 * b4;
3784  xmm8 -= a4 * b4;
3785  }
3786 
3787  for( ; k<kend; ++k ) {
3788  const SIMDType a1( A.load(i ,k) );
3789  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3790  const SIMDType b1( set( B(k,j ) ) );
3791  const SIMDType b2( set( B(k,j+1UL) ) );
3792  xmm1 -= a1 * b1;
3793  xmm2 -= a2 * b1;
3794  xmm3 -= a1 * b2;
3795  xmm4 -= a2 * b2;
3796  }
3797 
3798  (~C).store( i , j , xmm1+xmm5 );
3799  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
3800  (~C).store( i , j+1UL, xmm3+xmm7 );
3801  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
3802  }
3803 
3804  if( j < jend )
3805  {
3806  const size_t kbegin( ( IsLower<MT5>::value )
3807  ?( ( IsUpper<MT4>::value )
3808  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3809  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3810  :( IsUpper<MT4>::value ? i : 0UL ) );
3811  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
3812 
3813  SIMDType xmm1( (~C).load(i ,j) );
3814  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3815  SIMDType xmm3, xmm4;
3816  size_t k( kbegin );
3817 
3818  for( ; (k+2UL) <= kend; k+=2UL ) {
3819  const SIMDType b1( set( B(k ,j) ) );
3820  const SIMDType b2( set( B(k+1UL,j) ) );
3821  xmm1 -= A.load(i ,k ) * b1;
3822  xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
3823  xmm3 -= A.load(i ,k+1UL) * b2;
3824  xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
3825  }
3826 
3827  for( ; k<kend; ++k ) {
3828  const SIMDType b1( set( B(k,j) ) );
3829  xmm1 -= A.load(i ,k) * b1;
3830  xmm2 -= A.load(i+SIMDSIZE,k) * b1;
3831  }
3832 
3833  (~C).store( i , j, xmm1+xmm3 );
3834  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
3835  }
3836  }
3837 
3838  for( ; i<ipos; i+=SIMDSIZE )
3839  {
3840  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
3841  size_t j( UPP ? i : 0UL );
3842 
3843  for( ; (j+4UL) <= jend; j+=4UL )
3844  {
3845  const size_t kbegin( ( IsLower<MT5>::value )
3846  ?( ( IsUpper<MT4>::value )
3847  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3848  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3849  :( IsUpper<MT4>::value ? i : 0UL ) );
3850  const size_t kend( ( IsUpper<MT5>::value )
3851  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
3852  :( K ) );
3853 
3854  SIMDType xmm1( (~C).load(i,j ) );
3855  SIMDType xmm2( (~C).load(i,j+1UL) );
3856  SIMDType xmm3( (~C).load(i,j+2UL) );
3857  SIMDType xmm4( (~C).load(i,j+3UL) );
3858  SIMDType xmm5, xmm6, xmm7, xmm8;
3859  size_t k( kbegin );
3860 
3861  for( ; (k+2UL) <= kend; k+=2UL ) {
3862  const SIMDType a1( A.load(i,k ) );
3863  const SIMDType a2( A.load(i,k+1UL) );
3864  xmm1 -= a1 * set( B(k ,j ) );
3865  xmm2 -= a1 * set( B(k ,j+1UL) );
3866  xmm3 -= a1 * set( B(k ,j+2UL) );
3867  xmm4 -= a1 * set( B(k ,j+3UL) );
3868  xmm5 -= a2 * set( B(k+1UL,j ) );
3869  xmm6 -= a2 * set( B(k+1UL,j+1UL) );
3870  xmm7 -= a2 * set( B(k+1UL,j+2UL) );
3871  xmm8 -= a2 * set( B(k+1UL,j+3UL) );
3872  }
3873 
3874  for( ; k<kend; ++k ) {
3875  const SIMDType a1( A.load(i,k) );
3876  xmm1 -= a1 * set( B(k,j ) );
3877  xmm2 -= a1 * set( B(k,j+1UL) );
3878  xmm3 -= a1 * set( B(k,j+2UL) );
3879  xmm4 -= a1 * set( B(k,j+3UL) );
3880  }
3881 
3882  (~C).store( i, j , xmm1+xmm5 );
3883  (~C).store( i, j+1UL, xmm2+xmm6 );
3884  (~C).store( i, j+2UL, xmm3+xmm7 );
3885  (~C).store( i, j+3UL, xmm4+xmm8 );
3886  }
3887 
3888  for( ; (j+3UL) <= jend; j+=3UL )
3889  {
3890  const size_t kbegin( ( IsLower<MT5>::value )
3891  ?( ( IsUpper<MT4>::value )
3892  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3893  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3894  :( IsUpper<MT4>::value ? i : 0UL ) );
3895  const size_t kend( ( IsUpper<MT5>::value )
3896  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
3897  :( K ) );
3898 
3899  SIMDType xmm1( (~C).load(i,j ) );
3900  SIMDType xmm2( (~C).load(i,j+1UL) );
3901  SIMDType xmm3( (~C).load(i,j+2UL) );
3902  SIMDType xmm4, xmm5, xmm6;
3903  size_t k( kbegin );
3904 
3905  for( ; (k+2UL) <= kend; k+=2UL ) {
3906  const SIMDType a1( A.load(i,k ) );
3907  const SIMDType a2( A.load(i,k+1UL) );
3908  xmm1 -= a1 * set( B(k ,j ) );
3909  xmm2 -= a1 * set( B(k ,j+1UL) );
3910  xmm3 -= a1 * set( B(k ,j+2UL) );
3911  xmm4 -= a2 * set( B(k+1UL,j ) );
3912  xmm5 -= a2 * set( B(k+1UL,j+1UL) );
3913  xmm6 -= a2 * set( B(k+1UL,j+2UL) );
3914  }
3915 
3916  for( ; k<kend; ++k ) {
3917  const SIMDType a1( A.load(i,k) );
3918  xmm1 -= a1 * set( B(k,j ) );
3919  xmm2 -= a1 * set( B(k,j+1UL) );
3920  xmm3 -= a1 * set( B(k,j+2UL) );
3921  }
3922 
3923  (~C).store( i, j , xmm1+xmm4 );
3924  (~C).store( i, j+1UL, xmm2+xmm5 );
3925  (~C).store( i, j+2UL, xmm3+xmm6 );
3926  }
3927 
3928  for( ; (j+2UL) <= jend; j+=2UL )
3929  {
3930  const size_t kbegin( ( IsLower<MT5>::value )
3931  ?( ( IsUpper<MT4>::value )
3932  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3933  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3934  :( IsUpper<MT4>::value ? i : 0UL ) );
3935  const size_t kend( ( IsUpper<MT5>::value )
3936  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3937  :( K ) );
3938 
3939  SIMDType xmm1( (~C).load(i,j ) );
3940  SIMDType xmm2( (~C).load(i,j+1UL) );
3941  SIMDType xmm3, xmm4;
3942  size_t k( kbegin );
3943 
3944  for( ; (k+2UL) <= kend; k+=2UL ) {
3945  const SIMDType a1( A.load(i,k ) );
3946  const SIMDType a2( A.load(i,k+1UL) );
3947  xmm1 -= a1 * set( B(k ,j ) );
3948  xmm2 -= a1 * set( B(k ,j+1UL) );
3949  xmm3 -= a2 * set( B(k+1UL,j ) );
3950  xmm4 -= a2 * set( B(k+1UL,j+1UL) );
3951  }
3952 
3953  for( ; k<kend; ++k ) {
3954  const SIMDType a1( A.load(i,k) );
3955  xmm1 -= a1 * set( B(k,j ) );
3956  xmm2 -= a1 * set( B(k,j+1UL) );
3957  }
3958 
3959  (~C).store( i, j , xmm1+xmm3 );
3960  (~C).store( i, j+1UL, xmm2+xmm4 );
3961  }
3962 
3963  if( j < jend )
3964  {
3965  const size_t kbegin( ( IsLower<MT5>::value )
3966  ?( ( IsUpper<MT4>::value )
3967  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3968  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3969  :( IsUpper<MT4>::value ? i : 0UL ) );
3970 
3971  SIMDType xmm1( (~C).load(i,j) );
3972  SIMDType xmm2;
3973  size_t k( kbegin );
3974 
3975  for( ; (k+2UL) <= K; k+=2UL ) {
3976  xmm1 -= A.load(i,k ) * set( B(k ,j) );
3977  xmm2 -= A.load(i,k+1UL) * set( B(k+1UL,j) );
3978  }
3979 
3980  for( ; k<K; ++k ) {
3981  xmm1 -= A.load(i,k) * set( B(k,j) );
3982  }
3983 
3984  (~C).store( i, j, xmm1+xmm2 );
3985  }
3986  }
3987 
3988  for( ; remainder && i<M; ++i )
3989  {
3990  const size_t jend( LOW ? i+1UL : N );
3991  size_t j( UPP ? i : 0UL );
3992 
3993  for( ; (j+2UL) <= jend; j+=2UL )
3994  {
3995  const size_t kbegin( ( IsLower<MT5>::value )
3996  ?( ( IsUpper<MT4>::value )
3997  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3998  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3999  :( IsUpper<MT4>::value ? i : 0UL ) );
4000  const size_t kend( ( IsUpper<MT5>::value )
4001  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4002  :( K ) );
4003 
4004  ElementType value1( (~C)(i,j ) );
4005  ElementType value2( (~C)(i,j+1UL) );
4006 
4007  for( size_t k=kbegin; k<kend; ++k ) {
4008  value1 -= A(i,k) * B(k,j );
4009  value2 -= A(i,k) * B(k,j+1UL);
4010  }
4011 
4012  (~C)(i,j ) = value1;
4013  (~C)(i,j+1UL) = value2;
4014  }
4015 
4016  if( j < jend )
4017  {
4018  const size_t kbegin( ( IsLower<MT5>::value )
4019  ?( ( IsUpper<MT4>::value )
4020  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4021  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4022  :( IsUpper<MT4>::value ? i : 0UL ) );
4023 
4024  ElementType value( (~C)(i,j) );
4025 
4026  for( size_t k=kbegin; k<K; ++k ) {
4027  value -= A(i,k) * B(k,j);
4028  }
4029 
4030  (~C)(i,j) = value;
4031  }
4032  }
4033  }
4035  //**********************************************************************************************
4036 
4037  //**Default subtraction assignment to dense matrices (large matrices)***************************
4051  template< typename MT3 // Type of the left-hand side target matrix
4052  , typename MT4 // Type of the left-hand side matrix operand
4053  , typename MT5 > // Type of the right-hand side matrix operand
4055  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4056  {
4057  selectDefaultSubAssignKernel( C, A, B );
4058  }
4060  //**********************************************************************************************
4061 
4062  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
4077  template< typename MT3 // Type of the left-hand side target matrix
4078  , typename MT4 // Type of the left-hand side matrix operand
4079  , typename MT5 > // Type of the right-hand side matrix operand
4081  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4082  {
4083  if( LOW )
4084  lmmm( C, A, B, ElementType(-1), ElementType(1) );
4085  else if( UPP )
4086  ummm( C, A, B, ElementType(-1), ElementType(1) );
4087  else
4088  mmm( C, A, B, ElementType(-1), ElementType(1) );
4089  }
4091  //**********************************************************************************************
4092 
4093  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4107  template< typename MT3 // Type of the left-hand side target matrix
4108  , typename MT4 // Type of the left-hand side matrix operand
4109  , typename MT5 > // Type of the right-hand side matrix operand
4111  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4112  {
4113  selectLargeSubAssignKernel( C, A, B );
4114  }
4116  //**********************************************************************************************
4117 
4118  //**BLAS-based subraction assignment to dense matrices******************************************
4119 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4120 
4133  template< typename MT3 // Type of the left-hand side target matrix
4134  , typename MT4 // Type of the left-hand side matrix operand
4135  , typename MT5 > // Type of the right-hand side matrix operand
4137  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4138  {
4139  using ET = ElementType_<MT3>;
4140 
4141  if( IsTriangular<MT4>::value ) {
4142  ResultType_<MT3> tmp( serial( B ) );
4143  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4144  subAssign( C, tmp );
4145  }
4146  else if( IsTriangular<MT5>::value ) {
4147  ResultType_<MT3> tmp( serial( A ) );
4148  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4149  subAssign( C, tmp );
4150  }
4151  else {
4152  gemm( C, A, B, ET(-1), ET(1) );
4153  }
4154  }
4156 #endif
4157  //**********************************************************************************************
4158 
4159  //**Restructuring subtraction assignment to row-major matrices**********************************
4175  template< typename MT > // Type of the target matrix
4177  subAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4178  {
4180 
4182 
4183  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4184  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4185 
4186  const ForwardFunctor fwd;
4187 
4189  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4190  else if( IsSymmetric<MT1>::value )
4191  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4192  else
4193  subAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4194  }
4196  //**********************************************************************************************
4197 
4198  //**Subtraction assignment to sparse matrices***************************************************
4199  // No special implementation for the subtraction assignment to sparse matrices.
4200  //**********************************************************************************************
4201 
4202  //**Schur product assignment to dense matrices**************************************************
4215  template< typename MT // Type of the target dense matrix
4216  , bool SO > // Storage order of the target dense matrix
4217  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4218  {
4220 
4224 
4225  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4226  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4227 
4228  const ResultType tmp( serial( rhs ) );
4229  schurAssign( ~lhs, tmp );
4230  }
4232  //**********************************************************************************************
4233 
4234  //**Multiplication assignment to dense matrices*************************************************
4235  // No special implementation for the multiplication assignment to dense matrices.
4236  //**********************************************************************************************
4237 
4238  //**Multiplication assignment to sparse matrices************************************************
4239  // No special implementation for the multiplication assignment to sparse matrices.
4240  //**********************************************************************************************
4241 
4242  //**SMP assignment to dense matrices************************************************************
4258  template< typename MT // Type of the target dense matrix
4259  , bool SO > // Storage order of the target dense matrix
4261  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4262  {
4264 
4265  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4266  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4267 
4268  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4269  return;
4270  }
4271  else if( rhs.lhs_.columns() == 0UL ) {
4272  reset( ~lhs );
4273  return;
4274  }
4275 
4276  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4277  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4278 
4279  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4280  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4281  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4282  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4283  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4284  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4285 
4286  smpAssign( ~lhs, A * B );
4287  }
4289  //**********************************************************************************************
4290 
4291  //**SMP assignment to sparse matrices***********************************************************
4307  template< typename MT // Type of the target sparse matrix
4308  , bool SO > // Storage order of the target sparse matrix
4311  {
4313 
4315 
4322 
4323  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4324  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4325 
4326  const ForwardFunctor fwd;
4327 
4328  const TmpType tmp( rhs );
4329  smpAssign( ~lhs, fwd( tmp ) );
4330  }
4332  //**********************************************************************************************
4333 
4334  //**Restructuring SMP assignment to row-major matrices******************************************
4349  template< typename MT > // Type of the target matrix
4351  smpAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4352  {
4354 
4356 
4357  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4358  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4359 
4360  const ForwardFunctor fwd;
4361 
4363  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4364  else if( IsSymmetric<MT1>::value )
4365  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4366  else
4367  smpAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4368  }
4370  //**********************************************************************************************
4371 
4372  //**SMP addition assignment to dense matrices***************************************************
4388  template< typename MT // Type of the target dense matrix
4389  , bool SO > // Storage order of the target dense matrix
4392  {
4394 
4395  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4396  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4397 
4398  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4399  return;
4400  }
4401 
4402  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4403  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4404 
4405  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4406  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4407  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4408  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4409  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4410  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4411 
4412  smpAddAssign( ~lhs, A * B );
4413  }
4415  //**********************************************************************************************
4416 
4417  //**Restructuring SMP addition assignment to row-major matrices*********************************
4433  template< typename MT > // Type of the target matrix
4436  {
4438 
4440 
4441  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4442  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4443 
4444  const ForwardFunctor fwd;
4445 
4447  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4448  else if( IsSymmetric<MT1>::value )
4449  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4450  else
4451  smpAddAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4452  }
4454  //**********************************************************************************************
4455 
4456  //**SMP addition assignment to sparse matrices**************************************************
4457  // No special implementation for the SMP addition assignment to sparse matrices.
4458  //**********************************************************************************************
4459 
4460  //**SMP subtraction assignment to dense matrices************************************************
4476  template< typename MT // Type of the target dense matrix
4477  , bool SO > // Storage order of the target dense matrix
4480  {
4482 
4483  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4484  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4485 
4486  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4487  return;
4488  }
4489 
4490  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4491  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4492 
4493  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4494  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4495  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4496  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4497  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4498  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4499 
4500  smpSubAssign( ~lhs, A * B );
4501  }
4503  //**********************************************************************************************
4504 
4505  //**Restructuring SMP subtraction assignment to row-major matrices******************************
4521  template< typename MT > // Type of the target matrix
4524  {
4526 
4528 
4529  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4530  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4531 
4532  const ForwardFunctor fwd;
4533 
4535  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4536  else if( IsSymmetric<MT1>::value )
4537  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4538  else
4539  smpSubAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4540  }
4542  //**********************************************************************************************
4543 
4544  //**SMP subtraction assignment to sparse matrices***********************************************
4545  // No special implementation for the SMP subtraction assignment to sparse matrices.
4546  //**********************************************************************************************
4547 
4548  //**SMP Schur product assignment to dense matrices**********************************************
4562  template< typename MT // Type of the target dense matrix
4563  , bool SO > // Storage order of the target dense matrix
4564  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4565  {
4567 
4571 
4572  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4573  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4574 
4575  const ResultType tmp( rhs );
4576  smpSchurAssign( ~lhs, tmp );
4577  }
4579  //**********************************************************************************************
4580 
4581  //**SMP Schur product assignment to sparse matrices*********************************************
4582  // No special implementation for the SMP Schur product assignment to sparse matrices.
4583  //**********************************************************************************************
4584 
4585  //**SMP multiplication assignment to dense matrices*********************************************
4586  // No special implementation for the SMP multiplication assignment to dense matrices.
4587  //**********************************************************************************************
4588 
4589  //**SMP multiplication assignment to sparse matrices********************************************
4590  // No special implementation for the SMP multiplication assignment to sparse matrices.
4591  //**********************************************************************************************
4592 
4593  //**Compile time checks*************************************************************************
4601  //**********************************************************************************************
4602 };
4603 //*************************************************************************************************
4604 
4605 
4606 
4607 
4608 //=================================================================================================
4609 //
4610 // DMATSCALARMULTEXPR SPECIALIZATION
4611 //
4612 //=================================================================================================
4613 
4614 //*************************************************************************************************
4622 template< typename MT1 // Type of the left-hand side dense matrix
4623  , typename MT2 // Type of the right-hand side dense matrix
4624  , bool SF // Symmetry flag
4625  , bool HF // Hermitian flag
4626  , bool LF // Lower flag
4627  , bool UF // Upper flag
4628  , typename ST > // Type of the right-hand side scalar value
4629 class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
4630  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4631  , private Computation
4632 {
4633  private:
4634  //**Type definitions****************************************************************************
4637 
4638  using RES = ResultType_<MMM>;
4639  using RT1 = ResultType_<MT1>;
4640  using RT2 = ResultType_<MT2>;
4641  using ET1 = ElementType_<RT1>;
4642  using ET2 = ElementType_<RT2>;
4643  using CT1 = CompositeType_<MT1>;
4644  using CT2 = CompositeType_<MT2>;
4645  //**********************************************************************************************
4646 
4647  //**********************************************************************************************
4649  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4650  //**********************************************************************************************
4651 
4652  //**********************************************************************************************
4654  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4655  //**********************************************************************************************
4656 
4657  //**********************************************************************************************
4659  enum : bool {
4660  SYM = ( SF && !( HF || LF || UF ) ),
4661  HERM = ( HF && !( LF || UF ) ),
4662  LOW = ( LF || ( ( SF || HF ) && UF ) ),
4663  UPP = ( UF || ( ( SF || HF ) && LF ) )
4664  };
4665  //**********************************************************************************************
4666 
4667  //**********************************************************************************************
4669 
4674  template< typename T1, typename T2, typename T3 >
4675  struct CanExploitSymmetry {
4676  enum : bool { value = IsRowMajorMatrix<T1>::value &&
4678  };
4679  //**********************************************************************************************
4680 
4681  //**********************************************************************************************
4683 
4686  template< typename T1, typename T2, typename T3 >
4687  struct IsEvaluationRequired {
4688  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
4689  !CanExploitSymmetry<T1,T2,T3>::value };
4690  };
4691  //**********************************************************************************************
4692 
4693  //**********************************************************************************************
4695 
4697  template< typename T1, typename T2, typename T3, typename T4 >
4698  struct UseBlasKernel {
4699  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4700  !SYM && !HERM && !LOW && !UPP &&
4705  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4710  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
4712  };
4713  //**********************************************************************************************
4714 
4715  //**********************************************************************************************
4717 
4719  template< typename T1, typename T2, typename T3, typename T4 >
4720  struct UseVectorizedDefaultKernel {
4721  enum : bool { value = useOptimizedKernels &&
4723  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4727  , T4 >::value &&
4728  HasSIMDAdd< ElementType_<T2>, ElementType_<T2> >::value &&
4729  HasSIMDMult< ElementType_<T3>, ElementType_<T3> >::value };
4730  };
4731  //**********************************************************************************************
4732 
4733  //**********************************************************************************************
4735 
4737  using ForwardFunctor = IfTrue_< HERM
4738  , DeclHerm
4739  , IfTrue_< SYM
4740  , DeclSym
4741  , IfTrue_< LOW
4742  , IfTrue_< UPP
4743  , DeclDiag
4744  , DeclLow >
4745  , IfTrue_< UPP
4746  , DeclUpp
4747  , Noop > > > >;
4748  //**********************************************************************************************
4749 
4750  public:
4751  //**Type definitions****************************************************************************
4753  using ResultType = MultTrait_<RES,ST>;
4758  using ReturnType = const ElementType;
4759  using CompositeType = const ResultType;
4760 
4763 
4765  using RightOperand = ST;
4766 
4769 
4772  //**********************************************************************************************
4773 
4774  //**Compilation flags***************************************************************************
4776  enum : bool { simdEnabled = !IsDiagonal<MT1>::value &&
4777  MT1::simdEnabled && MT2::simdEnabled &&
4781 
4783  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4784  !evaluateRight && MT2::smpAssignable };
4785  //**********************************************************************************************
4786 
4787  //**SIMD properties*****************************************************************************
4789  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
4790  //**********************************************************************************************
4791 
4792  //**Constructor*********************************************************************************
4798  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4799  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4800  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4801  {}
4802  //**********************************************************************************************
4803 
4804  //**Access operator*****************************************************************************
4811  inline ReturnType operator()( size_t i, size_t j ) const {
4812  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4813  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4814  return matrix_(i,j) * scalar_;
4815  }
4816  //**********************************************************************************************
4817 
4818  //**At function*********************************************************************************
4826  inline ReturnType at( size_t i, size_t j ) const {
4827  if( i >= matrix_.rows() ) {
4828  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4829  }
4830  if( j >= matrix_.columns() ) {
4831  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4832  }
4833  return (*this)(i,j);
4834  }
4835  //**********************************************************************************************
4836 
4837  //**Rows function*******************************************************************************
4842  inline size_t rows() const {
4843  return matrix_.rows();
4844  }
4845  //**********************************************************************************************
4846 
4847  //**Columns function****************************************************************************
4852  inline size_t columns() const {
4853  return matrix_.columns();
4854  }
4855  //**********************************************************************************************
4856 
4857  //**Left operand access*************************************************************************
4862  inline LeftOperand leftOperand() const {
4863  return matrix_;
4864  }
4865  //**********************************************************************************************
4866 
4867  //**Right operand access************************************************************************
4872  inline RightOperand rightOperand() const {
4873  return scalar_;
4874  }
4875  //**********************************************************************************************
4876 
4877  //**********************************************************************************************
4883  template< typename T >
4884  inline bool canAlias( const T* alias ) const {
4885  return matrix_.canAlias( alias );
4886  }
4887  //**********************************************************************************************
4888 
4889  //**********************************************************************************************
4895  template< typename T >
4896  inline bool isAliased( const T* alias ) const {
4897  return matrix_.isAliased( alias );
4898  }
4899  //**********************************************************************************************
4900 
4901  //**********************************************************************************************
4906  inline bool isAligned() const {
4907  return matrix_.isAligned();
4908  }
4909  //**********************************************************************************************
4910 
4911  //**********************************************************************************************
4916  inline bool canSMPAssign() const noexcept {
4917  return ( !BLAZE_BLAS_MODE ||
4918  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4920  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4921  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
4922  }
4923  //**********************************************************************************************
4924 
4925  private:
4926  //**Member variables****************************************************************************
4927  LeftOperand matrix_;
4928  RightOperand scalar_;
4929  //**********************************************************************************************
4930 
4931  //**Assignment to dense matrices****************************************************************
4943  template< typename MT // Type of the target dense matrix
4944  , bool SO > // Storage order of the target dense matrix
4946  assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4947  {
4949 
4950  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4951  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4952 
4953  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
4954  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
4955 
4956  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4957  return;
4958  }
4959  else if( left.columns() == 0UL ) {
4960  reset( ~lhs );
4961  return;
4962  }
4963 
4964  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4965  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4966 
4967  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4968  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4969  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4970  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4971  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4972  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4973 
4974  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4975  }
4976  //**********************************************************************************************
4977 
4978  //**Assignment to dense matrices (kernel selection)*********************************************
4989  template< typename MT3 // Type of the left-hand side target matrix
4990  , typename MT4 // Type of the left-hand side matrix operand
4991  , typename MT5 // Type of the right-hand side matrix operand
4992  , typename ST2 > // Type of the scalar value
4993  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4994  {
4995  if( ( IsDiagonal<MT4>::value ) ||
4996  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
4997  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
4998  selectSmallAssignKernel( C, A, B, scalar );
4999  else
5000  selectBlasAssignKernel( C, A, B, scalar );
5001  }
5002  //**********************************************************************************************
5003 
5004  //**Default assignment to dense matrices (general/general)**************************************
5018  template< typename MT3 // Type of the left-hand side target matrix
5019  , typename MT4 // Type of the left-hand side matrix operand
5020  , typename MT5 // Type of the right-hand side matrix operand
5021  , typename ST2 > // Type of the scalar value
5023  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5024  {
5025  const size_t M( A.rows() );
5026  const size_t N( B.columns() );
5027  const size_t K( A.columns() );
5028 
5029  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5030 
5031  for( size_t j=0UL; j<N; ++j )
5032  {
5033  const size_t kbegin( ( IsLower<MT5>::value )
5034  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5035  :( 0UL ) );
5036  const size_t kend( ( IsUpper<MT5>::value )
5037  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5038  :( K ) );
5039  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5040 
5041  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
5042  for( size_t i=0UL; i<M; ++i ) {
5043  reset( C(i,j) );
5044  }
5045  continue;
5046  }
5047 
5048  {
5049  const size_t ibegin( ( IsLower<MT4>::value )
5051  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
5052  :( LOW ? max(j,kbegin) : kbegin ) )
5053  :( LOW ? j : 0UL ) );
5054  const size_t iend( ( IsUpper<MT4>::value )
5056  ?( UPP ? min(j+1UL,kbegin) : kbegin )
5057  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
5058  :( UPP ? j+1UL : M ) );
5059 
5060  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
5061  for( size_t i=0UL; i<ibegin; ++i ) {
5062  reset( C(i,j) );
5063  }
5064  }
5065  else if( IsStrictlyLower<MT4>::value ) {
5066  reset( C(0UL,j) );
5067  }
5068  for( size_t i=ibegin; i<iend; ++i ) {
5069  C(i,j) = A(i,kbegin) * B(kbegin,j);
5070  }
5071  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
5072  for( size_t i=iend; i<M; ++i ) {
5073  reset( C(i,j) );
5074  }
5075  }
5076  else if( IsStrictlyUpper<MT4>::value ) {
5077  reset( C(M-1UL,j) );
5078  }
5079  }
5080 
5081  for( size_t k=kbegin+1UL; k<kend; ++k )
5082  {
5083  const size_t ibegin( ( IsLower<MT4>::value )
5085  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
5086  :( SYM || HERM || LOW ? max( j, k ) : k ) )
5087  :( SYM || HERM || LOW ? j : 0UL ) );
5088  const size_t iend( ( IsUpper<MT4>::value )
5090  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
5091  :( UPP ? min(j+1UL,k) : k ) )
5092  :( UPP ? j+1UL : M ) );
5093 
5094  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5095  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5096 
5097  for( size_t i=ibegin; i<iend; ++i ) {
5098  C(i,j) += A(i,k) * B(k,j);
5099  }
5100  if( IsUpper<MT4>::value ) {
5101  C(iend,j) = A(iend,k) * B(k,j);
5102  }
5103  }
5104 
5105  {
5106  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
5108  :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5109  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
5111  :( UPP ? j+1UL : M ) );
5112 
5113  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5114  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5115 
5116  for( size_t i=ibegin; i<iend; ++i ) {
5117  C(i,j) *= scalar;
5118  }
5119  }
5120  }
5121 
5122  if( SYM || HERM ) {
5123  for( size_t j=1UL; j<N; ++j ) {
5124  for( size_t i=0UL; i<j; ++i ) {
5125  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5126  }
5127  }
5128  }
5129  }
5130  //**********************************************************************************************
5131 
5132  //**Default assignment to dense matrices (general/diagonal)*************************************
5146  template< typename MT3 // Type of the left-hand side target matrix
5147  , typename MT4 // Type of the left-hand side matrix operand
5148  , typename MT5 // Type of the right-hand side matrix operand
5149  , typename ST2 > // Type of the scalar value
5150  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
5151  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5152  {
5154 
5155  const size_t M( A.rows() );
5156  const size_t N( B.columns() );
5157 
5158  for( size_t j=0UL; j<N; ++j )
5159  {
5160  const size_t ibegin( ( IsLower<MT4>::value )
5161  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
5162  :( 0UL ) );
5163  const size_t iend( ( IsUpper<MT4>::value )
5164  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
5165  :( M ) );
5166  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5167 
5168  if( IsLower<MT4>::value ) {
5169  for( size_t i=0UL; i<ibegin; ++i ) {
5170  reset( C(i,j) );
5171  }
5172  }
5173  for( size_t i=ibegin; i<iend; ++i ) {
5174  C(i,j) = A(i,j) * B(j,j) * scalar;
5175  }
5176  if( IsUpper<MT4>::value ) {
5177  for( size_t i=iend; i<M; ++i ) {
5178  reset( C(i,j) );
5179  }
5180  }
5181  }
5182  }
5183  //**********************************************************************************************
5184 
5185  //**Default assignment to dense matrices (diagonal/general)*************************************
5199  template< typename MT3 // Type of the left-hand side target matrix
5200  , typename MT4 // Type of the left-hand side matrix operand
5201  , typename MT5 // Type of the right-hand side matrix operand
5202  , typename ST2 > // Type of the scalar value
5204  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5205  {
5207 
5208  const size_t M( A.rows() );
5209  const size_t N( B.columns() );
5210 
5211  for( size_t j=0UL; j<N; ++j )
5212  {
5213  const size_t ibegin( ( IsLower<MT5>::value )
5214  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5215  :( 0UL ) );
5216  const size_t iend( ( IsUpper<MT5>::value )
5217  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5218  :( M ) );
5219  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5220 
5221  if( IsLower<MT4>::value ) {
5222  for( size_t i=0UL; i<ibegin; ++i ) {
5223  reset( C(i,j) );
5224  }
5225  }
5226  for( size_t i=ibegin; i<iend; ++i ) {
5227  C(i,j) = A(i,i) * B(i,j) * scalar;
5228  }
5229  if( IsUpper<MT4>::value ) {
5230  for( size_t i=iend; i<M; ++i ) {
5231  reset( C(i,j) );
5232  }
5233  }
5234  }
5235  }
5236  //**********************************************************************************************
5237 
5238  //**Default assignment to dense matrices (diagonal/diagonal)************************************
5252  template< typename MT3 // Type of the left-hand side target matrix
5253  , typename MT4 // Type of the left-hand side matrix operand
5254  , typename MT5 // Type of the right-hand side matrix operand
5255  , typename ST2 > // Type of the scalar value
5256  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
5257  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5258  {
5260 
5261  reset( C );
5262 
5263  for( size_t i=0UL; i<A.rows(); ++i ) {
5264  C(i,i) = A(i,i) * B(i,i) * scalar;
5265  }
5266  }
5267  //**********************************************************************************************
5268 
5269  //**Default assignment to dense matrices (small matrices)***************************************
5283  template< typename MT3 // Type of the left-hand side target matrix
5284  , typename MT4 // Type of the left-hand side matrix operand
5285  , typename MT5 // Type of the right-hand side matrix operand
5286  , typename ST2 > // Type of the scalar value
5288  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5289  {
5290  selectDefaultAssignKernel( C, A, B, scalar );
5291  }
5292  //**********************************************************************************************
5293 
5294  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5309  template< typename MT3 // Type of the left-hand side target matrix
5310  , typename MT4 // Type of the left-hand side matrix operand
5311  , typename MT5 // Type of the right-hand side matrix operand
5312  , typename ST2 > // Type of the scalar value
5314  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5315  {
5320 
5321  const ForwardFunctor fwd;
5322 
5324  const OppositeType_<MT5> tmp( serial( B ) );
5325  assign( ~C, fwd( A * tmp ) * scalar );
5326  }
5328  const OppositeType_<MT4> tmp( serial( A ) );
5329  assign( ~C, fwd( tmp * B ) * scalar );
5330  }
5331  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5332  const OppositeType_<MT5> tmp( serial( B ) );
5333  assign( ~C, fwd( A * tmp ) * scalar );
5334  }
5335  else {
5336  const OppositeType_<MT4> tmp( serial( A ) );
5337  assign( ~C, fwd( tmp * B ) * scalar );
5338  }
5339  }
5340  //**********************************************************************************************
5341 
5342  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
5357  template< typename MT3 // Type of the left-hand side target matrix
5358  , typename MT4 // Type of the left-hand side matrix operand
5359  , typename MT5 // Type of the right-hand side matrix operand
5360  , typename ST2 > // Type of the scalar value
5362  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5363  {
5364  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5365 
5366  const size_t M( A.rows() );
5367  const size_t N( B.columns() );
5368  const size_t K( A.columns() );
5369 
5370  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5371 
5372  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
5373  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5374 
5375  const SIMDType factor( set( scalar ) );
5376 
5377  if( LOW && UPP && M > SIMDSIZE*3UL ) {
5378  reset( ~C );
5379  }
5380 
5381  {
5382  size_t i( 0UL );
5383 
5385  {
5386  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5387  for( size_t j=0UL; j<N; ++j )
5388  {
5389  const size_t kbegin( ( IsLower<MT5>::value )
5390  ?( ( IsUpper<MT4>::value )
5391  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5392  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5393  :( IsUpper<MT4>::value ? i : 0UL ) );
5394  const size_t kend( ( IsUpper<MT5>::value )
5395  ?( ( IsLower<MT4>::value )
5396  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
5397  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
5398  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
5399 
5400  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5401 
5402  for( size_t k=kbegin; k<kend; ++k ) {
5403  const SIMDType b1( set( B(k,j) ) );
5404  xmm1 += A.load(i ,k) * b1;
5405  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5406  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5407  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5408  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5409  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
5410  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
5411  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
5412  }
5413 
5414  (~C).store( i , j, xmm1 * factor );
5415  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5416  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5417  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5418  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5419  (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
5420  (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
5421  (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
5422  }
5423  }
5424  }
5425 
5426  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5427  {
5428  size_t j( 0UL );
5429 
5430  for( ; (j+2UL) <= N; j+=2UL )
5431  {
5432  const size_t kbegin( ( IsLower<MT5>::value )
5433  ?( ( IsUpper<MT4>::value )
5434  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5435  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5436  :( IsUpper<MT4>::value ? i : 0UL ) );
5437  const size_t kend( ( IsUpper<MT5>::value )
5438  ?( ( IsLower<MT4>::value )
5439  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5440  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5441  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
5442 
5443  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5444 
5445  for( size_t k=kbegin; k<kend; ++k ) {
5446  const SIMDType a1( A.load(i ,k) );
5447  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5448  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5449  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5450  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5451  const SIMDType b1( set( B(k,j ) ) );
5452  const SIMDType b2( set( B(k,j+1UL) ) );
5453  xmm1 += a1 * b1;
5454  xmm2 += a2 * b1;
5455  xmm3 += a3 * b1;
5456  xmm4 += a4 * b1;
5457  xmm5 += a5 * b1;
5458  xmm6 += a1 * b2;
5459  xmm7 += a2 * b2;
5460  xmm8 += a3 * b2;
5461  xmm9 += a4 * b2;
5462  xmm10 += a5 * b2;
5463  }
5464 
5465  (~C).store( i , j , xmm1 * factor );
5466  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5467  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5468  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5469  (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
5470  (~C).store( i , j+1UL, xmm6 * factor );
5471  (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
5472  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5473  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5474  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5475  }
5476 
5477  if( j < N )
5478  {
5479  const size_t kbegin( ( IsLower<MT5>::value )
5480  ?( ( IsUpper<MT4>::value )
5481  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5482  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5483  :( IsUpper<MT4>::value ? i : 0UL ) );
5484  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
5485 
5486  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5487 
5488  for( size_t k=kbegin; k<kend; ++k ) {
5489  const SIMDType b1( set( B(k,j) ) );
5490  xmm1 += A.load(i ,k) * b1;
5491  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5492  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5493  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5494  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5495  }
5496 
5497  (~C).store( i , j, xmm1 * factor );
5498  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5499  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5500  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5501  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5502  }
5503  }
5504 
5505  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5506  {
5507  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
5508  size_t j( UPP ? i : 0UL );
5509 
5510  for( ; (j+2UL) <= jend; j+=2UL )
5511  {
5512  const size_t kbegin( ( IsLower<MT5>::value )
5513  ?( ( IsUpper<MT4>::value )
5514  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5515  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5516  :( IsUpper<MT4>::value ? i : 0UL ) );
5517  const size_t kend( ( IsUpper<MT5>::value )
5518  ?( ( IsLower<MT4>::value )
5519  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5520  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5521  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
5522 
5523  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5524 
5525  for( size_t k=kbegin; k<kend; ++k ) {
5526  const SIMDType a1( A.load(i ,k) );
5527  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5528  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5529  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5530  const SIMDType b1( set( B(k,j ) ) );
5531  const SIMDType b2( set( B(k,j+1UL) ) );
5532  xmm1 += a1 * b1;
5533  xmm2 += a2 * b1;
5534  xmm3 += a3 * b1;
5535  xmm4 += a4 * b1;
5536  xmm5 += a1 * b2;
5537  xmm6 += a2 * b2;
5538  xmm7 += a3 * b2;
5539  xmm8 += a4 * b2;
5540  }
5541 
5542  (~C).store( i , j , xmm1 * factor );
5543  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5544  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5545  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5546  (~C).store( i , j+1UL, xmm5 * factor );
5547  (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
5548  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5549  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5550  }
5551 
5552  if( j < jend )
5553  {
5554  const size_t kbegin( ( IsLower<MT5>::value )
5555  ?( ( IsUpper<MT4>::value )
5556  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5557  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5558  :( IsUpper<MT4>::value ? i : 0UL ) );
5559  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
5560 
5561  SIMDType xmm1, xmm2, xmm3, xmm4;
5562 
5563  for( size_t k=kbegin; k<kend; ++k ) {
5564  const SIMDType b1( set( B(k,j) ) );
5565  xmm1 += A.load(i ,k) * b1;
5566  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5567  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5568  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5569  }
5570 
5571  (~C).store( i , j, xmm1 * factor );
5572  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5573  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5574  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5575  }
5576  }
5577 
5578  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5579  {
5580  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
5581  size_t j( UPP ? i : 0UL );
5582 
5583  for( ; (j+2UL) <= jend; j+=2UL )
5584  {
5585  const size_t kbegin( ( IsLower<MT5>::value )
5586  ?( ( IsUpper<MT4>::value )
5587  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5588  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5589  :( IsUpper<MT4>::value ? i : 0UL ) );
5590  const size_t kend( ( IsUpper<MT5>::value )
5591  ?( ( IsLower<MT4>::value )
5592  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5593  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5594  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
5595 
5596  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5597 
5598  for( size_t k=kbegin; k<kend; ++k ) {
5599  const SIMDType a1( A.load(i ,k) );
5600  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5601  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5602  const SIMDType b1( set( B(k,j ) ) );
5603  const SIMDType b2( set( B(k,j+1UL) ) );
5604  xmm1 += a1 * b1;
5605  xmm2 += a2 * b1;
5606  xmm3 += a3 * b1;
5607  xmm4 += a1 * b2;
5608  xmm5 += a2 * b2;
5609  xmm6 += a3 * b2;
5610  }
5611 
5612  (~C).store( i , j , xmm1 * factor );
5613  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5614  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5615  (~C).store( i , j+1UL, xmm4 * factor );
5616  (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
5617  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
5618  }
5619 
5620  if( j < jend )
5621  {
5622  const size_t kbegin( ( IsLower<MT5>::value )
5623  ?( ( IsUpper<MT4>::value )
5624  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5625  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5626  :( IsUpper<MT4>::value ? i : 0UL ) );
5627  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
5628 
5629  SIMDType xmm1, xmm2, xmm3;
5630 
5631  for( size_t k=kbegin; k<kend; ++k ) {
5632  const SIMDType b1( set( B(k,j) ) );
5633  xmm1 += A.load(i ,k) * b1;
5634  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5635  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5636  }
5637 
5638  (~C).store( i , j, xmm1 * factor );
5639  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5640  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5641  }
5642  }
5643 
5644  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5645  {
5646  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
5647  size_t j( UPP ? i : 0UL );
5648 
5649  for( ; (j+4UL) <= jend; j+=4UL )
5650  {
5651  const size_t kbegin( ( IsLower<MT5>::value )
5652  ?( ( IsUpper<MT4>::value )
5653  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5654  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5655  :( IsUpper<MT4>::value ? i : 0UL ) );
5656  const size_t kend( ( IsUpper<MT5>::value )
5657  ?( ( IsLower<MT4>::value )
5658  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
5659  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
5660  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
5661 
5662  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5663 
5664  for( size_t k=kbegin; k<kend; ++k ) {
5665  const SIMDType a1( A.load(i ,k) );
5666  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5667  const SIMDType b1( set( B(k,j ) ) );
5668  const SIMDType b2( set( B(k,j+1UL) ) );
5669  const SIMDType b3( set( B(k,j+2UL) ) );
5670  const SIMDType b4( set( B(k,j+3UL) ) );
5671  xmm1 += a1 * b1;
5672  xmm2 += a2 * b1;
5673  xmm3 += a1 * b2;
5674  xmm4 += a2 * b2;
5675  xmm5 += a1 * b3;
5676  xmm6 += a2 * b3;
5677  xmm7 += a1 * b4;
5678  xmm8 += a2 * b4;
5679  }
5680 
5681  (~C).store( i , j , xmm1 * factor );
5682  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
5683  (~C).store( i , j+1UL, xmm3 * factor );
5684  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5685  (~C).store( i , j+2UL, xmm5 * factor );
5686  (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5687  (~C).store( i , j+3UL, xmm7 * factor );
5688  (~C).store( i+SIMDSIZE, j+3UL, xmm8 * factor );
5689  }
5690 
5691  for( ; (j+3UL) <= jend; j+=3UL )
5692  {
5693  const size_t kbegin( ( IsLower<MT5>::value )
5694  ?( ( IsUpper<MT4>::value )
5695  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5696  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5697  :( IsUpper<MT4>::value ? i : 0UL ) );
5698  const size_t kend( ( IsUpper<MT5>::value )
5699  ?( ( IsLower<MT4>::value )
5700  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
5701  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
5702  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
5703 
5704  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5705 
5706  for( size_t k=kbegin; k<kend; ++k ) {
5707  const SIMDType a1( A.load(i ,k) );
5708  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5709  const SIMDType b1( set( B(k,j ) ) );
5710  const SIMDType b2( set( B(k,j+1UL) ) );
5711  const SIMDType b3( set( B(k,j+2UL) ) );
5712  xmm1 += a1 * b1;
5713  xmm2 += a2 * b1;
5714  xmm3 += a1 * b2;
5715  xmm4 += a2 * b2;
5716  xmm5 += a1 * b3;
5717  xmm6 += a2 * b3;
5718  }
5719 
5720  (~C).store( i , j , xmm1 * factor );
5721  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
5722  (~C).store( i , j+1UL, xmm3 * factor );
5723  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5724  (~C).store( i , j+2UL, xmm5 * factor );
5725  (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5726  }
5727 
5728  for( ; (j+2UL) <= jend; j+=2UL )
5729  {
5730  const size_t kbegin( ( IsLower<MT5>::value )
5731  ?( ( IsUpper<MT4>::value )
5732  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5733  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5734  :( IsUpper<MT4>::value ? i : 0UL ) );
5735  const size_t kend( ( IsUpper<MT5>::value )
5736  ?( ( IsLower<MT4>::value )
5737  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5738  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5739  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
5740 
5741  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5742  size_t k( kbegin );
5743 
5744  for( ; (k+2UL) <= kend; k+=2UL ) {
5745  const SIMDType a1( A.load(i ,k ) );
5746  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
5747  const SIMDType a3( A.load(i ,k+1UL) );
5748  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
5749  const SIMDType b1( set( B(k ,j ) ) );
5750  const SIMDType b2( set( B(k ,j+1UL) ) );
5751  const SIMDType b3( set( B(k+1UL,j ) ) );
5752  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
5753  xmm1 += a1 * b1;
5754  xmm2 += a2 * b1;
5755  xmm3 += a1 * b2;
5756  xmm4 += a2 * b2;
5757  xmm5 += a3 * b3;
5758  xmm6 += a4 * b3;
5759  xmm7 += a3 * b4;
5760  xmm8 += a4 * b4;
5761  }
5762 
5763  for( ; k<kend; ++k ) {
5764  const SIMDType a1( A.load(i ,k) );
5765  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5766  const SIMDType b1( set( B(k,j ) ) );
5767  const SIMDType b2( set( B(k,j+1UL) ) );
5768  xmm1 += a1 * b1;
5769  xmm2 += a2 * b1;
5770  xmm3 += a1 * b2;
5771  xmm4 += a2 * b2;
5772  }
5773 
5774  (~C).store( i , j , (xmm1+xmm5) * factor );
5775  (~C).store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
5776  (~C).store( i , j+1UL, (xmm3+xmm7) * factor );
5777  (~C).store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
5778  }
5779 
5780  if( j < jend )
5781  {
5782  const size_t kbegin( ( IsLower<MT5>::value )
5783  ?( ( IsUpper<MT4>::value )
5784  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5785  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5786  :( IsUpper<MT4>::value ? i : 0UL ) );
5787  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
5788 
5789  SIMDType xmm1, xmm2, xmm3, xmm4;
5790  size_t k( kbegin );
5791 
5792  for( ; (k+2UL) <= kend; k+=2UL ) {
5793  const SIMDType b1( set( B(k ,j) ) );
5794  const SIMDType b2( set( B(k+1UL,j) ) );
5795  xmm1 += A.load(i ,k ) * b1;
5796  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
5797  xmm3 += A.load(i ,k+1UL) * b2;
5798  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
5799  }
5800 
5801  for( ; k<kend; ++k ) {
5802  const SIMDType b1( set( B(k,j) ) );
5803  xmm1 += A.load(i ,k) * b1;
5804  xmm2 += A.load(i+SIMDSIZE,k) * b1;
5805  }
5806 
5807  (~C).store( i , j, (xmm1+xmm3) * factor );
5808  (~C).store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
5809  }
5810  }
5811 
5812  for( ; i<ipos; i+=SIMDSIZE )
5813  {
5814  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
5815  size_t j( UPP ? i : 0UL );
5816 
5817  for( ; (j+4UL) <= jend; j+=4UL )
5818  {
5819  const size_t kbegin( ( IsLower<MT5>::value )
5820  ?( ( IsUpper<MT4>::value )
5821  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5822  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5823  :( IsUpper<MT4>::value ? i : 0UL ) );
5824  const size_t kend( ( IsUpper<MT5>::value )
5825  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
5826  :( K ) );
5827 
5828  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5829  size_t k( kbegin );
5830 
5831  for( ; (k+2UL) <= kend; k+=2UL ) {
5832  const SIMDType a1( A.load(i,k ) );
5833  const SIMDType a2( A.load(i,k+1UL) );
5834  xmm1 += a1 * set( B(k ,j ) );
5835  xmm2 += a1 * set( B(k ,j+1UL) );
5836  xmm3 += a1 * set( B(k ,j+2UL) );
5837  xmm4 += a1 * set( B(k ,j+3UL) );
5838  xmm5 += a2 * set( B(k+1UL,j ) );
5839  xmm6 += a2 * set( B(k+1UL,j+1UL) );
5840  xmm7 += a2 * set( B(k+1UL,j+2UL) );
5841  xmm8 += a2 * set( B(k+1UL,j+3UL) );
5842  }
5843 
5844  for( ; k<kend; ++k ) {
5845  const SIMDType a1( A.load(i,k) );
5846  xmm1 += a1 * set( B(k,j ) );
5847  xmm2 += a1 * set( B(k,j+1UL) );
5848  xmm3 += a1 * set( B(k,j+2UL) );
5849  xmm4 += a1 * set( B(k,j+3UL) );
5850  }
5851 
5852  (~C).store( i, j , (xmm1+xmm5) * factor );
5853  (~C).store( i, j+1UL, (xmm2+xmm6) * factor );
5854  (~C).store( i, j+2UL, (xmm3+xmm7) * factor );
5855  (~C).store( i, j+3UL, (xmm4+xmm8) * factor );
5856  }
5857 
5858  for( ; (j+3UL) <= jend; j+=3UL )
5859  {
5860  const size_t kbegin( ( IsLower<MT5>::value )
5861  ?( ( IsUpper<MT4>::value )
5862  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5863  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5864  :( IsUpper<MT4>::value ? i : 0UL ) );
5865  const size_t kend( ( IsUpper<MT5>::value )
5866  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
5867  :( K ) );
5868 
5869  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5870  size_t k( kbegin );
5871 
5872  for( ; (k+2UL) <= kend; k+=2UL ) {
5873  const SIMDType a1( A.load(i,k ) );
5874  const SIMDType a2( A.load(i,k+1UL) );
5875  xmm1 += a1 * set( B(k ,j ) );
5876  xmm2 += a1 * set( B(k ,j+1UL) );
5877  xmm3 += a1 * set( B(k ,j+2UL) );
5878  xmm4 += a2 * set( B(k+1UL,j ) );
5879  xmm5 += a2 * set( B(k+1UL,j+1UL) );
5880  xmm6 += a2 * set( B(k+1UL,j+2UL) );
5881  }
5882 
5883  for( ; k<kend; ++k ) {
5884  const SIMDType a1( A.load(i,k) );
5885  xmm1 += a1 * set( B(k,j ) );
5886  xmm2 += a1 * set( B(k,j+1UL) );
5887  xmm3 += a1 * set( B(k,j+2UL) );
5888  }
5889 
5890  (~C).store( i, j , (xmm1+xmm4) * factor );
5891  (~C).store( i, j+1UL, (xmm2+xmm5) * factor );
5892  (~C).store( i, j+2UL, (xmm3+xmm6) * factor );
5893  }
5894 
5895  for( ; (j+2UL) <= jend; j+=2UL )
5896  {
5897  const size_t kbegin( ( IsLower<MT5>::value )
5898  ?( ( IsUpper<MT4>::value )
5899  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5900  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5901  :( IsUpper<MT4>::value ? i : 0UL ) );
5902  const size_t kend( ( IsUpper<MT5>::value )
5903  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5904  :( K ) );
5905 
5906  SIMDType xmm1, xmm2, xmm3, xmm4;
5907  size_t k( kbegin );
5908 
5909  for( ; k<kend; ++k ) {
5910  const SIMDType a1( A.load(i,k) );
5911  xmm1 += a1 * set( B(k,j ) );
5912  xmm2 += a1 * set( B(k,j+1UL) );
5913  }
5914 
5915  for( ; (k+2UL) <= kend; k+=2UL ) {
5916  const SIMDType a1( A.load(i,k ) );
5917  const SIMDType a2( A.load(i,k+1UL) );
5918  xmm1 += a1 * set( B(k ,j ) );
5919  xmm2 += a1 * set( B(k ,j+1UL) );
5920  xmm3 += a2 * set( B(k+1UL,j ) );
5921  xmm4 += a2 * set( B(k+1UL,j+1UL) );
5922  }
5923 
5924  (~C).store( i, j , (xmm1+xmm3) * factor );
5925  (~C).store( i, j+1UL, (xmm2+xmm4) * factor );
5926  }
5927 
5928  if( j < jend )
5929  {
5930  const size_t kbegin( ( IsLower<MT5>::value )
5931  ?( ( IsUpper<MT4>::value )
5932  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5933  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5934  :( IsUpper<MT4>::value ? i : 0UL ) );
5935 
5936  SIMDType xmm1, xmm2;
5937  size_t k( kbegin );
5938 
5939  for( ; (k+2UL) <= K; k+=2UL ) {
5940  xmm1 += A.load(i,k ) * set( B(k ,j) );
5941  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
5942  }
5943 
5944  for( ; k<K; ++k ) {
5945  xmm1 += A.load(i,k) * set( B(k,j) );
5946  }
5947 
5948  (~C).store( i, j, (xmm1+xmm2) * factor );
5949  }
5950  }
5951 
5952  for( ; remainder && i<M; ++i )
5953  {
5954  size_t j( LOW && UPP ? i : 0UL );
5955 
5956  for( ; (j+2UL) <= N; j+=2UL )
5957  {
5958  const size_t kbegin( ( IsLower<MT5>::value )
5959  ?( ( IsUpper<MT4>::value )
5960  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5961  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5962  :( IsUpper<MT4>::value ? i : 0UL ) );
5963  const size_t kend( ( IsUpper<MT5>::value )
5964  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5965  :( K ) );
5966 
5967  ElementType value1 = ElementType();
5968  ElementType value2 = ElementType();
5969 
5970  for( size_t k=kbegin; k<kend; ++k ) {
5971  value1 += A(i,k) * B(k,j );
5972  value2 += A(i,k) * B(k,j+1UL);
5973  }
5974 
5975  (~C)(i,j ) = value1 * scalar;
5976  (~C)(i,j+1UL) = value2 * scalar;
5977  }
5978 
5979  if( j < N )
5980  {
5981  const size_t kbegin( ( IsLower<MT5>::value )
5982  ?( ( IsUpper<MT4>::value )
5983  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5984  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5985  :( IsUpper<MT4>::value ? i : 0UL ) );
5986 
5987  ElementType value = ElementType();
5988 
5989  for( size_t k=kbegin; k<K; ++k ) {
5990  value += A(i,k) * B(k,j);
5991  }
5992 
5993  (~C)(i,j) = value * scalar;
5994  }
5995  }
5996  }
5997 
5998  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
5999  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6000  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6001  for( size_t i=0UL; i<iend; ++i ) {
6002  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
6003  }
6004  }
6005  }
6006  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
6007  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6008  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6009  for( size_t i=0UL; i<iend; ++i ) {
6010  reset( (~C)(i,j) );
6011  }
6012  }
6013  }
6014  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
6015  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
6016  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
6017  for( size_t j=0UL; j<jend; ++j ) {
6018  reset( (~C)(i,j) );
6019  }
6020  }
6021  }
6022  }
6023  //**********************************************************************************************
6024 
6025  //**Default assignment to dense matrices (large matrices)***************************************
6039  template< typename MT3 // Type of the left-hand side target matrix
6040  , typename MT4 // Type of the left-hand side matrix operand
6041  , typename MT5 // Type of the right-hand side matrix operand
6042  , typename ST2 > // Type of the scalar value
6044  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6045  {
6046  selectDefaultAssignKernel( C, A, B, scalar );
6047  }
6048  //**********************************************************************************************
6049 
6050  //**Vectorized default assignment to dense matrices (large matrices)****************************
6065  template< typename MT3 // Type of the left-hand side target matrix
6066  , typename MT4 // Type of the left-hand side matrix operand
6067  , typename MT5 // Type of the right-hand side matrix operand
6068  , typename ST2 > // Type of the scalar value
6070  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6071  {
6072  if( SYM )
6073  smmm( C, A, B, scalar );
6074  else if( HERM )
6075  hmmm( C, A, B, scalar );
6076  else if( LOW )
6077  lmmm( C, A, B, scalar, ST2(0) );
6078  else if( UPP )
6079  ummm( C, A, B, scalar, ST2(0) );
6080  else
6081  mmm( C, A, B, scalar, ST2(0) );
6082  }
6083  //**********************************************************************************************
6084 
6085  //**BLAS-based assignment to dense matrices (default)*******************************************
6099  template< typename MT3 // Type of the left-hand side target matrix
6100  , typename MT4 // Type of the left-hand side matrix operand
6101  , typename MT5 // Type of the right-hand side matrix operand
6102  , typename ST2 > // Type of the scalar value
6104  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6105  {
6106  selectLargeAssignKernel( C, A, B, scalar );
6107  }
6108  //**********************************************************************************************
6109 
6110  //**BLAS-based assignment to dense matrices*****************************************************
6111 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6112 
6125  template< typename MT3 // Type of the left-hand side target matrix
6126  , typename MT4 // Type of the left-hand side matrix operand
6127  , typename MT5 // Type of the right-hand side matrix operand
6128  , typename ST2 > // Type of the scalar value
6130  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6131  {
6132  using ET = ElementType_<MT3>;
6133 
6134  if( IsTriangular<MT4>::value ) {
6135  assign( C, B );
6136  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6137  }
6138  else if( IsTriangular<MT5>::value ) {
6139  assign( C, A );
6140  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6141  }
6142  else {
6143  gemm( C, A, B, ET(scalar), ET(0) );
6144  }
6145  }
6146 #endif
6147  //**********************************************************************************************
6148 
6149  //**Assignment to sparse matrices***************************************************************
6161  template< typename MT // Type of the target sparse matrix
6162  , bool SO > // Storage order of the target sparse matrix
6164  assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6165  {
6167 
6169 
6176 
6177  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6178  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6179 
6180  const ForwardFunctor fwd;
6181 
6182  const TmpType tmp( serial( rhs ) );
6183  assign( ~lhs, fwd( tmp ) );
6184  }
6185  //**********************************************************************************************
6186 
6187  //**Restructuring assignment to row-major matrices**********************************************
6201  template< typename MT > // Type of the target matrix
6203  assign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
6204  {
6206 
6208 
6209  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6210  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6211 
6212  const ForwardFunctor fwd;
6213 
6214  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6215  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6216 
6218  assign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
6219  else if( IsSymmetric<MT1>::value )
6220  assign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
6221  else
6222  assign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
6223  }
6224  //**********************************************************************************************
6225 
6226  //**Addition assignment to dense matrices*******************************************************
6238  template< typename MT // Type of the target dense matrix
6239  , bool SO > // Storage order of the target dense matrix
6241  addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6242  {
6244 
6245  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6246  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6247 
6248  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6249  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6250 
6251  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6252  return;
6253  }
6254 
6255  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6256  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6257 
6258  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6259  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6260  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6261  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6262  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6263  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6264 
6265  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6266  }
6267  //**********************************************************************************************
6268 
6269  //**Addition assignment to dense matrices (kernel selection)************************************
6280  template< typename MT3 // Type of the left-hand side target matrix
6281  , typename MT4 // Type of the left-hand side matrix operand
6282  , typename MT5 // Type of the right-hand side matrix operand
6283  , typename ST2 > // Type of the scalar value
6284  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6285  {
6286  if( ( IsDiagonal<MT4>::value ) ||
6287  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
6288  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6289  selectSmallAddAssignKernel( C, A, B, scalar );
6290  else
6291  selectBlasAddAssignKernel( C, A, B, scalar );
6292  }
6293  //**********************************************************************************************
6294 
6295  //**Default addition assignment to dense matrices (general/general)*****************************
6309  template< typename MT3 // Type of the left-hand side target matrix
6310  , typename MT4 // Type of the left-hand side matrix operand
6311  , typename MT5 // Type of the right-hand side matrix operand
6312  , typename ST2 > // Type of the scalar value
6313  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6314  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6315  {
6316  const ResultType tmp( serial( A * B * scalar ) );
6317  addAssign( C, tmp );
6318  }
6319  //**********************************************************************************************
6320 
6321  //**Default addition assignment to dense matrices (general/diagonal)****************************
6335  template< typename MT3 // Type of the left-hand side target matrix
6336  , typename MT4 // Type of the left-hand side matrix operand
6337  , typename MT5 // Type of the right-hand side matrix operand
6338  , typename ST2 > // Type of the scalar value
6339  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6340  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6341  {
6343 
6344  const size_t M( A.rows() );
6345  const size_t N( B.columns() );
6346 
6347  for( size_t j=0UL; j<N; ++j )
6348  {
6349  const size_t ibegin( ( IsLower<MT4>::value )
6350  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6351  :( 0UL ) );
6352  const size_t iend( ( IsUpper<MT4>::value )
6353  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6354  :( M ) );
6355  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6356 
6357  const size_t inum( iend - ibegin );
6358  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6359 
6360  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6361  C(i ,j) += A(i ,j) * B(j,j) * scalar;
6362  C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6363  }
6364  if( ipos < iend ) {
6365  C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6366  }
6367  }
6368  }
6369  //**********************************************************************************************
6370 
6371  //**Default addition assignment to dense matrices (diagonal/general)****************************
6385  template< typename MT3 // Type of the left-hand side target matrix
6386  , typename MT4 // Type of the left-hand side matrix operand
6387  , typename MT5 // Type of the right-hand side matrix operand
6388  , typename ST2 > // Type of the scalar value
6389  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6390  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6391  {
6393 
6394  const size_t M( A.rows() );
6395  const size_t N( B.columns() );
6396 
6397  for( size_t j=0UL; j<N; ++j )
6398  {
6399  const size_t ibegin( ( IsLower<MT5>::value )
6400  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6401  :( 0UL ) );
6402  const size_t iend( ( IsUpper<MT5>::value )
6403  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6404  :( M ) );
6405  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6406 
6407  const size_t inum( iend - ibegin );
6408  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6409 
6410  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6411  C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6412  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6413  }
6414  if( ipos < iend ) {
6415  C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6416  }
6417  }
6418  }
6419  //**********************************************************************************************
6420 
6421  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6435  template< typename MT3 // Type of the left-hand side target matrix
6436  , typename MT4 // Type of the left-hand side matrix operand
6437  , typename MT5 // Type of the right-hand side matrix operand
6438  , typename ST2 > // Type of the scalar value
6439  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6440  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6441  {
6443 
6444  for( size_t i=0UL; i<A.rows(); ++i ) {
6445  C(i,i) += A(i,i) * B(i,i) * scalar;
6446  }
6447  }
6448  //**********************************************************************************************
6449 
6450  //**Default addition assignment to dense matrices (small matrices)******************************
6464  template< typename MT3 // Type of the left-hand side target matrix
6465  , typename MT4 // Type of the left-hand side matrix operand
6466  , typename MT5 // Type of the right-hand side matrix operand
6467  , typename ST2 > // Type of the scalar value
6469  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6470  {
6471  selectDefaultAddAssignKernel( C, A, B, scalar );
6472  }
6473  //**********************************************************************************************
6474 
6475  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
6490  template< typename MT3 // Type of the left-hand side target matrix
6491  , typename MT4 // Type of the left-hand side matrix operand
6492  , typename MT5 // Type of the right-hand side matrix operand
6493  , typename ST2 > // Type of the scalar value
6495  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6496  {
6501 
6502  const ForwardFunctor fwd;
6503 
6505  const OppositeType_<MT5> tmp( serial( B ) );
6506  addAssign( ~C, fwd( A * tmp ) * scalar );
6507  }
6509  const OppositeType_<MT4> tmp( serial( A ) );
6510  addAssign( ~C, fwd( tmp * B ) * scalar );
6511  }
6512  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6513  const OppositeType_<MT5> tmp( serial( B ) );
6514  addAssign( ~C, fwd( A * tmp ) * scalar );
6515  }
6516  else {
6517  const OppositeType_<MT4> tmp( serial( A ) );
6518  addAssign( ~C, fwd( tmp * B ) * scalar );
6519  }
6520  }
6521  //**********************************************************************************************
6522 
6523  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
6538  template< typename MT3 // Type of the left-hand side target matrix
6539  , typename MT4 // Type of the left-hand side matrix operand
6540  , typename MT5 // Type of the right-hand side matrix operand
6541  , typename ST2 > // Type of the scalar value
6543  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6544  {
6545  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6546 
6547  const size_t M( A.rows() );
6548  const size_t N( B.columns() );
6549  const size_t K( A.columns() );
6550 
6551  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6552 
6553  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
6554  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
6555 
6556  const SIMDType factor( set( scalar ) );
6557 
6558  size_t i( 0UL );
6559 
6561  {
6562  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6563  for( size_t j=0UL; j<N; ++j )
6564  {
6565  const size_t kbegin( ( IsLower<MT5>::value )
6566  ?( ( IsUpper<MT4>::value )
6567  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6568  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6569  :( IsUpper<MT4>::value ? i : 0UL ) );
6570  const size_t kend( ( IsUpper<MT5>::value )
6571  ?( ( IsLower<MT4>::value )
6572  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6573  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6574  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
6575 
6576  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6577 
6578  for( size_t k=kbegin; k<kend; ++k ) {
6579  const SIMDType b1( set( B(k,j) ) );
6580  xmm1 += A.load(i ,k) * b1;
6581  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6582  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6583  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6584  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6585  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
6586  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
6587  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
6588  }
6589 
6590  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6591  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6592  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6593  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6594  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6595  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
6596  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
6597  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
6598  }
6599  }
6600  }
6601 
6602  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6603  {
6604  size_t j( 0UL );
6605 
6606  for( ; (j+2UL) <= N; j+=2UL )
6607  {
6608  const size_t kbegin( ( IsLower<MT5>::value )
6609  ?( ( IsUpper<MT4>::value )
6610  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6611  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6612  :( IsUpper<MT4>::value ? i : 0UL ) );
6613  const size_t kend( ( IsUpper<MT5>::value )
6614  ?( ( IsLower<MT4>::value )
6615  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6616  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6617  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
6618 
6619  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6620 
6621  for( size_t k=kbegin; k<kend; ++k ) {
6622  const SIMDType a1( A.load(i ,k) );
6623  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6624  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6625  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6626  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6627  const SIMDType b1( set( B(k,j ) ) );
6628  const SIMDType b2( set( B(k,j+1UL) ) );
6629  xmm1 += a1 * b1;
6630  xmm2 += a2 * b1;
6631  xmm3 += a3 * b1;
6632  xmm4 += a4 * b1;
6633  xmm5 += a5 * b1;
6634  xmm6 += a1 * b2;
6635  xmm7 += a2 * b2;
6636  xmm8 += a3 * b2;
6637  xmm9 += a4 * b2;
6638  xmm10 += a5 * b2;
6639  }
6640 
6641  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6642  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6643  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6644  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6645  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
6646  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
6647  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
6648  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
6649  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
6650  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
6651  }
6652 
6653  if( j < N )
6654  {
6655  const size_t kbegin( ( IsLower<MT5>::value )
6656  ?( ( IsUpper<MT4>::value )
6657  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6658  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6659  :( IsUpper<MT4>::value ? i : 0UL ) );
6660  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
6661 
6662  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6663 
6664  for( size_t k=kbegin; k<kend; ++k ) {
6665  const SIMDType b1( set( B(k,j) ) );
6666  xmm1 += A.load(i ,k) * b1;
6667  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6668  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6669  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6670  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6671  }
6672 
6673  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6674  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6675  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6676  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6677  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6678  }
6679  }
6680 
6681  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6682  {
6683  size_t j( 0UL );
6684 
6685  for( ; (j+2UL) <= N; j+=2UL )
6686  {
6687  const size_t kbegin( ( IsLower<MT5>::value )
6688  ?( ( IsUpper<MT4>::value )
6689  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6690  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6691  :( IsUpper<MT4>::value ? i : 0UL ) );
6692  const size_t kend( ( IsUpper<MT5>::value )
6693  ?( ( IsLower<MT4>::value )
6694  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6695  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6696  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
6697 
6698  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6699 
6700  for( size_t k=kbegin; k<kend; ++k ) {
6701  const SIMDType a1( A.load(i ,k) );
6702  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6703  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6704  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6705  const SIMDType b1( set( B(k,j ) ) );
6706  const SIMDType b2( set( B(k,j+1UL) ) );
6707  xmm1 += a1 * b1;
6708  xmm2 += a2 * b1;
6709  xmm3 += a3 * b1;
6710  xmm4 += a4 * b1;
6711  xmm5 += a1 * b2;
6712  xmm6 += a2 * b2;
6713  xmm7 += a3 * b2;
6714  xmm8 += a4 * b2;
6715  }
6716 
6717  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6718  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6719  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6720  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6721  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
6722  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
6723  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
6724  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
6725  }
6726 
6727  if( j < N )
6728  {
6729  const size_t kbegin( ( IsLower<MT5>::value )
6730  ?( ( IsUpper<MT4>::value )
6731  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6732  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6733  :( IsUpper<MT4>::value ? i : 0UL ) );
6734  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
6735 
6736  SIMDType xmm1, xmm2, xmm3, xmm4;
6737 
6738  for( size_t k=kbegin; k<kend; ++k ) {
6739  const SIMDType b1( set( B(k,j) ) );
6740  xmm1 += A.load(i ,k) * b1;
6741  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6742  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6743  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6744  }
6745 
6746  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6747  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6748  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6749  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6750  }
6751  }
6752 
6753  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
6754  {
6755  size_t j( 0UL );
6756 
6757  for( ; (j+2UL) <= N; j+=2UL )
6758  {
6759  const size_t kbegin( ( IsLower<MT5>::value )
6760  ?( ( IsUpper<MT4>::value )
6761  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6762  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6763  :( IsUpper<MT4>::value ? i : 0UL ) );
6764  const size_t kend( ( IsUpper<MT5>::value )
6765  ?( ( IsLower<MT4>::value )
6766  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6767  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6768  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
6769 
6770  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6771 
6772  for( size_t k=kbegin; k<kend; ++k ) {
6773  const SIMDType a1( A.load(i ,k) );
6774  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6775  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6776  const SIMDType b1( set( B(k,j ) ) );
6777  const SIMDType b2( set( B(k,j+1UL) ) );
6778  xmm1 += a1 * b1;
6779  xmm2 += a2 * b1;
6780  xmm3 += a3 * b1;
6781  xmm4 += a1 * b2;
6782  xmm5 += a2 * b2;
6783  xmm6 += a3 * b2;
6784  }
6785 
6786  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6787  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6788  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6789  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
6790  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
6791  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
6792  }
6793 
6794  if( j < N )
6795  {
6796  const size_t kbegin( ( IsLower<MT5>::value )
6797  ?( ( IsUpper<MT4>::value )
6798  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6799  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6800  :( IsUpper<MT4>::value ? i : 0UL ) );
6801  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
6802 
6803  SIMDType xmm1, xmm2, xmm3;
6804 
6805  for( size_t k=kbegin; k<kend; ++k ) {
6806  const SIMDType b1( set( B(k,j) ) );
6807  xmm1 += A.load(i ,k) * b1;
6808  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6809  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6810  }
6811 
6812  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6813  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6814  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6815  }
6816  }
6817 
6818  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6819  {
6820  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
6821  size_t j( UPP ? i : 0UL );
6822 
6823  for( ; (j+4UL) <= jend; j+=4UL )
6824  {
6825  const size_t kbegin( ( IsLower<MT5>::value )
6826  ?( ( IsUpper<MT4>::value )
6827  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6828  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6829  :( IsUpper<MT4>::value ? i : 0UL ) );
6830  const size_t kend( ( IsUpper<MT5>::value )
6831  ?( ( IsLower<MT4>::value )
6832  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
6833  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
6834  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6835 
6836  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6837 
6838  for( size_t k=kbegin; k<kend; ++k ) {
6839  const SIMDType a1( A.load(i ,k) );
6840  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6841  const SIMDType b1( set( B(k,j ) ) );
6842  const SIMDType b2( set( B(k,j+1UL) ) );
6843  const SIMDType b3( set( B(k,j+2UL) ) );
6844  const SIMDType b4( set( B(k,j+3UL) ) );
6845  xmm1 += a1 * b1;
6846  xmm2 += a2 * b1;
6847  xmm3 += a1 * b2;
6848  xmm4 += a2 * b2;
6849  xmm5 += a1 * b3;
6850  xmm6 += a2 * b3;
6851  xmm7 += a1 * b4;
6852  xmm8 += a2 * b4;
6853  }
6854 
6855  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6856  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
6857  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6858  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6859  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6860  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6861  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
6862  (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
6863  }
6864 
6865  for( ; (j+3UL) <= jend; j+=3UL )
6866  {
6867  const size_t kbegin( ( IsLower<MT5>::value )
6868  ?( ( IsUpper<MT4>::value )
6869  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6870  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6871  :( IsUpper<MT4>::value ? i : 0UL ) );
6872  const size_t kend( ( IsUpper<MT5>::value )
6873  ?( ( IsLower<MT4>::value )
6874  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
6875  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
6876  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6877 
6878  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6879 
6880  for( size_t k=kbegin; k<kend; ++k ) {
6881  const SIMDType a1( A.load(i ,k) );
6882  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6883  const SIMDType b1( set( B(k,j ) ) );
6884  const SIMDType b2( set( B(k,j+1UL) ) );
6885  const SIMDType b3( set( B(k,j+2UL) ) );
6886  xmm1 += a1 * b1;
6887  xmm2 += a2 * b1;
6888  xmm3 += a1 * b2;
6889  xmm4 += a2 * b2;
6890  xmm5 += a1 * b3;
6891  xmm6 += a2 * b3;
6892  }
6893 
6894  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6895  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
6896  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6897  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6898  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6899  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6900  }
6901 
6902  for( ; (j+2UL) <= jend; j+=2UL )
6903  {
6904  const size_t kbegin( ( IsLower<MT5>::value )
6905  ?( ( IsUpper<MT4>::value )
6906  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6907  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6908  :( IsUpper<MT4>::value ? i : 0UL ) );
6909  const size_t kend( ( IsUpper<MT5>::value )
6910  ?( ( IsLower<MT4>::value )
6911  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6912  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6913  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6914 
6915  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6916  size_t k( kbegin );
6917 
6918  for( ; (k+2UL) <= kend; k+=2UL ) {
6919  const SIMDType a1( A.load(i ,k ) );
6920  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6921  const SIMDType a3( A.load(i ,k+1UL) );
6922  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6923  const SIMDType b1( set( B(k ,j ) ) );
6924  const SIMDType b2( set( B(k ,j+1UL) ) );
6925  const SIMDType b3( set( B(k+1UL,j ) ) );
6926  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
6927  xmm1 += a1 * b1;
6928  xmm2 += a2 * b1;
6929  xmm3 += a1 * b2;
6930  xmm4 += a2 * b2;
6931  xmm5 += a3 * b3;
6932  xmm6 += a4 * b3;
6933  xmm7 += a3 * b4;
6934  xmm8 += a4 * b4;
6935  }
6936 
6937  for( ; k<kend; ++k ) {
6938  const SIMDType a1( A.load(i ,k) );
6939  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6940  const SIMDType b1( set( B(k,j ) ) );
6941  const SIMDType b2( set( B(k,j+1UL) ) );
6942  xmm1 += a1 * b1;
6943  xmm2 += a2 * b1;
6944  xmm3 += a1 * b2;
6945  xmm4 += a2 * b2;
6946  }
6947 
6948  (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
6949  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
6950  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + (xmm3+xmm7) * factor );
6951  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
6952  }
6953 
6954  if( j < jend )
6955  {
6956  const size_t kbegin( ( IsLower<MT5>::value )
6957  ?( ( IsUpper<MT4>::value )
6958  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6959  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6960  :( IsUpper<MT4>::value ? i : 0UL ) );
6961  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6962 
6963  SIMDType xmm1, xmm2, xmm3, xmm4;
6964  size_t k( kbegin );
6965 
6966  for( ; (k+2UL) <= kend; k+=2UL ) {
6967  const SIMDType b1( set( B(k ,j) ) );
6968  const SIMDType b2( set( B(k+1UL,j) ) );
6969  xmm1 += A.load(i ,k ) * b1;
6970  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
6971  xmm3 += A.load(i ,k+1UL) * b2;
6972  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
6973  }
6974 
6975  for( ; k<kend; ++k ) {
6976  const SIMDType b1( set( B(k,j) ) );
6977  xmm1 += A.load(i ,k) * b1;
6978  xmm2 += A.load(i+SIMDSIZE,k) * b1;
6979  }
6980 
6981  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
6982  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
6983  }
6984  }
6985 
6986  for( ; i<ipos; i+=SIMDSIZE )
6987  {
6988  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
6989  size_t j( UPP ? i : 0UL );
6990 
6991  for( ; (j+4UL) <= jend; j+=4UL )
6992  {
6993  const size_t kbegin( ( IsLower<MT5>::value )
6994  ?( ( IsUpper<MT4>::value )
6995  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6996  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6997  :( IsUpper<MT4>::value ? i : 0UL ) );
6998  const size_t kend( ( IsUpper<MT5>::value )
6999  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
7000  :( K ) );
7001 
7002  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7003  size_t k( kbegin );
7004 
7005  for( ; (k+2UL) <= kend; k+=2UL ) {
7006  const SIMDType a1( A.load(i,k ) );
7007  const SIMDType a2( A.load(i,k+1UL) );
7008  xmm1 += a1 * set( B(k ,j ) );
7009  xmm2 += a1 * set( B(k ,j+1UL) );
7010  xmm3 += a1 * set( B(k ,j+2UL) );
7011  xmm4 += a1 * set( B(k ,j+3UL) );
7012  xmm5 += a2 * set( B(k+1UL,j ) );
7013  xmm6 += a2 * set( B(k+1UL,j+1UL) );
7014  xmm7 += a2 * set( B(k+1UL,j+2UL) );
7015  xmm8 += a2 * set( B(k+1UL,j+3UL) );
7016  }
7017 
7018  for( ; k<kend; ++k ) {
7019  const SIMDType a1( A.load(i,k) );
7020  xmm1 += a1 * set( B(k,j ) );
7021  xmm2 += a1 * set( B(k,j+1UL) );
7022  xmm3 += a1 * set( B(k,j+2UL) );
7023  xmm4 += a1 * set( B(k,j+3UL) );
7024  }
7025 
7026  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm5) * factor );
7027  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm6) * factor );
7028  (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm7) * factor );
7029  (~C).store( i, j+3UL, (~C).load(i,j+3UL) + (xmm4+xmm8) * factor );
7030  }
7031 
7032  for( ; (j+3UL) <= jend; j+=3UL )
7033  {
7034  const size_t kbegin( ( IsLower<MT5>::value )
7035  ?( ( IsUpper<MT4>::value )
7036  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7037  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7038  :( IsUpper<MT4>::value ? i : 0UL ) );
7039  const size_t kend( ( IsUpper<MT5>::value )
7040  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
7041  :( K ) );
7042 
7043  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7044  size_t k( kbegin );
7045 
7046  for( ; (k+2UL) <= kend; k+=2UL ) {
7047  const SIMDType a1( A.load(i,k ) );
7048  const SIMDType a2( A.load(i,k+1UL) );
7049  xmm1 += a1 * set( B(k ,j ) );
7050  xmm2 += a1 * set( B(k ,j+1UL) );
7051  xmm3 += a1 * set( B(k ,j+2UL) );
7052  xmm4 += a2 * set( B(k+1UL,j ) );
7053  xmm5 += a2 * set( B(k+1UL,j+1UL) );
7054  xmm6 += a2 * set( B(k+1UL,j+2UL) );
7055  }
7056 
7057  for( ; k<kend; ++k ) {
7058  const SIMDType a1( A.load(i,k) );
7059  xmm1 += a1 * set( B(k,j ) );
7060  xmm2 += a1 * set( B(k,j+1UL) );
7061  xmm3 += a1 * set( B(k,j+2UL) );
7062  }
7063 
7064  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm4) * factor );
7065  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm5) * factor );
7066  (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm6) * factor );
7067  }
7068 
7069  for( ; (j+2UL) <= jend; j+=2UL )
7070  {
7071  const size_t kbegin( ( IsLower<MT5>::value )
7072  ?( ( IsUpper<MT4>::value )
7073  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7074  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7075  :( IsUpper<MT4>::value ? i : 0UL ) );
7076  const size_t kend( ( IsUpper<MT5>::value )
7077  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7078  :( K ) );
7079 
7080  SIMDType xmm1, xmm2, xmm3, xmm4;
7081  size_t k( kbegin );
7082 
7083  for( ; (k+2UL) <= kend; k+=2UL ) {
7084  const SIMDType a1( A.load(i,k ) );
7085  const SIMDType a2( A.load(i,k+1UL) );
7086  xmm1 += a1 * set( B(k ,j ) );
7087  xmm2 += a1 * set( B(k ,j+1UL) );
7088  xmm3 += a2 * set( B(k+1UL,j ) );
7089  xmm4 += a2 * set( B(k+1UL,j+1UL) );
7090  }
7091 
7092  for( ; k<kend; ++k ) {
7093  const SIMDType a1( A.load(i,k) );
7094  xmm1 += a1 * set( B(k,j ) );
7095  xmm2 += a1 * set( B(k,j+1UL) );
7096  }
7097 
7098  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
7099  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm4) * factor );
7100  }
7101 
7102  if( j < jend )
7103  {
7104  const size_t kbegin( ( IsLower<MT5>::value )
7105  ?( ( IsUpper<MT4>::value )
7106  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7107  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7108  :( IsUpper<MT4>::value ? i : 0UL ) );
7109 
7110  SIMDType xmm1, xmm2;
7111  size_t k( kbegin );
7112 
7113  for( ; (k+2UL) <= K; k+=2UL ) {
7114  xmm1 += A.load(i,k ) * set( B(k ,j) );
7115  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
7116  }
7117 
7118  for( ; k<K; ++k ) {
7119  xmm1 += A.load(i,k) * set( B(k,j) );
7120  }
7121 
7122  (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
7123  }
7124  }
7125 
7126  for( ; remainder && i<M; ++i )
7127  {
7128  const size_t jend( LOW ? i+1UL : N );
7129  size_t j( UPP ? i : 0UL );
7130 
7131  for( ; (j+2UL) <= jend; j+=2UL )
7132  {
7133  const size_t kbegin( ( IsLower<MT5>::value )
7134  ?( ( IsUpper<MT4>::value )
7135  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7136  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7137  :( IsUpper<MT4>::value ? i : 0UL ) );
7138  const size_t kend( ( IsUpper<MT5>::value )
7139  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7140  :( K ) );
7141 
7142  ElementType value1 = ElementType();
7143  ElementType value2 = ElementType();
7144 
7145  for( size_t k=kbegin; k<kend; ++k ) {
7146  value1 += A(i,k) * B(k,j );
7147  value2 += A(i,k) * B(k,j+1UL);
7148  }
7149 
7150  (~C)(i,j ) += value1 * scalar;
7151  (~C)(i,j+1UL) += value2 * scalar;
7152  }
7153 
7154  if( j < jend )
7155  {
7156  const size_t kbegin( ( IsLower<MT5>::value )
7157  ?( ( IsUpper<MT4>::value )
7158  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7159  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7160  :( IsUpper<MT4>::value ? i : 0UL ) );
7161 
7162  ElementType value = ElementType();
7163 
7164  for( size_t k=kbegin; k<K; ++k ) {
7165  value += A(i,k) * B(k,j);
7166  }
7167 
7168  (~C)(i,j) += value * scalar;
7169  }
7170  }
7171  }
7172  //**********************************************************************************************
7173 
7174  //**Default addition assignment to dense matrices (large matrices)******************************
7188  template< typename MT3 // Type of the left-hand side target matrix
7189  , typename MT4 // Type of the left-hand side matrix operand
7190  , typename MT5 // Type of the right-hand side matrix operand
7191  , typename ST2 > // Type of the scalar value
7193  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7194  {
7195  selectDefaultAddAssignKernel( C, A, B, scalar );
7196  }
7197  //**********************************************************************************************
7198 
7199  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
7214  template< typename MT3 // Type of the left-hand side target matrix
7215  , typename MT4 // Type of the left-hand side matrix operand
7216  , typename MT5 // Type of the right-hand side matrix operand
7217  , typename ST2 > // Type of the scalar value
7219  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7220  {
7221  if( LOW )
7222  lmmm( C, A, B, scalar, ST2(1) );
7223  else if( UPP )
7224  ummm( C, A, B, scalar, ST2(1) );
7225  else
7226  mmm( C, A, B, scalar, ST2(1) );
7227  }
7228  //**********************************************************************************************
7229 
7230  //**BLAS-based addition assignment to dense matrices (default)**********************************
7245  template< typename MT3 // Type of the left-hand side target matrix
7246  , typename MT4 // Type of the left-hand side matrix operand
7247  , typename MT5 // Type of the right-hand side matrix operand
7248  , typename ST2 > // Type of the scalar value
7250  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7251  {
7252  selectLargeAddAssignKernel( C, A, B, scalar );
7253  }
7254  //**********************************************************************************************
7255 
7256  //**BLAS-based addition assignment to dense matrices********************************************
7257 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7258 
7271  template< typename MT3 // Type of the left-hand side target matrix
7272  , typename MT4 // Type of the left-hand side matrix operand
7273  , typename MT5 // Type of the right-hand side matrix operand
7274  , typename ST2 > // Type of the scalar value
7276  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7277  {
7278  using ET = ElementType_<MT3>;
7279 
7280  if( IsTriangular<MT4>::value ) {
7281  ResultType_<MT3> tmp( serial( B ) );
7282  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7283  addAssign( C, tmp );
7284  }
7285  else if( IsTriangular<MT5>::value ) {
7286  ResultType_<MT3> tmp( serial( A ) );
7287  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7288  addAssign( C, tmp );
7289  }
7290  else {
7291  gemm( C, A, B, ET(scalar), ET(1) );
7292  }
7293  }
7294 #endif
7295  //**********************************************************************************************
7296 
7297  //**Restructuring addition assignment to row-major matrices*************************************
7312  template< typename MT > // Type of the target matrix
7314  addAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7315  {
7317 
7319 
7320  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7321  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7322 
7323  const ForwardFunctor fwd;
7324 
7325  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7326  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7327 
7329  addAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
7330  else if( IsSymmetric<MT1>::value )
7331  addAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
7332  else
7333  addAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
7334  }
7335  //**********************************************************************************************
7336 
7337  //**Addition assignment to sparse matrices******************************************************
7338  // No special implementation for the addition assignment to sparse matrices.
7339  //**********************************************************************************************
7340 
7341  //**Subtraction assignment to dense matrices****************************************************
7353  template< typename MT // Type of the target dense matrix
7354  , bool SO > // Storage order of the target dense matrix
7356  subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7357  {
7359 
7360  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7361  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7362 
7363  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7364  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7365 
7366  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7367  return;
7368  }
7369 
7370  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7371  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7372 
7373  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7374  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7375  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7376  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7377  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7378  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7379 
7380  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7381  }
7382  //**********************************************************************************************
7383 
7384  //**Subtraction assignment to dense matrices (kernel selection)*********************************
7395  template< typename MT3 // Type of the left-hand side target matrix
7396  , typename MT4 // Type of the left-hand side matrix operand
7397  , typename MT5 // Type of the right-hand side matrix operand
7398  , typename ST2 > // Type of the scalar value
7399  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7400  {
7401  if( ( IsDiagonal<MT4>::value ) ||
7402  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
7403  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
7404  selectSmallSubAssignKernel( C, A, B, scalar );
7405  else
7406  selectBlasSubAssignKernel( C, A, B, scalar );
7407  }
7408  //**********************************************************************************************
7409 
7410  //**Default subtraction assignment to dense matrices (general/general)**************************
7424  template< typename MT3 // Type of the left-hand side target matrix
7425  , typename MT4 // Type of the left-hand side matrix operand
7426  , typename MT5 // Type of the right-hand side matrix operand
7427  , typename ST2 > // Type of the scalar value
7428  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
7429  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7430  {
7431  const ResultType tmp( serial( A * B * scalar ) );
7432  subAssign( C, tmp );
7433  }
7434  //**********************************************************************************************
7435 
7436  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
7450  template< typename MT3 // Type of the left-hand side target matrix
7451  , typename MT4 // Type of the left-hand side matrix operand
7452  , typename MT5 // Type of the right-hand side matrix operand
7453  , typename ST2 > // Type of the scalar value
7454  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7455  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7456  {
7458 
7459  const size_t M( A.rows() );
7460  const size_t N( B.columns() );
7461 
7462  for( size_t j=0UL; j<N; ++j )
7463  {
7464  const size_t ibegin( ( IsLower<MT4>::value )
7465  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
7466  :( 0UL ) );
7467  const size_t iend( ( IsUpper<MT4>::value )
7468  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
7469  :( M ) );
7470  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7471 
7472  const size_t inum( iend - ibegin );
7473  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7474 
7475  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7476  C(i ,j) -= A(i ,j) * B(j,j) * scalar;
7477  C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
7478  }
7479  if( ipos < iend ) {
7480  C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
7481  }
7482  }
7483  }
7484  //**********************************************************************************************
7485 
7486  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
7500  template< typename MT3 // Type of the left-hand side target matrix
7501  , typename MT4 // Type of the left-hand side matrix operand
7502  , typename MT5 // Type of the right-hand side matrix operand
7503  , typename ST2 > // Type of the scalar value
7504  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
7505  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7506  {
7508 
7509  const size_t M( A.rows() );
7510  const size_t N( B.columns() );
7511 
7512  for( size_t j=0UL; j<N; ++j )
7513  {
7514  const size_t ibegin( ( IsLower<MT5>::value )
7515  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
7516  :( 0UL ) );
7517  const size_t iend( ( IsUpper<MT5>::value )
7518  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
7519  :( M ) );
7520  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7521 
7522  const size_t inum( iend - ibegin );
7523  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7524 
7525  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7526  C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7527  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7528  }
7529  if( ipos < iend ) {
7530  C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7531  }
7532  }
7533  }
7534  //**********************************************************************************************
7535 
7536  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
7550  template< typename MT3 // Type of the left-hand side target matrix
7551  , typename MT4 // Type of the left-hand side matrix operand
7552  , typename MT5 // Type of the right-hand side matrix operand
7553  , typename ST2 > // Type of the scalar value
7554  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
7555  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7556  {
7558 
7559  for( size_t i=0UL; i<A.rows(); ++i ) {
7560  C(i,i) -= A(i,i) * B(i,i) * scalar;
7561  }
7562  }
7563  //**********************************************************************************************
7564 
7565  //**Default subtraction assignment to dense matrices (small matrices)***************************
7579  template< typename MT3 // Type of the left-hand side target matrix
7580  , typename MT4 // Type of the left-hand side matrix operand
7581  , typename MT5 // Type of the right-hand side matrix operand
7582  , typename ST2 > // Type of the scalar value
7584  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7585  {
7586  selectDefaultSubAssignKernel( C, A, B, scalar );
7587  }
7588  //**********************************************************************************************
7589 
7590  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
7605  template< typename MT3 // Type of the left-hand side target matrix
7606  , typename MT4 // Type of the left-hand side matrix operand
7607  , typename MT5 // Type of the right-hand side matrix operand
7608  , typename ST2 > // Type of the scalar value
7610  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7611  {
7616 
7617  const ForwardFunctor fwd;
7618 
7620  const OppositeType_<MT5> tmp( serial( B ) );
7621  subAssign( ~C, fwd( A * tmp ) * scalar );
7622  }
7624  const OppositeType_<MT4> tmp( serial( A ) );
7625  subAssign( ~C, fwd( tmp * B ) * scalar );
7626  }
7627  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7628  const OppositeType_<MT5> tmp( serial( B ) );
7629  subAssign( ~C, fwd( A * tmp ) * scalar );
7630  }
7631  else {
7632  const OppositeType_<MT4> tmp( serial( A ) );
7633  subAssign( ~C, fwd( tmp * B ) * scalar );
7634  }
7635  }
7636  //**********************************************************************************************
7637 
7638  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
7653  template< typename MT3 // Type of the left-hand side target matrix
7654  , typename MT4 // Type of the left-hand side matrix operand
7655  , typename MT5 // Type of the right-hand side matrix operand
7656  , typename ST2 > // Type of the scalar value
7658  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7659  {
7660  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7661 
7662  const size_t M( A.rows() );
7663  const size_t N( B.columns() );
7664  const size_t K( A.columns() );
7665 
7666  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7667 
7668  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
7669  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
7670 
7671  const SIMDType factor( set( scalar ) );
7672 
7673  size_t i( 0UL );
7674 
7676  {
7677  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
7678  for( size_t j=0UL; j<N; ++j )
7679  {
7680  const size_t kbegin( ( IsLower<MT5>::value )
7681  ?( ( IsUpper<MT4>::value )
7682  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7683  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7684  :( IsUpper<MT4>::value ? i : 0UL ) );
7685  const size_t kend( ( IsUpper<MT5>::value )
7686  ?( ( IsLower<MT4>::value )
7687  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
7688  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
7689  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
7690 
7691  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7692 
7693  for( size_t k=kbegin; k<kend; ++k ) {
7694  const SIMDType b1( set( B(k,j) ) );
7695  xmm1 += A.load(i ,k) * b1;
7696  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7697  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7698  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7699  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7700  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
7701  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
7702  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
7703  }
7704 
7705  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7706  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7707  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7708  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7709  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7710  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
7711  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
7712  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
7713  }
7714  }
7715  }
7716 
7717  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
7718  {
7719  size_t j( 0UL );
7720 
7721  for( ; (j+2UL) <= N; j+=2UL )
7722  {
7723  const size_t kbegin( ( IsLower<MT5>::value )
7724  ?( ( IsUpper<MT4>::value )
7725  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7726  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7727  :( IsUpper<MT4>::value ? i : 0UL ) );
7728  const size_t kend( ( IsUpper<MT5>::value )
7729  ?( ( IsLower<MT4>::value )
7730  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7731  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7732  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
7733 
7734  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7735 
7736  for( size_t k=kbegin; k<kend; ++k ) {
7737  const SIMDType a1( A.load(i ,k) );
7738  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7739  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7740  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7741  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
7742  const SIMDType b1( set( B(k,j ) ) );
7743  const SIMDType b2( set( B(k,j+1UL) ) );
7744  xmm1 += a1 * b1;
7745  xmm2 += a2 * b1;
7746  xmm3 += a3 * b1;
7747  xmm4 += a4 * b1;
7748  xmm5 += a5 * b1;
7749  xmm6 += a1 * b2;
7750  xmm7 += a2 * b2;
7751  xmm8 += a3 * b2;
7752  xmm9 += a4 * b2;
7753  xmm10 += a5 * b2;
7754  }
7755 
7756  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7757  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7758  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7759  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7760  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
7761  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
7762  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
7763  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
7764  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
7765  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
7766  }
7767 
7768  if( j < N )
7769  {
7770  const size_t kbegin( ( IsLower<MT5>::value )
7771  ?( ( IsUpper<MT4>::value )
7772  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7773  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7774  :( IsUpper<MT4>::value ? i : 0UL ) );
7775  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
7776 
7777  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7778 
7779  for( size_t k=kbegin; k<kend; ++k ) {
7780  const SIMDType b1( set( B(k,j) ) );
7781  xmm1 += A.load(i ,k) * b1;
7782  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7783  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7784  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7785  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7786  }
7787 
7788  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7789  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7790  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7791  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7792  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7793  }
7794  }
7795 
7796  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7797  {
7798  size_t j( 0UL );
7799 
7800  for( ; (j+2UL) <= N; j+=2UL )
7801  {
7802  const size_t kbegin( ( IsLower<MT5>::value )
7803  ?( ( IsUpper<MT4>::value )
7804  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7805  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7806  :( IsUpper<MT4>::value ? i : 0UL ) );
7807  const size_t kend( ( IsUpper<MT5>::value )
7808  ?( ( IsLower<MT4>::value )
7809  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7810  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7811  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
7812 
7813  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7814 
7815  for( size_t k=kbegin; k<kend; ++k ) {
7816  const SIMDType a1( A.load(i ,k) );
7817  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7818  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7819  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7820  const SIMDType b1( set( B(k,j ) ) );
7821  const SIMDType b2( set( B(k,j+1UL) ) );
7822  xmm1 += a1 * b1;
7823  xmm2 += a2 * b1;
7824  xmm3 += a3 * b1;
7825  xmm4 += a4 * b1;
7826  xmm5 += a1 * b2;
7827  xmm6 += a2 * b2;
7828  xmm7 += a3 * b2;
7829  xmm8 += a4 * b2;
7830  }
7831 
7832  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7833  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7834  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7835  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7836  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
7837  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
7838  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
7839  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
7840  }
7841 
7842  if( j < N )
7843  {
7844  const size_t kbegin( ( IsLower<MT5>::value )
7845  ?( ( IsUpper<MT4>::value )
7846  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7847  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7848  :( IsUpper<MT4>::value ? i : 0UL ) );
7849  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
7850 
7851  SIMDType xmm1, xmm2, xmm3, xmm4;
7852 
7853  for( size_t k=kbegin; k<kend; ++k ) {
7854  const SIMDType b1( set( B(k,j) ) );
7855  xmm1 += A.load(i ,k) * b1;
7856  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7857  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7858  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7859  }
7860 
7861  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7862  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7863  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7864  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7865  }
7866  }
7867 
7868  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
7869  {
7870  size_t j( 0UL );
7871 
7872  for( ; (j+2UL) <= N; j+=2UL )
7873  {
7874  const size_t kbegin( ( IsLower<MT5>::value )
7875  ?( ( IsUpper<MT4>::value )
7876  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7877  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7878  :( IsUpper<MT4>::value ? i : 0UL ) );
7879  const size_t kend( ( IsUpper<MT5>::value )
7880  ?( ( IsLower<MT4>::value )
7881  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7882  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7883  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
7884 
7885  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7886 
7887  for( size_t k=kbegin; k<kend; ++k ) {
7888  const SIMDType a1( A.load(i ,k) );
7889  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7890  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7891  const SIMDType b1( set( B(k,j ) ) );
7892  const SIMDType b2( set( B(k,j+1UL) ) );
7893  xmm1 += a1 * b1;
7894  xmm2 += a2 * b1;
7895  xmm3 += a3 * b1;
7896  xmm4 += a1 * b2;
7897  xmm5 += a2 * b2;
7898  xmm6 += a3 * b2;
7899  }
7900 
7901  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7902  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7903  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7904  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
7905  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
7906  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
7907  }
7908 
7909  if( j < N )
7910  {
7911  const size_t kbegin( ( IsLower<MT5>::value )
7912  ?( ( IsUpper<MT4>::value )
7913  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7914  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7915  :( IsUpper<MT4>::value ? i : 0UL ) );
7916  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
7917 
7918  SIMDType xmm1, xmm2, xmm3;
7919 
7920  for( size_t k=kbegin; k<kend; ++k ) {
7921  const SIMDType b1( set( B(k,j) ) );
7922  xmm1 += A.load(i ,k) * b1;
7923  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7924  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7925  }
7926 
7927  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7928  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7929  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7930  }
7931  }
7932 
7933  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7934  {
7935  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
7936  size_t j( UPP ? i : 0UL );
7937 
7938  for( ; (j+4UL) <= jend; j+=4UL )
7939  {
7940  const size_t kbegin( ( IsLower<MT5>::value )
7941  ?( ( IsUpper<MT4>::value )
7942  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7943  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7944  :( IsUpper<MT4>::value ? i : 0UL ) );
7945  const size_t kend( ( IsUpper<MT5>::value )
7946  ?( ( IsLower<MT4>::value )
7947  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
7948  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
7949  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
7950 
7951  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7952 
7953  for( size_t k=kbegin; k<kend; ++k ) {
7954  const SIMDType a1( A.load(i ,k) );
7955  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7956  const SIMDType b1( set( B(k,j ) ) );
7957  const SIMDType b2( set( B(k,j+1UL) ) );
7958  const SIMDType b3( set( B(k,j+2UL) ) );
7959  const SIMDType b4( set( B(k,j+3UL) ) );
7960  xmm1 += a1 * b1;
7961  xmm2 += a2 * b1;
7962  xmm3 += a1 * b2;
7963  xmm4 += a2 * b2;
7964  xmm5 += a1 * b3;
7965  xmm6 += a2 * b3;
7966  xmm7 += a1 * b4;
7967  xmm8 += a2 * b4;
7968  }
7969 
7970  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7971  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
7972  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7973  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
7974  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
7975  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
7976  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
7977  (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
7978  }
7979 
7980  for( ; (j+3UL) <= jend; j+=3UL )
7981  {
7982  const size_t kbegin( ( IsLower<MT5>::value )
7983  ?( ( IsUpper<MT4>::value )
7984  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7985  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7986  :( IsUpper<MT4>::value ? i : 0UL ) );
7987  const size_t kend( ( IsUpper<MT5>::value )
7988  ?( ( IsLower<MT4>::value )
7989  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
7990  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
7991  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
7992 
7993  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7994 
7995  for( size_t k=kbegin; k<kend; ++k ) {
7996  const SIMDType a1( A.load(i ,k) );
7997  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7998  const SIMDType b1( set( B(k,j ) ) );
7999  const SIMDType b2( set( B(k,j+1UL) ) );
8000  const SIMDType b3( set( B(k,j+2UL) ) );
8001  xmm1 += a1 * b1;
8002  xmm2 += a2 * b1;
8003  xmm3 += a1 * b2;
8004  xmm4 += a2 * b2;
8005  xmm5 += a1 * b3;
8006  xmm6 += a2 * b3;
8007  }
8008 
8009  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
8010  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
8011  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
8012  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8013  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
8014  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
8015  }
8016 
8017  for( ; (j+2UL) <= jend; j+=2UL )
8018  {
8019  const size_t kbegin( ( IsLower<MT5>::value )
8020  ?( ( IsUpper<MT4>::value )
8021  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8022  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8023  :( IsUpper<MT4>::value ? i : 0UL ) );
8024  const size_t kend( ( IsUpper<MT5>::value )
8025  ?( ( IsLower<MT4>::value )
8026  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8027  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8028  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8029 
8030  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8031  size_t k( kbegin );
8032 
8033  for( ; (k+2UL) <= kend; k+=2UL ) {
8034  const SIMDType a1( A.load(i ,k ) );
8035  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8036  const SIMDType a3( A.load(i ,k+1UL) );
8037  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8038  const SIMDType b1( set( B(k ,j ) ) );
8039  const SIMDType b2( set( B(k ,j+1UL) ) );
8040  const SIMDType b3( set( B(k+1UL,j ) ) );
8041  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
8042  xmm1 += a1 * b1;
8043  xmm2 += a2 * b1;
8044  xmm3 += a1 * b2;
8045  xmm4 += a2 * b2;
8046  xmm5 += a3 * b3;
8047  xmm6 += a4 * b3;
8048  xmm7 += a3 * b4;
8049  xmm8 += a4 * b4;
8050  }
8051 
8052  for( ; k<kend; ++k ) {
8053  const SIMDType a1( A.load(i ,k) );
8054  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8055  const SIMDType b1( set( B(k,j ) ) );
8056  const SIMDType b2( set( B(k,j+1UL) ) );
8057  xmm1 += a1 * b1;
8058  xmm2 += a2 * b1;
8059  xmm3 += a1 * b2;
8060  xmm4 += a2 * b2;
8061  }
8062 
8063  (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
8064  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
8065  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - (xmm3+xmm7) * factor );
8066  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
8067  }
8068 
8069  if( j < jend )
8070  {
8071  const size_t kbegin( ( IsLower<MT5>::value )
8072  ?( ( IsUpper<MT4>::value )
8073  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8074  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8075  :( IsUpper<MT4>::value ? i : 0UL ) );
8076  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8077 
8078  SIMDType xmm1, xmm2, xmm3, xmm4;
8079  size_t k( kbegin );
8080 
8081  for( ; (k+2UL) <= kend; k+=2UL ) {
8082  const SIMDType b1( set( B(k ,j) ) );
8083  const SIMDType b2( set( B(k+1UL,j) ) );
8084  xmm1 += A.load(i ,k ) * b1;
8085  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8086  xmm3 += A.load(i ,k+1UL) * b2;
8087  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8088  }
8089 
8090  for( ; k<kend; ++k ) {
8091  const SIMDType b1( set( B(k,j) ) );
8092  xmm1 += A.load(i ,k) * b1;
8093  xmm2 += A.load(i+SIMDSIZE,k) * b1;
8094  }
8095 
8096  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
8097  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
8098  }
8099  }
8100 
8101  for( ; i<ipos; i+=SIMDSIZE )
8102  {
8103  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
8104  size_t j( UPP ? i : 0UL );
8105 
8106  for( ; (j+4UL) <= jend; j+=4UL )
8107  {
8108  const size_t kbegin( ( IsLower<MT5>::value )
8109  ?( ( IsUpper<MT4>::value )
8110  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8111  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8112  :( IsUpper<MT4>::value ? i : 0UL ) );
8113  const size_t kend( ( IsUpper<MT5>::value )
8114  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
8115  :( K ) );
8116 
8117  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8118  size_t k( kbegin );
8119 
8120  for( ; (k+2UL) <= kend; k+=2UL ) {
8121  const SIMDType a1( A.load(i,k ) );
8122  const SIMDType a2( A.load(i,k+1UL) );
8123  xmm1 += a1 * set( B(k ,j ) );
8124  xmm2 += a1 * set( B(k ,j+1UL) );
8125  xmm3 += a1 * set( B(k ,j+2UL) );
8126  xmm4 += a1 * set( B(k ,j+3UL) );
8127  xmm5 += a2 * set( B(k+1UL,j ) );
8128  xmm6 += a2 * set( B(k+1UL,j+1UL) );
8129  xmm7 += a2 * set( B(k+1UL,j+2UL) );
8130  xmm8 += a2 * set( B(k+1UL,j+3UL) );
8131  }
8132 
8133  for( ; k<kend; ++k ) {
8134  const SIMDType a1( A.load(i,k) );
8135  xmm1 += a1 * set( B(k,j ) );
8136  xmm2 += a1 * set( B(k,j+1UL) );
8137  xmm3 += a1 * set( B(k,j+2UL) );
8138  xmm4 += a1 * set( B(k,j+3UL) );
8139  }
8140 
8141  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm5) * factor );
8142  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm6) * factor );
8143  (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm7) * factor );
8144  (~C).store( i, j+3UL, (~C).load(i,j+3UL) - (xmm4+xmm8) * factor );
8145  }
8146 
8147  for( ; (j+3UL) <= jend; j+=3UL )
8148  {
8149  const size_t kbegin( ( IsLower<MT5>::value )
8150  ?( ( IsUpper<MT4>::value )
8151  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8152  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8153  :( IsUpper<MT4>::value ? i : 0UL ) );
8154  const size_t kend( ( IsUpper<MT5>::value )
8155  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
8156  :( K ) );
8157 
8158  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8159  size_t k( kbegin );
8160 
8161  for( ; (k+2UL) <= kend; k+=2UL ) {
8162  const SIMDType a1( A.load(i,k ) );
8163  const SIMDType a2( A.load(i,k+1UL) );
8164  xmm1 += a1 * set( B(k ,j ) );
8165  xmm2 += a1 * set( B(k ,j+1UL) );
8166  xmm3 += a1 * set( B(k ,j+2UL) );
8167  xmm4 += a2 * set( B(k+1UL,j ) );
8168  xmm5 += a2 * set( B(k+1UL,j+1UL) );
8169  xmm6 += a2 * set( B(k+1UL,j+2UL) );
8170  }
8171 
8172  for( ; k<kend; ++k ) {
8173  const SIMDType a1( A.load(i,k) );
8174  xmm1 += a1 * set( B(k,j ) );
8175  xmm2 += a1 * set( B(k,j+1UL) );
8176  xmm3 += a1 * set( B(k,j+2UL) );
8177  }
8178 
8179  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm4) * factor );
8180  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm5) * factor );
8181  (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm6) * factor );
8182  }
8183 
8184  for( ; (j+2UL) <= jend; j+=2UL )
8185  {
8186  const size_t kbegin( ( IsLower<MT5>::value )
8187  ?( ( IsUpper<MT4>::value )
8188  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8189  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8190  :( IsUpper<MT4>::value ? i : 0UL ) );
8191  const size_t kend( ( IsUpper<MT5>::value )
8192  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8193  :( K ) );
8194 
8195  SIMDType xmm1, xmm2, xmm3, xmm4;
8196  size_t k( kbegin );
8197 
8198  for( ; (k+2UL) <= kend; k+=2UL ) {
8199  const SIMDType a1( A.load(i,k ) );
8200  const SIMDType a2( A.load(i,k+1UL) );
8201  xmm1 += a1 * set( B(k ,j ) );
8202  xmm2 += a1 * set( B(k ,j+1UL) );
8203  xmm3 += a2 * set( B(k+1UL,j ) );
8204  xmm4 += a2 * set( B(k+1UL,j+1UL) );
8205  }
8206 
8207  for( ; k<kend; ++k ) {
8208  const SIMDType a1( A.load(i,k) );
8209  xmm1 += a1 * set( B(k,j ) );
8210  xmm2 += a1 * set( B(k,j+1UL) );
8211  }
8212 
8213  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
8214  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm4) * factor );
8215  }
8216 
8217  if( j < jend )
8218  {
8219  const size_t kbegin( ( IsLower<MT5>::value )
8220  ?( ( IsUpper<MT4>::value )
8221  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8222  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8223  :( IsUpper<MT4>::value ? i : 0UL ) );
8224 
8225  SIMDType xmm1, xmm2;
8226  size_t k( kbegin );
8227 
8228  for( ; (k+2UL) <= K; k+=2UL ) {
8229  xmm1 += A.load(i,k ) * set( B(k ,j) );
8230  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
8231  }
8232 
8233  for( ; k<K; ++k ) {
8234  xmm1 += A.load(i,k) * set( B(k,j) );
8235  }
8236 
8237  (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
8238  }
8239  }
8240 
8241  for( ; remainder && i<M; ++i )
8242  {
8243  const size_t jend( LOW ? i+1UL : N );
8244  size_t j( UPP ? i : 0UL );
8245 
8246  for( ; (j+2UL) <= jend; j+=2UL )
8247  {
8248  const size_t kbegin( ( IsLower<MT5>::value )
8249  ?( ( IsUpper<MT4>::value )
8250  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8251  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8252  :( IsUpper<MT4>::value ? i : 0UL ) );
8253  const size_t kend( ( IsUpper<MT5>::value )
8254  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8255  :( K ) );
8256 
8257  ElementType value1 = ElementType();
8258  ElementType value2 = ElementType();
8259 
8260  for( size_t k=kbegin; k<kend; ++k ) {
8261  value1 += A(i,k) * B(k,j );
8262  value2 += A(i,k) * B(k,j+1UL);
8263  }
8264 
8265  (~C)(i,j ) -= value1 * scalar;
8266  (~C)(i,j+1UL) -= value2 * scalar;
8267  }
8268 
8269  if( j < jend )
8270  {
8271  const size_t kbegin( ( IsLower<MT5>::value )
8272  ?( ( IsUpper<MT4>::value )
8273  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8274  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8275  :( IsUpper<MT4>::value ? i : 0UL ) );
8276 
8277  ElementType value = ElementType();
8278 
8279  for( size_t k=kbegin; k<K; ++k ) {
8280  value += A(i,k) * B(k,j);
8281  }
8282 
8283  (~C)(i,j) -= value * scalar;
8284  }
8285  }
8286  }
8287  //**********************************************************************************************
8288 
8289  //**Default subtraction assignment to dense matrices (large matrices)***************************
8303  template< typename MT3 // Type of the left-hand side target matrix
8304  , typename MT4 // Type of the left-hand side matrix operand
8305  , typename MT5 // Type of the right-hand side matrix operand
8306  , typename ST2 > // Type of the scalar value
8308  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8309  {
8310  selectDefaultSubAssignKernel( C, A, B, scalar );
8311  }
8312  //**********************************************************************************************
8313 
8314  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
8329  template< typename MT3 // Type of the left-hand side target matrix
8330  , typename MT4 // Type of the left-hand side matrix operand
8331  , typename MT5 // Type of the right-hand side matrix operand
8332  , typename ST2 > // Type of the scalar value
8334  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8335  {
8336  if( LOW )
8337  lmmm( C, A, B, -scalar, ST2(1) );
8338  else if( UPP )
8339  ummm( C, A, B, -scalar, ST2(1) );
8340  else
8341  mmm( C, A, B, -scalar, ST2(1) );
8342  }
8343  //**********************************************************************************************
8344 
8345  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
8360  template< typename MT3 // Type of the left-hand side target matrix
8361  , typename MT4 // Type of the left-hand side matrix operand
8362  , typename MT5 // Type of the right-hand side matrix operand
8363  , typename ST2 > // Type of the scalar value
8365  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8366  {
8367  selectLargeSubAssignKernel( C, A, B, scalar );
8368  }
8369  //**********************************************************************************************
8370 
8371  //**BLAS-based subraction assignment to dense matrices******************************************
8372 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
8373 
8386  template< typename MT3 // Type of the left-hand side target matrix
8387  , typename MT4 // Type of the left-hand side matrix operand
8388  , typename MT5 // Type of the right-hand side matrix operand
8389  , typename ST2 > // Type of the scalar value
8391  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8392  {
8393  using ET = ElementType_<MT3>;
8394 
8395  if( IsTriangular<MT4>::value ) {
8396  ResultType_<MT3> tmp( serial( B ) );
8397  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8398  subAssign( C, tmp );
8399  }
8400  else if( IsTriangular<MT5>::value ) {
8401  ResultType_<MT3> tmp( serial( A ) );
8402  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8403  subAssign( C, tmp );
8404  }
8405  else {
8406  gemm( C, A, B, ET(-scalar), ET(1) );
8407  }
8408  }
8409 #endif
8410  //**********************************************************************************************
8411 
8412  //**Restructuring subtraction assignment to row-major matrices**********************************
8426  template< typename MT > // Type of the target matrix
8428  subAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8429  {
8431 
8433 
8434  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8435  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8436 
8437  const ForwardFunctor fwd;
8438 
8439  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8440  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8441 
8443  subAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8444  else if( IsSymmetric<MT1>::value )
8445  subAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8446  else
8447  subAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8448  }
8449  //**********************************************************************************************
8450 
8451  //**Subtraction assignment to sparse matrices***************************************************
8452  // No special implementation for the subtraction assignment to sparse matrices.
8453  //**********************************************************************************************
8454 
8455  //**Schur product assignment to dense matrices**************************************************
8467  template< typename MT // Type of the target dense matrix
8468  , bool SO > // Storage order of the target dense matrix
8469  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8470  {
8472 
8476 
8477  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8478  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8479 
8480  const ResultType tmp( serial( rhs ) );
8481  schurAssign( ~lhs, tmp );
8482  }
8483  //**********************************************************************************************
8484 
8485  //**Schur product assignment to sparse matrices*************************************************
8486  // No special implementation for the Schur product assignment to sparse matrices.
8487  //**********************************************************************************************
8488 
8489  //**Multiplication assignment to dense matrices*************************************************
8490  // No special implementation for the multiplication assignment to dense matrices.
8491  //**********************************************************************************************
8492 
8493  //**Multiplication assignment to sparse matrices************************************************
8494  // No special implementation for the multiplication assignment to sparse matrices.
8495  //**********************************************************************************************
8496 
8497  //**SMP assignment to dense matrices************************************************************
8512  template< typename MT // Type of the target dense matrix
8513  , bool SO > // Storage order of the target dense matrix
8515  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8516  {
8518 
8519  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8520  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8521 
8522  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8523  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8524 
8525  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
8526  return;
8527  }
8528  else if( left.columns() == 0UL ) {
8529  reset( ~lhs );
8530  return;
8531  }
8532 
8533  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8534  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8535 
8536  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8537  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8538  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8539  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8540  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8541  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8542 
8543  smpAssign( ~lhs, A * B * rhs.scalar_ );
8544  }
8545  //**********************************************************************************************
8546 
8547  //**SMP assignment to sparse matrices***********************************************************
8562  template< typename MT // Type of the target sparse matrix
8563  , bool SO > // Storage order of the target sparse matrix
8566  {
8568 
8570 
8577 
8578  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8579  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8580 
8581  const ForwardFunctor fwd;
8582 
8583  const TmpType tmp( rhs );
8584  smpAssign( ~lhs, fwd( tmp ) );
8585  }
8586  //**********************************************************************************************
8587 
8588  //**Restructuring SMP assignment to row-major matrices******************************************
8602  template< typename MT > // Type of the target matrix
8604  smpAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8605  {
8607 
8609 
8610  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8611  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8612 
8613  const ForwardFunctor fwd;
8614 
8615  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8616  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8617 
8619  smpAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8620  else if( IsSymmetric<MT1>::value )
8621  smpAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8622  else
8623  smpAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8624  }
8625  //**********************************************************************************************
8626 
8627  //**SMP addition assignment to dense matrices***************************************************
8642  template< typename MT // Type of the target dense matrix
8643  , bool SO > // Storage order of the target dense matrix
8646  {
8648 
8649  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8650  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8651 
8652  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8653  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8654 
8655  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8656  return;
8657  }
8658 
8659  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8660  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8661 
8662  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8663  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8664  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8665  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8666  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8667  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8668 
8669  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
8670  }
8671  //**********************************************************************************************
8672 
8673  //**Restructuring SMP addition assignment to row-major matrices*********************************
8688  template< typename MT > // Type of the target matrix
8691  {
8693 
8695 
8696  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8697  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8698 
8699  const ForwardFunctor fwd;
8700 
8701  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8702  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8703 
8705  smpAddAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8706  else if( IsSymmetric<MT1>::value )
8707  smpAddAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8708  else
8709  smpAddAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8710  }
8711  //**********************************************************************************************
8712 
8713  //**SMP addition assignment to sparse matrices**************************************************
8714  // No special implementation for the SMP addition assignment to sparse matrices.
8715  //**********************************************************************************************
8716 
8717  //**SMP subtraction assignment to dense matrices************************************************
8732  template< typename MT // Type of the target dense matrix
8733  , bool SO > // Storage order of the target dense matrix
8736  {
8738 
8739  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8740  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8741 
8742  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8743  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8744 
8745  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8746  return;
8747  }
8748 
8749  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8750  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8751 
8752  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8753  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8754  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8755  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8756  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8757  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8758 
8759  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
8760  }
8761  //**********************************************************************************************
8762 
8763  //**Restructuring SMP subtraction assignment to row-major matrices******************************
8778  template< typename MT > // Type of the target matrix
8781  {
8783 
8785 
8786  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8787  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8788 
8789  const ForwardFunctor fwd;
8790 
8791  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8792  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8793 
8795  smpSubAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8796  else if( IsSymmetric<MT1>::value )
8797  smpSubAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8798  else
8799  smpSubAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8800  }
8801  //**********************************************************************************************
8802 
8803  //**SMP subtraction assignment to sparse matrices***********************************************
8804  // No special implementation for the SMP subtraction assignment to sparse matrices.
8805  //**********************************************************************************************
8806 
8807  //**SMP Schur product assignment to dense matrices**********************************************
8819  template< typename MT // Type of the target dense matrix
8820  , bool SO > // Storage order of the target dense matrix
8821  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8822  {
8824 
8828 
8829  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8830  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8831 
8832  const ResultType tmp( rhs );
8833  smpSchurAssign( ~lhs, tmp );
8834  }
8835  //**********************************************************************************************
8836 
8837  //**SMP Schur product assignment to sparse matrices*********************************************
8838  // No special implementation for the SMP Schur product assignment to sparse matrices.
8839  //**********************************************************************************************
8840 
8841  //**SMP multiplication assignment to dense matrices*********************************************
8842  // No special implementation for the SMP multiplication assignment to dense matrices.
8843  //**********************************************************************************************
8844 
8845  //**SMP multiplication assignment to sparse matrices********************************************
8846  // No special implementation for the SMP multiplication assignment to sparse matrices.
8847  //**********************************************************************************************
8848 
8849  //**Compile time checks*************************************************************************
8858  //**********************************************************************************************
8859 };
8861 //*************************************************************************************************
8862 
8863 
8864 
8865 
8866 //=================================================================================================
8867 //
8868 // GLOBAL BINARY ARITHMETIC OPERATORS
8869 //
8870 //=================================================================================================
8871 
8872 //*************************************************************************************************
8899 template< typename MT1 // Type of the left-hand side dense matrix
8900  , typename MT2 > // Type of the right-hand side dense matrix
8901 inline decltype(auto)
8902  operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,true>& rhs )
8903 {
8905 
8906  if( (~lhs).columns() != (~rhs).rows() ) {
8907  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
8908  }
8909 
8911  return ReturnType( ~lhs, ~rhs );
8912 }
8913 //*************************************************************************************************
8914 
8915 
8916 
8917 
8918 //=================================================================================================
8919 //
8920 // GLOBAL FUNCTIONS
8921 //
8922 //=================================================================================================
8923 
8924 //*************************************************************************************************
8947 template< typename MT1 // Type of the left-hand side dense matrix
8948  , typename MT2 // Type of the right-hand side dense matrix
8949  , bool SF // Symmetry flag
8950  , bool HF // Hermitian flag
8951  , bool LF // Lower flag
8952  , bool UF > // Upper flag
8953 inline decltype(auto) declsym( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8954 {
8956 
8957  if( !isSquare( dm ) ) {
8958  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
8959  }
8960 
8962  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8963 }
8965 //*************************************************************************************************
8966 
8967 
8968 //*************************************************************************************************
8991 template< typename MT1 // Type of the left-hand side dense matrix
8992  , typename MT2 // Type of the right-hand side dense matrix
8993  , bool SF // Symmetry flag
8994  , bool HF // Hermitian flag
8995  , bool LF // Lower flag
8996  , bool UF > // Upper flag
8997 inline decltype(auto) declherm( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8998 {
9000 
9001  if( !isSquare( dm ) ) {
9002  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
9003  }
9004 
9006  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9007 }
9009 //*************************************************************************************************
9010 
9011 
9012 //*************************************************************************************************
9035 template< typename MT1 // Type of the left-hand side dense matrix
9036  , typename MT2 // Type of the right-hand side dense matrix
9037  , bool SF // Symmetry flag
9038  , bool HF // Hermitian flag
9039  , bool LF // Lower flag
9040  , bool UF > // Upper flag
9041 inline decltype(auto) decllow( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9042 {
9044 
9045  if( !isSquare( dm ) ) {
9046  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9047  }
9048 
9050  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9051 }
9053 //*************************************************************************************************
9054 
9055 
9056 //*************************************************************************************************
9079 template< typename MT1 // Type of the left-hand side dense matrix
9080  , typename MT2 // Type of the right-hand side dense matrix
9081  , bool SF // Symmetry flag
9082  , bool HF // Hermitian flag
9083  , bool LF // Lower flag
9084  , bool UF > // Upper flag
9085 inline decltype(auto) declupp( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9086 {
9088 
9089  if( !isSquare( dm ) ) {
9090  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9091  }
9092 
9094  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9095 }
9097 //*************************************************************************************************
9098 
9099 
9100 //*************************************************************************************************
9123 template< typename MT1 // Type of the left-hand side dense matrix
9124  , typename MT2 // Type of the right-hand side dense matrix
9125  , bool SF // Symmetry flag
9126  , bool HF // Hermitian flag
9127  , bool LF // Lower flag
9128  , bool UF > // Upper flag
9129 inline decltype(auto) decldiag( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9130 {
9132 
9133  if( !isSquare( dm ) ) {
9134  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
9135  }
9136 
9138  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9139 }
9141 //*************************************************************************************************
9142 
9143 
9144 
9145 
9146 //=================================================================================================
9147 //
9148 // ROWS SPECIALIZATIONS
9149 //
9150 //=================================================================================================
9151 
9152 //*************************************************************************************************
9154 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9155 struct Rows< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9156  : public Rows<MT1>
9157 {};
9159 //*************************************************************************************************
9160 
9161 
9162 
9163 
9164 //=================================================================================================
9165 //
9166 // COLUMNS SPECIALIZATIONS
9167 //
9168 //=================================================================================================
9169 
9170 //*************************************************************************************************
9172 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9173 struct Columns< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9174  : public Columns<MT2>
9175 {};
9177 //*************************************************************************************************
9178 
9179 
9180 
9181 
9182 //=================================================================================================
9183 //
9184 // ISALIGNED SPECIALIZATIONS
9185 //
9186 //=================================================================================================
9187 
9188 //*************************************************************************************************
9190 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9191 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9192  : public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
9193 {};
9195 //*************************************************************************************************
9196 
9197 
9198 
9199 
9200 //=================================================================================================
9201 //
9202 // ISSYMMETRIC SPECIALIZATIONS
9203 //
9204 //=================================================================================================
9205 
9206 //*************************************************************************************************
9208 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9209 struct IsSymmetric< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9210  : public BoolConstant< Or< Bool<SF>
9211  , And< Bool<HF>
9212  , IsBuiltin< ElementType_< TDMatTDMatMultExpr<MT1,MT2,false,true,false,false> > > >
9213  , And< Bool<LF>, Bool<UF> > >::value >
9214 {};
9216 //*************************************************************************************************
9217 
9218 
9219 
9220 
9221 //=================================================================================================
9222 //
9223 // ISHERMITIAN SPECIALIZATIONS
9224 //
9225 //=================================================================================================
9226 
9227 //*************************************************************************************************
9229 template< typename MT1, typename MT2, bool SF, bool LF, bool UF >
9230 struct IsHermitian< TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
9231  : public TrueType
9232 {};
9234 //*************************************************************************************************
9235 
9236 
9237 
9238 
9239 //=================================================================================================
9240 //
9241 // ISLOWER SPECIALIZATIONS
9242 //
9243 //=================================================================================================
9244 
9245 //*************************************************************************************************
9247 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9248 struct IsLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9249  : public BoolConstant< Or< Bool<LF>
9250  , And< IsLower<MT1>, IsLower<MT2> >
9251  , And< Or< Bool<SF>, Bool<HF> >
9252  , IsUpper<MT1>, IsUpper<MT2> > >::value >
9253 {};
9255 //*************************************************************************************************
9256 
9257 
9258 
9259 
9260 //=================================================================================================
9261 //
9262 // ISUNILOWER SPECIALIZATIONS
9263 //
9264 //=================================================================================================
9265 
9266 //*************************************************************************************************
9268 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9269 struct IsUniLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9270  : public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
9271  , And< Or< Bool<SF>, Bool<HF> >
9272  , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
9273 {};
9275 //*************************************************************************************************
9276 
9277 
9278 
9279 
9280 //=================================================================================================
9281 //
9282 // ISSTRICTLYLOWER SPECIALIZATIONS
9283 //
9284 //=================================================================================================
9285 
9286 //*************************************************************************************************
9288 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9289 struct IsStrictlyLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9290  : public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9291  , And< IsStrictlyLower<MT2>, IsLower<MT1> >
9292  , And< Or< Bool<SF>, Bool<HF> >
9293  , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9294  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
9295 {};
9297 //*************************************************************************************************
9298 
9299 
9300 
9301 
9302 //=================================================================================================
9303 //
9304 // ISUPPER SPECIALIZATIONS
9305 //
9306 //=================================================================================================
9307 
9308 //*************************************************************************************************
9310 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9311 struct IsUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9312  : public BoolConstant< Or< Bool<UF>
9313  , And< IsUpper<MT1>, IsUpper<MT2> >
9314  , And< Or< Bool<SF>, Bool<HF> >
9315  , IsLower<MT1>, IsLower<MT2> > >::value >
9316 {};
9318 //*************************************************************************************************
9319 
9320 
9321 
9322 
9323 //=================================================================================================
9324 //
9325 // ISUNIUPPER SPECIALIZATIONS
9326 //
9327 //=================================================================================================
9328 
9329 //*************************************************************************************************
9331 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9332 struct IsUniUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9333  : public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
9334  , And< Or< Bool<SF>, Bool<HF> >
9335  , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
9336 {};
9338 //*************************************************************************************************
9339 
9340 
9341 
9342 
9343 //=================================================================================================
9344 //
9345 // ISSTRICTLYUPPER SPECIALIZATIONS
9346 //
9347 //=================================================================================================
9348 
9349 //*************************************************************************************************
9351 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9352 struct IsStrictlyUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9353  : public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9354  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
9355  , And< Or< Bool<SF>, Bool<HF> >
9356  , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9357  , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
9358 {};
9360 //*************************************************************************************************
9361 
9362 } // namespace blaze
9363 
9364 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Data type constraint.
Headerfile for the generic min algorithm.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:488
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:476
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:297
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:547
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:288
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:177
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:620
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:537
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
Header file for the IsIntegral type trait.
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:323
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1027
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:154
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:280
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1809
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:422
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:176
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:279
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:283
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Row< MT > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:124
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:432
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:110
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:110
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:281
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:456
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:179
Header file for the Columns type trait.
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1027
Header file for the IsLower type trait.
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:160
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:178
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:282
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:619
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:264
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:489
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:108
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:159
Compile time check for resizable data types.This type trait tests whether the given data type is a re...
Definition: IsResizable.h:75
System settings for the BLAS mode.
Header file for the IsSIMDCombinable type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:412
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Utility type for generic codes.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:294
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:386
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1029
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Base class for matrices.The Matrix class is a base class for all dense and sparse matrix classes with...
Definition: Forward.h:101
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:790
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1029
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:402
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:444
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:338
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:75
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:466
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:742
Header file for the IsResizable type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:285
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:284
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:291