TDMatTDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/dense/MMM.h>
54 #include <blaze/math/Exception.h>
67 #include <blaze/math/shims/Reset.h>
69 #include <blaze/math/SIMD.h>
94 #include <blaze/math/views/Check.h>
95 #include <blaze/system/BLAS.h>
96 #include <blaze/system/Blocking.h>
97 #include <blaze/system/Debugging.h>
102 #include <blaze/util/Assert.h>
103 #include <blaze/util/Complex.h>
106 #include <blaze/util/DisableIf.h>
107 #include <blaze/util/EnableIf.h>
109 #include <blaze/util/mpl/And.h>
110 #include <blaze/util/mpl/Bool.h>
111 #include <blaze/util/mpl/If.h>
112 #include <blaze/util/mpl/Not.h>
113 #include <blaze/util/mpl/Or.h>
114 #include <blaze/util/TrueType.h>
115 #include <blaze/util/Types.h>
124 
125 
126 namespace blaze {
127 
128 //=================================================================================================
129 //
130 // CLASS TDMATTDMATMULTEXPR
131 //
132 //=================================================================================================
133 
134 //*************************************************************************************************
141 template< typename MT1 // Type of the left-hand side dense matrix
142  , typename MT2 // Type of the right-hand side dense matrix
143  , bool SF // Symmetry flag
144  , bool HF // Hermitian flag
145  , bool LF // Lower flag
146  , bool UF > // Upper flag
147 class TDMatTDMatMultExpr
148  : public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
149  , private Computation
150 {
151  private:
152  //**Type definitions****************************************************************************
159  //**********************************************************************************************
160 
161  //**********************************************************************************************
163  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
164  //**********************************************************************************************
165 
166  //**********************************************************************************************
168  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
169  //**********************************************************************************************
170 
171  //**********************************************************************************************
173  enum : bool {
174  SYM = ( SF && !( HF || LF || UF ) ),
175  HERM = ( HF && !( LF || UF ) ),
176  LOW = ( LF || ( ( SF || HF ) && UF ) ),
177  UPP = ( UF || ( ( SF || HF ) && LF ) )
178  };
179  //**********************************************************************************************
180 
181  //**********************************************************************************************
183 
189  template< typename T1, typename T2, typename T3 >
190  struct CanExploitSymmetry {
191  enum : bool { value = IsRowMajorMatrix<T1>::value &&
193  };
195  //**********************************************************************************************
196 
197  //**********************************************************************************************
199 
203  template< typename T1, typename T2, typename T3 >
204  struct IsEvaluationRequired {
205  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
206  CanExploitSymmetry<T1,T2,T3>::value };
207  };
209  //**********************************************************************************************
210 
211  //**********************************************************************************************
213 
216  template< typename T1, typename T2, typename T3 >
217  struct UseBlasKernel {
218  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
219  !SYM && !HERM && !LOW && !UPP &&
224  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
229  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
230  };
232  //**********************************************************************************************
233 
234  //**********************************************************************************************
236 
239  template< typename T1, typename T2, typename T3 >
240  struct UseVectorizedDefaultKernel {
241  enum : bool { value = useOptimizedKernels &&
243  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
246  , ElementType_<T3> >::value &&
249  };
251  //**********************************************************************************************
252 
253  //**********************************************************************************************
255 
258  using ForwardFunctor = IfTrue_< HERM
259  , DeclHerm
260  , IfTrue_< SYM
261  , DeclSym
262  , IfTrue_< LOW
263  , IfTrue_< UPP
264  , DeclDiag
265  , DeclLow >
266  , IfTrue_< UPP
267  , DeclUpp
268  , Noop > > > >;
270  //**********************************************************************************************
271 
272  public:
273  //**Type definitions****************************************************************************
276 
282  using ReturnType = const ElementType;
283  using CompositeType = const ResultType;
284 
286  using LeftOperand = If_< IsExpression<MT1>, const MT1, const MT1& >;
287 
289  using RightOperand = If_< IsExpression<MT2>, const MT2, const MT2& >;
290 
293 
296  //**********************************************************************************************
297 
298  //**Compilation flags***************************************************************************
300  enum : bool { simdEnabled = !IsDiagonal<MT1>::value &&
301  MT1::simdEnabled && MT2::simdEnabled &&
304 
306  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
307  !evaluateRight && MT2::smpAssignable };
308  //**********************************************************************************************
309 
310  //**SIMD properties*****************************************************************************
312  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
313  //**********************************************************************************************
314 
315  //**Constructor*********************************************************************************
321  explicit inline TDMatTDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
322  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
323  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
324  {
325  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
326  }
327  //**********************************************************************************************
328 
329  //**Access operator*****************************************************************************
336  inline ReturnType operator()( size_t i, size_t j ) const {
337  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
338  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
339 
340  if( IsDiagonal<MT1>::value ) {
341  return lhs_(i,i) * rhs_(i,j);
342  }
343  else if( IsDiagonal<MT2>::value ) {
344  return lhs_(i,j) * rhs_(j,j);
345  }
347  const size_t begin( ( IsUpper<MT1>::value )
348  ?( ( IsLower<MT2>::value )
349  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
350  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
351  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
352  :( ( IsLower<MT2>::value )
353  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
354  :( 0UL ) ) );
355  const size_t end( ( IsLower<MT1>::value )
356  ?( ( IsUpper<MT2>::value )
357  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
358  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
359  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
360  :( ( IsUpper<MT2>::value )
361  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
362  :( lhs_.columns() ) ) );
363 
364  if( begin >= end ) return ElementType();
365 
366  const size_t n( end - begin );
367 
368  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
369  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
370  }
371  else {
372  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
373  }
374  }
375  //**********************************************************************************************
376 
377  //**At function*********************************************************************************
385  inline ReturnType at( size_t i, size_t j ) const {
386  if( i >= lhs_.rows() ) {
387  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
388  }
389  if( j >= rhs_.columns() ) {
390  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
391  }
392  return (*this)(i,j);
393  }
394  //**********************************************************************************************
395 
396  //**Rows function*******************************************************************************
401  inline size_t rows() const noexcept {
402  return lhs_.rows();
403  }
404  //**********************************************************************************************
405 
406  //**Columns function****************************************************************************
411  inline size_t columns() const noexcept {
412  return rhs_.columns();
413  }
414  //**********************************************************************************************
415 
416  //**Left operand access*************************************************************************
421  inline LeftOperand leftOperand() const noexcept {
422  return lhs_;
423  }
424  //**********************************************************************************************
425 
426  //**Right operand access************************************************************************
431  inline RightOperand rightOperand() const noexcept {
432  return rhs_;
433  }
434  //**********************************************************************************************
435 
436  //**********************************************************************************************
442  template< typename T >
443  inline bool canAlias( const T* alias ) const noexcept {
444  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
445  }
446  //**********************************************************************************************
447 
448  //**********************************************************************************************
454  template< typename T >
455  inline bool isAliased( const T* alias ) const noexcept {
456  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
457  }
458  //**********************************************************************************************
459 
460  //**********************************************************************************************
465  inline bool isAligned() const noexcept {
466  return lhs_.isAligned() && rhs_.isAligned();
467  }
468  //**********************************************************************************************
469 
470  //**********************************************************************************************
475  inline bool canSMPAssign() const noexcept {
476  return ( !BLAZE_BLAS_MODE ||
477  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
479  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
480  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
482  }
483  //**********************************************************************************************
484 
485  private:
486  //**Member variables****************************************************************************
489  //**********************************************************************************************
490 
491  //**Assignment to dense matrices****************************************************************
504  template< typename MT // Type of the target dense matrix
505  , bool SO > // Storage order of the target dense matrix
507  assign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
508  {
510 
511  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
512  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
513 
514  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
515  return;
516  }
517  else if( rhs.lhs_.columns() == 0UL ) {
518  reset( ~lhs );
519  return;
520  }
521 
522  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
523  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
524 
525  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
526  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
527  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
528  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
529  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
530  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
531 
532  TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
533  }
535  //**********************************************************************************************
536 
537  //**Assignment to dense matrices (kernel selection)*********************************************
548  template< typename MT3 // Type of the left-hand side target matrix
549  , typename MT4 // Type of the left-hand side matrix operand
550  , typename MT5 > // Type of the right-hand side matrix operand
551  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
552  {
553  if( ( IsDiagonal<MT4>::value ) ||
554  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
555  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
556  selectSmallAssignKernel( C, A, B );
557  else
558  selectBlasAssignKernel( C, A, B );
559  }
561  //**********************************************************************************************
562 
563  //**Default assignment to dense matrices (general/general)**************************************
577  template< typename MT3 // Type of the left-hand side target matrix
578  , typename MT4 // Type of the left-hand side matrix operand
579  , typename MT5 > // Type of the right-hand side matrix operand
581  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
582  {
583  const size_t M( A.rows() );
584  const size_t N( B.columns() );
585  const size_t K( A.columns() );
586 
587  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
588 
589  for( size_t j=0UL; j<N; ++j )
590  {
591  const size_t kbegin( ( IsLower<MT5>::value )
592  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
593  :( 0UL ) );
594  const size_t kend( ( IsUpper<MT5>::value )
595  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
596  :( K ) );
597  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
598 
599  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
600  for( size_t i=0UL; i<M; ++i ) {
601  reset( C(i,j) );
602  }
603  continue;
604  }
605 
606  {
607  const size_t ibegin( ( IsLower<MT4>::value )
609  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
610  :( LOW ? max(j,kbegin) : kbegin ) )
611  :( LOW ? j : 0UL ) );
612  const size_t iend( ( IsUpper<MT4>::value )
614  ?( UPP ? min(j+1UL,kbegin) : kbegin )
615  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
616  :( UPP ? j+1UL : M ) );
617 
618  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
619  for( size_t i=0UL; i<ibegin; ++i ) {
620  reset( C(i,j) );
621  }
622  }
623  else if( IsStrictlyLower<MT4>::value ) {
624  reset( C(0UL,j) );
625  }
626  for( size_t i=ibegin; i<iend; ++i ) {
627  C(i,j) = A(i,kbegin) * B(kbegin,j);
628  }
629  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
630  for( size_t i=iend; i<M; ++i ) {
631  reset( C(i,j) );
632  }
633  }
634  else if( IsStrictlyUpper<MT4>::value ) {
635  reset( C(M-1UL,j) );
636  }
637  }
638 
639  for( size_t k=kbegin+1UL; k<kend; ++k )
640  {
641  const size_t ibegin( ( IsLower<MT4>::value )
643  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
644  :( SYM || HERM || LOW ? max( j, k ) : k ) )
645  :( SYM || HERM || LOW ? j : 0UL ) );
646  const size_t iend( ( IsUpper<MT4>::value )
648  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
649  :( UPP ? min(j+1UL,k) : k ) )
650  :( UPP ? j+1UL : M ) );
651 
652  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
653  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
654 
655  for( size_t i=ibegin; i<iend; ++i ) {
656  C(i,j) += A(i,k) * B(k,j);
657  }
658  if( IsUpper<MT4>::value ) {
659  C(iend,j) = A(iend,k) * B(k,j);
660  }
661  }
662  }
663 
664  if( SYM || HERM ) {
665  for( size_t j=1UL; j<N; ++j ) {
666  for( size_t i=0UL; i<j; ++i ) {
667  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
668  }
669  }
670  }
671  }
673  //**********************************************************************************************
674 
675  //**Default assignment to dense matrices (general/diagonal)*************************************
689  template< typename MT3 // Type of the left-hand side target matrix
690  , typename MT4 // Type of the left-hand side matrix operand
691  , typename MT5 > // Type of the right-hand side matrix operand
692  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
693  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
694  {
696 
697  const size_t M( A.rows() );
698  const size_t N( B.columns() );
699 
700  for( size_t j=0UL; j<N; ++j )
701  {
702  const size_t ibegin( ( IsLower<MT4>::value )
703  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
704  :( 0UL ) );
705  const size_t iend( ( IsUpper<MT4>::value )
706  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
707  :( M ) );
708  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
709 
710  if( IsLower<MT4>::value ) {
711  for( size_t i=0UL; i<ibegin; ++i ) {
712  reset( C(i,j) );
713  }
714  }
715  for( size_t i=ibegin; i<iend; ++i ) {
716  C(i,j) = A(i,j) * B(j,j);
717  }
718  if( IsUpper<MT4>::value ) {
719  for( size_t i=iend; i<M; ++i ) {
720  reset( C(i,j) );
721  }
722  }
723  }
724  }
726  //**********************************************************************************************
727 
728  //**Default assignment to dense matrices (diagonal/general)*************************************
742  template< typename MT3 // Type of the left-hand side target matrix
743  , typename MT4 // Type of the left-hand side matrix operand
744  , typename MT5 > // Type of the right-hand side matrix operand
746  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
747  {
749 
750  const size_t M( A.rows() );
751  const size_t N( B.columns() );
752 
753  for( size_t j=0UL; j<N; ++j )
754  {
755  const size_t ibegin( ( IsLower<MT5>::value )
756  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
757  :( 0UL ) );
758  const size_t iend( ( IsUpper<MT5>::value )
759  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
760  :( M ) );
761  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
762 
763  if( IsLower<MT4>::value ) {
764  for( size_t i=0UL; i<ibegin; ++i ) {
765  reset( C(i,j) );
766  }
767  }
768  for( size_t i=ibegin; i<iend; ++i ) {
769  C(i,j) = A(i,i) * B(i,j);
770  }
771  if( IsUpper<MT4>::value ) {
772  for( size_t i=iend; i<M; ++i ) {
773  reset( C(i,j) );
774  }
775  }
776  }
777  }
779  //**********************************************************************************************
780 
781  //**Default assignment to dense matrices (diagonal/diagonal)************************************
795  template< typename MT3 // Type of the left-hand side target matrix
796  , typename MT4 // Type of the left-hand side matrix operand
797  , typename MT5 > // Type of the right-hand side matrix operand
798  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
799  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
800  {
802 
803  reset( C );
804 
805  for( size_t i=0UL; i<A.rows(); ++i ) {
806  C(i,i) = A(i,i) * B(i,i);
807  }
808  }
810  //**********************************************************************************************
811 
812  //**Default assignment to dense matrices (small matrices)***************************************
826  template< typename MT3 // Type of the left-hand side target matrix
827  , typename MT4 // Type of the left-hand side matrix operand
828  , typename MT5 > // Type of the right-hand side matrix operand
830  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
831  {
832  selectDefaultAssignKernel( C, A, B );
833  }
835  //**********************************************************************************************
836 
837  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
852  template< typename MT3 // Type of the left-hand side target matrix
853  , typename MT4 // Type of the left-hand side matrix operand
854  , typename MT5 > // Type of the right-hand side matrix operand
856  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
857  {
862 
863  const ForwardFunctor fwd;
864 
866  const OppositeType_<MT5> tmp( serial( B ) );
867  assign( ~C, fwd( A * tmp ) );
868  }
870  const OppositeType_<MT4> tmp( serial( A ) );
871  assign( ~C, fwd( tmp * B ) );
872  }
873  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
874  const OppositeType_<MT5> tmp( serial( B ) );
875  assign( ~C, fwd( A * tmp ) );
876  }
877  else {
878  const OppositeType_<MT4> tmp( serial( A ) );
879  assign( ~C, fwd( tmp * B ) );
880  }
881  }
883  //**********************************************************************************************
884 
885  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
900  template< typename MT3 // Type of the left-hand side target matrix
901  , typename MT4 // Type of the left-hand side matrix operand
902  , typename MT5 > // Type of the right-hand side matrix operand
904  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
905  {
906  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
907 
908  const size_t M( A.rows() );
909  const size_t N( B.columns() );
910  const size_t K( A.columns() );
911 
912  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
913 
914  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
915  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
916 
917  if( LOW && UPP && M > SIMDSIZE*3UL ) {
918  reset( ~C );
919  }
920 
921  {
922  size_t i( 0UL );
923 
925  {
926  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
927  for( size_t j=0UL; j<N; ++j )
928  {
929  const size_t kbegin( ( IsLower<MT5>::value )
930  ?( ( IsUpper<MT4>::value )
931  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
932  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
933  :( IsUpper<MT4>::value ? i : 0UL ) );
934  const size_t kend( ( IsUpper<MT5>::value )
935  ?( ( IsLower<MT4>::value )
936  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
937  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
938  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
939 
940  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
941 
942  for( size_t k=kbegin; k<kend; ++k ) {
943  const SIMDType b1( set( B(k,j) ) );
944  xmm1 += A.load(i ,k) * b1;
945  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
946  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
947  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
948  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
949  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
950  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
951  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
952  }
953 
954  (~C).store( i , j, xmm1 );
955  (~C).store( i+SIMDSIZE , j, xmm2 );
956  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
957  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
958  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
959  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
960  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
961  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
962  }
963  }
964  }
965 
966  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
967  {
968  size_t j( 0UL );
969 
970  for( ; (j+2UL) <= N; j+=2UL )
971  {
972  const size_t kbegin( ( IsLower<MT5>::value )
973  ?( ( IsUpper<MT4>::value )
974  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
975  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
976  :( IsUpper<MT4>::value ? i : 0UL ) );
977  const size_t kend( ( IsUpper<MT5>::value )
978  ?( ( IsLower<MT4>::value )
979  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
980  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
981  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
982 
983  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
984 
985  for( size_t k=kbegin; k<kend; ++k ) {
986  const SIMDType a1( A.load(i ,k) );
987  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
988  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
989  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
990  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
991  const SIMDType b1( set( B(k,j ) ) );
992  const SIMDType b2( set( B(k,j+1UL) ) );
993  xmm1 += a1 * b1;
994  xmm2 += a2 * b1;
995  xmm3 += a3 * b1;
996  xmm4 += a4 * b1;
997  xmm5 += a5 * b1;
998  xmm6 += a1 * b2;
999  xmm7 += a2 * b2;
1000  xmm8 += a3 * b2;
1001  xmm9 += a4 * b2;
1002  xmm10 += a5 * b2;
1003  }
1004 
1005  (~C).store( i , j , xmm1 );
1006  (~C).store( i+SIMDSIZE , j , xmm2 );
1007  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1008  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1009  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1010  (~C).store( i , j+1UL, xmm6 );
1011  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1012  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1013  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1014  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1015  }
1016 
1017  if( j < N )
1018  {
1019  const size_t kbegin( ( IsLower<MT5>::value )
1020  ?( ( IsUpper<MT4>::value )
1021  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1022  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1023  :( IsUpper<MT4>::value ? i : 0UL ) );
1024  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1025 
1026  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1027 
1028  for( size_t k=kbegin; k<kend; ++k ) {
1029  const SIMDType b1( set( B(k,j) ) );
1030  xmm1 += A.load(i ,k) * b1;
1031  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1032  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1033  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1034  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1035  }
1036 
1037  (~C).store( i , j, xmm1 );
1038  (~C).store( i+SIMDSIZE , j, xmm2 );
1039  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1040  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1041  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1042  }
1043  }
1044 
1045  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1046  {
1047  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
1048  size_t j( UPP ? i : 0UL );
1049 
1050  for( ; (j+2UL) <= jend; j+=2UL )
1051  {
1052  const size_t kbegin( ( IsLower<MT5>::value )
1053  ?( ( IsUpper<MT4>::value )
1054  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1055  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1056  :( IsUpper<MT4>::value ? i : 0UL ) );
1057  const size_t kend( ( IsUpper<MT5>::value )
1058  ?( ( IsLower<MT4>::value )
1059  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1060  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1061  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
1062 
1063  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1064 
1065  for( size_t k=kbegin; k<kend; ++k ) {
1066  const SIMDType a1( A.load(i ,k) );
1067  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1068  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1069  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1070  const SIMDType b1( set( B(k,j ) ) );
1071  const SIMDType b2( set( B(k,j+1UL) ) );
1072  xmm1 += a1 * b1;
1073  xmm2 += a2 * b1;
1074  xmm3 += a3 * b1;
1075  xmm4 += a4 * b1;
1076  xmm5 += a1 * b2;
1077  xmm6 += a2 * b2;
1078  xmm7 += a3 * b2;
1079  xmm8 += a4 * b2;
1080  }
1081 
1082  (~C).store( i , j , xmm1 );
1083  (~C).store( i+SIMDSIZE , j , xmm2 );
1084  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1085  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1086  (~C).store( i , j+1UL, xmm5 );
1087  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1088  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1089  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1090  }
1091 
1092  if( j < jend )
1093  {
1094  const size_t kbegin( ( IsLower<MT5>::value )
1095  ?( ( IsUpper<MT4>::value )
1096  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1097  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1098  :( IsUpper<MT4>::value ? i : 0UL ) );
1099  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1100 
1101  SIMDType xmm1, xmm2, xmm3, xmm4;
1102 
1103  for( size_t k=kbegin; k<kend; ++k ) {
1104  const SIMDType b1( set( B(k,j) ) );
1105  xmm1 += A.load(i ,k) * b1;
1106  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1107  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1108  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1109  }
1110 
1111  (~C).store( i , j, xmm1 );
1112  (~C).store( i+SIMDSIZE , j, xmm2 );
1113  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1114  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1115  }
1116  }
1117 
1118  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1119  {
1120  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
1121  size_t j( UPP ? i : 0UL );
1122 
1123  for( ; (j+2UL) <= jend; j+=2UL )
1124  {
1125  const size_t kbegin( ( IsLower<MT5>::value )
1126  ?( ( IsUpper<MT4>::value )
1127  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1128  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1129  :( IsUpper<MT4>::value ? i : 0UL ) );
1130  const size_t kend( ( IsUpper<MT5>::value )
1131  ?( ( IsLower<MT4>::value )
1132  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1133  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1134  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
1135 
1136  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1137 
1138  for( size_t k=kbegin; k<kend; ++k ) {
1139  const SIMDType a1( A.load(i ,k) );
1140  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1141  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1142  const SIMDType b1( set( B(k,j ) ) );
1143  const SIMDType b2( set( B(k,j+1UL) ) );
1144  xmm1 += a1 * b1;
1145  xmm2 += a2 * b1;
1146  xmm3 += a3 * b1;
1147  xmm4 += a1 * b2;
1148  xmm5 += a2 * b2;
1149  xmm6 += a3 * b2;
1150  }
1151 
1152  (~C).store( i , j , xmm1 );
1153  (~C).store( i+SIMDSIZE , j , xmm2 );
1154  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1155  (~C).store( i , j+1UL, xmm4 );
1156  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1157  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1158  }
1159 
1160  if( j < jend )
1161  {
1162  const size_t kbegin( ( IsLower<MT5>::value )
1163  ?( ( IsUpper<MT4>::value )
1164  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1165  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1166  :( IsUpper<MT4>::value ? i : 0UL ) );
1167  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
1168 
1169  SIMDType xmm1, xmm2, xmm3;
1170 
1171  for( size_t k=kbegin; k<kend; ++k ) {
1172  const SIMDType b1( set( B(k,j) ) );
1173  xmm1 += A.load(i ,k) * b1;
1174  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1175  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1176  }
1177 
1178  (~C).store( i , j, xmm1 );
1179  (~C).store( i+SIMDSIZE , j, xmm2 );
1180  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1181  }
1182  }
1183 
1184  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1185  {
1186  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
1187  size_t j( UPP ? i : 0UL );
1188 
1189  for( ; (j+4UL) <= jend; j+=4UL )
1190  {
1191  const size_t kbegin( ( IsLower<MT5>::value )
1192  ?( ( IsUpper<MT4>::value )
1193  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1194  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1195  :( IsUpper<MT4>::value ? i : 0UL ) );
1196  const size_t kend( ( IsUpper<MT5>::value )
1197  ?( ( IsLower<MT4>::value )
1198  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
1199  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
1200  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
1201 
1202  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1203 
1204  for( size_t k=kbegin; k<kend; ++k ) {
1205  const SIMDType a1( A.load(i ,k) );
1206  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1207  const SIMDType b1( set( B(k,j ) ) );
1208  const SIMDType b2( set( B(k,j+1UL) ) );
1209  const SIMDType b3( set( B(k,j+2UL) ) );
1210  const SIMDType b4( set( B(k,j+3UL) ) );
1211  xmm1 += a1 * b1;
1212  xmm2 += a2 * b1;
1213  xmm3 += a1 * b2;
1214  xmm4 += a2 * b2;
1215  xmm5 += a1 * b3;
1216  xmm6 += a2 * b3;
1217  xmm7 += a1 * b4;
1218  xmm8 += a2 * b4;
1219  }
1220 
1221  (~C).store( i , j , xmm1 );
1222  (~C).store( i+SIMDSIZE, j , xmm2 );
1223  (~C).store( i , j+1UL, xmm3 );
1224  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1225  (~C).store( i , j+2UL, xmm5 );
1226  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
1227  (~C).store( i , j+3UL, xmm7 );
1228  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
1229  }
1230 
1231  for( ; (j+3UL) <= jend; j+=3UL )
1232  {
1233  const size_t kbegin( ( IsLower<MT5>::value )
1234  ?( ( IsUpper<MT4>::value )
1235  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1236  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1237  :( IsUpper<MT4>::value ? i : 0UL ) );
1238  const size_t kend( ( IsUpper<MT5>::value )
1239  ?( ( IsLower<MT4>::value )
1240  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
1241  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
1242  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
1243 
1244  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1245 
1246  for( size_t k=kbegin; k<kend; ++k ) {
1247  const SIMDType a1( A.load(i ,k) );
1248  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1249  const SIMDType b1( set( B(k,j ) ) );
1250  const SIMDType b2( set( B(k,j+1UL) ) );
1251  const SIMDType b3( set( B(k,j+2UL) ) );
1252  xmm1 += a1 * b1;
1253  xmm2 += a2 * b1;
1254  xmm3 += a1 * b2;
1255  xmm4 += a2 * b2;
1256  xmm5 += a1 * b3;
1257  xmm6 += a2 * b3;
1258  }
1259 
1260  (~C).store( i , j , xmm1 );
1261  (~C).store( i+SIMDSIZE, j , xmm2 );
1262  (~C).store( i , j+1UL, xmm3 );
1263  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1264  (~C).store( i , j+2UL, xmm5 );
1265  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
1266  }
1267 
1268  for( ; (j+2UL) <= jend; j+=2UL )
1269  {
1270  const size_t kbegin( ( IsLower<MT5>::value )
1271  ?( ( IsUpper<MT4>::value )
1272  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1273  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1274  :( IsUpper<MT4>::value ? i : 0UL ) );
1275  const size_t kend( ( IsUpper<MT5>::value )
1276  ?( ( IsLower<MT4>::value )
1277  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1278  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1279  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
1280 
1281  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1282  size_t k( kbegin );
1283 
1284  for( ; (k+2UL) <= kend; k+=2UL ) {
1285  const SIMDType a1( A.load(i ,k ) );
1286  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
1287  const SIMDType a3( A.load(i ,k+1UL) );
1288  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
1289  const SIMDType b1( set( B(k ,j ) ) );
1290  const SIMDType b2( set( B(k ,j+1UL) ) );
1291  const SIMDType b3( set( B(k+1UL,j ) ) );
1292  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
1293  xmm1 += a1 * b1;
1294  xmm2 += a2 * b1;
1295  xmm3 += a1 * b2;
1296  xmm4 += a2 * b2;
1297  xmm5 += a3 * b3;
1298  xmm6 += a4 * b3;
1299  xmm7 += a3 * b4;
1300  xmm8 += a4 * b4;
1301  }
1302 
1303  for( ; k<kend; ++k ) {
1304  const SIMDType a1( A.load(i ,k) );
1305  const SIMDType a2( A.load(i+SIMDSIZE,k) );
1306  const SIMDType b1( set( B(k,j ) ) );
1307  const SIMDType b2( set( B(k,j+1UL) ) );
1308  xmm1 += a1 * b1;
1309  xmm2 += a2 * b1;
1310  xmm3 += a1 * b2;
1311  xmm4 += a2 * b2;
1312  }
1313 
1314  (~C).store( i , j , xmm1+xmm5 );
1315  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
1316  (~C).store( i , j+1UL, xmm3+xmm7 );
1317  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
1318  }
1319 
1320  if( j < jend )
1321  {
1322  const size_t kbegin( ( IsLower<MT5>::value )
1323  ?( ( IsUpper<MT4>::value )
1324  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1325  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1326  :( IsUpper<MT4>::value ? i : 0UL ) );
1327  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
1328 
1329  SIMDType xmm1, xmm2, xmm3, xmm4;
1330  size_t k( kbegin );
1331 
1332  for( ; (k+2UL) <= kend; k+=2UL ) {
1333  const SIMDType b1( set( B(k ,j) ) );
1334  const SIMDType b2( set( B(k+1UL,j) ) );
1335  xmm1 += A.load(i ,k ) * b1;
1336  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
1337  xmm3 += A.load(i ,k+1UL) * b2;
1338  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
1339  }
1340 
1341  for( ; k<kend; ++k ) {
1342  const SIMDType b1( set( B(k,j) ) );
1343  xmm1 += A.load(i ,k) * b1;
1344  xmm2 += A.load(i+SIMDSIZE,k) * b1;
1345  }
1346 
1347  (~C).store( i , j, xmm1+xmm3 );
1348  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
1349  }
1350  }
1351 
1352  for( ; i<ipos; i+=SIMDSIZE )
1353  {
1354  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
1355  size_t j( UPP ? i : 0UL );
1356 
1357  for( ; (j+4UL) <= jend; j+=4UL )
1358  {
1359  const size_t kbegin( ( IsLower<MT5>::value )
1360  ?( ( IsUpper<MT4>::value )
1361  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1362  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1363  :( IsUpper<MT4>::value ? i : 0UL ) );
1364  const size_t kend( ( IsUpper<MT5>::value )
1365  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
1366  :( K ) );
1367 
1368  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1369  size_t k( kbegin );
1370 
1371  for( ; (k+2UL) <= kend; k+=2UL ) {
1372  const SIMDType a1( A.load(i,k ) );
1373  const SIMDType a2( A.load(i,k+1UL) );
1374  xmm1 += a1 * set( B(k ,j ) );
1375  xmm2 += a1 * set( B(k ,j+1UL) );
1376  xmm3 += a1 * set( B(k ,j+2UL) );
1377  xmm4 += a1 * set( B(k ,j+3UL) );
1378  xmm5 += a2 * set( B(k+1UL,j ) );
1379  xmm6 += a2 * set( B(k+1UL,j+1UL) );
1380  xmm7 += a2 * set( B(k+1UL,j+2UL) );
1381  xmm8 += a2 * set( B(k+1UL,j+3UL) );
1382  }
1383 
1384  for( ; k<kend; ++k ) {
1385  const SIMDType a1( A.load(i,k) );
1386  xmm1 += a1 * set( B(k,j ) );
1387  xmm2 += a1 * set( B(k,j+1UL) );
1388  xmm3 += a1 * set( B(k,j+2UL) );
1389  xmm4 += a1 * set( B(k,j+3UL) );
1390  }
1391 
1392  (~C).store( i, j , xmm1+xmm5 );
1393  (~C).store( i, j+1UL, xmm2+xmm6 );
1394  (~C).store( i, j+2UL, xmm3+xmm7 );
1395  (~C).store( i, j+3UL, xmm4+xmm8 );
1396  }
1397 
1398  for( ; (j+3UL) <= jend; j+=3UL )
1399  {
1400  const size_t kbegin( ( IsLower<MT5>::value )
1401  ?( ( IsUpper<MT4>::value )
1402  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1403  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1404  :( IsUpper<MT4>::value ? i : 0UL ) );
1405  const size_t kend( ( IsUpper<MT5>::value )
1406  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
1407  :( K ) );
1408 
1409  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1410  size_t k( kbegin );
1411 
1412  for( ; (k+2UL) <= kend; k+=2UL ) {
1413  const SIMDType a1( A.load(i,k ) );
1414  const SIMDType a2( A.load(i,k+1UL) );
1415  xmm1 += a1 * set( B(k ,j ) );
1416  xmm2 += a1 * set( B(k ,j+1UL) );
1417  xmm3 += a1 * set( B(k ,j+2UL) );
1418  xmm4 += a2 * set( B(k+1UL,j ) );
1419  xmm5 += a2 * set( B(k+1UL,j+1UL) );
1420  xmm6 += a2 * set( B(k+1UL,j+2UL) );
1421  }
1422 
1423  for( ; k<kend; ++k ) {
1424  const SIMDType a1( A.load(i,k) );
1425  xmm1 += a1 * set( B(k,j ) );
1426  xmm2 += a1 * set( B(k,j+1UL) );
1427  xmm3 += a1 * set( B(k,j+2UL) );
1428  }
1429 
1430  (~C).store( i, j , xmm1+xmm4 );
1431  (~C).store( i, j+1UL, xmm2+xmm5 );
1432  (~C).store( i, j+2UL, xmm3+xmm6 );
1433  }
1434 
1435  for( ; (j+2UL) <= jend; j+=2UL )
1436  {
1437  const size_t kbegin( ( IsLower<MT5>::value )
1438  ?( ( IsUpper<MT4>::value )
1439  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1440  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1441  :( IsUpper<MT4>::value ? i : 0UL ) );
1442  const size_t kend( ( IsUpper<MT5>::value )
1443  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1444  :( K ) );
1445 
1446  SIMDType xmm1, xmm2, xmm3, xmm4;
1447  size_t k( kbegin );
1448 
1449  for( ; (k+2UL) <= kend; k+=2UL ) {
1450  const SIMDType a1( A.load(i,k ) );
1451  const SIMDType a2( A.load(i,k+1UL) );
1452  xmm1 += a1 * set( B(k ,j ) );
1453  xmm2 += a1 * set( B(k ,j+1UL) );
1454  xmm3 += a2 * set( B(k+1UL,j ) );
1455  xmm4 += a2 * set( B(k+1UL,j+1UL) );
1456  }
1457 
1458  for( ; k<kend; ++k ) {
1459  const SIMDType a1( A.load(i,k) );
1460  xmm1 += a1 * set( B(k,j ) );
1461  xmm2 += a1 * set( B(k,j+1UL) );
1462  }
1463 
1464  (~C).store( i, j , xmm1+xmm3 );
1465  (~C).store( i, j+1UL, xmm2+xmm4 );
1466  }
1467 
1468  if( j < jend )
1469  {
1470  const size_t kbegin( ( IsLower<MT5>::value )
1471  ?( ( IsUpper<MT4>::value )
1472  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1473  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1474  :( IsUpper<MT4>::value ? i : 0UL ) );
1475 
1476  SIMDType xmm1, xmm2;
1477  size_t k( kbegin );
1478 
1479  for( ; (k+2UL) <= K; k+=2UL ) {
1480  xmm1 += A.load(i,k ) * set( B(k ,j) );
1481  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
1482  }
1483 
1484  for( ; k<K; ++k ) {
1485  xmm1 += A.load(i,k) * set( B(k,j) );
1486  }
1487 
1488  (~C).store( i, j, xmm1+xmm2 );
1489  }
1490  }
1491 
1492  for( ; remainder && i<M; ++i )
1493  {
1494  size_t j( LOW && UPP ? i : 0UL );
1495 
1496  for( ; (j+2UL) <= N; j+=2UL )
1497  {
1498  const size_t kbegin( ( IsLower<MT5>::value )
1499  ?( ( IsUpper<MT4>::value )
1500  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1501  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1502  :( IsUpper<MT4>::value ? i : 0UL ) );
1503  const size_t kend( ( IsUpper<MT5>::value )
1504  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1505  :( K ) );
1506 
1507  ElementType value1{};
1508  ElementType value2{};
1509 
1510  for( size_t k=kbegin; k<kend; ++k ) {
1511  value1 += A(i,k) * B(k,j );
1512  value2 += A(i,k) * B(k,j+1UL);
1513  }
1514 
1515  (~C)(i,j ) = value1;
1516  (~C)(i,j+1UL) = value2;
1517  }
1518 
1519  if( j < N )
1520  {
1521  const size_t kbegin( ( IsLower<MT5>::value )
1522  ?( ( IsUpper<MT4>::value )
1523  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1524  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1525  :( IsUpper<MT4>::value ? i : 0UL ) );
1526 
1527  ElementType value{};
1528 
1529  for( size_t k=kbegin; k<K; ++k ) {
1530  value += A(i,k) * B(k,j);
1531  }
1532 
1533  (~C)(i,j) = value;
1534  }
1535  }
1536  }
1537 
1538  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
1539  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1540  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1541  for( size_t i=0UL; i<iend; ++i ) {
1542  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
1543  }
1544  }
1545  }
1546  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
1547  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1548  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1549  for( size_t i=0UL; i<iend; ++i ) {
1550  reset( (~C)(i,j) );
1551  }
1552  }
1553  }
1554  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
1555  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1556  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1557  for( size_t j=0UL; j<jend; ++j ) {
1558  reset( (~C)(i,j) );
1559  }
1560  }
1561  }
1562  }
1564  //**********************************************************************************************
1565 
1566  //**Default assignment to dense matrices (large matrices)***************************************
1580  template< typename MT3 // Type of the left-hand side target matrix
1581  , typename MT4 // Type of the left-hand side matrix operand
1582  , typename MT5 > // Type of the right-hand side matrix operand
1584  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1585  {
1586  selectDefaultAssignKernel( C, A, B );
1587  }
1589  //**********************************************************************************************
1590 
1591  //**Vectorized default assignment to dense matrices (large matrices)****************************
1606  template< typename MT3 // Type of the left-hand side target matrix
1607  , typename MT4 // Type of the left-hand side matrix operand
1608  , typename MT5 > // Type of the right-hand side matrix operand
1610  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1611  {
1612  if( SYM )
1613  smmm( C, A, B, ElementType(1) );
1614  else if( HERM )
1615  hmmm( C, A, B, ElementType(1) );
1616  else if( LOW )
1617  lmmm( C, A, B, ElementType(1), ElementType(0) );
1618  else if( UPP )
1619  ummm( C, A, B, ElementType(1), ElementType(0) );
1620  else
1621  mmm( C, A, B, ElementType(1), ElementType(0) );
1622  }
1624  //**********************************************************************************************
1625 
1626  //**BLAS-based assignment to dense matrices (default)*******************************************
1640  template< typename MT3 // Type of the left-hand side target matrix
1641  , typename MT4 // Type of the left-hand side matrix operand
1642  , typename MT5 > // Type of the right-hand side matrix operand
1644  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1645  {
1646  selectLargeAssignKernel( C, A, B );
1647  }
1649  //**********************************************************************************************
1650 
1651  //**BLAS-based assignment to dense matrices*****************************************************
1652 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1653 
1666  template< typename MT3 // Type of the left-hand side target matrix
1667  , typename MT4 // Type of the left-hand side matrix operand
1668  , typename MT5 > // Type of the right-hand side matrix operand
1670  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1671  {
1672  using ET = ElementType_<MT3>;
1673 
1674  if( IsTriangular<MT4>::value ) {
1675  assign( C, B );
1676  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1677  }
1678  else if( IsTriangular<MT5>::value ) {
1679  assign( C, A );
1680  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1681  }
1682  else {
1683  gemm( C, A, B, ET(1), ET(0) );
1684  }
1685  }
1687 #endif
1688  //**********************************************************************************************
1689 
1690  //**Assignment to sparse matrices***************************************************************
1703  template< typename MT // Type of the target sparse matrix
1704  , bool SO > // Storage order of the target sparse matrix
1706  assign( SparseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1707  {
1709 
1711 
1718 
1719  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1720  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1721 
1722  const ForwardFunctor fwd;
1723 
1724  const TmpType tmp( serial( rhs ) );
1725  assign( ~lhs, fwd( tmp ) );
1726  }
1728  //**********************************************************************************************
1729 
1730  //**Restructuring assignment to row-major matrices**********************************************
1745  template< typename MT > // Type of the target matrix
1747  assign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
1748  {
1750 
1752 
1753  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1754  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1755 
1756  const ForwardFunctor fwd;
1757 
1759  assign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
1760  else if( IsSymmetric<MT1>::value )
1761  assign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
1762  else
1763  assign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
1764  }
1766  //**********************************************************************************************
1767 
1768  //**Addition assignment to dense matrices*******************************************************
1781  template< typename MT // Type of the target dense matrix
1782  , bool SO > // Storage order of the target dense matrix
1784  addAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
1785  {
1787 
1788  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1789  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1790 
1791  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1792  return;
1793  }
1794 
1795  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1796  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1797 
1798  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1799  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1800  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1801  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1802  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1803  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1804 
1805  TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1806  }
1808  //**********************************************************************************************
1809 
1810  //**Addition assignment to dense matrices (kernel selection)************************************
1821  template< typename MT3 // Type of the left-hand side target matrix
1822  , typename MT4 // Type of the left-hand side matrix operand
1823  , typename MT5 > // Type of the right-hand side matrix operand
1824  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1825  {
1826  if( ( IsDiagonal<MT4>::value ) ||
1827  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
1828  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1829  selectSmallAddAssignKernel( C, A, B );
1830  else
1831  selectBlasAddAssignKernel( C, A, B );
1832  }
1834  //**********************************************************************************************
1835 
1836  //**Default addition assignment to dense matrices (general/general)*****************************
1850  template< typename MT3 // Type of the left-hand side target matrix
1851  , typename MT4 // Type of the left-hand side matrix operand
1852  , typename MT5 > // Type of the right-hand side matrix operand
1853  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
1854  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1855  {
1856  const size_t M( A.rows() );
1857  const size_t N( B.columns() );
1858  const size_t K( A.columns() );
1859 
1860  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1861 
1862  for( size_t j=0UL; j<N; ++j )
1863  {
1864  const size_t kbegin( ( IsLower<MT5>::value )
1865  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1866  :( 0UL ) );
1867  const size_t kend( ( IsUpper<MT5>::value )
1868  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1869  :( K ) );
1870  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1871 
1872  for( size_t k=kbegin; k<kend; ++k )
1873  {
1874  const size_t ibegin( ( IsLower<MT4>::value )
1876  ?( LOW ? max(j,k+1UL) : k+1UL )
1877  :( LOW ? max(j,k) : k ) )
1878  :( LOW ? j : 0UL ) );
1879  const size_t iend( ( IsUpper<MT4>::value )
1881  ?( UPP ? min(j+1UL,k) : k )
1882  :( UPP ? min(j,k)+1UL : k+1UL ) )
1883  :( UPP ? j+1UL : M ) );
1884 
1885  if( ( LOW || UPP ) && ibegin >= iend ) continue;
1886  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1887 
1888  const size_t inum( iend - ibegin );
1889  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1890 
1891  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1892  C(i ,j) += A(i ,k) * B(k,j);
1893  C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1894  }
1895  if( ipos < iend ) {
1896  C(ipos,j) += A(ipos,k) * B(k,j);
1897  }
1898  }
1899  }
1900  }
1902  //**********************************************************************************************
1903 
1904  //**Default addition assignment to dense matrices (general/diagonal)****************************
1918  template< typename MT3 // Type of the left-hand side target matrix
1919  , typename MT4 // Type of the left-hand side matrix operand
1920  , typename MT5 > // Type of the right-hand side matrix operand
1921  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
1922  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1923  {
1925 
1926  const size_t M( A.rows() );
1927  const size_t N( B.columns() );
1928 
1929  for( size_t j=0UL; j<N; ++j )
1930  {
1931  const size_t ibegin( ( IsLower<MT4>::value )
1932  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
1933  :( 0UL ) );
1934  const size_t iend( ( IsUpper<MT4>::value )
1935  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
1936  :( M ) );
1937  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1938 
1939  const size_t inum( iend - ibegin );
1940  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1941 
1942  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1943  C(i ,j) += A(i ,j) * B(j,j);
1944  C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1945  }
1946  if( ipos < iend ) {
1947  C(ipos,j) += A(ipos,j) * B(j,j);
1948  }
1949  }
1950  }
1952  //**********************************************************************************************
1953 
1954  //**Default addition assignment to dense matrices (diagonal/general)****************************
1968  template< typename MT3 // Type of the left-hand side target matrix
1969  , typename MT4 // Type of the left-hand side matrix operand
1970  , typename MT5 > // Type of the right-hand side matrix operand
1971  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
1972  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1973  {
1975 
1976  const size_t M( A.rows() );
1977  const size_t N( B.columns() );
1978 
1979  for( size_t j=0UL; j<N; ++j )
1980  {
1981  const size_t ibegin( ( IsLower<MT5>::value )
1982  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1983  :( 0UL ) );
1984  const size_t iend( ( IsUpper<MT5>::value )
1985  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1986  :( M ) );
1987  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
1988 
1989  const size_t inum( iend - ibegin );
1990  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
1991 
1992  for( size_t i=ibegin; i<ipos; i+=2UL ) {
1993  C(i ,j) += A(i ,i ) * B(i ,j);
1994  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1995  }
1996  if( ipos < iend ) {
1997  C(ipos,j) += A(ipos,ipos) * B(ipos,j);
1998  }
1999  }
2000  }
2002  //**********************************************************************************************
2003 
2004  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2018  template< typename MT3 // Type of the left-hand side target matrix
2019  , typename MT4 // Type of the left-hand side matrix operand
2020  , typename MT5 > // Type of the right-hand side matrix operand
2021  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2022  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2023  {
2025 
2026  for( size_t i=0UL; i<A.rows(); ++i ) {
2027  C(i,i) += A(i,i) * B(i,i);
2028  }
2029  }
2031  //**********************************************************************************************
2032 
2033  //**Default addition assignment to dense matrices (small matrices)******************************
2047  template< typename MT3 // Type of the left-hand side target matrix
2048  , typename MT4 // Type of the left-hand side matrix operand
2049  , typename MT5 > // Type of the right-hand side matrix operand
2051  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2052  {
2053  selectDefaultAddAssignKernel( C, A, B );
2054  }
2056  //**********************************************************************************************
2057 
2058  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2073  template< typename MT3 // Type of the left-hand side target matrix
2074  , typename MT4 // Type of the left-hand side matrix operand
2075  , typename MT5 > // Type of the right-hand side matrix operand
2077  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2078  {
2083 
2084  const ForwardFunctor fwd;
2085 
2087  const OppositeType_<MT5> tmp( serial( B ) );
2088  addAssign( ~C, fwd( A * tmp ) );
2089  }
2091  const OppositeType_<MT4> tmp( serial( A ) );
2092  addAssign( ~C, fwd( tmp * B ) );
2093  }
2094  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2095  const OppositeType_<MT5> tmp( serial( B ) );
2096  addAssign( ~C, fwd( A * tmp ) );
2097  }
2098  else {
2099  const OppositeType_<MT4> tmp( serial( A ) );
2100  addAssign( ~C, fwd( tmp * B ) );
2101  }
2102  }
2104  //**********************************************************************************************
2105 
2106  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2121  template< typename MT3 // Type of the left-hand side target matrix
2122  , typename MT4 // Type of the left-hand side matrix operand
2123  , typename MT5 > // Type of the right-hand side matrix operand
2125  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2126  {
2127  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
2128 
2129  const size_t M( A.rows() );
2130  const size_t N( B.columns() );
2131  const size_t K( A.columns() );
2132 
2133  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2134 
2135  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
2136  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
2137 
2138  size_t i( 0UL );
2139 
2141  {
2142  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
2143  for( size_t j=0UL; j<N; ++j )
2144  {
2145  const size_t kbegin( ( IsLower<MT5>::value )
2146  ?( ( IsUpper<MT4>::value )
2147  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2148  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2149  :( IsUpper<MT4>::value ? i : 0UL ) );
2150  const size_t kend( ( IsUpper<MT5>::value )
2151  ?( ( IsLower<MT4>::value )
2152  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2153  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
2154  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
2155 
2156  SIMDType xmm1( (~C).load(i ,j) );
2157  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2158  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2159  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2160  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2161  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
2162  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
2163  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
2164 
2165  for( size_t k=kbegin; k<kend; ++k ) {
2166  const SIMDType b1( set( B(k,j) ) );
2167  xmm1 += A.load(i ,k) * b1;
2168  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2169  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2170  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2171  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2172  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
2173  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
2174  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
2175  }
2176 
2177  (~C).store( i , j, xmm1 );
2178  (~C).store( i+SIMDSIZE , j, xmm2 );
2179  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2180  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2181  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2182  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
2183  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
2184  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
2185  }
2186  }
2187  }
2188 
2189  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
2190  {
2191  size_t j( 0UL );
2192 
2193  for( ; (j+2UL) <= N; j+=2UL )
2194  {
2195  const size_t kbegin( ( IsLower<MT5>::value )
2196  ?( ( IsUpper<MT4>::value )
2197  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2198  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2199  :( IsUpper<MT4>::value ? i : 0UL ) );
2200  const size_t kend( ( IsUpper<MT5>::value )
2201  ?( ( IsLower<MT4>::value )
2202  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2203  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2204  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
2205 
2206  SIMDType xmm1 ( (~C).load(i ,j ) );
2207  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
2208  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
2209  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
2210  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
2211  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
2212  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
2213  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2214  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2215  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
2216 
2217  for( size_t k=kbegin; k<kend; ++k ) {
2218  const SIMDType a1( A.load(i ,k) );
2219  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2220  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2221  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2222  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
2223  const SIMDType b1( set( B(k,j ) ) );
2224  const SIMDType b2( set( B(k,j+1UL) ) );
2225  xmm1 += a1 * b1;
2226  xmm2 += a2 * b1;
2227  xmm3 += a3 * b1;
2228  xmm4 += a4 * b1;
2229  xmm5 += a5 * b1;
2230  xmm6 += a1 * b2;
2231  xmm7 += a2 * b2;
2232  xmm8 += a3 * b2;
2233  xmm9 += a4 * b2;
2234  xmm10 += a5 * b2;
2235  }
2236 
2237  (~C).store( i , j , xmm1 );
2238  (~C).store( i+SIMDSIZE , j , xmm2 );
2239  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2240  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2241  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
2242  (~C).store( i , j+1UL, xmm6 );
2243  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
2244  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
2245  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
2246  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
2247  }
2248 
2249  if( j < N )
2250  {
2251  const size_t kbegin( ( IsLower<MT5>::value )
2252  ?( ( IsUpper<MT4>::value )
2253  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2254  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2255  :( IsUpper<MT4>::value ? i : 0UL ) );
2256  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
2257 
2258  SIMDType xmm1( (~C).load(i ,j) );
2259  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2260  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2261  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2262  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
2263 
2264  for( size_t k=kbegin; k<kend; ++k ) {
2265  const SIMDType b1( set( B(k,j) ) );
2266  xmm1 += A.load(i ,k) * b1;
2267  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2268  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2269  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2270  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
2271  }
2272 
2273  (~C).store( i , j, xmm1 );
2274  (~C).store( i+SIMDSIZE , j, xmm2 );
2275  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2276  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2277  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
2278  }
2279  }
2280 
2281  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2282  {
2283  size_t j( 0UL );
2284 
2285  for( ; (j+2UL) <= N; j+=2UL )
2286  {
2287  const size_t kbegin( ( IsLower<MT5>::value )
2288  ?( ( IsUpper<MT4>::value )
2289  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2290  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2291  :( IsUpper<MT4>::value ? i : 0UL ) );
2292  const size_t kend( ( IsUpper<MT5>::value )
2293  ?( ( IsLower<MT4>::value )
2294  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2295  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2296  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
2297 
2298  SIMDType xmm1( (~C).load(i ,j ) );
2299  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2300  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2301  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
2302  SIMDType xmm5( (~C).load(i ,j+1UL) );
2303  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
2304  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2305  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2306 
2307  for( size_t k=kbegin; k<kend; ++k ) {
2308  const SIMDType a1( A.load(i ,k) );
2309  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2310  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2311  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2312  const SIMDType b1( set( B(k,j ) ) );
2313  const SIMDType b2( set( B(k,j+1UL) ) );
2314  xmm1 += a1 * b1;
2315  xmm2 += a2 * b1;
2316  xmm3 += a3 * b1;
2317  xmm4 += a4 * b1;
2318  xmm5 += a1 * b2;
2319  xmm6 += a2 * b2;
2320  xmm7 += a3 * b2;
2321  xmm8 += a4 * b2;
2322  }
2323 
2324  (~C).store( i , j , xmm1 );
2325  (~C).store( i+SIMDSIZE , j , xmm2 );
2326  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2327  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2328  (~C).store( i , j+1UL, xmm5 );
2329  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
2330  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2331  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2332  }
2333 
2334  if( j < N )
2335  {
2336  const size_t kbegin( ( IsLower<MT5>::value )
2337  ?( ( IsUpper<MT4>::value )
2338  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2339  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2340  :( IsUpper<MT4>::value ? i : 0UL ) );
2341  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
2342 
2343  SIMDType xmm1( (~C).load(i ,j) );
2344  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2345  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2346  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2347 
2348  for( size_t k=kbegin; k<kend; ++k ) {
2349  const SIMDType b1( set( B(k,j) ) );
2350  xmm1 += A.load(i ,k) * b1;
2351  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2352  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2353  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
2354  }
2355 
2356  (~C).store( i , j, xmm1 );
2357  (~C).store( i+SIMDSIZE , j, xmm2 );
2358  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2359  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2360  }
2361  }
2362 
2363  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2364  {
2365  size_t j( 0UL );
2366 
2367  for( ; (j+2UL) <= N; j+=2UL )
2368  {
2369  const size_t kbegin( ( IsLower<MT5>::value )
2370  ?( ( IsUpper<MT4>::value )
2371  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2372  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2373  :( IsUpper<MT4>::value ? i : 0UL ) );
2374  const size_t kend( ( IsUpper<MT5>::value )
2375  ?( ( IsLower<MT4>::value )
2376  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2377  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2378  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
2379 
2380  SIMDType xmm1( (~C).load(i ,j ) );
2381  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2382  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2383  SIMDType xmm4( (~C).load(i ,j+1UL) );
2384  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
2385  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2386 
2387  for( size_t k=kbegin; k<kend; ++k ) {
2388  const SIMDType a1( A.load(i ,k) );
2389  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2390  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2391  const SIMDType b1( set( B(k,j ) ) );
2392  const SIMDType b2( set( B(k,j+1UL) ) );
2393  xmm1 += a1 * b1;
2394  xmm2 += a2 * b1;
2395  xmm3 += a3 * b1;
2396  xmm4 += a1 * b2;
2397  xmm5 += a2 * b2;
2398  xmm6 += a3 * b2;
2399  }
2400 
2401  (~C).store( i , j , xmm1 );
2402  (~C).store( i+SIMDSIZE , j , xmm2 );
2403  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2404  (~C).store( i , j+1UL, xmm4 );
2405  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
2406  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
2407  }
2408 
2409  if( j < N )
2410  {
2411  const size_t kbegin( ( IsLower<MT5>::value )
2412  ?( ( IsUpper<MT4>::value )
2413  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2414  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2415  :( IsUpper<MT4>::value ? i : 0UL ) );
2416  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2417 
2418  SIMDType xmm1( (~C).load(i ,j) );
2419  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2420  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2421 
2422  for( size_t k=kbegin; k<kend; ++k ) {
2423  const SIMDType b1( set( B(k,j) ) );
2424  xmm1 += A.load(i ,k) * b1;
2425  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2426  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2427  }
2428 
2429  (~C).store( i , j, xmm1 );
2430  (~C).store( i+SIMDSIZE , j, xmm2 );
2431  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2432  }
2433  }
2434 
2435  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2436  {
2437  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
2438  size_t j( UPP ? i : 0UL );
2439 
2440  for( ; (j+4UL) <= jend; j+=4UL )
2441  {
2442  const size_t kbegin( ( IsLower<MT5>::value )
2443  ?( ( IsUpper<MT4>::value )
2444  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2445  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2446  :( IsUpper<MT4>::value ? i : 0UL ) );
2447  const size_t kend( ( IsUpper<MT5>::value )
2448  ?( ( IsLower<MT4>::value )
2449  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
2450  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
2451  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2452 
2453  SIMDType xmm1( (~C).load(i ,j ) );
2454  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2455  SIMDType xmm3( (~C).load(i ,j+1UL) );
2456  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2457  SIMDType xmm5( (~C).load(i ,j+2UL) );
2458  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
2459  SIMDType xmm7( (~C).load(i ,j+3UL) );
2460  SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
2461 
2462  for( size_t k=kbegin; k<kend; ++k ) {
2463  const SIMDType a1( A.load(i ,k) );
2464  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2465  const SIMDType b1( set( B(k,j ) ) );
2466  const SIMDType b2( set( B(k,j+1UL) ) );
2467  const SIMDType b3( set( B(k,j+2UL) ) );
2468  const SIMDType b4( set( B(k,j+3UL) ) );
2469  xmm1 += a1 * b1;
2470  xmm2 += a2 * b1;
2471  xmm3 += a1 * b2;
2472  xmm4 += a2 * b2;
2473  xmm5 += a1 * b3;
2474  xmm6 += a2 * b3;
2475  xmm7 += a1 * b4;
2476  xmm8 += a2 * b4;
2477  }
2478 
2479  (~C).store( i , j , xmm1 );
2480  (~C).store( i+SIMDSIZE, j , xmm2 );
2481  (~C).store( i , j+1UL, xmm3 );
2482  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2483  (~C).store( i , j+2UL, xmm5 );
2484  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2485  (~C).store( i , j+3UL, xmm7 );
2486  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
2487  }
2488 
2489  for( ; (j+3UL) <= jend; j+=3UL )
2490  {
2491  const size_t kbegin( ( IsLower<MT5>::value )
2492  ?( ( IsUpper<MT4>::value )
2493  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2494  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2495  :( IsUpper<MT4>::value ? i : 0UL ) );
2496  const size_t kend( ( IsUpper<MT5>::value )
2497  ?( ( IsLower<MT4>::value )
2498  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
2499  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
2500  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2501 
2502  SIMDType xmm1( (~C).load(i ,j ) );
2503  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2504  SIMDType xmm3( (~C).load(i ,j+1UL) );
2505  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2506  SIMDType xmm5( (~C).load(i ,j+2UL) );
2507  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
2508 
2509  for( size_t k=kbegin; k<kend; ++k ) {
2510  const SIMDType a1( A.load(i ,k) );
2511  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2512  const SIMDType b1( set( B(k,j ) ) );
2513  const SIMDType b2( set( B(k,j+1UL) ) );
2514  const SIMDType b3( set( B(k,j+2UL) ) );
2515  xmm1 += a1 * b1;
2516  xmm2 += a2 * b1;
2517  xmm3 += a1 * b2;
2518  xmm4 += a2 * b2;
2519  xmm5 += a1 * b3;
2520  xmm6 += a2 * b3;
2521  }
2522 
2523  (~C).store( i , j , xmm1 );
2524  (~C).store( i+SIMDSIZE, j , xmm2 );
2525  (~C).store( i , j+1UL, xmm3 );
2526  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2527  (~C).store( i , j+2UL, xmm5 );
2528  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2529  }
2530 
2531  for( ; (j+2UL) <= jend; j+=2UL )
2532  {
2533  const size_t kbegin( ( IsLower<MT5>::value )
2534  ?( ( IsUpper<MT4>::value )
2535  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2536  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2537  :( IsUpper<MT4>::value ? i : 0UL ) );
2538  const size_t kend( ( IsUpper<MT5>::value )
2539  ?( ( IsLower<MT4>::value )
2540  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2541  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2542  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2543 
2544  SIMDType xmm1( (~C).load(i ,j ) );
2545  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2546  SIMDType xmm3( (~C).load(i ,j+1UL) );
2547  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2548  SIMDType xmm5, xmm6, xmm7, xmm8;
2549  size_t k( kbegin );
2550 
2551  for( ; (k+2UL) < kend; k+=2UL ) {
2552  const SIMDType a1( A.load(i ,k ) );
2553  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2554  const SIMDType a3( A.load(i ,k+1UL) );
2555  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2556  const SIMDType b1( set( B(k ,j ) ) );
2557  const SIMDType b2( set( B(k ,j+1UL) ) );
2558  const SIMDType b3( set( B(k+1UL,j ) ) );
2559  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
2560  xmm1 += a1 * b1;
2561  xmm2 += a2 * b1;
2562  xmm3 += a1 * b2;
2563  xmm4 += a2 * b2;
2564  xmm5 += a3 * b3;
2565  xmm6 += a4 * b3;
2566  xmm7 += a3 * b4;
2567  xmm8 += a4 * b4;
2568  }
2569 
2570  for( ; k<kend; ++k ) {
2571  const SIMDType a1( A.load(i ,k) );
2572  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2573  const SIMDType b1( set( B(k,j ) ) );
2574  const SIMDType b2( set( B(k,j+1UL) ) );
2575  xmm1 += a1 * b1;
2576  xmm2 += a2 * b1;
2577  xmm3 += a1 * b2;
2578  xmm4 += a2 * b2;
2579  }
2580 
2581  (~C).store( i , j , xmm1+xmm5 );
2582  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
2583  (~C).store( i , j+1UL, xmm3+xmm7 );
2584  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2585  }
2586 
2587  if( j < jend )
2588  {
2589  const size_t kbegin( ( IsLower<MT5>::value )
2590  ?( ( IsUpper<MT4>::value )
2591  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2592  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2593  :( IsUpper<MT4>::value ? i : 0UL ) );
2594  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2595 
2596  SIMDType xmm1( (~C).load(i ,j) );
2597  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
2598  SIMDType xmm3, xmm4;
2599  size_t k( kbegin );
2600 
2601  for( ; (k+2UL) <= kend; k+=2UL ) {
2602  const SIMDType b1( set( B(k ,j) ) );
2603  const SIMDType b2( set( B(k+1UL,j) ) );
2604  xmm1 += A.load(i ,k ) * b1;
2605  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2606  xmm3 += A.load(i ,k+1UL) * b2;
2607  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2608  }
2609 
2610  for( ; k<kend; ++k ) {
2611  const SIMDType b1( set( B(k,j) ) );
2612  xmm1 += A.load(i ,k) * b1;
2613  xmm2 += A.load(i+SIMDSIZE,k) * b1;
2614  }
2615 
2616  (~C).store( i , j, xmm1+xmm3 );
2617  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
2618  }
2619  }
2620 
2621  for( ; i<ipos; i+=SIMDSIZE )
2622  {
2623  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
2624  size_t j( UPP ? i : 0UL );
2625 
2626  for( ; (j+4UL) <= jend; j+=4UL )
2627  {
2628  const size_t kbegin( ( IsLower<MT5>::value )
2629  ?( ( IsUpper<MT4>::value )
2630  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2631  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2632  :( IsUpper<MT4>::value ? i : 0UL ) );
2633  const size_t kend( ( IsUpper<MT5>::value )
2634  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
2635  :( K ) );
2636 
2637  SIMDType xmm1( (~C).load(i,j ) );
2638  SIMDType xmm2( (~C).load(i,j+1UL) );
2639  SIMDType xmm3( (~C).load(i,j+2UL) );
2640  SIMDType xmm4( (~C).load(i,j+3UL) );
2641  SIMDType xmm5, xmm6, xmm7, xmm8;
2642  size_t k( kbegin );
2643 
2644  for( ; (k+2UL) <= kend; k+=2UL ) {
2645  const SIMDType a1( A.load(i,k ) );
2646  const SIMDType a2( A.load(i,k+1UL) );
2647  xmm1 += a1 * set( B(k ,j ) );
2648  xmm2 += a1 * set( B(k ,j+1UL) );
2649  xmm3 += a1 * set( B(k ,j+2UL) );
2650  xmm4 += a1 * set( B(k ,j+3UL) );
2651  xmm5 += a2 * set( B(k+1UL,j ) );
2652  xmm6 += a2 * set( B(k+1UL,j+1UL) );
2653  xmm7 += a2 * set( B(k+1UL,j+2UL) );
2654  xmm8 += a2 * set( B(k+1UL,j+3UL) );
2655  }
2656 
2657  for( ; k<kend; ++k ) {
2658  const SIMDType a1( A.load(i,k) );
2659  xmm1 += a1 * set( B(k,j ) );
2660  xmm2 += a1 * set( B(k,j+1UL) );
2661  xmm3 += a1 * set( B(k,j+2UL) );
2662  xmm4 += a1 * set( B(k,j+3UL) );
2663  }
2664 
2665  (~C).store( i, j , xmm1+xmm5 );
2666  (~C).store( i, j+1UL, xmm2+xmm6 );
2667  (~C).store( i, j+2UL, xmm3+xmm7 );
2668  (~C).store( i, j+3UL, xmm4+xmm8 );
2669  }
2670 
2671  for( ; (j+3UL) <= jend; j+=3UL )
2672  {
2673  const size_t kbegin( ( IsLower<MT5>::value )
2674  ?( ( IsUpper<MT4>::value )
2675  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2676  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2677  :( IsUpper<MT4>::value ? i : 0UL ) );
2678  const size_t kend( ( IsUpper<MT5>::value )
2679  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
2680  :( K ) );
2681 
2682  SIMDType xmm1( (~C).load(i,j ) );
2683  SIMDType xmm2( (~C).load(i,j+1UL) );
2684  SIMDType xmm3( (~C).load(i,j+2UL) );
2685  SIMDType xmm4, xmm5, xmm6;
2686  size_t k( kbegin );
2687 
2688  for( ; (k+2UL) <= kend; k+=2UL ) {
2689  const SIMDType a1( A.load(i,k ) );
2690  const SIMDType a2( A.load(i,k+1UL) );
2691  xmm1 += a1 * set( B(k ,j ) );
2692  xmm2 += a1 * set( B(k ,j+1UL) );
2693  xmm3 += a1 * set( B(k ,j+2UL) );
2694  xmm4 += a2 * set( B(k+1UL,j ) );
2695  xmm5 += a2 * set( B(k+1UL,j+1UL) );
2696  xmm6 += a2 * set( B(k+1UL,j+2UL) );
2697  }
2698 
2699  for( ; k<kend; ++k ) {
2700  const SIMDType a1( A.load(i,k) );
2701  xmm1 += a1 * set( B(k,j ) );
2702  xmm2 += a1 * set( B(k,j+1UL) );
2703  xmm3 += a1 * set( B(k,j+2UL) );
2704  }
2705 
2706  (~C).store( i, j , xmm1+xmm4 );
2707  (~C).store( i, j+1UL, xmm2+xmm5 );
2708  (~C).store( i, j+2UL, xmm3+xmm6 );
2709  }
2710 
2711  for( ; (j+2UL) <= jend; j+=2UL )
2712  {
2713  const size_t kbegin( ( IsLower<MT5>::value )
2714  ?( ( IsUpper<MT4>::value )
2715  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2716  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2717  :( IsUpper<MT4>::value ? i : 0UL ) );
2718  const size_t kend( ( IsUpper<MT5>::value )
2719  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2720  :( K ) );
2721 
2722  SIMDType xmm1( (~C).load(i,j ) );
2723  SIMDType xmm2( (~C).load(i,j+1UL) );
2724  SIMDType xmm3, xmm4;
2725  size_t k( kbegin );
2726 
2727  for( ; (k+2UL) <= kend; k+=2UL ) {
2728  const SIMDType a1( A.load(i,k ) );
2729  const SIMDType a2( A.load(i,k+1UL) );
2730  xmm1 += a1 * set( B(k ,j ) );
2731  xmm2 += a1 * set( B(k ,j+1UL) );
2732  xmm3 += a2 * set( B(k+1UL,j ) );
2733  xmm4 += a2 * set( B(k+1UL,j+1UL) );
2734  }
2735 
2736  for( ; k<kend; ++k ) {
2737  const SIMDType a1( A.load(i,k) );
2738  xmm1 += a1 * set( B(k,j ) );
2739  xmm2 += a1 * set( B(k,j+1UL) );
2740  }
2741 
2742  (~C).store( i, j , xmm1+xmm3 );
2743  (~C).store( i, j+1UL, xmm2+xmm4 );
2744  }
2745 
2746  if( j < jend )
2747  {
2748  const size_t kbegin( ( IsLower<MT5>::value )
2749  ?( ( IsUpper<MT4>::value )
2750  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2751  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2752  :( IsUpper<MT4>::value ? i : 0UL ) );
2753 
2754  SIMDType xmm1( (~C).load(i,j) );
2755  SIMDType xmm2;
2756  size_t k( kbegin );
2757 
2758  for( ; (k+2UL) <= K; k+=2UL ) {
2759  xmm1 += A.load(i,k ) * set( B(k ,j) );
2760  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
2761  }
2762 
2763  for( ; k<K; ++k ) {
2764  xmm1 += A.load(i,k) * set( B(k,j) );
2765  }
2766 
2767  (~C).store( i, j, xmm1+xmm2 );
2768  }
2769  }
2770 
2771  for( ; remainder && i<M; ++i )
2772  {
2773  const size_t jend( LOW ? i+1UL : N );
2774  size_t j( UPP ? i : 0UL );
2775 
2776  for( ; (j+2UL) <= jend; j+=2UL )
2777  {
2778  const size_t kbegin( ( IsLower<MT5>::value )
2779  ?( ( IsUpper<MT4>::value )
2780  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2781  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2782  :( IsUpper<MT4>::value ? i : 0UL ) );
2783  const size_t kend( ( IsUpper<MT5>::value )
2784  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2785  :( K ) );
2786 
2787  ElementType value1( (~C)(i,j ) );
2788  ElementType value2( (~C)(i,j+1UL) );
2789 
2790  for( size_t k=kbegin; k<kend; ++k ) {
2791  value1 += A(i,k) * B(k,j );
2792  value2 += A(i,k) * B(k,j+1UL);
2793  }
2794 
2795  (~C)(i,j ) = value1;
2796  (~C)(i,j+1UL) = value2;
2797  }
2798 
2799  if( j < jend )
2800  {
2801  const size_t kbegin( ( IsLower<MT5>::value )
2802  ?( ( IsUpper<MT4>::value )
2803  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2804  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2805  :( IsUpper<MT4>::value ? i : 0UL ) );
2806 
2807  ElementType value( (~C)(i,j) );
2808 
2809  for( size_t k=kbegin; k<K; ++k ) {
2810  value += A(i,k) * B(k,j);
2811  }
2812 
2813  (~C)(i,j) = value;
2814  }
2815  }
2816  }
2818  //**********************************************************************************************
2819 
2820  //**Default addition assignment to dense matrices (large matrices)******************************
2834  template< typename MT3 // Type of the left-hand side target matrix
2835  , typename MT4 // Type of the left-hand side matrix operand
2836  , typename MT5 > // Type of the right-hand side matrix operand
2838  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2839  {
2840  selectDefaultAddAssignKernel( C, A, B );
2841  }
2843  //**********************************************************************************************
2844 
2845  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
2860  template< typename MT3 // Type of the left-hand side target matrix
2861  , typename MT4 // Type of the left-hand side matrix operand
2862  , typename MT5 > // Type of the right-hand side matrix operand
2864  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2865  {
2866  if( LOW )
2867  lmmm( C, A, B, ElementType(1), ElementType(1) );
2868  else if( UPP )
2869  ummm( C, A, B, ElementType(1), ElementType(1) );
2870  else
2871  mmm( C, A, B, ElementType(1), ElementType(1) );
2872  }
2874  //**********************************************************************************************
2875 
2876  //**BLAS-based addition assignment to dense matrices (default)**********************************
2890  template< typename MT3 // Type of the left-hand side target matrix
2891  , typename MT4 // Type of the left-hand side matrix operand
2892  , typename MT5 > // Type of the right-hand side matrix operand
2894  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2895  {
2896  selectLargeAddAssignKernel( C, A, B );
2897  }
2899  //**********************************************************************************************
2900 
2901  //**BLAS-based addition assignment to dense matrices********************************************
2902 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2903 
2916  template< typename MT3 // Type of the left-hand side target matrix
2917  , typename MT4 // Type of the left-hand side matrix operand
2918  , typename MT5 > // Type of the right-hand side matrix operand
2920  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2921  {
2922  using ET = ElementType_<MT3>;
2923 
2924  if( IsTriangular<MT4>::value ) {
2925  ResultType_<MT3> tmp( serial( B ) );
2926  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2927  addAssign( C, tmp );
2928  }
2929  else if( IsTriangular<MT5>::value ) {
2930  ResultType_<MT3> tmp( serial( A ) );
2931  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2932  addAssign( C, tmp );
2933  }
2934  else {
2935  gemm( C, A, B, ET(1), ET(1) );
2936  }
2937  }
2939 #endif
2940  //**********************************************************************************************
2941 
2942  //**Restructuring addition assignment to row-major matrices*************************************
2957  template< typename MT > // Type of the target matrix
2959  addAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
2960  {
2962 
2964 
2965  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2966  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2967 
2968  const ForwardFunctor fwd;
2969 
2971  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
2972  else if( IsSymmetric<MT1>::value )
2973  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
2974  else
2975  addAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
2976  }
2978  //**********************************************************************************************
2979 
2980  //**Addition assignment to sparse matrices******************************************************
2981  // No special implementation for the addition assignment to sparse matrices.
2982  //**********************************************************************************************
2983 
2984  //**Subtraction assignment to dense matrices****************************************************
2997  template< typename MT // Type of the target dense matrix
2998  , bool SO > // Storage order of the target dense matrix
3000  subAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
3001  {
3003 
3004  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3005  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3006 
3007  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3008  return;
3009  }
3010 
3011  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3012  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3013 
3014  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3015  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3016  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3017  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3018  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3019  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3020 
3021  TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3022  }
3024  //**********************************************************************************************
3025 
3026  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3037  template< typename MT3 // Type of the left-hand side target matrix
3038  , typename MT4 // Type of the left-hand side matrix operand
3039  , typename MT5 > // Type of the right-hand side matrix operand
3040  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3041  {
3042  if( ( IsDiagonal<MT4>::value ) ||
3043  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
3044  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3045  selectSmallSubAssignKernel( C, A, B );
3046  else
3047  selectBlasSubAssignKernel( C, A, B );
3048  }
3050  //**********************************************************************************************
3051 
3052  //**Default subtraction assignment to dense matrices (general/general)**************************
3066  template< typename MT3 // Type of the left-hand side target matrix
3067  , typename MT4 // Type of the left-hand side matrix operand
3068  , typename MT5 > // Type of the right-hand side matrix operand
3069  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
3070  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3071  {
3072  const size_t M( A.rows() );
3073  const size_t N( B.columns() );
3074  const size_t K( A.columns() );
3075 
3076  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3077 
3078  for( size_t j=0UL; j<N; ++j )
3079  {
3080  const size_t kbegin( ( IsLower<MT5>::value )
3081  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3082  :( 0UL ) );
3083  const size_t kend( ( IsUpper<MT5>::value )
3084  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3085  :( K ) );
3086  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3087 
3088  for( size_t k=kbegin; k<kend; ++k )
3089  {
3090  const size_t ibegin( ( IsLower<MT4>::value )
3092  ?( LOW ? max(j,k+1UL) : k+1UL )
3093  :( LOW ? max(j,k) : k ) )
3094  :( LOW ? j : 0UL ) );
3095  const size_t iend( ( IsUpper<MT4>::value )
3097  ?( UPP ? min(j+1UL,k) : k )
3098  :( UPP ? min(j,k)+1UL : k+1UL ) )
3099  :( UPP ? j+1UL : M ) );
3100 
3101  if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
3102  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3103 
3104  const size_t inum( iend - ibegin );
3105  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3106 
3107  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3108  C(i ,j) -= A(i ,k) * B(k,j);
3109  C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3110  }
3111  if( ipos < iend ) {
3112  C(ipos,j) -= A(ipos,k) * B(k,j);
3113  }
3114  }
3115  }
3116  }
3118  //**********************************************************************************************
3119 
3120  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
3134  template< typename MT3 // Type of the left-hand side target matrix
3135  , typename MT4 // Type of the left-hand side matrix operand
3136  , typename MT5 > // Type of the right-hand side matrix operand
3137  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
3138  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3139  {
3141 
3142  const size_t M( A.rows() );
3143  const size_t N( B.columns() );
3144 
3145  for( size_t j=0UL; j<N; ++j )
3146  {
3147  const size_t ibegin( ( IsLower<MT4>::value )
3148  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
3149  :( 0UL ) );
3150  const size_t iend( ( IsUpper<MT4>::value )
3151  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
3152  :( M ) );
3153  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3154 
3155  const size_t inum( iend - ibegin );
3156  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3157 
3158  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3159  C(i ,j) -= A(i ,j) * B(j,j);
3160  C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3161  }
3162  if( ipos < iend ) {
3163  C(ipos,j) -= A(ipos,j) * B(j,j);
3164  }
3165  }
3166  }
3168  //**********************************************************************************************
3169 
3170  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
3184  template< typename MT3 // Type of the left-hand side target matrix
3185  , typename MT4 // Type of the left-hand side matrix operand
3186  , typename MT5 > // Type of the right-hand side matrix operand
3187  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
3188  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3189  {
3191 
3192  const size_t M( A.rows() );
3193  const size_t N( B.columns() );
3194 
3195  for( size_t j=0UL; j<N; ++j )
3196  {
3197  const size_t ibegin( ( IsLower<MT5>::value )
3198  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3199  :( 0UL ) );
3200  const size_t iend( ( IsUpper<MT5>::value )
3201  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3202  :( M ) );
3203  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
3204 
3205  const size_t inum( iend - ibegin );
3206  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
3207 
3208  for( size_t i=ibegin; i<ipos; i+=2UL ) {
3209  C(i ,j) -= A(i ,i ) * B(i ,j);
3210  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3211  }
3212  if( ipos < iend ) {
3213  C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3214  }
3215  }
3216  }
3218  //**********************************************************************************************
3219 
3220  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3234  template< typename MT3 // Type of the left-hand side target matrix
3235  , typename MT4 // Type of the left-hand side matrix operand
3236  , typename MT5 > // Type of the right-hand side matrix operand
3237  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
3238  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3239  {
3241 
3242  for( size_t i=0UL; i<A.rows(); ++i ) {
3243  C(i,i) -= A(i,i) * B(i,i);
3244  }
3245  }
3247  //**********************************************************************************************
3248 
3249  //**Default subtraction assignment to dense matrices (small matrices)***************************
3263  template< typename MT3 // Type of the left-hand side target matrix
3264  , typename MT4 // Type of the left-hand side matrix operand
3265  , typename MT5 > // Type of the right-hand side matrix operand
3267  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3268  {
3269  selectDefaultSubAssignKernel( C, A, B );
3270  }
3272  //**********************************************************************************************
3273 
3274  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
3289  template< typename MT3 // Type of the left-hand side target matrix
3290  , typename MT4 // Type of the left-hand side matrix operand
3291  , typename MT5 > // Type of the right-hand side matrix operand
3293  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3294  {
3299 
3300  const ForwardFunctor fwd;
3301 
3303  const OppositeType_<MT5> tmp( serial( B ) );
3304  subAssign( ~C, fwd( A * tmp ) );
3305  }
3307  const OppositeType_<MT4> tmp( serial( A ) );
3308  subAssign( ~C, fwd( tmp * B ) );
3309  }
3310  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3311  const OppositeType_<MT5> tmp( serial( B ) );
3312  subAssign( ~C, fwd( A * tmp ) );
3313  }
3314  else {
3315  const OppositeType_<MT4> tmp( serial( A ) );
3316  subAssign( ~C, fwd( tmp * B ) );
3317  }
3318  }
3320  //**********************************************************************************************
3321 
3322  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
3337  template< typename MT3 // Type of the left-hand side target matrix
3338  , typename MT4 // Type of the left-hand side matrix operand
3339  , typename MT5 > // Type of the right-hand side matrix operand
3341  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3342  {
3343  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3344 
3345  const size_t M( A.rows() );
3346  const size_t N( B.columns() );
3347  const size_t K( A.columns() );
3348 
3349  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3350 
3351  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3352  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3353 
3354  size_t i( 0UL );
3355 
3357  {
3358  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3359  for( size_t j=0UL; j<N; ++j )
3360  {
3361  const size_t kbegin( ( IsLower<MT5>::value )
3362  ?( ( IsUpper<MT4>::value )
3363  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3364  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3365  :( IsUpper<MT4>::value ? i : 0UL ) );
3366  const size_t kend( ( IsUpper<MT5>::value )
3367  ?( ( IsLower<MT4>::value )
3368  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3369  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3370  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
3371 
3372  SIMDType xmm1( (~C).load(i ,j) );
3373  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3374  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3375  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3376  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3377  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3378  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3379  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3380 
3381  for( size_t k=kbegin; k<kend; ++k ) {
3382  const SIMDType b1( set( B(k,j) ) );
3383  xmm1 -= A.load(i ,k) * b1;
3384  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3385  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3386  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3387  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3388  xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
3389  xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
3390  xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
3391  }
3392 
3393  (~C).store( i , j, xmm1 );
3394  (~C).store( i+SIMDSIZE , j, xmm2 );
3395  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3396  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3397  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3398  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3399  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3400  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3401  }
3402  }
3403  }
3404 
3405  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3406  {
3407  size_t j( 0UL );
3408 
3409  for( ; (j+2UL) <= N; j+=2UL )
3410  {
3411  const size_t kbegin( ( IsLower<MT5>::value )
3412  ?( ( IsUpper<MT4>::value )
3413  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3414  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3415  :( IsUpper<MT4>::value ? i : 0UL ) );
3416  const size_t kend( ( IsUpper<MT5>::value )
3417  ?( ( IsLower<MT4>::value )
3418  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3419  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3420  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
3421 
3422  SIMDType xmm1 ( (~C).load(i ,j ) );
3423  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3424  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3425  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3426  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3427  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3428  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3429  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3430  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3431  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3432 
3433  for( size_t k=kbegin; k<kend; ++k ) {
3434  const SIMDType a1( A.load(i ,k) );
3435  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3436  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3437  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3438  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3439  const SIMDType b1( set( B(k,j ) ) );
3440  const SIMDType b2( set( B(k,j+1UL) ) );
3441  xmm1 -= a1 * b1;
3442  xmm2 -= a2 * b1;
3443  xmm3 -= a3 * b1;
3444  xmm4 -= a4 * b1;
3445  xmm5 -= a5 * b1;
3446  xmm6 -= a1 * b2;
3447  xmm7 -= a2 * b2;
3448  xmm8 -= a3 * b2;
3449  xmm9 -= a4 * b2;
3450  xmm10 -= a5 * b2;
3451  }
3452 
3453  (~C).store( i , j , xmm1 );
3454  (~C).store( i+SIMDSIZE , j , xmm2 );
3455  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3456  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3457  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3458  (~C).store( i , j+1UL, xmm6 );
3459  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3460  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3461  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3462  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3463  }
3464 
3465  if( j < N )
3466  {
3467  const size_t kbegin( ( IsLower<MT5>::value )
3468  ?( ( IsUpper<MT4>::value )
3469  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3470  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3471  :( IsUpper<MT4>::value ? i : 0UL ) );
3472  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
3473 
3474  SIMDType xmm1( (~C).load(i ,j) );
3475  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3476  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3477  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3478  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3479 
3480  for( size_t k=kbegin; k<kend; ++k ) {
3481  const SIMDType b1( set( B(k,j) ) );
3482  xmm1 -= A.load(i ,k) * b1;
3483  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3484  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3485  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3486  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
3487  }
3488 
3489  (~C).store( i , j, xmm1 );
3490  (~C).store( i+SIMDSIZE , j, xmm2 );
3491  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3492  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3493  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3494  }
3495  }
3496 
3497  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3498  {
3499  size_t j( 0UL );
3500 
3501  for( ; (j+2UL) <= N; j+=2UL )
3502  {
3503  const size_t kbegin( ( IsLower<MT5>::value )
3504  ?( ( IsUpper<MT4>::value )
3505  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3506  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3507  :( IsUpper<MT4>::value ? i : 0UL ) );
3508  const size_t kend( ( IsUpper<MT5>::value )
3509  ?( ( IsLower<MT4>::value )
3510  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3511  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3512  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
3513 
3514  SIMDType xmm1( (~C).load(i ,j ) );
3515  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3516  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3517  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3518  SIMDType xmm5( (~C).load(i ,j+1UL) );
3519  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3520  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3521  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3522 
3523  for( size_t k=kbegin; k<kend; ++k ) {
3524  const SIMDType a1( A.load(i ,k) );
3525  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3526  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3527  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3528  const SIMDType b1( set( B(k,j ) ) );
3529  const SIMDType b2( set( B(k,j+1UL) ) );
3530  xmm1 -= a1 * b1;
3531  xmm2 -= a2 * b1;
3532  xmm3 -= a3 * b1;
3533  xmm4 -= a4 * b1;
3534  xmm5 -= a1 * b2;
3535  xmm6 -= a2 * b2;
3536  xmm7 -= a3 * b2;
3537  xmm8 -= a4 * b2;
3538  }
3539 
3540  (~C).store( i , j , xmm1 );
3541  (~C).store( i+SIMDSIZE , j , xmm2 );
3542  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3543  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3544  (~C).store( i , j+1UL, xmm5 );
3545  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3546  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3547  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3548  }
3549 
3550  if( j < N )
3551  {
3552  const size_t kbegin( ( IsLower<MT5>::value )
3553  ?( ( IsUpper<MT4>::value )
3554  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3555  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3556  :( IsUpper<MT4>::value ? i : 0UL ) );
3557  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3558 
3559  SIMDType xmm1( (~C).load(i ,j) );
3560  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3561  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3562  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3563 
3564  for( size_t k=kbegin; k<kend; ++k ) {
3565  const SIMDType b1( set( B(k,j) ) );
3566  xmm1 -= A.load(i ,k) * b1;
3567  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3568  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3569  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
3570  }
3571 
3572  (~C).store( i , j, xmm1 );
3573  (~C).store( i+SIMDSIZE , j, xmm2 );
3574  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3575  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3576  }
3577  }
3578 
3579  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3580  {
3581  size_t j( 0UL );
3582 
3583  for( ; (j+2UL) <= N; j+=2UL )
3584  {
3585  const size_t kbegin( ( IsLower<MT5>::value )
3586  ?( ( IsUpper<MT4>::value )
3587  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3588  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3589  :( IsUpper<MT4>::value ? i : 0UL ) );
3590  const size_t kend( ( IsUpper<MT5>::value )
3591  ?( ( IsLower<MT4>::value )
3592  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3593  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3594  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
3595 
3596  SIMDType xmm1( (~C).load(i ,j ) );
3597  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3598  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3599  SIMDType xmm4( (~C).load(i ,j+1UL) );
3600  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
3601  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3602 
3603  for( size_t k=kbegin; k<kend; ++k ) {
3604  const SIMDType a1( A.load(i ,k) );
3605  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3606  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3607  const SIMDType b1( set( B(k,j ) ) );
3608  const SIMDType b2( set( B(k,j+1UL) ) );
3609  xmm1 -= a1 * b1;
3610  xmm2 -= a2 * b1;
3611  xmm3 -= a3 * b1;
3612  xmm4 -= a1 * b2;
3613  xmm5 -= a2 * b2;
3614  xmm6 -= a3 * b2;
3615  }
3616 
3617  (~C).store( i , j , xmm1 );
3618  (~C).store( i+SIMDSIZE , j , xmm2 );
3619  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3620  (~C).store( i , j+1UL, xmm4 );
3621  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
3622  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
3623  }
3624 
3625  if( j < N )
3626  {
3627  const size_t kbegin( ( IsLower<MT5>::value )
3628  ?( ( IsUpper<MT4>::value )
3629  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3630  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3631  :( IsUpper<MT4>::value ? i : 0UL ) );
3632  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
3633 
3634  SIMDType xmm1( (~C).load(i ,j) );
3635  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3636  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3637 
3638  for( size_t k=kbegin; k<kend; ++k ) {
3639  const SIMDType b1( set( B(k,j) ) );
3640  xmm1 -= A.load(i ,k) * b1;
3641  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
3642  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
3643  }
3644 
3645  (~C).store( i , j, xmm1 );
3646  (~C).store( i+SIMDSIZE , j, xmm2 );
3647  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3648  }
3649  }
3650 
3651  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3652  {
3653  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
3654  size_t j( UPP ? i : 0UL );
3655 
3656  for( ; (j+4UL) <= jend; j+=4UL )
3657  {
3658  const size_t kbegin( ( IsLower<MT5>::value )
3659  ?( ( IsUpper<MT4>::value )
3660  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3661  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3662  :( IsUpper<MT4>::value ? i : 0UL ) );
3663  const size_t kend( ( IsUpper<MT5>::value )
3664  ?( ( IsLower<MT4>::value )
3665  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
3666  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
3667  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
3668 
3669  SIMDType xmm1( (~C).load(i ,j ) );
3670  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3671  SIMDType xmm3( (~C).load(i ,j+1UL) );
3672  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3673  SIMDType xmm5( (~C).load(i ,j+2UL) );
3674  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
3675  SIMDType xmm7( (~C).load(i ,j+3UL) );
3676  SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
3677 
3678  for( size_t k=kbegin; k<kend; ++k ) {
3679  const SIMDType a1( A.load(i ,k) );
3680  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3681  const SIMDType b1( set( B(k,j ) ) );
3682  const SIMDType b2( set( B(k,j+1UL) ) );
3683  const SIMDType b3( set( B(k,j+2UL) ) );
3684  const SIMDType b4( set( B(k,j+3UL) ) );
3685  xmm1 -= a1 * b1;
3686  xmm2 -= a2 * b1;
3687  xmm3 -= a1 * b2;
3688  xmm4 -= a2 * b2;
3689  xmm5 -= a1 * b3;
3690  xmm6 -= a2 * b3;
3691  xmm7 -= a1 * b4;
3692  xmm8 -= a2 * b4;
3693  }
3694 
3695  (~C).store( i , j , xmm1 );
3696  (~C).store( i+SIMDSIZE, j , xmm2 );
3697  (~C).store( i , j+1UL, xmm3 );
3698  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3699  (~C).store( i , j+2UL, xmm5 );
3700  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
3701  (~C).store( i , j+3UL, xmm7 );
3702  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
3703  }
3704 
3705  for( ; (j+3UL) <= jend; j+=3UL )
3706  {
3707  const size_t kbegin( ( IsLower<MT5>::value )
3708  ?( ( IsUpper<MT4>::value )
3709  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3710  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3711  :( IsUpper<MT4>::value ? i : 0UL ) );
3712  const size_t kend( ( IsUpper<MT5>::value )
3713  ?( ( IsLower<MT4>::value )
3714  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
3715  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
3716  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
3717 
3718  SIMDType xmm1( (~C).load(i ,j ) );
3719  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3720  SIMDType xmm3( (~C).load(i ,j+1UL) );
3721  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3722  SIMDType xmm5( (~C).load(i ,j+2UL) );
3723  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
3724 
3725  for( size_t k=kbegin; k<kend; ++k ) {
3726  const SIMDType a1( A.load(i ,k) );
3727  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3728  const SIMDType b1( set( B(k,j ) ) );
3729  const SIMDType b2( set( B(k,j+1UL) ) );
3730  const SIMDType b3( set( B(k,j+2UL) ) );
3731  xmm1 -= a1 * b1;
3732  xmm2 -= a2 * b1;
3733  xmm3 -= a1 * b2;
3734  xmm4 -= a2 * b2;
3735  xmm5 -= a1 * b3;
3736  xmm6 -= a2 * b3;
3737  }
3738 
3739  (~C).store( i , j , xmm1 );
3740  (~C).store( i+SIMDSIZE, j , xmm2 );
3741  (~C).store( i , j+1UL, xmm3 );
3742  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3743  (~C).store( i , j+2UL, xmm5 );
3744  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
3745  }
3746 
3747  for( ; (j+2UL) <= jend; j+=2UL )
3748  {
3749  const size_t kbegin( ( IsLower<MT5>::value )
3750  ?( ( IsUpper<MT4>::value )
3751  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3752  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3753  :( IsUpper<MT4>::value ? i : 0UL ) );
3754  const size_t kend( ( IsUpper<MT5>::value )
3755  ?( ( IsLower<MT4>::value )
3756  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3757  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3758  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
3759 
3760  SIMDType xmm1( (~C).load(i ,j ) );
3761  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3762  SIMDType xmm3( (~C).load(i ,j+1UL) );
3763  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3764  SIMDType xmm5, xmm6, xmm7, xmm8;
3765  size_t k( kbegin );
3766 
3767  for( ; (k+2UL) <= kend; k+=2UL ) {
3768  const SIMDType a1( A.load(i ,k ) );
3769  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
3770  const SIMDType a3( A.load(i ,k+1UL) );
3771  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
3772  const SIMDType b1( set( B(k ,j ) ) );
3773  const SIMDType b2( set( B(k ,j+1UL) ) );
3774  const SIMDType b3( set( B(k+1UL,j ) ) );
3775  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
3776  xmm1 -= a1 * b1;
3777  xmm2 -= a2 * b1;
3778  xmm3 -= a1 * b2;
3779  xmm4 -= a2 * b2;
3780  xmm5 -= a3 * b3;
3781  xmm6 -= a4 * b3;
3782  xmm7 -= a3 * b4;
3783  xmm8 -= a4 * b4;
3784  }
3785 
3786  for( ; k<kend; ++k ) {
3787  const SIMDType a1( A.load(i ,k) );
3788  const SIMDType a2( A.load(i+SIMDSIZE,k) );
3789  const SIMDType b1( set( B(k,j ) ) );
3790  const SIMDType b2( set( B(k,j+1UL) ) );
3791  xmm1 -= a1 * b1;
3792  xmm2 -= a2 * b1;
3793  xmm3 -= a1 * b2;
3794  xmm4 -= a2 * b2;
3795  }
3796 
3797  (~C).store( i , j , xmm1+xmm5 );
3798  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
3799  (~C).store( i , j+1UL, xmm3+xmm7 );
3800  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
3801  }
3802 
3803  if( j < jend )
3804  {
3805  const size_t kbegin( ( IsLower<MT5>::value )
3806  ?( ( IsUpper<MT4>::value )
3807  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3808  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3809  :( IsUpper<MT4>::value ? i : 0UL ) );
3810  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
3811 
3812  SIMDType xmm1( (~C).load(i ,j) );
3813  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3814  SIMDType xmm3, xmm4;
3815  size_t k( kbegin );
3816 
3817  for( ; (k+2UL) <= kend; k+=2UL ) {
3818  const SIMDType b1( set( B(k ,j) ) );
3819  const SIMDType b2( set( B(k+1UL,j) ) );
3820  xmm1 -= A.load(i ,k ) * b1;
3821  xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
3822  xmm3 -= A.load(i ,k+1UL) * b2;
3823  xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
3824  }
3825 
3826  for( ; k<kend; ++k ) {
3827  const SIMDType b1( set( B(k,j) ) );
3828  xmm1 -= A.load(i ,k) * b1;
3829  xmm2 -= A.load(i+SIMDSIZE,k) * b1;
3830  }
3831 
3832  (~C).store( i , j, xmm1+xmm3 );
3833  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
3834  }
3835  }
3836 
3837  for( ; i<ipos; i+=SIMDSIZE )
3838  {
3839  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
3840  size_t j( UPP ? i : 0UL );
3841 
3842  for( ; (j+4UL) <= jend; j+=4UL )
3843  {
3844  const size_t kbegin( ( IsLower<MT5>::value )
3845  ?( ( IsUpper<MT4>::value )
3846  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3847  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3848  :( IsUpper<MT4>::value ? i : 0UL ) );
3849  const size_t kend( ( IsUpper<MT5>::value )
3850  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
3851  :( K ) );
3852 
3853  SIMDType xmm1( (~C).load(i,j ) );
3854  SIMDType xmm2( (~C).load(i,j+1UL) );
3855  SIMDType xmm3( (~C).load(i,j+2UL) );
3856  SIMDType xmm4( (~C).load(i,j+3UL) );
3857  SIMDType xmm5, xmm6, xmm7, xmm8;
3858  size_t k( kbegin );
3859 
3860  for( ; (k+2UL) <= kend; k+=2UL ) {
3861  const SIMDType a1( A.load(i,k ) );
3862  const SIMDType a2( A.load(i,k+1UL) );
3863  xmm1 -= a1 * set( B(k ,j ) );
3864  xmm2 -= a1 * set( B(k ,j+1UL) );
3865  xmm3 -= a1 * set( B(k ,j+2UL) );
3866  xmm4 -= a1 * set( B(k ,j+3UL) );
3867  xmm5 -= a2 * set( B(k+1UL,j ) );
3868  xmm6 -= a2 * set( B(k+1UL,j+1UL) );
3869  xmm7 -= a2 * set( B(k+1UL,j+2UL) );
3870  xmm8 -= a2 * set( B(k+1UL,j+3UL) );
3871  }
3872 
3873  for( ; k<kend; ++k ) {
3874  const SIMDType a1( A.load(i,k) );
3875  xmm1 -= a1 * set( B(k,j ) );
3876  xmm2 -= a1 * set( B(k,j+1UL) );
3877  xmm3 -= a1 * set( B(k,j+2UL) );
3878  xmm4 -= a1 * set( B(k,j+3UL) );
3879  }
3880 
3881  (~C).store( i, j , xmm1+xmm5 );
3882  (~C).store( i, j+1UL, xmm2+xmm6 );
3883  (~C).store( i, j+2UL, xmm3+xmm7 );
3884  (~C).store( i, j+3UL, xmm4+xmm8 );
3885  }
3886 
3887  for( ; (j+3UL) <= jend; j+=3UL )
3888  {
3889  const size_t kbegin( ( IsLower<MT5>::value )
3890  ?( ( IsUpper<MT4>::value )
3891  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3892  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3893  :( IsUpper<MT4>::value ? i : 0UL ) );
3894  const size_t kend( ( IsUpper<MT5>::value )
3895  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
3896  :( K ) );
3897 
3898  SIMDType xmm1( (~C).load(i,j ) );
3899  SIMDType xmm2( (~C).load(i,j+1UL) );
3900  SIMDType xmm3( (~C).load(i,j+2UL) );
3901  SIMDType xmm4, xmm5, xmm6;
3902  size_t k( kbegin );
3903 
3904  for( ; (k+2UL) <= kend; k+=2UL ) {
3905  const SIMDType a1( A.load(i,k ) );
3906  const SIMDType a2( A.load(i,k+1UL) );
3907  xmm1 -= a1 * set( B(k ,j ) );
3908  xmm2 -= a1 * set( B(k ,j+1UL) );
3909  xmm3 -= a1 * set( B(k ,j+2UL) );
3910  xmm4 -= a2 * set( B(k+1UL,j ) );
3911  xmm5 -= a2 * set( B(k+1UL,j+1UL) );
3912  xmm6 -= a2 * set( B(k+1UL,j+2UL) );
3913  }
3914 
3915  for( ; k<kend; ++k ) {
3916  const SIMDType a1( A.load(i,k) );
3917  xmm1 -= a1 * set( B(k,j ) );
3918  xmm2 -= a1 * set( B(k,j+1UL) );
3919  xmm3 -= a1 * set( B(k,j+2UL) );
3920  }
3921 
3922  (~C).store( i, j , xmm1+xmm4 );
3923  (~C).store( i, j+1UL, xmm2+xmm5 );
3924  (~C).store( i, j+2UL, xmm3+xmm6 );
3925  }
3926 
3927  for( ; (j+2UL) <= jend; j+=2UL )
3928  {
3929  const size_t kbegin( ( IsLower<MT5>::value )
3930  ?( ( IsUpper<MT4>::value )
3931  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3932  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3933  :( IsUpper<MT4>::value ? i : 0UL ) );
3934  const size_t kend( ( IsUpper<MT5>::value )
3935  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3936  :( K ) );
3937 
3938  SIMDType xmm1( (~C).load(i,j ) );
3939  SIMDType xmm2( (~C).load(i,j+1UL) );
3940  SIMDType xmm3, xmm4;
3941  size_t k( kbegin );
3942 
3943  for( ; (k+2UL) <= kend; k+=2UL ) {
3944  const SIMDType a1( A.load(i,k ) );
3945  const SIMDType a2( A.load(i,k+1UL) );
3946  xmm1 -= a1 * set( B(k ,j ) );
3947  xmm2 -= a1 * set( B(k ,j+1UL) );
3948  xmm3 -= a2 * set( B(k+1UL,j ) );
3949  xmm4 -= a2 * set( B(k+1UL,j+1UL) );
3950  }
3951 
3952  for( ; k<kend; ++k ) {
3953  const SIMDType a1( A.load(i,k) );
3954  xmm1 -= a1 * set( B(k,j ) );
3955  xmm2 -= a1 * set( B(k,j+1UL) );
3956  }
3957 
3958  (~C).store( i, j , xmm1+xmm3 );
3959  (~C).store( i, j+1UL, xmm2+xmm4 );
3960  }
3961 
3962  if( j < jend )
3963  {
3964  const size_t kbegin( ( IsLower<MT5>::value )
3965  ?( ( IsUpper<MT4>::value )
3966  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3967  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3968  :( IsUpper<MT4>::value ? i : 0UL ) );
3969 
3970  SIMDType xmm1( (~C).load(i,j) );
3971  SIMDType xmm2;
3972  size_t k( kbegin );
3973 
3974  for( ; (k+2UL) <= K; k+=2UL ) {
3975  xmm1 -= A.load(i,k ) * set( B(k ,j) );
3976  xmm2 -= A.load(i,k+1UL) * set( B(k+1UL,j) );
3977  }
3978 
3979  for( ; k<K; ++k ) {
3980  xmm1 -= A.load(i,k) * set( B(k,j) );
3981  }
3982 
3983  (~C).store( i, j, xmm1+xmm2 );
3984  }
3985  }
3986 
3987  for( ; remainder && i<M; ++i )
3988  {
3989  const size_t jend( LOW ? i+1UL : N );
3990  size_t j( UPP ? i : 0UL );
3991 
3992  for( ; (j+2UL) <= jend; j+=2UL )
3993  {
3994  const size_t kbegin( ( IsLower<MT5>::value )
3995  ?( ( IsUpper<MT4>::value )
3996  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3997  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3998  :( IsUpper<MT4>::value ? i : 0UL ) );
3999  const size_t kend( ( IsUpper<MT5>::value )
4000  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4001  :( K ) );
4002 
4003  ElementType value1( (~C)(i,j ) );
4004  ElementType value2( (~C)(i,j+1UL) );
4005 
4006  for( size_t k=kbegin; k<kend; ++k ) {
4007  value1 -= A(i,k) * B(k,j );
4008  value2 -= A(i,k) * B(k,j+1UL);
4009  }
4010 
4011  (~C)(i,j ) = value1;
4012  (~C)(i,j+1UL) = value2;
4013  }
4014 
4015  if( j < jend )
4016  {
4017  const size_t kbegin( ( IsLower<MT5>::value )
4018  ?( ( IsUpper<MT4>::value )
4019  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4020  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4021  :( IsUpper<MT4>::value ? i : 0UL ) );
4022 
4023  ElementType value( (~C)(i,j) );
4024 
4025  for( size_t k=kbegin; k<K; ++k ) {
4026  value -= A(i,k) * B(k,j);
4027  }
4028 
4029  (~C)(i,j) = value;
4030  }
4031  }
4032  }
4034  //**********************************************************************************************
4035 
4036  //**Default subtraction assignment to dense matrices (large matrices)***************************
4050  template< typename MT3 // Type of the left-hand side target matrix
4051  , typename MT4 // Type of the left-hand side matrix operand
4052  , typename MT5 > // Type of the right-hand side matrix operand
4054  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4055  {
4056  selectDefaultSubAssignKernel( C, A, B );
4057  }
4059  //**********************************************************************************************
4060 
4061  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
4076  template< typename MT3 // Type of the left-hand side target matrix
4077  , typename MT4 // Type of the left-hand side matrix operand
4078  , typename MT5 > // Type of the right-hand side matrix operand
4080  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4081  {
4082  if( LOW )
4083  lmmm( C, A, B, ElementType(-1), ElementType(1) );
4084  else if( UPP )
4085  ummm( C, A, B, ElementType(-1), ElementType(1) );
4086  else
4087  mmm( C, A, B, ElementType(-1), ElementType(1) );
4088  }
4090  //**********************************************************************************************
4091 
4092  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4106  template< typename MT3 // Type of the left-hand side target matrix
4107  , typename MT4 // Type of the left-hand side matrix operand
4108  , typename MT5 > // Type of the right-hand side matrix operand
4110  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4111  {
4112  selectLargeSubAssignKernel( C, A, B );
4113  }
4115  //**********************************************************************************************
4116 
4117  //**BLAS-based subraction assignment to dense matrices******************************************
4118 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4119 
4132  template< typename MT3 // Type of the left-hand side target matrix
4133  , typename MT4 // Type of the left-hand side matrix operand
4134  , typename MT5 > // Type of the right-hand side matrix operand
4136  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4137  {
4138  using ET = ElementType_<MT3>;
4139 
4140  if( IsTriangular<MT4>::value ) {
4141  ResultType_<MT3> tmp( serial( B ) );
4142  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4143  subAssign( C, tmp );
4144  }
4145  else if( IsTriangular<MT5>::value ) {
4146  ResultType_<MT3> tmp( serial( A ) );
4147  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4148  subAssign( C, tmp );
4149  }
4150  else {
4151  gemm( C, A, B, ET(-1), ET(1) );
4152  }
4153  }
4155 #endif
4156  //**********************************************************************************************
4157 
4158  //**Restructuring subtraction assignment to row-major matrices**********************************
4174  template< typename MT > // Type of the target matrix
4176  subAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4177  {
4179 
4181 
4182  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4183  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4184 
4185  const ForwardFunctor fwd;
4186 
4188  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4189  else if( IsSymmetric<MT1>::value )
4190  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4191  else
4192  subAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4193  }
4195  //**********************************************************************************************
4196 
4197  //**Subtraction assignment to sparse matrices***************************************************
4198  // No special implementation for the subtraction assignment to sparse matrices.
4199  //**********************************************************************************************
4200 
4201  //**Schur product assignment to dense matrices**************************************************
4214  template< typename MT // Type of the target dense matrix
4215  , bool SO > // Storage order of the target dense matrix
4216  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4217  {
4219 
4223 
4224  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4225  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4226 
4227  const ResultType tmp( serial( rhs ) );
4228  schurAssign( ~lhs, tmp );
4229  }
4231  //**********************************************************************************************
4232 
4233  //**Multiplication assignment to dense matrices*************************************************
4234  // No special implementation for the multiplication assignment to dense matrices.
4235  //**********************************************************************************************
4236 
4237  //**Multiplication assignment to sparse matrices************************************************
4238  // No special implementation for the multiplication assignment to sparse matrices.
4239  //**********************************************************************************************
4240 
4241  //**SMP assignment to dense matrices************************************************************
4257  template< typename MT // Type of the target dense matrix
4258  , bool SO > // Storage order of the target dense matrix
4260  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4261  {
4263 
4264  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4265  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4266 
4267  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4268  return;
4269  }
4270  else if( rhs.lhs_.columns() == 0UL ) {
4271  reset( ~lhs );
4272  return;
4273  }
4274 
4275  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4276  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4277 
4278  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4279  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4280  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4281  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4282  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4283  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4284 
4285  smpAssign( ~lhs, A * B );
4286  }
4288  //**********************************************************************************************
4289 
4290  //**SMP assignment to sparse matrices***********************************************************
4306  template< typename MT // Type of the target sparse matrix
4307  , bool SO > // Storage order of the target sparse matrix
4310  {
4312 
4314 
4321 
4322  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4323  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4324 
4325  const ForwardFunctor fwd;
4326 
4327  const TmpType tmp( rhs );
4328  smpAssign( ~lhs, fwd( tmp ) );
4329  }
4331  //**********************************************************************************************
4332 
4333  //**Restructuring SMP assignment to row-major matrices******************************************
4348  template< typename MT > // Type of the target matrix
4350  smpAssign( Matrix<MT,false>& lhs, const TDMatTDMatMultExpr& rhs )
4351  {
4353 
4355 
4356  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4357  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4358 
4359  const ForwardFunctor fwd;
4360 
4362  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4363  else if( IsSymmetric<MT1>::value )
4364  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4365  else
4366  smpAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4367  }
4369  //**********************************************************************************************
4370 
4371  //**SMP addition assignment to dense matrices***************************************************
4387  template< typename MT // Type of the target dense matrix
4388  , bool SO > // Storage order of the target dense matrix
4391  {
4393 
4394  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4395  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4396 
4397  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4398  return;
4399  }
4400 
4401  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4402  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4403 
4404  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4405  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4406  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4407  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4408  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4409  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4410 
4411  smpAddAssign( ~lhs, A * B );
4412  }
4414  //**********************************************************************************************
4415 
4416  //**Restructuring SMP addition assignment to row-major matrices*********************************
4432  template< typename MT > // Type of the target matrix
4435  {
4437 
4439 
4440  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4441  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4442 
4443  const ForwardFunctor fwd;
4444 
4446  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4447  else if( IsSymmetric<MT1>::value )
4448  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4449  else
4450  smpAddAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4451  }
4453  //**********************************************************************************************
4454 
4455  //**SMP addition assignment to sparse matrices**************************************************
4456  // No special implementation for the SMP addition assignment to sparse matrices.
4457  //**********************************************************************************************
4458 
4459  //**SMP subtraction assignment to dense matrices************************************************
4475  template< typename MT // Type of the target dense matrix
4476  , bool SO > // Storage order of the target dense matrix
4479  {
4481 
4482  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4483  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4484 
4485  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4486  return;
4487  }
4488 
4489  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4490  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4491 
4492  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4493  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4494  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4495  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4496  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4497  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4498 
4499  smpSubAssign( ~lhs, A * B );
4500  }
4502  //**********************************************************************************************
4503 
4504  //**Restructuring SMP subtraction assignment to row-major matrices******************************
4520  template< typename MT > // Type of the target matrix
4523  {
4525 
4527 
4528  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4529  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4530 
4531  const ForwardFunctor fwd;
4532 
4534  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4535  else if( IsSymmetric<MT1>::value )
4536  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4537  else
4538  smpSubAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4539  }
4541  //**********************************************************************************************
4542 
4543  //**SMP subtraction assignment to sparse matrices***********************************************
4544  // No special implementation for the SMP subtraction assignment to sparse matrices.
4545  //**********************************************************************************************
4546 
4547  //**SMP Schur product assignment to dense matrices**********************************************
4561  template< typename MT // Type of the target dense matrix
4562  , bool SO > // Storage order of the target dense matrix
4563  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatTDMatMultExpr& rhs )
4564  {
4566 
4570 
4571  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4572  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4573 
4574  const ResultType tmp( rhs );
4575  smpSchurAssign( ~lhs, tmp );
4576  }
4578  //**********************************************************************************************
4579 
4580  //**SMP Schur product assignment to sparse matrices*********************************************
4581  // No special implementation for the SMP Schur product assignment to sparse matrices.
4582  //**********************************************************************************************
4583 
4584  //**SMP multiplication assignment to dense matrices*********************************************
4585  // No special implementation for the SMP multiplication assignment to dense matrices.
4586  //**********************************************************************************************
4587 
4588  //**SMP multiplication assignment to sparse matrices********************************************
4589  // No special implementation for the SMP multiplication assignment to sparse matrices.
4590  //**********************************************************************************************
4591 
4592  //**Compile time checks*************************************************************************
4600  //**********************************************************************************************
4601 };
4602 //*************************************************************************************************
4603 
4604 
4605 
4606 
4607 //=================================================================================================
4608 //
4609 // DMATSCALARMULTEXPR SPECIALIZATION
4610 //
4611 //=================================================================================================
4612 
4613 //*************************************************************************************************
4621 template< typename MT1 // Type of the left-hand side dense matrix
4622  , typename MT2 // Type of the right-hand side dense matrix
4623  , bool SF // Symmetry flag
4624  , bool HF // Hermitian flag
4625  , bool LF // Lower flag
4626  , bool UF // Upper flag
4627  , typename ST > // Type of the right-hand side scalar value
4628 class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
4629  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4630  , private Computation
4631 {
4632  private:
4633  //**Type definitions****************************************************************************
4636 
4637  using RES = ResultType_<MMM>;
4638  using RT1 = ResultType_<MT1>;
4639  using RT2 = ResultType_<MT2>;
4640  using ET1 = ElementType_<RT1>;
4641  using ET2 = ElementType_<RT2>;
4642  using CT1 = CompositeType_<MT1>;
4643  using CT2 = CompositeType_<MT2>;
4644  //**********************************************************************************************
4645 
4646  //**********************************************************************************************
4648  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4649  //**********************************************************************************************
4650 
4651  //**********************************************************************************************
4653  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4654  //**********************************************************************************************
4655 
4656  //**********************************************************************************************
4658  enum : bool {
4659  SYM = ( SF && !( HF || LF || UF ) ),
4660  HERM = ( HF && !( LF || UF ) ),
4661  LOW = ( LF || ( ( SF || HF ) && UF ) ),
4662  UPP = ( UF || ( ( SF || HF ) && LF ) )
4663  };
4664  //**********************************************************************************************
4665 
4666  //**********************************************************************************************
4668 
4673  template< typename T1, typename T2, typename T3 >
4674  struct CanExploitSymmetry {
4675  enum : bool { value = IsRowMajorMatrix<T1>::value &&
4677  };
4678  //**********************************************************************************************
4679 
4680  //**********************************************************************************************
4682 
4685  template< typename T1, typename T2, typename T3 >
4686  struct IsEvaluationRequired {
4687  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
4688  !CanExploitSymmetry<T1,T2,T3>::value };
4689  };
4690  //**********************************************************************************************
4691 
4692  //**********************************************************************************************
4694 
4696  template< typename T1, typename T2, typename T3, typename T4 >
4697  struct UseBlasKernel {
4698  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4699  !SYM && !HERM && !LOW && !UPP &&
4704  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4709  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
4711  };
4712  //**********************************************************************************************
4713 
4714  //**********************************************************************************************
4716 
4718  template< typename T1, typename T2, typename T3, typename T4 >
4719  struct UseVectorizedDefaultKernel {
4720  enum : bool { value = useOptimizedKernels &&
4722  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4726  , T4 >::value &&
4727  HasSIMDAdd< ElementType_<T2>, ElementType_<T2> >::value &&
4728  HasSIMDMult< ElementType_<T3>, ElementType_<T3> >::value };
4729  };
4730  //**********************************************************************************************
4731 
4732  //**********************************************************************************************
4734 
4736  using ForwardFunctor = IfTrue_< HERM
4737  , DeclHerm
4738  , IfTrue_< SYM
4739  , DeclSym
4740  , IfTrue_< LOW
4741  , IfTrue_< UPP
4742  , DeclDiag
4743  , DeclLow >
4744  , IfTrue_< UPP
4745  , DeclUpp
4746  , Noop > > > >;
4747  //**********************************************************************************************
4748 
4749  public:
4750  //**Type definitions****************************************************************************
4752  using ResultType = MultTrait_<RES,ST>;
4757  using ReturnType = const ElementType;
4758  using CompositeType = const ResultType;
4759 
4762 
4764  using RightOperand = ST;
4765 
4768 
4771  //**********************************************************************************************
4772 
4773  //**Compilation flags***************************************************************************
4775  enum : bool { simdEnabled = !IsDiagonal<MT1>::value &&
4776  MT1::simdEnabled && MT2::simdEnabled &&
4780 
4782  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4783  !evaluateRight && MT2::smpAssignable };
4784  //**********************************************************************************************
4785 
4786  //**SIMD properties*****************************************************************************
4788  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
4789  //**********************************************************************************************
4790 
4791  //**Constructor*********************************************************************************
4797  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4798  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4799  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4800  {}
4801  //**********************************************************************************************
4802 
4803  //**Access operator*****************************************************************************
4810  inline ReturnType operator()( size_t i, size_t j ) const {
4811  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4812  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4813  return matrix_(i,j) * scalar_;
4814  }
4815  //**********************************************************************************************
4816 
4817  //**At function*********************************************************************************
4825  inline ReturnType at( size_t i, size_t j ) const {
4826  if( i >= matrix_.rows() ) {
4827  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4828  }
4829  if( j >= matrix_.columns() ) {
4830  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4831  }
4832  return (*this)(i,j);
4833  }
4834  //**********************************************************************************************
4835 
4836  //**Rows function*******************************************************************************
4841  inline size_t rows() const {
4842  return matrix_.rows();
4843  }
4844  //**********************************************************************************************
4845 
4846  //**Columns function****************************************************************************
4851  inline size_t columns() const {
4852  return matrix_.columns();
4853  }
4854  //**********************************************************************************************
4855 
4856  //**Left operand access*************************************************************************
4861  inline LeftOperand leftOperand() const {
4862  return matrix_;
4863  }
4864  //**********************************************************************************************
4865 
4866  //**Right operand access************************************************************************
4871  inline RightOperand rightOperand() const {
4872  return scalar_;
4873  }
4874  //**********************************************************************************************
4875 
4876  //**********************************************************************************************
4882  template< typename T >
4883  inline bool canAlias( const T* alias ) const {
4884  return matrix_.canAlias( alias );
4885  }
4886  //**********************************************************************************************
4887 
4888  //**********************************************************************************************
4894  template< typename T >
4895  inline bool isAliased( const T* alias ) const {
4896  return matrix_.isAliased( alias );
4897  }
4898  //**********************************************************************************************
4899 
4900  //**********************************************************************************************
4905  inline bool isAligned() const {
4906  return matrix_.isAligned();
4907  }
4908  //**********************************************************************************************
4909 
4910  //**********************************************************************************************
4915  inline bool canSMPAssign() const noexcept {
4916  return ( !BLAZE_BLAS_MODE ||
4917  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4919  ( rows() * columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4920  ( rows() * columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
4921  }
4922  //**********************************************************************************************
4923 
4924  private:
4925  //**Member variables****************************************************************************
4926  LeftOperand matrix_;
4927  RightOperand scalar_;
4928  //**********************************************************************************************
4929 
4930  //**Assignment to dense matrices****************************************************************
4942  template< typename MT // Type of the target dense matrix
4943  , bool SO > // Storage order of the target dense matrix
4945  assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4946  {
4948 
4949  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4950  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4951 
4952  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
4953  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
4954 
4955  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4956  return;
4957  }
4958  else if( left.columns() == 0UL ) {
4959  reset( ~lhs );
4960  return;
4961  }
4962 
4963  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4964  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4965 
4966  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4967  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4968  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4969  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4970  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4971  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4972 
4973  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4974  }
4975  //**********************************************************************************************
4976 
4977  //**Assignment to dense matrices (kernel selection)*********************************************
4988  template< typename MT3 // Type of the left-hand side target matrix
4989  , typename MT4 // Type of the left-hand side matrix operand
4990  , typename MT5 // Type of the right-hand side matrix operand
4991  , typename ST2 > // Type of the scalar value
4992  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4993  {
4994  if( ( IsDiagonal<MT4>::value ) ||
4995  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
4996  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
4997  selectSmallAssignKernel( C, A, B, scalar );
4998  else
4999  selectBlasAssignKernel( C, A, B, scalar );
5000  }
5001  //**********************************************************************************************
5002 
5003  //**Default assignment to dense matrices (general/general)**************************************
5017  template< typename MT3 // Type of the left-hand side target matrix
5018  , typename MT4 // Type of the left-hand side matrix operand
5019  , typename MT5 // Type of the right-hand side matrix operand
5020  , typename ST2 > // Type of the scalar value
5022  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5023  {
5024  const size_t M( A.rows() );
5025  const size_t N( B.columns() );
5026  const size_t K( A.columns() );
5027 
5028  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5029 
5030  for( size_t j=0UL; j<N; ++j )
5031  {
5032  const size_t kbegin( ( IsLower<MT5>::value )
5033  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5034  :( 0UL ) );
5035  const size_t kend( ( IsUpper<MT5>::value )
5036  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5037  :( K ) );
5038  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5039 
5040  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
5041  for( size_t i=0UL; i<M; ++i ) {
5042  reset( C(i,j) );
5043  }
5044  continue;
5045  }
5046 
5047  {
5048  const size_t ibegin( ( IsLower<MT4>::value )
5050  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
5051  :( LOW ? max(j,kbegin) : kbegin ) )
5052  :( LOW ? j : 0UL ) );
5053  const size_t iend( ( IsUpper<MT4>::value )
5055  ?( UPP ? min(j+1UL,kbegin) : kbegin )
5056  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
5057  :( UPP ? j+1UL : M ) );
5058 
5059  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
5060  for( size_t i=0UL; i<ibegin; ++i ) {
5061  reset( C(i,j) );
5062  }
5063  }
5064  else if( IsStrictlyLower<MT4>::value ) {
5065  reset( C(0UL,j) );
5066  }
5067  for( size_t i=ibegin; i<iend; ++i ) {
5068  C(i,j) = A(i,kbegin) * B(kbegin,j);
5069  }
5070  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
5071  for( size_t i=iend; i<M; ++i ) {
5072  reset( C(i,j) );
5073  }
5074  }
5075  else if( IsStrictlyUpper<MT4>::value ) {
5076  reset( C(M-1UL,j) );
5077  }
5078  }
5079 
5080  for( size_t k=kbegin+1UL; k<kend; ++k )
5081  {
5082  const size_t ibegin( ( IsLower<MT4>::value )
5084  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
5085  :( SYM || HERM || LOW ? max( j, k ) : k ) )
5086  :( SYM || HERM || LOW ? j : 0UL ) );
5087  const size_t iend( ( IsUpper<MT4>::value )
5089  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
5090  :( UPP ? min(j+1UL,k) : k ) )
5091  :( UPP ? j+1UL : M ) );
5092 
5093  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5094  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5095 
5096  for( size_t i=ibegin; i<iend; ++i ) {
5097  C(i,j) += A(i,k) * B(k,j);
5098  }
5099  if( IsUpper<MT4>::value ) {
5100  C(iend,j) = A(iend,k) * B(k,j);
5101  }
5102  }
5103 
5104  {
5105  const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
5107  :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5108  const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
5110  :( UPP ? j+1UL : M ) );
5111 
5112  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
5113  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5114 
5115  for( size_t i=ibegin; i<iend; ++i ) {
5116  C(i,j) *= scalar;
5117  }
5118  }
5119  }
5120 
5121  if( SYM || HERM ) {
5122  for( size_t j=1UL; j<N; ++j ) {
5123  for( size_t i=0UL; i<j; ++i ) {
5124  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5125  }
5126  }
5127  }
5128  }
5129  //**********************************************************************************************
5130 
5131  //**Default assignment to dense matrices (general/diagonal)*************************************
5145  template< typename MT3 // Type of the left-hand side target matrix
5146  , typename MT4 // Type of the left-hand side matrix operand
5147  , typename MT5 // Type of the right-hand side matrix operand
5148  , typename ST2 > // Type of the scalar value
5149  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
5150  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5151  {
5153 
5154  const size_t M( A.rows() );
5155  const size_t N( B.columns() );
5156 
5157  for( size_t j=0UL; j<N; ++j )
5158  {
5159  const size_t ibegin( ( IsLower<MT4>::value )
5160  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
5161  :( 0UL ) );
5162  const size_t iend( ( IsUpper<MT4>::value )
5163  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
5164  :( M ) );
5165  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5166 
5167  if( IsLower<MT4>::value ) {
5168  for( size_t i=0UL; i<ibegin; ++i ) {
5169  reset( C(i,j) );
5170  }
5171  }
5172  for( size_t i=ibegin; i<iend; ++i ) {
5173  C(i,j) = A(i,j) * B(j,j) * scalar;
5174  }
5175  if( IsUpper<MT4>::value ) {
5176  for( size_t i=iend; i<M; ++i ) {
5177  reset( C(i,j) );
5178  }
5179  }
5180  }
5181  }
5182  //**********************************************************************************************
5183 
5184  //**Default assignment to dense matrices (diagonal/general)*************************************
5198  template< typename MT3 // Type of the left-hand side target matrix
5199  , typename MT4 // Type of the left-hand side matrix operand
5200  , typename MT5 // Type of the right-hand side matrix operand
5201  , typename ST2 > // Type of the scalar value
5203  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5204  {
5206 
5207  const size_t M( A.rows() );
5208  const size_t N( B.columns() );
5209 
5210  for( size_t j=0UL; j<N; ++j )
5211  {
5212  const size_t ibegin( ( IsLower<MT5>::value )
5213  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5214  :( 0UL ) );
5215  const size_t iend( ( IsUpper<MT5>::value )
5216  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5217  :( M ) );
5218  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
5219 
5220  if( IsLower<MT4>::value ) {
5221  for( size_t i=0UL; i<ibegin; ++i ) {
5222  reset( C(i,j) );
5223  }
5224  }
5225  for( size_t i=ibegin; i<iend; ++i ) {
5226  C(i,j) = A(i,i) * B(i,j) * scalar;
5227  }
5228  if( IsUpper<MT4>::value ) {
5229  for( size_t i=iend; i<M; ++i ) {
5230  reset( C(i,j) );
5231  }
5232  }
5233  }
5234  }
5235  //**********************************************************************************************
5236 
5237  //**Default assignment to dense matrices (diagonal/diagonal)************************************
5251  template< typename MT3 // Type of the left-hand side target matrix
5252  , typename MT4 // Type of the left-hand side matrix operand
5253  , typename MT5 // Type of the right-hand side matrix operand
5254  , typename ST2 > // Type of the scalar value
5255  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
5256  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5257  {
5259 
5260  reset( C );
5261 
5262  for( size_t i=0UL; i<A.rows(); ++i ) {
5263  C(i,i) = A(i,i) * B(i,i) * scalar;
5264  }
5265  }
5266  //**********************************************************************************************
5267 
5268  //**Default assignment to dense matrices (small matrices)***************************************
5282  template< typename MT3 // Type of the left-hand side target matrix
5283  , typename MT4 // Type of the left-hand side matrix operand
5284  , typename MT5 // Type of the right-hand side matrix operand
5285  , typename ST2 > // Type of the scalar value
5287  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5288  {
5289  selectDefaultAssignKernel( C, A, B, scalar );
5290  }
5291  //**********************************************************************************************
5292 
5293  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5308  template< typename MT3 // Type of the left-hand side target matrix
5309  , typename MT4 // Type of the left-hand side matrix operand
5310  , typename MT5 // Type of the right-hand side matrix operand
5311  , typename ST2 > // Type of the scalar value
5313  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5314  {
5319 
5320  const ForwardFunctor fwd;
5321 
5323  const OppositeType_<MT5> tmp( serial( B ) );
5324  assign( ~C, fwd( A * tmp ) * scalar );
5325  }
5327  const OppositeType_<MT4> tmp( serial( A ) );
5328  assign( ~C, fwd( tmp * B ) * scalar );
5329  }
5330  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5331  const OppositeType_<MT5> tmp( serial( B ) );
5332  assign( ~C, fwd( A * tmp ) * scalar );
5333  }
5334  else {
5335  const OppositeType_<MT4> tmp( serial( A ) );
5336  assign( ~C, fwd( tmp * B ) * scalar );
5337  }
5338  }
5339  //**********************************************************************************************
5340 
5341  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
5356  template< typename MT3 // Type of the left-hand side target matrix
5357  , typename MT4 // Type of the left-hand side matrix operand
5358  , typename MT5 // Type of the right-hand side matrix operand
5359  , typename ST2 > // Type of the scalar value
5361  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5362  {
5363  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5364 
5365  const size_t M( A.rows() );
5366  const size_t N( B.columns() );
5367  const size_t K( A.columns() );
5368 
5369  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5370 
5371  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
5372  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5373 
5374  const SIMDType factor( set( scalar ) );
5375 
5376  if( LOW && UPP && M > SIMDSIZE*3UL ) {
5377  reset( ~C );
5378  }
5379 
5380  {
5381  size_t i( 0UL );
5382 
5384  {
5385  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5386  for( size_t j=0UL; j<N; ++j )
5387  {
5388  const size_t kbegin( ( IsLower<MT5>::value )
5389  ?( ( IsUpper<MT4>::value )
5390  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5391  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5392  :( IsUpper<MT4>::value ? i : 0UL ) );
5393  const size_t kend( ( IsUpper<MT5>::value )
5394  ?( ( IsLower<MT4>::value )
5395  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
5396  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
5397  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
5398 
5399  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5400 
5401  for( size_t k=kbegin; k<kend; ++k ) {
5402  const SIMDType b1( set( B(k,j) ) );
5403  xmm1 += A.load(i ,k) * b1;
5404  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5405  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5406  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5407  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5408  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
5409  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
5410  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
5411  }
5412 
5413  (~C).store( i , j, xmm1 * factor );
5414  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5415  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5416  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5417  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5418  (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
5419  (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
5420  (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
5421  }
5422  }
5423  }
5424 
5425  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5426  {
5427  size_t j( 0UL );
5428 
5429  for( ; (j+2UL) <= N; j+=2UL )
5430  {
5431  const size_t kbegin( ( IsLower<MT5>::value )
5432  ?( ( IsUpper<MT4>::value )
5433  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5434  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5435  :( IsUpper<MT4>::value ? i : 0UL ) );
5436  const size_t kend( ( IsUpper<MT5>::value )
5437  ?( ( IsLower<MT4>::value )
5438  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5439  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5440  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
5441 
5442  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5443 
5444  for( size_t k=kbegin; k<kend; ++k ) {
5445  const SIMDType a1( A.load(i ,k) );
5446  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5447  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5448  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5449  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5450  const SIMDType b1( set( B(k,j ) ) );
5451  const SIMDType b2( set( B(k,j+1UL) ) );
5452  xmm1 += a1 * b1;
5453  xmm2 += a2 * b1;
5454  xmm3 += a3 * b1;
5455  xmm4 += a4 * b1;
5456  xmm5 += a5 * b1;
5457  xmm6 += a1 * b2;
5458  xmm7 += a2 * b2;
5459  xmm8 += a3 * b2;
5460  xmm9 += a4 * b2;
5461  xmm10 += a5 * b2;
5462  }
5463 
5464  (~C).store( i , j , xmm1 * factor );
5465  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5466  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5467  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5468  (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
5469  (~C).store( i , j+1UL, xmm6 * factor );
5470  (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
5471  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5472  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5473  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5474  }
5475 
5476  if( j < N )
5477  {
5478  const size_t kbegin( ( IsLower<MT5>::value )
5479  ?( ( IsUpper<MT4>::value )
5480  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5481  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5482  :( IsUpper<MT4>::value ? i : 0UL ) );
5483  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
5484 
5485  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5486 
5487  for( size_t k=kbegin; k<kend; ++k ) {
5488  const SIMDType b1( set( B(k,j) ) );
5489  xmm1 += A.load(i ,k) * b1;
5490  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5491  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5492  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5493  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
5494  }
5495 
5496  (~C).store( i , j, xmm1 * factor );
5497  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5498  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5499  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5500  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
5501  }
5502  }
5503 
5504  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5505  {
5506  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
5507  size_t j( UPP ? i : 0UL );
5508 
5509  for( ; (j+2UL) <= jend; j+=2UL )
5510  {
5511  const size_t kbegin( ( IsLower<MT5>::value )
5512  ?( ( IsUpper<MT4>::value )
5513  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5514  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5515  :( IsUpper<MT4>::value ? i : 0UL ) );
5516  const size_t kend( ( IsUpper<MT5>::value )
5517  ?( ( IsLower<MT4>::value )
5518  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5519  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5520  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
5521 
5522  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5523 
5524  for( size_t k=kbegin; k<kend; ++k ) {
5525  const SIMDType a1( A.load(i ,k) );
5526  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5527  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5528  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5529  const SIMDType b1( set( B(k,j ) ) );
5530  const SIMDType b2( set( B(k,j+1UL) ) );
5531  xmm1 += a1 * b1;
5532  xmm2 += a2 * b1;
5533  xmm3 += a3 * b1;
5534  xmm4 += a4 * b1;
5535  xmm5 += a1 * b2;
5536  xmm6 += a2 * b2;
5537  xmm7 += a3 * b2;
5538  xmm8 += a4 * b2;
5539  }
5540 
5541  (~C).store( i , j , xmm1 * factor );
5542  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5543  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5544  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
5545  (~C).store( i , j+1UL, xmm5 * factor );
5546  (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
5547  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5548  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5549  }
5550 
5551  if( j < jend )
5552  {
5553  const size_t kbegin( ( IsLower<MT5>::value )
5554  ?( ( IsUpper<MT4>::value )
5555  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5556  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5557  :( IsUpper<MT4>::value ? i : 0UL ) );
5558  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
5559 
5560  SIMDType xmm1, xmm2, xmm3, xmm4;
5561 
5562  for( size_t k=kbegin; k<kend; ++k ) {
5563  const SIMDType b1( set( B(k,j) ) );
5564  xmm1 += A.load(i ,k) * b1;
5565  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5566  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5567  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
5568  }
5569 
5570  (~C).store( i , j, xmm1 * factor );
5571  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5572  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5573  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
5574  }
5575  }
5576 
5577  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5578  {
5579  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
5580  size_t j( UPP ? i : 0UL );
5581 
5582  for( ; (j+2UL) <= jend; j+=2UL )
5583  {
5584  const size_t kbegin( ( IsLower<MT5>::value )
5585  ?( ( IsUpper<MT4>::value )
5586  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5587  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5588  :( IsUpper<MT4>::value ? i : 0UL ) );
5589  const size_t kend( ( IsUpper<MT5>::value )
5590  ?( ( IsLower<MT4>::value )
5591  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5592  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5593  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
5594 
5595  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5596 
5597  for( size_t k=kbegin; k<kend; ++k ) {
5598  const SIMDType a1( A.load(i ,k) );
5599  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5600  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5601  const SIMDType b1( set( B(k,j ) ) );
5602  const SIMDType b2( set( B(k,j+1UL) ) );
5603  xmm1 += a1 * b1;
5604  xmm2 += a2 * b1;
5605  xmm3 += a3 * b1;
5606  xmm4 += a1 * b2;
5607  xmm5 += a2 * b2;
5608  xmm6 += a3 * b2;
5609  }
5610 
5611  (~C).store( i , j , xmm1 * factor );
5612  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
5613  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
5614  (~C).store( i , j+1UL, xmm4 * factor );
5615  (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
5616  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
5617  }
5618 
5619  if( j < jend )
5620  {
5621  const size_t kbegin( ( IsLower<MT5>::value )
5622  ?( ( IsUpper<MT4>::value )
5623  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5624  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5625  :( IsUpper<MT4>::value ? i : 0UL ) );
5626  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
5627 
5628  SIMDType xmm1, xmm2, xmm3;
5629 
5630  for( size_t k=kbegin; k<kend; ++k ) {
5631  const SIMDType b1( set( B(k,j) ) );
5632  xmm1 += A.load(i ,k) * b1;
5633  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
5634  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
5635  }
5636 
5637  (~C).store( i , j, xmm1 * factor );
5638  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
5639  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
5640  }
5641  }
5642 
5643  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5644  {
5645  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
5646  size_t j( UPP ? i : 0UL );
5647 
5648  for( ; (j+4UL) <= jend; j+=4UL )
5649  {
5650  const size_t kbegin( ( IsLower<MT5>::value )
5651  ?( ( IsUpper<MT4>::value )
5652  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5653  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5654  :( IsUpper<MT4>::value ? i : 0UL ) );
5655  const size_t kend( ( IsUpper<MT5>::value )
5656  ?( ( IsLower<MT4>::value )
5657  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
5658  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
5659  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
5660 
5661  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5662 
5663  for( size_t k=kbegin; k<kend; ++k ) {
5664  const SIMDType a1( A.load(i ,k) );
5665  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5666  const SIMDType b1( set( B(k,j ) ) );
5667  const SIMDType b2( set( B(k,j+1UL) ) );
5668  const SIMDType b3( set( B(k,j+2UL) ) );
5669  const SIMDType b4( set( B(k,j+3UL) ) );
5670  xmm1 += a1 * b1;
5671  xmm2 += a2 * b1;
5672  xmm3 += a1 * b2;
5673  xmm4 += a2 * b2;
5674  xmm5 += a1 * b3;
5675  xmm6 += a2 * b3;
5676  xmm7 += a1 * b4;
5677  xmm8 += a2 * b4;
5678  }
5679 
5680  (~C).store( i , j , xmm1 * factor );
5681  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
5682  (~C).store( i , j+1UL, xmm3 * factor );
5683  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5684  (~C).store( i , j+2UL, xmm5 * factor );
5685  (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5686  (~C).store( i , j+3UL, xmm7 * factor );
5687  (~C).store( i+SIMDSIZE, j+3UL, xmm8 * factor );
5688  }
5689 
5690  for( ; (j+3UL) <= jend; j+=3UL )
5691  {
5692  const size_t kbegin( ( IsLower<MT5>::value )
5693  ?( ( IsUpper<MT4>::value )
5694  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5695  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5696  :( IsUpper<MT4>::value ? i : 0UL ) );
5697  const size_t kend( ( IsUpper<MT5>::value )
5698  ?( ( IsLower<MT4>::value )
5699  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
5700  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
5701  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
5702 
5703  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5704 
5705  for( size_t k=kbegin; k<kend; ++k ) {
5706  const SIMDType a1( A.load(i ,k) );
5707  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5708  const SIMDType b1( set( B(k,j ) ) );
5709  const SIMDType b2( set( B(k,j+1UL) ) );
5710  const SIMDType b3( set( B(k,j+2UL) ) );
5711  xmm1 += a1 * b1;
5712  xmm2 += a2 * b1;
5713  xmm3 += a1 * b2;
5714  xmm4 += a2 * b2;
5715  xmm5 += a1 * b3;
5716  xmm6 += a2 * b3;
5717  }
5718 
5719  (~C).store( i , j , xmm1 * factor );
5720  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
5721  (~C).store( i , j+1UL, xmm3 * factor );
5722  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
5723  (~C).store( i , j+2UL, xmm5 * factor );
5724  (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
5725  }
5726 
5727  for( ; (j+2UL) <= jend; j+=2UL )
5728  {
5729  const size_t kbegin( ( IsLower<MT5>::value )
5730  ?( ( IsUpper<MT4>::value )
5731  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5732  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5733  :( IsUpper<MT4>::value ? i : 0UL ) );
5734  const size_t kend( ( IsUpper<MT5>::value )
5735  ?( ( IsLower<MT4>::value )
5736  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5737  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5738  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
5739 
5740  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5741  size_t k( kbegin );
5742 
5743  for( ; (k+2UL) <= kend; k+=2UL ) {
5744  const SIMDType a1( A.load(i ,k ) );
5745  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
5746  const SIMDType a3( A.load(i ,k+1UL) );
5747  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
5748  const SIMDType b1( set( B(k ,j ) ) );
5749  const SIMDType b2( set( B(k ,j+1UL) ) );
5750  const SIMDType b3( set( B(k+1UL,j ) ) );
5751  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
5752  xmm1 += a1 * b1;
5753  xmm2 += a2 * b1;
5754  xmm3 += a1 * b2;
5755  xmm4 += a2 * b2;
5756  xmm5 += a3 * b3;
5757  xmm6 += a4 * b3;
5758  xmm7 += a3 * b4;
5759  xmm8 += a4 * b4;
5760  }
5761 
5762  for( ; k<kend; ++k ) {
5763  const SIMDType a1( A.load(i ,k) );
5764  const SIMDType a2( A.load(i+SIMDSIZE,k) );
5765  const SIMDType b1( set( B(k,j ) ) );
5766  const SIMDType b2( set( B(k,j+1UL) ) );
5767  xmm1 += a1 * b1;
5768  xmm2 += a2 * b1;
5769  xmm3 += a1 * b2;
5770  xmm4 += a2 * b2;
5771  }
5772 
5773  (~C).store( i , j , (xmm1+xmm5) * factor );
5774  (~C).store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
5775  (~C).store( i , j+1UL, (xmm3+xmm7) * factor );
5776  (~C).store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
5777  }
5778 
5779  if( j < jend )
5780  {
5781  const size_t kbegin( ( IsLower<MT5>::value )
5782  ?( ( IsUpper<MT4>::value )
5783  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5784  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5785  :( IsUpper<MT4>::value ? i : 0UL ) );
5786  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
5787 
5788  SIMDType xmm1, xmm2, xmm3, xmm4;
5789  size_t k( kbegin );
5790 
5791  for( ; (k+2UL) <= kend; k+=2UL ) {
5792  const SIMDType b1( set( B(k ,j) ) );
5793  const SIMDType b2( set( B(k+1UL,j) ) );
5794  xmm1 += A.load(i ,k ) * b1;
5795  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
5796  xmm3 += A.load(i ,k+1UL) * b2;
5797  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
5798  }
5799 
5800  for( ; k<kend; ++k ) {
5801  const SIMDType b1( set( B(k,j) ) );
5802  xmm1 += A.load(i ,k) * b1;
5803  xmm2 += A.load(i+SIMDSIZE,k) * b1;
5804  }
5805 
5806  (~C).store( i , j, (xmm1+xmm3) * factor );
5807  (~C).store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
5808  }
5809  }
5810 
5811  for( ; i<ipos; i+=SIMDSIZE )
5812  {
5813  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
5814  size_t j( UPP ? i : 0UL );
5815 
5816  for( ; (j+4UL) <= jend; j+=4UL )
5817  {
5818  const size_t kbegin( ( IsLower<MT5>::value )
5819  ?( ( IsUpper<MT4>::value )
5820  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5821  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5822  :( IsUpper<MT4>::value ? i : 0UL ) );
5823  const size_t kend( ( IsUpper<MT5>::value )
5824  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
5825  :( K ) );
5826 
5827  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5828  size_t k( kbegin );
5829 
5830  for( ; (k+2UL) <= kend; k+=2UL ) {
5831  const SIMDType a1( A.load(i,k ) );
5832  const SIMDType a2( A.load(i,k+1UL) );
5833  xmm1 += a1 * set( B(k ,j ) );
5834  xmm2 += a1 * set( B(k ,j+1UL) );
5835  xmm3 += a1 * set( B(k ,j+2UL) );
5836  xmm4 += a1 * set( B(k ,j+3UL) );
5837  xmm5 += a2 * set( B(k+1UL,j ) );
5838  xmm6 += a2 * set( B(k+1UL,j+1UL) );
5839  xmm7 += a2 * set( B(k+1UL,j+2UL) );
5840  xmm8 += a2 * set( B(k+1UL,j+3UL) );
5841  }
5842 
5843  for( ; k<kend; ++k ) {
5844  const SIMDType a1( A.load(i,k) );
5845  xmm1 += a1 * set( B(k,j ) );
5846  xmm2 += a1 * set( B(k,j+1UL) );
5847  xmm3 += a1 * set( B(k,j+2UL) );
5848  xmm4 += a1 * set( B(k,j+3UL) );
5849  }
5850 
5851  (~C).store( i, j , (xmm1+xmm5) * factor );
5852  (~C).store( i, j+1UL, (xmm2+xmm6) * factor );
5853  (~C).store( i, j+2UL, (xmm3+xmm7) * factor );
5854  (~C).store( i, j+3UL, (xmm4+xmm8) * factor );
5855  }
5856 
5857  for( ; (j+3UL) <= jend; j+=3UL )
5858  {
5859  const size_t kbegin( ( IsLower<MT5>::value )
5860  ?( ( IsUpper<MT4>::value )
5861  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5862  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5863  :( IsUpper<MT4>::value ? i : 0UL ) );
5864  const size_t kend( ( IsUpper<MT5>::value )
5865  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
5866  :( K ) );
5867 
5868  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5869  size_t k( kbegin );
5870 
5871  for( ; (k+2UL) <= kend; k+=2UL ) {
5872  const SIMDType a1( A.load(i,k ) );
5873  const SIMDType a2( A.load(i,k+1UL) );
5874  xmm1 += a1 * set( B(k ,j ) );
5875  xmm2 += a1 * set( B(k ,j+1UL) );
5876  xmm3 += a1 * set( B(k ,j+2UL) );
5877  xmm4 += a2 * set( B(k+1UL,j ) );
5878  xmm5 += a2 * set( B(k+1UL,j+1UL) );
5879  xmm6 += a2 * set( B(k+1UL,j+2UL) );
5880  }
5881 
5882  for( ; k<kend; ++k ) {
5883  const SIMDType a1( A.load(i,k) );
5884  xmm1 += a1 * set( B(k,j ) );
5885  xmm2 += a1 * set( B(k,j+1UL) );
5886  xmm3 += a1 * set( B(k,j+2UL) );
5887  }
5888 
5889  (~C).store( i, j , (xmm1+xmm4) * factor );
5890  (~C).store( i, j+1UL, (xmm2+xmm5) * factor );
5891  (~C).store( i, j+2UL, (xmm3+xmm6) * factor );
5892  }
5893 
5894  for( ; (j+2UL) <= jend; j+=2UL )
5895  {
5896  const size_t kbegin( ( IsLower<MT5>::value )
5897  ?( ( IsUpper<MT4>::value )
5898  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5899  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5900  :( IsUpper<MT4>::value ? i : 0UL ) );
5901  const size_t kend( ( IsUpper<MT5>::value )
5902  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5903  :( K ) );
5904 
5905  SIMDType xmm1, xmm2, xmm3, xmm4;
5906  size_t k( kbegin );
5907 
5908  for( ; k<kend; ++k ) {
5909  const SIMDType a1( A.load(i,k) );
5910  xmm1 += a1 * set( B(k,j ) );
5911  xmm2 += a1 * set( B(k,j+1UL) );
5912  }
5913 
5914  for( ; (k+2UL) <= kend; k+=2UL ) {
5915  const SIMDType a1( A.load(i,k ) );
5916  const SIMDType a2( A.load(i,k+1UL) );
5917  xmm1 += a1 * set( B(k ,j ) );
5918  xmm2 += a1 * set( B(k ,j+1UL) );
5919  xmm3 += a2 * set( B(k+1UL,j ) );
5920  xmm4 += a2 * set( B(k+1UL,j+1UL) );
5921  }
5922 
5923  (~C).store( i, j , (xmm1+xmm3) * factor );
5924  (~C).store( i, j+1UL, (xmm2+xmm4) * factor );
5925  }
5926 
5927  if( j < jend )
5928  {
5929  const size_t kbegin( ( IsLower<MT5>::value )
5930  ?( ( IsUpper<MT4>::value )
5931  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5932  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5933  :( IsUpper<MT4>::value ? i : 0UL ) );
5934 
5935  SIMDType xmm1, xmm2;
5936  size_t k( kbegin );
5937 
5938  for( ; (k+2UL) <= K; k+=2UL ) {
5939  xmm1 += A.load(i,k ) * set( B(k ,j) );
5940  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
5941  }
5942 
5943  for( ; k<K; ++k ) {
5944  xmm1 += A.load(i,k) * set( B(k,j) );
5945  }
5946 
5947  (~C).store( i, j, (xmm1+xmm2) * factor );
5948  }
5949  }
5950 
5951  for( ; remainder && i<M; ++i )
5952  {
5953  size_t j( LOW && UPP ? i : 0UL );
5954 
5955  for( ; (j+2UL) <= N; j+=2UL )
5956  {
5957  const size_t kbegin( ( IsLower<MT5>::value )
5958  ?( ( IsUpper<MT4>::value )
5959  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5960  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5961  :( IsUpper<MT4>::value ? i : 0UL ) );
5962  const size_t kend( ( IsUpper<MT5>::value )
5963  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5964  :( K ) );
5965 
5966  ElementType value1{};
5967  ElementType value2{};
5968 
5969  for( size_t k=kbegin; k<kend; ++k ) {
5970  value1 += A(i,k) * B(k,j );
5971  value2 += A(i,k) * B(k,j+1UL);
5972  }
5973 
5974  (~C)(i,j ) = value1 * scalar;
5975  (~C)(i,j+1UL) = value2 * scalar;
5976  }
5977 
5978  if( j < N )
5979  {
5980  const size_t kbegin( ( IsLower<MT5>::value )
5981  ?( ( IsUpper<MT4>::value )
5982  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5983  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5984  :( IsUpper<MT4>::value ? i : 0UL ) );
5985 
5986  ElementType value{};
5987 
5988  for( size_t k=kbegin; k<K; ++k ) {
5989  value += A(i,k) * B(k,j);
5990  }
5991 
5992  (~C)(i,j) = value * scalar;
5993  }
5994  }
5995  }
5996 
5997  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
5998  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
5999  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6000  for( size_t i=0UL; i<iend; ++i ) {
6001  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
6002  }
6003  }
6004  }
6005  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
6006  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
6007  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
6008  for( size_t i=0UL; i<iend; ++i ) {
6009  reset( (~C)(i,j) );
6010  }
6011  }
6012  }
6013  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
6014  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
6015  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
6016  for( size_t j=0UL; j<jend; ++j ) {
6017  reset( (~C)(i,j) );
6018  }
6019  }
6020  }
6021  }
6022  //**********************************************************************************************
6023 
6024  //**Default assignment to dense matrices (large matrices)***************************************
6038  template< typename MT3 // Type of the left-hand side target matrix
6039  , typename MT4 // Type of the left-hand side matrix operand
6040  , typename MT5 // Type of the right-hand side matrix operand
6041  , typename ST2 > // Type of the scalar value
6043  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6044  {
6045  selectDefaultAssignKernel( C, A, B, scalar );
6046  }
6047  //**********************************************************************************************
6048 
6049  //**Vectorized default assignment to dense matrices (large matrices)****************************
6064  template< typename MT3 // Type of the left-hand side target matrix
6065  , typename MT4 // Type of the left-hand side matrix operand
6066  , typename MT5 // Type of the right-hand side matrix operand
6067  , typename ST2 > // Type of the scalar value
6069  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6070  {
6071  if( SYM )
6072  smmm( C, A, B, scalar );
6073  else if( HERM )
6074  hmmm( C, A, B, scalar );
6075  else if( LOW )
6076  lmmm( C, A, B, scalar, ST2(0) );
6077  else if( UPP )
6078  ummm( C, A, B, scalar, ST2(0) );
6079  else
6080  mmm( C, A, B, scalar, ST2(0) );
6081  }
6082  //**********************************************************************************************
6083 
6084  //**BLAS-based assignment to dense matrices (default)*******************************************
6098  template< typename MT3 // Type of the left-hand side target matrix
6099  , typename MT4 // Type of the left-hand side matrix operand
6100  , typename MT5 // Type of the right-hand side matrix operand
6101  , typename ST2 > // Type of the scalar value
6103  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6104  {
6105  selectLargeAssignKernel( C, A, B, scalar );
6106  }
6107  //**********************************************************************************************
6108 
6109  //**BLAS-based assignment to dense matrices*****************************************************
6110 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6111 
6124  template< typename MT3 // Type of the left-hand side target matrix
6125  , typename MT4 // Type of the left-hand side matrix operand
6126  , typename MT5 // Type of the right-hand side matrix operand
6127  , typename ST2 > // Type of the scalar value
6129  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6130  {
6131  using ET = ElementType_<MT3>;
6132 
6133  if( IsTriangular<MT4>::value ) {
6134  assign( C, B );
6135  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6136  }
6137  else if( IsTriangular<MT5>::value ) {
6138  assign( C, A );
6139  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6140  }
6141  else {
6142  gemm( C, A, B, ET(scalar), ET(0) );
6143  }
6144  }
6145 #endif
6146  //**********************************************************************************************
6147 
6148  //**Assignment to sparse matrices***************************************************************
6160  template< typename MT // Type of the target sparse matrix
6161  , bool SO > // Storage order of the target sparse matrix
6163  assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6164  {
6166 
6168 
6175 
6176  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6177  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6178 
6179  const ForwardFunctor fwd;
6180 
6181  const TmpType tmp( serial( rhs ) );
6182  assign( ~lhs, fwd( tmp ) );
6183  }
6184  //**********************************************************************************************
6185 
6186  //**Restructuring assignment to row-major matrices**********************************************
6200  template< typename MT > // Type of the target matrix
6202  assign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
6203  {
6205 
6207 
6208  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6209  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6210 
6211  const ForwardFunctor fwd;
6212 
6213  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6214  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6215 
6217  assign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
6218  else if( IsSymmetric<MT1>::value )
6219  assign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
6220  else
6221  assign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
6222  }
6223  //**********************************************************************************************
6224 
6225  //**Addition assignment to dense matrices*******************************************************
6237  template< typename MT // Type of the target dense matrix
6238  , bool SO > // Storage order of the target dense matrix
6240  addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6241  {
6243 
6244  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6245  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6246 
6247  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6248  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6249 
6250  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6251  return;
6252  }
6253 
6254  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6255  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6256 
6257  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6258  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6259  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6260  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6261  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6262  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6263 
6264  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6265  }
6266  //**********************************************************************************************
6267 
6268  //**Addition assignment to dense matrices (kernel selection)************************************
6279  template< typename MT3 // Type of the left-hand side target matrix
6280  , typename MT4 // Type of the left-hand side matrix operand
6281  , typename MT5 // Type of the right-hand side matrix operand
6282  , typename ST2 > // Type of the scalar value
6283  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6284  {
6285  if( ( IsDiagonal<MT4>::value ) ||
6286  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
6287  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6288  selectSmallAddAssignKernel( C, A, B, scalar );
6289  else
6290  selectBlasAddAssignKernel( C, A, B, scalar );
6291  }
6292  //**********************************************************************************************
6293 
6294  //**Default addition assignment to dense matrices (general/general)*****************************
6308  template< typename MT3 // Type of the left-hand side target matrix
6309  , typename MT4 // Type of the left-hand side matrix operand
6310  , typename MT5 // Type of the right-hand side matrix operand
6311  , typename ST2 > // Type of the scalar value
6312  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6313  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6314  {
6315  const ResultType tmp( serial( A * B * scalar ) );
6316  addAssign( C, tmp );
6317  }
6318  //**********************************************************************************************
6319 
6320  //**Default addition assignment to dense matrices (general/diagonal)****************************
6334  template< typename MT3 // Type of the left-hand side target matrix
6335  , typename MT4 // Type of the left-hand side matrix operand
6336  , typename MT5 // Type of the right-hand side matrix operand
6337  , typename ST2 > // Type of the scalar value
6338  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6339  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6340  {
6342 
6343  const size_t M( A.rows() );
6344  const size_t N( B.columns() );
6345 
6346  for( size_t j=0UL; j<N; ++j )
6347  {
6348  const size_t ibegin( ( IsLower<MT4>::value )
6349  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6350  :( 0UL ) );
6351  const size_t iend( ( IsUpper<MT4>::value )
6352  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6353  :( M ) );
6354  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6355 
6356  const size_t inum( iend - ibegin );
6357  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6358 
6359  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6360  C(i ,j) += A(i ,j) * B(j,j) * scalar;
6361  C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6362  }
6363  if( ipos < iend ) {
6364  C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6365  }
6366  }
6367  }
6368  //**********************************************************************************************
6369 
6370  //**Default addition assignment to dense matrices (diagonal/general)****************************
6384  template< typename MT3 // Type of the left-hand side target matrix
6385  , typename MT4 // Type of the left-hand side matrix operand
6386  , typename MT5 // Type of the right-hand side matrix operand
6387  , typename ST2 > // Type of the scalar value
6388  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6389  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6390  {
6392 
6393  const size_t M( A.rows() );
6394  const size_t N( B.columns() );
6395 
6396  for( size_t j=0UL; j<N; ++j )
6397  {
6398  const size_t ibegin( ( IsLower<MT5>::value )
6399  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6400  :( 0UL ) );
6401  const size_t iend( ( IsUpper<MT5>::value )
6402  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6403  :( M ) );
6404  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
6405 
6406  const size_t inum( iend - ibegin );
6407  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
6408 
6409  for( size_t i=ibegin; i<ipos; i+=2UL ) {
6410  C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6411  C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6412  }
6413  if( ipos < iend ) {
6414  C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6415  }
6416  }
6417  }
6418  //**********************************************************************************************
6419 
6420  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6434  template< typename MT3 // Type of the left-hand side target matrix
6435  , typename MT4 // Type of the left-hand side matrix operand
6436  , typename MT5 // Type of the right-hand side matrix operand
6437  , typename ST2 > // Type of the scalar value
6438  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6439  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6440  {
6442 
6443  for( size_t i=0UL; i<A.rows(); ++i ) {
6444  C(i,i) += A(i,i) * B(i,i) * scalar;
6445  }
6446  }
6447  //**********************************************************************************************
6448 
6449  //**Default addition assignment to dense matrices (small matrices)******************************
6463  template< typename MT3 // Type of the left-hand side target matrix
6464  , typename MT4 // Type of the left-hand side matrix operand
6465  , typename MT5 // Type of the right-hand side matrix operand
6466  , typename ST2 > // Type of the scalar value
6468  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6469  {
6470  selectDefaultAddAssignKernel( C, A, B, scalar );
6471  }
6472  //**********************************************************************************************
6473 
6474  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
6489  template< typename MT3 // Type of the left-hand side target matrix
6490  , typename MT4 // Type of the left-hand side matrix operand
6491  , typename MT5 // Type of the right-hand side matrix operand
6492  , typename ST2 > // Type of the scalar value
6494  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6495  {
6500 
6501  const ForwardFunctor fwd;
6502 
6504  const OppositeType_<MT5> tmp( serial( B ) );
6505  addAssign( ~C, fwd( A * tmp ) * scalar );
6506  }
6508  const OppositeType_<MT4> tmp( serial( A ) );
6509  addAssign( ~C, fwd( tmp * B ) * scalar );
6510  }
6511  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6512  const OppositeType_<MT5> tmp( serial( B ) );
6513  addAssign( ~C, fwd( A * tmp ) * scalar );
6514  }
6515  else {
6516  const OppositeType_<MT4> tmp( serial( A ) );
6517  addAssign( ~C, fwd( tmp * B ) * scalar );
6518  }
6519  }
6520  //**********************************************************************************************
6521 
6522  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
6537  template< typename MT3 // Type of the left-hand side target matrix
6538  , typename MT4 // Type of the left-hand side matrix operand
6539  , typename MT5 // Type of the right-hand side matrix operand
6540  , typename ST2 > // Type of the scalar value
6542  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
6543  {
6544  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6545 
6546  const size_t M( A.rows() );
6547  const size_t N( B.columns() );
6548  const size_t K( A.columns() );
6549 
6550  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6551 
6552  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
6553  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
6554 
6555  const SIMDType factor( set( scalar ) );
6556 
6557  size_t i( 0UL );
6558 
6560  {
6561  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6562  for( size_t j=0UL; j<N; ++j )
6563  {
6564  const size_t kbegin( ( IsLower<MT5>::value )
6565  ?( ( IsUpper<MT4>::value )
6566  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6567  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6568  :( IsUpper<MT4>::value ? i : 0UL ) );
6569  const size_t kend( ( IsUpper<MT5>::value )
6570  ?( ( IsLower<MT4>::value )
6571  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6572  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6573  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
6574 
6575  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6576 
6577  for( size_t k=kbegin; k<kend; ++k ) {
6578  const SIMDType b1( set( B(k,j) ) );
6579  xmm1 += A.load(i ,k) * b1;
6580  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6581  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6582  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6583  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6584  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
6585  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
6586  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
6587  }
6588 
6589  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6590  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6591  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6592  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6593  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6594  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
6595  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
6596  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
6597  }
6598  }
6599  }
6600 
6601  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
6602  {
6603  size_t j( 0UL );
6604 
6605  for( ; (j+2UL) <= N; j+=2UL )
6606  {
6607  const size_t kbegin( ( IsLower<MT5>::value )
6608  ?( ( IsUpper<MT4>::value )
6609  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6610  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6611  :( IsUpper<MT4>::value ? i : 0UL ) );
6612  const size_t kend( ( IsUpper<MT5>::value )
6613  ?( ( IsLower<MT4>::value )
6614  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6615  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6616  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
6617 
6618  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6619 
6620  for( size_t k=kbegin; k<kend; ++k ) {
6621  const SIMDType a1( A.load(i ,k) );
6622  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6623  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6624  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6625  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
6626  const SIMDType b1( set( B(k,j ) ) );
6627  const SIMDType b2( set( B(k,j+1UL) ) );
6628  xmm1 += a1 * b1;
6629  xmm2 += a2 * b1;
6630  xmm3 += a3 * b1;
6631  xmm4 += a4 * b1;
6632  xmm5 += a5 * b1;
6633  xmm6 += a1 * b2;
6634  xmm7 += a2 * b2;
6635  xmm8 += a3 * b2;
6636  xmm9 += a4 * b2;
6637  xmm10 += a5 * b2;
6638  }
6639 
6640  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6641  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6642  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6643  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6644  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
6645  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
6646  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
6647  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
6648  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
6649  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
6650  }
6651 
6652  if( j < N )
6653  {
6654  const size_t kbegin( ( IsLower<MT5>::value )
6655  ?( ( IsUpper<MT4>::value )
6656  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6657  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6658  :( IsUpper<MT4>::value ? i : 0UL ) );
6659  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
6660 
6661  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6662 
6663  for( size_t k=kbegin; k<kend; ++k ) {
6664  const SIMDType b1( set( B(k,j) ) );
6665  xmm1 += A.load(i ,k) * b1;
6666  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6667  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6668  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6669  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
6670  }
6671 
6672  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6673  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6674  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6675  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6676  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
6677  }
6678  }
6679 
6680  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6681  {
6682  size_t j( 0UL );
6683 
6684  for( ; (j+2UL) <= N; j+=2UL )
6685  {
6686  const size_t kbegin( ( IsLower<MT5>::value )
6687  ?( ( IsUpper<MT4>::value )
6688  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6689  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6690  :( IsUpper<MT4>::value ? i : 0UL ) );
6691  const size_t kend( ( IsUpper<MT5>::value )
6692  ?( ( IsLower<MT4>::value )
6693  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6694  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6695  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
6696 
6697  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6698 
6699  for( size_t k=kbegin; k<kend; ++k ) {
6700  const SIMDType a1( A.load(i ,k) );
6701  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6702  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6703  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6704  const SIMDType b1( set( B(k,j ) ) );
6705  const SIMDType b2( set( B(k,j+1UL) ) );
6706  xmm1 += a1 * b1;
6707  xmm2 += a2 * b1;
6708  xmm3 += a3 * b1;
6709  xmm4 += a4 * b1;
6710  xmm5 += a1 * b2;
6711  xmm6 += a2 * b2;
6712  xmm7 += a3 * b2;
6713  xmm8 += a4 * b2;
6714  }
6715 
6716  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6717  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6718  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6719  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
6720  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
6721  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
6722  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
6723  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
6724  }
6725 
6726  if( j < N )
6727  {
6728  const size_t kbegin( ( IsLower<MT5>::value )
6729  ?( ( IsUpper<MT4>::value )
6730  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6731  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6732  :( IsUpper<MT4>::value ? i : 0UL ) );
6733  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
6734 
6735  SIMDType xmm1, xmm2, xmm3, xmm4;
6736 
6737  for( size_t k=kbegin; k<kend; ++k ) {
6738  const SIMDType b1( set( B(k,j) ) );
6739  xmm1 += A.load(i ,k) * b1;
6740  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6741  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6742  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
6743  }
6744 
6745  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6746  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6747  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6748  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
6749  }
6750  }
6751 
6752  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
6753  {
6754  size_t j( 0UL );
6755 
6756  for( ; (j+2UL) <= N; j+=2UL )
6757  {
6758  const size_t kbegin( ( IsLower<MT5>::value )
6759  ?( ( IsUpper<MT4>::value )
6760  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6761  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6762  :( IsUpper<MT4>::value ? i : 0UL ) );
6763  const size_t kend( ( IsUpper<MT5>::value )
6764  ?( ( IsLower<MT4>::value )
6765  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6766  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6767  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
6768 
6769  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6770 
6771  for( size_t k=kbegin; k<kend; ++k ) {
6772  const SIMDType a1( A.load(i ,k) );
6773  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6774  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6775  const SIMDType b1( set( B(k,j ) ) );
6776  const SIMDType b2( set( B(k,j+1UL) ) );
6777  xmm1 += a1 * b1;
6778  xmm2 += a2 * b1;
6779  xmm3 += a3 * b1;
6780  xmm4 += a1 * b2;
6781  xmm5 += a2 * b2;
6782  xmm6 += a3 * b2;
6783  }
6784 
6785  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6786  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
6787  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
6788  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
6789  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
6790  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
6791  }
6792 
6793  if( j < N )
6794  {
6795  const size_t kbegin( ( IsLower<MT5>::value )
6796  ?( ( IsUpper<MT4>::value )
6797  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6798  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6799  :( IsUpper<MT4>::value ? i : 0UL ) );
6800  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
6801 
6802  SIMDType xmm1, xmm2, xmm3;
6803 
6804  for( size_t k=kbegin; k<kend; ++k ) {
6805  const SIMDType b1( set( B(k,j) ) );
6806  xmm1 += A.load(i ,k) * b1;
6807  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
6808  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
6809  }
6810 
6811  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6812  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
6813  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
6814  }
6815  }
6816 
6817  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6818  {
6819  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
6820  size_t j( UPP ? i : 0UL );
6821 
6822  for( ; (j+4UL) <= jend; j+=4UL )
6823  {
6824  const size_t kbegin( ( IsLower<MT5>::value )
6825  ?( ( IsUpper<MT4>::value )
6826  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6827  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6828  :( IsUpper<MT4>::value ? i : 0UL ) );
6829  const size_t kend( ( IsUpper<MT5>::value )
6830  ?( ( IsLower<MT4>::value )
6831  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
6832  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
6833  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6834 
6835  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6836 
6837  for( size_t k=kbegin; k<kend; ++k ) {
6838  const SIMDType a1( A.load(i ,k) );
6839  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6840  const SIMDType b1( set( B(k,j ) ) );
6841  const SIMDType b2( set( B(k,j+1UL) ) );
6842  const SIMDType b3( set( B(k,j+2UL) ) );
6843  const SIMDType b4( set( B(k,j+3UL) ) );
6844  xmm1 += a1 * b1;
6845  xmm2 += a2 * b1;
6846  xmm3 += a1 * b2;
6847  xmm4 += a2 * b2;
6848  xmm5 += a1 * b3;
6849  xmm6 += a2 * b3;
6850  xmm7 += a1 * b4;
6851  xmm8 += a2 * b4;
6852  }
6853 
6854  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6855  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
6856  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6857  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6858  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6859  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6860  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
6861  (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
6862  }
6863 
6864  for( ; (j+3UL) <= jend; j+=3UL )
6865  {
6866  const size_t kbegin( ( IsLower<MT5>::value )
6867  ?( ( IsUpper<MT4>::value )
6868  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6869  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6870  :( IsUpper<MT4>::value ? i : 0UL ) );
6871  const size_t kend( ( IsUpper<MT5>::value )
6872  ?( ( IsLower<MT4>::value )
6873  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
6874  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
6875  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6876 
6877  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6878 
6879  for( size_t k=kbegin; k<kend; ++k ) {
6880  const SIMDType a1( A.load(i ,k) );
6881  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6882  const SIMDType b1( set( B(k,j ) ) );
6883  const SIMDType b2( set( B(k,j+1UL) ) );
6884  const SIMDType b3( set( B(k,j+2UL) ) );
6885  xmm1 += a1 * b1;
6886  xmm2 += a2 * b1;
6887  xmm3 += a1 * b2;
6888  xmm4 += a2 * b2;
6889  xmm5 += a1 * b3;
6890  xmm6 += a2 * b3;
6891  }
6892 
6893  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6894  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
6895  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6896  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
6897  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6898  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
6899  }
6900 
6901  for( ; (j+2UL) <= jend; j+=2UL )
6902  {
6903  const size_t kbegin( ( IsLower<MT5>::value )
6904  ?( ( IsUpper<MT4>::value )
6905  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6906  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6907  :( IsUpper<MT4>::value ? i : 0UL ) );
6908  const size_t kend( ( IsUpper<MT5>::value )
6909  ?( ( IsLower<MT4>::value )
6910  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6911  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6912  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6913 
6914  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6915  size_t k( kbegin );
6916 
6917  for( ; (k+2UL) <= kend; k+=2UL ) {
6918  const SIMDType a1( A.load(i ,k ) );
6919  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6920  const SIMDType a3( A.load(i ,k+1UL) );
6921  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6922  const SIMDType b1( set( B(k ,j ) ) );
6923  const SIMDType b2( set( B(k ,j+1UL) ) );
6924  const SIMDType b3( set( B(k+1UL,j ) ) );
6925  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
6926  xmm1 += a1 * b1;
6927  xmm2 += a2 * b1;
6928  xmm3 += a1 * b2;
6929  xmm4 += a2 * b2;
6930  xmm5 += a3 * b3;
6931  xmm6 += a4 * b3;
6932  xmm7 += a3 * b4;
6933  xmm8 += a4 * b4;
6934  }
6935 
6936  for( ; k<kend; ++k ) {
6937  const SIMDType a1( A.load(i ,k) );
6938  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6939  const SIMDType b1( set( B(k,j ) ) );
6940  const SIMDType b2( set( B(k,j+1UL) ) );
6941  xmm1 += a1 * b1;
6942  xmm2 += a2 * b1;
6943  xmm3 += a1 * b2;
6944  xmm4 += a2 * b2;
6945  }
6946 
6947  (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
6948  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
6949  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + (xmm3+xmm7) * factor );
6950  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
6951  }
6952 
6953  if( j < jend )
6954  {
6955  const size_t kbegin( ( IsLower<MT5>::value )
6956  ?( ( IsUpper<MT4>::value )
6957  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6958  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6959  :( IsUpper<MT4>::value ? i : 0UL ) );
6960  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6961 
6962  SIMDType xmm1, xmm2, xmm3, xmm4;
6963  size_t k( kbegin );
6964 
6965  for( ; (k+2UL) <= kend; k+=2UL ) {
6966  const SIMDType b1( set( B(k ,j) ) );
6967  const SIMDType b2( set( B(k+1UL,j) ) );
6968  xmm1 += A.load(i ,k ) * b1;
6969  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
6970  xmm3 += A.load(i ,k+1UL) * b2;
6971  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
6972  }
6973 
6974  for( ; k<kend; ++k ) {
6975  const SIMDType b1( set( B(k,j) ) );
6976  xmm1 += A.load(i ,k) * b1;
6977  xmm2 += A.load(i+SIMDSIZE,k) * b1;
6978  }
6979 
6980  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
6981  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
6982  }
6983  }
6984 
6985  for( ; i<ipos; i+=SIMDSIZE )
6986  {
6987  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
6988  size_t j( UPP ? i : 0UL );
6989 
6990  for( ; (j+4UL) <= jend; j+=4UL )
6991  {
6992  const size_t kbegin( ( IsLower<MT5>::value )
6993  ?( ( IsUpper<MT4>::value )
6994  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6995  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6996  :( IsUpper<MT4>::value ? i : 0UL ) );
6997  const size_t kend( ( IsUpper<MT5>::value )
6998  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
6999  :( K ) );
7000 
7001  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7002  size_t k( kbegin );
7003 
7004  for( ; (k+2UL) <= kend; k+=2UL ) {
7005  const SIMDType a1( A.load(i,k ) );
7006  const SIMDType a2( A.load(i,k+1UL) );
7007  xmm1 += a1 * set( B(k ,j ) );
7008  xmm2 += a1 * set( B(k ,j+1UL) );
7009  xmm3 += a1 * set( B(k ,j+2UL) );
7010  xmm4 += a1 * set( B(k ,j+3UL) );
7011  xmm5 += a2 * set( B(k+1UL,j ) );
7012  xmm6 += a2 * set( B(k+1UL,j+1UL) );
7013  xmm7 += a2 * set( B(k+1UL,j+2UL) );
7014  xmm8 += a2 * set( B(k+1UL,j+3UL) );
7015  }
7016 
7017  for( ; k<kend; ++k ) {
7018  const SIMDType a1( A.load(i,k) );
7019  xmm1 += a1 * set( B(k,j ) );
7020  xmm2 += a1 * set( B(k,j+1UL) );
7021  xmm3 += a1 * set( B(k,j+2UL) );
7022  xmm4 += a1 * set( B(k,j+3UL) );
7023  }
7024 
7025  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm5) * factor );
7026  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm6) * factor );
7027  (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm7) * factor );
7028  (~C).store( i, j+3UL, (~C).load(i,j+3UL) + (xmm4+xmm8) * factor );
7029  }
7030 
7031  for( ; (j+3UL) <= jend; j+=3UL )
7032  {
7033  const size_t kbegin( ( IsLower<MT5>::value )
7034  ?( ( IsUpper<MT4>::value )
7035  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7036  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7037  :( IsUpper<MT4>::value ? i : 0UL ) );
7038  const size_t kend( ( IsUpper<MT5>::value )
7039  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
7040  :( K ) );
7041 
7042  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7043  size_t k( kbegin );
7044 
7045  for( ; (k+2UL) <= kend; k+=2UL ) {
7046  const SIMDType a1( A.load(i,k ) );
7047  const SIMDType a2( A.load(i,k+1UL) );
7048  xmm1 += a1 * set( B(k ,j ) );
7049  xmm2 += a1 * set( B(k ,j+1UL) );
7050  xmm3 += a1 * set( B(k ,j+2UL) );
7051  xmm4 += a2 * set( B(k+1UL,j ) );
7052  xmm5 += a2 * set( B(k+1UL,j+1UL) );
7053  xmm6 += a2 * set( B(k+1UL,j+2UL) );
7054  }
7055 
7056  for( ; k<kend; ++k ) {
7057  const SIMDType a1( A.load(i,k) );
7058  xmm1 += a1 * set( B(k,j ) );
7059  xmm2 += a1 * set( B(k,j+1UL) );
7060  xmm3 += a1 * set( B(k,j+2UL) );
7061  }
7062 
7063  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm4) * factor );
7064  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm5) * factor );
7065  (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm6) * factor );
7066  }
7067 
7068  for( ; (j+2UL) <= jend; j+=2UL )
7069  {
7070  const size_t kbegin( ( IsLower<MT5>::value )
7071  ?( ( IsUpper<MT4>::value )
7072  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7073  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7074  :( IsUpper<MT4>::value ? i : 0UL ) );
7075  const size_t kend( ( IsUpper<MT5>::value )
7076  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7077  :( K ) );
7078 
7079  SIMDType xmm1, xmm2, xmm3, xmm4;
7080  size_t k( kbegin );
7081 
7082  for( ; (k+2UL) <= kend; k+=2UL ) {
7083  const SIMDType a1( A.load(i,k ) );
7084  const SIMDType a2( A.load(i,k+1UL) );
7085  xmm1 += a1 * set( B(k ,j ) );
7086  xmm2 += a1 * set( B(k ,j+1UL) );
7087  xmm3 += a2 * set( B(k+1UL,j ) );
7088  xmm4 += a2 * set( B(k+1UL,j+1UL) );
7089  }
7090 
7091  for( ; k<kend; ++k ) {
7092  const SIMDType a1( A.load(i,k) );
7093  xmm1 += a1 * set( B(k,j ) );
7094  xmm2 += a1 * set( B(k,j+1UL) );
7095  }
7096 
7097  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
7098  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm4) * factor );
7099  }
7100 
7101  if( j < jend )
7102  {
7103  const size_t kbegin( ( IsLower<MT5>::value )
7104  ?( ( IsUpper<MT4>::value )
7105  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7106  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7107  :( IsUpper<MT4>::value ? i : 0UL ) );
7108 
7109  SIMDType xmm1, xmm2;
7110  size_t k( kbegin );
7111 
7112  for( ; (k+2UL) <= K; k+=2UL ) {
7113  xmm1 += A.load(i,k ) * set( B(k ,j) );
7114  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
7115  }
7116 
7117  for( ; k<K; ++k ) {
7118  xmm1 += A.load(i,k) * set( B(k,j) );
7119  }
7120 
7121  (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
7122  }
7123  }
7124 
7125  for( ; remainder && i<M; ++i )
7126  {
7127  const size_t jend( LOW ? i+1UL : N );
7128  size_t j( UPP ? i : 0UL );
7129 
7130  for( ; (j+2UL) <= jend; j+=2UL )
7131  {
7132  const size_t kbegin( ( IsLower<MT5>::value )
7133  ?( ( IsUpper<MT4>::value )
7134  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7135  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7136  :( IsUpper<MT4>::value ? i : 0UL ) );
7137  const size_t kend( ( IsUpper<MT5>::value )
7138  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7139  :( K ) );
7140 
7141  ElementType value1{};
7142  ElementType value2{};
7143 
7144  for( size_t k=kbegin; k<kend; ++k ) {
7145  value1 += A(i,k) * B(k,j );
7146  value2 += A(i,k) * B(k,j+1UL);
7147  }
7148 
7149  (~C)(i,j ) += value1 * scalar;
7150  (~C)(i,j+1UL) += value2 * scalar;
7151  }
7152 
7153  if( j < jend )
7154  {
7155  const size_t kbegin( ( IsLower<MT5>::value )
7156  ?( ( IsUpper<MT4>::value )
7157  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7158  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7159  :( IsUpper<MT4>::value ? i : 0UL ) );
7160 
7161  ElementType value{};
7162 
7163  for( size_t k=kbegin; k<K; ++k ) {
7164  value += A(i,k) * B(k,j);
7165  }
7166 
7167  (~C)(i,j) += value * scalar;
7168  }
7169  }
7170  }
7171  //**********************************************************************************************
7172 
7173  //**Default addition assignment to dense matrices (large matrices)******************************
7187  template< typename MT3 // Type of the left-hand side target matrix
7188  , typename MT4 // Type of the left-hand side matrix operand
7189  , typename MT5 // Type of the right-hand side matrix operand
7190  , typename ST2 > // Type of the scalar value
7192  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7193  {
7194  selectDefaultAddAssignKernel( C, A, B, scalar );
7195  }
7196  //**********************************************************************************************
7197 
7198  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
7213  template< typename MT3 // Type of the left-hand side target matrix
7214  , typename MT4 // Type of the left-hand side matrix operand
7215  , typename MT5 // Type of the right-hand side matrix operand
7216  , typename ST2 > // Type of the scalar value
7218  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7219  {
7220  if( LOW )
7221  lmmm( C, A, B, scalar, ST2(1) );
7222  else if( UPP )
7223  ummm( C, A, B, scalar, ST2(1) );
7224  else
7225  mmm( C, A, B, scalar, ST2(1) );
7226  }
7227  //**********************************************************************************************
7228 
7229  //**BLAS-based addition assignment to dense matrices (default)**********************************
7244  template< typename MT3 // Type of the left-hand side target matrix
7245  , typename MT4 // Type of the left-hand side matrix operand
7246  , typename MT5 // Type of the right-hand side matrix operand
7247  , typename ST2 > // Type of the scalar value
7249  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7250  {
7251  selectLargeAddAssignKernel( C, A, B, scalar );
7252  }
7253  //**********************************************************************************************
7254 
7255  //**BLAS-based addition assignment to dense matrices********************************************
7256 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7257 
7270  template< typename MT3 // Type of the left-hand side target matrix
7271  , typename MT4 // Type of the left-hand side matrix operand
7272  , typename MT5 // Type of the right-hand side matrix operand
7273  , typename ST2 > // Type of the scalar value
7275  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7276  {
7277  using ET = ElementType_<MT3>;
7278 
7279  if( IsTriangular<MT4>::value ) {
7280  ResultType_<MT3> tmp( serial( B ) );
7281  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7282  addAssign( C, tmp );
7283  }
7284  else if( IsTriangular<MT5>::value ) {
7285  ResultType_<MT3> tmp( serial( A ) );
7286  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7287  addAssign( C, tmp );
7288  }
7289  else {
7290  gemm( C, A, B, ET(scalar), ET(1) );
7291  }
7292  }
7293 #endif
7294  //**********************************************************************************************
7295 
7296  //**Restructuring addition assignment to row-major matrices*************************************
7311  template< typename MT > // Type of the target matrix
7313  addAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
7314  {
7316 
7318 
7319  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7320  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7321 
7322  const ForwardFunctor fwd;
7323 
7324  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7325  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7326 
7328  addAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
7329  else if( IsSymmetric<MT1>::value )
7330  addAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
7331  else
7332  addAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
7333  }
7334  //**********************************************************************************************
7335 
7336  //**Addition assignment to sparse matrices******************************************************
7337  // No special implementation for the addition assignment to sparse matrices.
7338  //**********************************************************************************************
7339 
7340  //**Subtraction assignment to dense matrices****************************************************
7352  template< typename MT // Type of the target dense matrix
7353  , bool SO > // Storage order of the target dense matrix
7355  subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7356  {
7358 
7359  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7360  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7361 
7362  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7363  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7364 
7365  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7366  return;
7367  }
7368 
7369  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7370  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7371 
7372  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7373  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7374  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7375  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7376  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7377  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7378 
7379  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7380  }
7381  //**********************************************************************************************
7382 
7383  //**Subtraction assignment to dense matrices (kernel selection)*********************************
7394  template< typename MT3 // Type of the left-hand side target matrix
7395  , typename MT4 // Type of the left-hand side matrix operand
7396  , typename MT5 // Type of the right-hand side matrix operand
7397  , typename ST2 > // Type of the scalar value
7398  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7399  {
7400  if( ( IsDiagonal<MT4>::value ) ||
7401  ( !BLAZE_DEBUG_MODE && A.rows() <= SIMDSIZE*10UL ) ||
7402  ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
7403  selectSmallSubAssignKernel( C, A, B, scalar );
7404  else
7405  selectBlasSubAssignKernel( C, A, B, scalar );
7406  }
7407  //**********************************************************************************************
7408 
7409  //**Default subtraction assignment to dense matrices (general/general)**************************
7423  template< typename MT3 // Type of the left-hand side target matrix
7424  , typename MT4 // Type of the left-hand side matrix operand
7425  , typename MT5 // Type of the right-hand side matrix operand
7426  , typename ST2 > // Type of the scalar value
7427  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
7428  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7429  {
7430  const ResultType tmp( serial( A * B * scalar ) );
7431  subAssign( C, tmp );
7432  }
7433  //**********************************************************************************************
7434 
7435  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
7449  template< typename MT3 // Type of the left-hand side target matrix
7450  , typename MT4 // Type of the left-hand side matrix operand
7451  , typename MT5 // Type of the right-hand side matrix operand
7452  , typename ST2 > // Type of the scalar value
7453  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7454  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7455  {
7457 
7458  const size_t M( A.rows() );
7459  const size_t N( B.columns() );
7460 
7461  for( size_t j=0UL; j<N; ++j )
7462  {
7463  const size_t ibegin( ( IsLower<MT4>::value )
7464  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
7465  :( 0UL ) );
7466  const size_t iend( ( IsUpper<MT4>::value )
7467  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
7468  :( M ) );
7469  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7470 
7471  const size_t inum( iend - ibegin );
7472  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7473 
7474  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7475  C(i ,j) -= A(i ,j) * B(j,j) * scalar;
7476  C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
7477  }
7478  if( ipos < iend ) {
7479  C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
7480  }
7481  }
7482  }
7483  //**********************************************************************************************
7484 
7485  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
7499  template< typename MT3 // Type of the left-hand side target matrix
7500  , typename MT4 // Type of the left-hand side matrix operand
7501  , typename MT5 // Type of the right-hand side matrix operand
7502  , typename ST2 > // Type of the scalar value
7503  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
7504  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7505  {
7507 
7508  const size_t M( A.rows() );
7509  const size_t N( B.columns() );
7510 
7511  for( size_t j=0UL; j<N; ++j )
7512  {
7513  const size_t ibegin( ( IsLower<MT5>::value )
7514  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
7515  :( 0UL ) );
7516  const size_t iend( ( IsUpper<MT5>::value )
7517  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
7518  :( M ) );
7519  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7520 
7521  const size_t inum( iend - ibegin );
7522  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
7523 
7524  for( size_t i=ibegin; i<ipos; i+=2UL ) {
7525  C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7526  C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7527  }
7528  if( ipos < iend ) {
7529  C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7530  }
7531  }
7532  }
7533  //**********************************************************************************************
7534 
7535  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
7549  template< typename MT3 // Type of the left-hand side target matrix
7550  , typename MT4 // Type of the left-hand side matrix operand
7551  , typename MT5 // Type of the right-hand side matrix operand
7552  , typename ST2 > // Type of the scalar value
7553  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
7554  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7555  {
7557 
7558  for( size_t i=0UL; i<A.rows(); ++i ) {
7559  C(i,i) -= A(i,i) * B(i,i) * scalar;
7560  }
7561  }
7562  //**********************************************************************************************
7563 
7564  //**Default subtraction assignment to dense matrices (small matrices)***************************
7578  template< typename MT3 // Type of the left-hand side target matrix
7579  , typename MT4 // Type of the left-hand side matrix operand
7580  , typename MT5 // Type of the right-hand side matrix operand
7581  , typename ST2 > // Type of the scalar value
7583  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7584  {
7585  selectDefaultSubAssignKernel( C, A, B, scalar );
7586  }
7587  //**********************************************************************************************
7588 
7589  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
7604  template< typename MT3 // Type of the left-hand side target matrix
7605  , typename MT4 // Type of the left-hand side matrix operand
7606  , typename MT5 // Type of the right-hand side matrix operand
7607  , typename ST2 > // Type of the scalar value
7609  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7610  {
7615 
7616  const ForwardFunctor fwd;
7617 
7619  const OppositeType_<MT5> tmp( serial( B ) );
7620  subAssign( ~C, fwd( A * tmp ) * scalar );
7621  }
7623  const OppositeType_<MT4> tmp( serial( A ) );
7624  subAssign( ~C, fwd( tmp * B ) * scalar );
7625  }
7626  else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7627  const OppositeType_<MT5> tmp( serial( B ) );
7628  subAssign( ~C, fwd( A * tmp ) * scalar );
7629  }
7630  else {
7631  const OppositeType_<MT4> tmp( serial( A ) );
7632  subAssign( ~C, fwd( tmp * B ) * scalar );
7633  }
7634  }
7635  //**********************************************************************************************
7636 
7637  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
7652  template< typename MT3 // Type of the left-hand side target matrix
7653  , typename MT4 // Type of the left-hand side matrix operand
7654  , typename MT5 // Type of the right-hand side matrix operand
7655  , typename ST2 > // Type of the scalar value
7657  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7658  {
7659  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7660 
7661  const size_t M( A.rows() );
7662  const size_t N( B.columns() );
7663  const size_t K( A.columns() );
7664 
7665  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7666 
7667  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
7668  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
7669 
7670  const SIMDType factor( set( scalar ) );
7671 
7672  size_t i( 0UL );
7673 
7675  {
7676  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
7677  for( size_t j=0UL; j<N; ++j )
7678  {
7679  const size_t kbegin( ( IsLower<MT5>::value )
7680  ?( ( IsUpper<MT4>::value )
7681  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7682  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7683  :( IsUpper<MT4>::value ? i : 0UL ) );
7684  const size_t kend( ( IsUpper<MT5>::value )
7685  ?( ( IsLower<MT4>::value )
7686  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
7687  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
7688  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
7689 
7690  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7691 
7692  for( size_t k=kbegin; k<kend; ++k ) {
7693  const SIMDType b1( set( B(k,j) ) );
7694  xmm1 += A.load(i ,k) * b1;
7695  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7696  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7697  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7698  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7699  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
7700  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
7701  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
7702  }
7703 
7704  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7705  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7706  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7707  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7708  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7709  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
7710  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
7711  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
7712  }
7713  }
7714  }
7715 
7716  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
7717  {
7718  size_t j( 0UL );
7719 
7720  for( ; (j+2UL) <= N; j+=2UL )
7721  {
7722  const size_t kbegin( ( IsLower<MT5>::value )
7723  ?( ( IsUpper<MT4>::value )
7724  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7725  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7726  :( IsUpper<MT4>::value ? i : 0UL ) );
7727  const size_t kend( ( IsUpper<MT5>::value )
7728  ?( ( IsLower<MT4>::value )
7729  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7730  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7731  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
7732 
7733  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7734 
7735  for( size_t k=kbegin; k<kend; ++k ) {
7736  const SIMDType a1( A.load(i ,k) );
7737  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7738  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7739  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7740  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
7741  const SIMDType b1( set( B(k,j ) ) );
7742  const SIMDType b2( set( B(k,j+1UL) ) );
7743  xmm1 += a1 * b1;
7744  xmm2 += a2 * b1;
7745  xmm3 += a3 * b1;
7746  xmm4 += a4 * b1;
7747  xmm5 += a5 * b1;
7748  xmm6 += a1 * b2;
7749  xmm7 += a2 * b2;
7750  xmm8 += a3 * b2;
7751  xmm9 += a4 * b2;
7752  xmm10 += a5 * b2;
7753  }
7754 
7755  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7756  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7757  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7758  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7759  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
7760  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
7761  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
7762  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
7763  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
7764  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
7765  }
7766 
7767  if( j < N )
7768  {
7769  const size_t kbegin( ( IsLower<MT5>::value )
7770  ?( ( IsUpper<MT4>::value )
7771  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7772  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7773  :( IsUpper<MT4>::value ? i : 0UL ) );
7774  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
7775 
7776  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7777 
7778  for( size_t k=kbegin; k<kend; ++k ) {
7779  const SIMDType b1( set( B(k,j) ) );
7780  xmm1 += A.load(i ,k) * b1;
7781  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7782  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7783  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7784  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
7785  }
7786 
7787  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7788  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7789  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7790  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7791  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
7792  }
7793  }
7794 
7795  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7796  {
7797  size_t j( 0UL );
7798 
7799  for( ; (j+2UL) <= N; j+=2UL )
7800  {
7801  const size_t kbegin( ( IsLower<MT5>::value )
7802  ?( ( IsUpper<MT4>::value )
7803  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7804  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7805  :( IsUpper<MT4>::value ? i : 0UL ) );
7806  const size_t kend( ( IsUpper<MT5>::value )
7807  ?( ( IsLower<MT4>::value )
7808  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7809  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7810  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
7811 
7812  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7813 
7814  for( size_t k=kbegin; k<kend; ++k ) {
7815  const SIMDType a1( A.load(i ,k) );
7816  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7817  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7818  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7819  const SIMDType b1( set( B(k,j ) ) );
7820  const SIMDType b2( set( B(k,j+1UL) ) );
7821  xmm1 += a1 * b1;
7822  xmm2 += a2 * b1;
7823  xmm3 += a3 * b1;
7824  xmm4 += a4 * b1;
7825  xmm5 += a1 * b2;
7826  xmm6 += a2 * b2;
7827  xmm7 += a3 * b2;
7828  xmm8 += a4 * b2;
7829  }
7830 
7831  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7832  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7833  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7834  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
7835  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
7836  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
7837  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
7838  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
7839  }
7840 
7841  if( j < N )
7842  {
7843  const size_t kbegin( ( IsLower<MT5>::value )
7844  ?( ( IsUpper<MT4>::value )
7845  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7846  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7847  :( IsUpper<MT4>::value ? i : 0UL ) );
7848  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
7849 
7850  SIMDType xmm1, xmm2, xmm3, xmm4;
7851 
7852  for( size_t k=kbegin; k<kend; ++k ) {
7853  const SIMDType b1( set( B(k,j) ) );
7854  xmm1 += A.load(i ,k) * b1;
7855  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7856  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7857  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
7858  }
7859 
7860  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7861  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7862  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7863  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
7864  }
7865  }
7866 
7867  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
7868  {
7869  size_t j( 0UL );
7870 
7871  for( ; (j+2UL) <= N; j+=2UL )
7872  {
7873  const size_t kbegin( ( IsLower<MT5>::value )
7874  ?( ( IsUpper<MT4>::value )
7875  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7876  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7877  :( IsUpper<MT4>::value ? i : 0UL ) );
7878  const size_t kend( ( IsUpper<MT5>::value )
7879  ?( ( IsLower<MT4>::value )
7880  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7881  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7882  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
7883 
7884  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7885 
7886  for( size_t k=kbegin; k<kend; ++k ) {
7887  const SIMDType a1( A.load(i ,k) );
7888  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7889  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7890  const SIMDType b1( set( B(k,j ) ) );
7891  const SIMDType b2( set( B(k,j+1UL) ) );
7892  xmm1 += a1 * b1;
7893  xmm2 += a2 * b1;
7894  xmm3 += a3 * b1;
7895  xmm4 += a1 * b2;
7896  xmm5 += a2 * b2;
7897  xmm6 += a3 * b2;
7898  }
7899 
7900  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7901  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
7902  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
7903  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
7904  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
7905  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
7906  }
7907 
7908  if( j < N )
7909  {
7910  const size_t kbegin( ( IsLower<MT5>::value )
7911  ?( ( IsUpper<MT4>::value )
7912  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7913  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7914  :( IsUpper<MT4>::value ? i : 0UL ) );
7915  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
7916 
7917  SIMDType xmm1, xmm2, xmm3;
7918 
7919  for( size_t k=kbegin; k<kend; ++k ) {
7920  const SIMDType b1( set( B(k,j) ) );
7921  xmm1 += A.load(i ,k) * b1;
7922  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
7923  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
7924  }
7925 
7926  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7927  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
7928  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
7929  }
7930  }
7931 
7932  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7933  {
7934  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
7935  size_t j( UPP ? i : 0UL );
7936 
7937  for( ; (j+4UL) <= jend; j+=4UL )
7938  {
7939  const size_t kbegin( ( IsLower<MT5>::value )
7940  ?( ( IsUpper<MT4>::value )
7941  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7942  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7943  :( IsUpper<MT4>::value ? i : 0UL ) );
7944  const size_t kend( ( IsUpper<MT5>::value )
7945  ?( ( IsLower<MT4>::value )
7946  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
7947  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
7948  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
7949 
7950  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7951 
7952  for( size_t k=kbegin; k<kend; ++k ) {
7953  const SIMDType a1( A.load(i ,k) );
7954  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7955  const SIMDType b1( set( B(k,j ) ) );
7956  const SIMDType b2( set( B(k,j+1UL) ) );
7957  const SIMDType b3( set( B(k,j+2UL) ) );
7958  const SIMDType b4( set( B(k,j+3UL) ) );
7959  xmm1 += a1 * b1;
7960  xmm2 += a2 * b1;
7961  xmm3 += a1 * b2;
7962  xmm4 += a2 * b2;
7963  xmm5 += a1 * b3;
7964  xmm6 += a2 * b3;
7965  xmm7 += a1 * b4;
7966  xmm8 += a2 * b4;
7967  }
7968 
7969  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7970  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
7971  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7972  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
7973  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
7974  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
7975  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
7976  (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
7977  }
7978 
7979  for( ; (j+3UL) <= jend; j+=3UL )
7980  {
7981  const size_t kbegin( ( IsLower<MT5>::value )
7982  ?( ( IsUpper<MT4>::value )
7983  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7984  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7985  :( IsUpper<MT4>::value ? i : 0UL ) );
7986  const size_t kend( ( IsUpper<MT5>::value )
7987  ?( ( IsLower<MT4>::value )
7988  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
7989  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
7990  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
7991 
7992  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7993 
7994  for( size_t k=kbegin; k<kend; ++k ) {
7995  const SIMDType a1( A.load(i ,k) );
7996  const SIMDType a2( A.load(i+SIMDSIZE,k) );
7997  const SIMDType b1( set( B(k,j ) ) );
7998  const SIMDType b2( set( B(k,j+1UL) ) );
7999  const SIMDType b3( set( B(k,j+2UL) ) );
8000  xmm1 += a1 * b1;
8001  xmm2 += a2 * b1;
8002  xmm3 += a1 * b2;
8003  xmm4 += a2 * b2;
8004  xmm5 += a1 * b3;
8005  xmm6 += a2 * b3;
8006  }
8007 
8008  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
8009  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
8010  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
8011  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
8012  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
8013  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
8014  }
8015 
8016  for( ; (j+2UL) <= jend; j+=2UL )
8017  {
8018  const size_t kbegin( ( IsLower<MT5>::value )
8019  ?( ( IsUpper<MT4>::value )
8020  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8021  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8022  :( IsUpper<MT4>::value ? i : 0UL ) );
8023  const size_t kend( ( IsUpper<MT5>::value )
8024  ?( ( IsLower<MT4>::value )
8025  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8026  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8027  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8028 
8029  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8030  size_t k( kbegin );
8031 
8032  for( ; (k+2UL) <= kend; k+=2UL ) {
8033  const SIMDType a1( A.load(i ,k ) );
8034  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8035  const SIMDType a3( A.load(i ,k+1UL) );
8036  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8037  const SIMDType b1( set( B(k ,j ) ) );
8038  const SIMDType b2( set( B(k ,j+1UL) ) );
8039  const SIMDType b3( set( B(k+1UL,j ) ) );
8040  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
8041  xmm1 += a1 * b1;
8042  xmm2 += a2 * b1;
8043  xmm3 += a1 * b2;
8044  xmm4 += a2 * b2;
8045  xmm5 += a3 * b3;
8046  xmm6 += a4 * b3;
8047  xmm7 += a3 * b4;
8048  xmm8 += a4 * b4;
8049  }
8050 
8051  for( ; k<kend; ++k ) {
8052  const SIMDType a1( A.load(i ,k) );
8053  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8054  const SIMDType b1( set( B(k,j ) ) );
8055  const SIMDType b2( set( B(k,j+1UL) ) );
8056  xmm1 += a1 * b1;
8057  xmm2 += a2 * b1;
8058  xmm3 += a1 * b2;
8059  xmm4 += a2 * b2;
8060  }
8061 
8062  (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
8063  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
8064  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - (xmm3+xmm7) * factor );
8065  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
8066  }
8067 
8068  if( j < jend )
8069  {
8070  const size_t kbegin( ( IsLower<MT5>::value )
8071  ?( ( IsUpper<MT4>::value )
8072  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8073  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8074  :( IsUpper<MT4>::value ? i : 0UL ) );
8075  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8076 
8077  SIMDType xmm1, xmm2, xmm3, xmm4;
8078  size_t k( kbegin );
8079 
8080  for( ; (k+2UL) <= kend; k+=2UL ) {
8081  const SIMDType b1( set( B(k ,j) ) );
8082  const SIMDType b2( set( B(k+1UL,j) ) );
8083  xmm1 += A.load(i ,k ) * b1;
8084  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8085  xmm3 += A.load(i ,k+1UL) * b2;
8086  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8087  }
8088 
8089  for( ; k<kend; ++k ) {
8090  const SIMDType b1( set( B(k,j) ) );
8091  xmm1 += A.load(i ,k) * b1;
8092  xmm2 += A.load(i+SIMDSIZE,k) * b1;
8093  }
8094 
8095  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
8096  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
8097  }
8098  }
8099 
8100  for( ; i<ipos; i+=SIMDSIZE )
8101  {
8102  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
8103  size_t j( UPP ? i : 0UL );
8104 
8105  for( ; (j+4UL) <= jend; j+=4UL )
8106  {
8107  const size_t kbegin( ( IsLower<MT5>::value )
8108  ?( ( IsUpper<MT4>::value )
8109  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8110  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8111  :( IsUpper<MT4>::value ? i : 0UL ) );
8112  const size_t kend( ( IsUpper<MT5>::value )
8113  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
8114  :( K ) );
8115 
8116  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8117  size_t k( kbegin );
8118 
8119  for( ; (k+2UL) <= kend; k+=2UL ) {
8120  const SIMDType a1( A.load(i,k ) );
8121  const SIMDType a2( A.load(i,k+1UL) );
8122  xmm1 += a1 * set( B(k ,j ) );
8123  xmm2 += a1 * set( B(k ,j+1UL) );
8124  xmm3 += a1 * set( B(k ,j+2UL) );
8125  xmm4 += a1 * set( B(k ,j+3UL) );
8126  xmm5 += a2 * set( B(k+1UL,j ) );
8127  xmm6 += a2 * set( B(k+1UL,j+1UL) );
8128  xmm7 += a2 * set( B(k+1UL,j+2UL) );
8129  xmm8 += a2 * set( B(k+1UL,j+3UL) );
8130  }
8131 
8132  for( ; k<kend; ++k ) {
8133  const SIMDType a1( A.load(i,k) );
8134  xmm1 += a1 * set( B(k,j ) );
8135  xmm2 += a1 * set( B(k,j+1UL) );
8136  xmm3 += a1 * set( B(k,j+2UL) );
8137  xmm4 += a1 * set( B(k,j+3UL) );
8138  }
8139 
8140  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm5) * factor );
8141  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm6) * factor );
8142  (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm7) * factor );
8143  (~C).store( i, j+3UL, (~C).load(i,j+3UL) - (xmm4+xmm8) * factor );
8144  }
8145 
8146  for( ; (j+3UL) <= jend; j+=3UL )
8147  {
8148  const size_t kbegin( ( IsLower<MT5>::value )
8149  ?( ( IsUpper<MT4>::value )
8150  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8151  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8152  :( IsUpper<MT4>::value ? i : 0UL ) );
8153  const size_t kend( ( IsUpper<MT5>::value )
8154  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
8155  :( K ) );
8156 
8157  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8158  size_t k( kbegin );
8159 
8160  for( ; (k+2UL) <= kend; k+=2UL ) {
8161  const SIMDType a1( A.load(i,k ) );
8162  const SIMDType a2( A.load(i,k+1UL) );
8163  xmm1 += a1 * set( B(k ,j ) );
8164  xmm2 += a1 * set( B(k ,j+1UL) );
8165  xmm3 += a1 * set( B(k ,j+2UL) );
8166  xmm4 += a2 * set( B(k+1UL,j ) );
8167  xmm5 += a2 * set( B(k+1UL,j+1UL) );
8168  xmm6 += a2 * set( B(k+1UL,j+2UL) );
8169  }
8170 
8171  for( ; k<kend; ++k ) {
8172  const SIMDType a1( A.load(i,k) );
8173  xmm1 += a1 * set( B(k,j ) );
8174  xmm2 += a1 * set( B(k,j+1UL) );
8175  xmm3 += a1 * set( B(k,j+2UL) );
8176  }
8177 
8178  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm4) * factor );
8179  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm5) * factor );
8180  (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm6) * factor );
8181  }
8182 
8183  for( ; (j+2UL) <= jend; j+=2UL )
8184  {
8185  const size_t kbegin( ( IsLower<MT5>::value )
8186  ?( ( IsUpper<MT4>::value )
8187  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8188  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8189  :( IsUpper<MT4>::value ? i : 0UL ) );
8190  const size_t kend( ( IsUpper<MT5>::value )
8191  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8192  :( K ) );
8193 
8194  SIMDType xmm1, xmm2, xmm3, xmm4;
8195  size_t k( kbegin );
8196 
8197  for( ; (k+2UL) <= kend; k+=2UL ) {
8198  const SIMDType a1( A.load(i,k ) );
8199  const SIMDType a2( A.load(i,k+1UL) );
8200  xmm1 += a1 * set( B(k ,j ) );
8201  xmm2 += a1 * set( B(k ,j+1UL) );
8202  xmm3 += a2 * set( B(k+1UL,j ) );
8203  xmm4 += a2 * set( B(k+1UL,j+1UL) );
8204  }
8205 
8206  for( ; k<kend; ++k ) {
8207  const SIMDType a1( A.load(i,k) );
8208  xmm1 += a1 * set( B(k,j ) );
8209  xmm2 += a1 * set( B(k,j+1UL) );
8210  }
8211 
8212  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
8213  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm4) * factor );
8214  }
8215 
8216  if( j < jend )
8217  {
8218  const size_t kbegin( ( IsLower<MT5>::value )
8219  ?( ( IsUpper<MT4>::value )
8220  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8221  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8222  :( IsUpper<MT4>::value ? i : 0UL ) );
8223 
8224  SIMDType xmm1, xmm2;
8225  size_t k( kbegin );
8226 
8227  for( ; (k+2UL) <= K; k+=2UL ) {
8228  xmm1 += A.load(i,k ) * set( B(k ,j) );
8229  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
8230  }
8231 
8232  for( ; k<K; ++k ) {
8233  xmm1 += A.load(i,k) * set( B(k,j) );
8234  }
8235 
8236  (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
8237  }
8238  }
8239 
8240  for( ; remainder && i<M; ++i )
8241  {
8242  const size_t jend( LOW ? i+1UL : N );
8243  size_t j( UPP ? i : 0UL );
8244 
8245  for( ; (j+2UL) <= jend; j+=2UL )
8246  {
8247  const size_t kbegin( ( IsLower<MT5>::value )
8248  ?( ( IsUpper<MT4>::value )
8249  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8250  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8251  :( IsUpper<MT4>::value ? i : 0UL ) );
8252  const size_t kend( ( IsUpper<MT5>::value )
8253  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8254  :( K ) );
8255 
8256  ElementType value1{};
8257  ElementType value2{};
8258 
8259  for( size_t k=kbegin; k<kend; ++k ) {
8260  value1 += A(i,k) * B(k,j );
8261  value2 += A(i,k) * B(k,j+1UL);
8262  }
8263 
8264  (~C)(i,j ) -= value1 * scalar;
8265  (~C)(i,j+1UL) -= value2 * scalar;
8266  }
8267 
8268  if( j < jend )
8269  {
8270  const size_t kbegin( ( IsLower<MT5>::value )
8271  ?( ( IsUpper<MT4>::value )
8272  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8273  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8274  :( IsUpper<MT4>::value ? i : 0UL ) );
8275 
8276  ElementType value{};
8277 
8278  for( size_t k=kbegin; k<K; ++k ) {
8279  value += A(i,k) * B(k,j);
8280  }
8281 
8282  (~C)(i,j) -= value * scalar;
8283  }
8284  }
8285  }
8286  //**********************************************************************************************
8287 
8288  //**Default subtraction assignment to dense matrices (large matrices)***************************
8302  template< typename MT3 // Type of the left-hand side target matrix
8303  , typename MT4 // Type of the left-hand side matrix operand
8304  , typename MT5 // Type of the right-hand side matrix operand
8305  , typename ST2 > // Type of the scalar value
8307  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8308  {
8309  selectDefaultSubAssignKernel( C, A, B, scalar );
8310  }
8311  //**********************************************************************************************
8312 
8313  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
8328  template< typename MT3 // Type of the left-hand side target matrix
8329  , typename MT4 // Type of the left-hand side matrix operand
8330  , typename MT5 // Type of the right-hand side matrix operand
8331  , typename ST2 > // Type of the scalar value
8333  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8334  {
8335  if( LOW )
8336  lmmm( C, A, B, -scalar, ST2(1) );
8337  else if( UPP )
8338  ummm( C, A, B, -scalar, ST2(1) );
8339  else
8340  mmm( C, A, B, -scalar, ST2(1) );
8341  }
8342  //**********************************************************************************************
8343 
8344  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
8359  template< typename MT3 // Type of the left-hand side target matrix
8360  , typename MT4 // Type of the left-hand side matrix operand
8361  , typename MT5 // Type of the right-hand side matrix operand
8362  , typename ST2 > // Type of the scalar value
8364  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8365  {
8366  selectLargeSubAssignKernel( C, A, B, scalar );
8367  }
8368  //**********************************************************************************************
8369 
8370  //**BLAS-based subraction assignment to dense matrices******************************************
8371 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
8372 
8385  template< typename MT3 // Type of the left-hand side target matrix
8386  , typename MT4 // Type of the left-hand side matrix operand
8387  , typename MT5 // Type of the right-hand side matrix operand
8388  , typename ST2 > // Type of the scalar value
8390  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8391  {
8392  using ET = ElementType_<MT3>;
8393 
8394  if( IsTriangular<MT4>::value ) {
8395  ResultType_<MT3> tmp( serial( B ) );
8396  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8397  subAssign( C, tmp );
8398  }
8399  else if( IsTriangular<MT5>::value ) {
8400  ResultType_<MT3> tmp( serial( A ) );
8401  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8402  subAssign( C, tmp );
8403  }
8404  else {
8405  gemm( C, A, B, ET(-scalar), ET(1) );
8406  }
8407  }
8408 #endif
8409  //**********************************************************************************************
8410 
8411  //**Restructuring subtraction assignment to row-major matrices**********************************
8425  template< typename MT > // Type of the target matrix
8427  subAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8428  {
8430 
8432 
8433  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8434  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8435 
8436  const ForwardFunctor fwd;
8437 
8438  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8439  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8440 
8442  subAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8443  else if( IsSymmetric<MT1>::value )
8444  subAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8445  else
8446  subAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8447  }
8448  //**********************************************************************************************
8449 
8450  //**Subtraction assignment to sparse matrices***************************************************
8451  // No special implementation for the subtraction assignment to sparse matrices.
8452  //**********************************************************************************************
8453 
8454  //**Schur product assignment to dense matrices**************************************************
8466  template< typename MT // Type of the target dense matrix
8467  , bool SO > // Storage order of the target dense matrix
8468  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8469  {
8471 
8475 
8476  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8477  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8478 
8479  const ResultType tmp( serial( rhs ) );
8480  schurAssign( ~lhs, tmp );
8481  }
8482  //**********************************************************************************************
8483 
8484  //**Schur product assignment to sparse matrices*************************************************
8485  // No special implementation for the Schur product assignment to sparse matrices.
8486  //**********************************************************************************************
8487 
8488  //**Multiplication assignment to dense matrices*************************************************
8489  // No special implementation for the multiplication assignment to dense matrices.
8490  //**********************************************************************************************
8491 
8492  //**Multiplication assignment to sparse matrices************************************************
8493  // No special implementation for the multiplication assignment to sparse matrices.
8494  //**********************************************************************************************
8495 
8496  //**SMP assignment to dense matrices************************************************************
8511  template< typename MT // Type of the target dense matrix
8512  , bool SO > // Storage order of the target dense matrix
8514  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8515  {
8517 
8518  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8519  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8520 
8521  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8522  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8523 
8524  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
8525  return;
8526  }
8527  else if( left.columns() == 0UL ) {
8528  reset( ~lhs );
8529  return;
8530  }
8531 
8532  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8533  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8534 
8535  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8536  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8537  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8538  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8539  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8540  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8541 
8542  smpAssign( ~lhs, A * B * rhs.scalar_ );
8543  }
8544  //**********************************************************************************************
8545 
8546  //**SMP assignment to sparse matrices***********************************************************
8561  template< typename MT // Type of the target sparse matrix
8562  , bool SO > // Storage order of the target sparse matrix
8565  {
8567 
8569 
8576 
8577  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8578  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8579 
8580  const ForwardFunctor fwd;
8581 
8582  const TmpType tmp( rhs );
8583  smpAssign( ~lhs, fwd( tmp ) );
8584  }
8585  //**********************************************************************************************
8586 
8587  //**Restructuring SMP assignment to row-major matrices******************************************
8601  template< typename MT > // Type of the target matrix
8603  smpAssign( Matrix<MT,false>& lhs, const DMatScalarMultExpr& rhs )
8604  {
8606 
8608 
8609  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8610  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8611 
8612  const ForwardFunctor fwd;
8613 
8614  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8615  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8616 
8618  smpAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8619  else if( IsSymmetric<MT1>::value )
8620  smpAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8621  else
8622  smpAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8623  }
8624  //**********************************************************************************************
8625 
8626  //**SMP addition assignment to dense matrices***************************************************
8641  template< typename MT // Type of the target dense matrix
8642  , bool SO > // Storage order of the target dense matrix
8645  {
8647 
8648  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8649  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8650 
8651  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8652  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8653 
8654  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8655  return;
8656  }
8657 
8658  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8659  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8660 
8661  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8662  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8663  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8664  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8665  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8666  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8667 
8668  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
8669  }
8670  //**********************************************************************************************
8671 
8672  //**Restructuring SMP addition assignment to row-major matrices*********************************
8687  template< typename MT > // Type of the target matrix
8690  {
8692 
8694 
8695  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8696  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8697 
8698  const ForwardFunctor fwd;
8699 
8700  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8701  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8702 
8704  smpAddAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8705  else if( IsSymmetric<MT1>::value )
8706  smpAddAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8707  else
8708  smpAddAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8709  }
8710  //**********************************************************************************************
8711 
8712  //**SMP addition assignment to sparse matrices**************************************************
8713  // No special implementation for the SMP addition assignment to sparse matrices.
8714  //**********************************************************************************************
8715 
8716  //**SMP subtraction assignment to dense matrices************************************************
8731  template< typename MT // Type of the target dense matrix
8732  , bool SO > // Storage order of the target dense matrix
8735  {
8737 
8738  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8739  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8740 
8741  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8742  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8743 
8744  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8745  return;
8746  }
8747 
8748  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8749  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8750 
8751  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8752  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8753  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8754  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8755  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8756  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8757 
8758  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
8759  }
8760  //**********************************************************************************************
8761 
8762  //**Restructuring SMP subtraction assignment to row-major matrices******************************
8777  template< typename MT > // Type of the target matrix
8780  {
8782 
8784 
8785  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8786  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8787 
8788  const ForwardFunctor fwd;
8789 
8790  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8791  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8792 
8794  smpSubAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8795  else if( IsSymmetric<MT1>::value )
8796  smpSubAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8797  else
8798  smpSubAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8799  }
8800  //**********************************************************************************************
8801 
8802  //**SMP subtraction assignment to sparse matrices***********************************************
8803  // No special implementation for the SMP subtraction assignment to sparse matrices.
8804  //**********************************************************************************************
8805 
8806  //**SMP Schur product assignment to dense matrices**********************************************
8818  template< typename MT // Type of the target dense matrix
8819  , bool SO > // Storage order of the target dense matrix
8820  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8821  {
8823 
8827 
8828  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8829  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8830 
8831  const ResultType tmp( rhs );
8832  smpSchurAssign( ~lhs, tmp );
8833  }
8834  //**********************************************************************************************
8835 
8836  //**SMP Schur product assignment to sparse matrices*********************************************
8837  // No special implementation for the SMP Schur product assignment to sparse matrices.
8838  //**********************************************************************************************
8839 
8840  //**SMP multiplication assignment to dense matrices*********************************************
8841  // No special implementation for the SMP multiplication assignment to dense matrices.
8842  //**********************************************************************************************
8843 
8844  //**SMP multiplication assignment to sparse matrices********************************************
8845  // No special implementation for the SMP multiplication assignment to sparse matrices.
8846  //**********************************************************************************************
8847 
8848  //**Compile time checks*************************************************************************
8857  //**********************************************************************************************
8858 };
8860 //*************************************************************************************************
8861 
8862 
8863 
8864 
8865 //=================================================================================================
8866 //
8867 // GLOBAL BINARY ARITHMETIC OPERATORS
8868 //
8869 //=================================================================================================
8870 
8871 //*************************************************************************************************
8898 template< typename MT1 // Type of the left-hand side dense matrix
8899  , typename MT2 > // Type of the right-hand side dense matrix
8900 inline decltype(auto)
8901  operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,true>& rhs )
8902 {
8904 
8905  if( (~lhs).columns() != (~rhs).rows() ) {
8906  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
8907  }
8908 
8910  return ReturnType( ~lhs, ~rhs );
8911 }
8912 //*************************************************************************************************
8913 
8914 
8915 
8916 
8917 //=================================================================================================
8918 //
8919 // GLOBAL FUNCTIONS
8920 //
8921 //=================================================================================================
8922 
8923 //*************************************************************************************************
8946 template< typename MT1 // Type of the left-hand side dense matrix
8947  , typename MT2 // Type of the right-hand side dense matrix
8948  , bool SF // Symmetry flag
8949  , bool HF // Hermitian flag
8950  , bool LF // Lower flag
8951  , bool UF > // Upper flag
8952 inline decltype(auto) declsym( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8953 {
8955 
8956  if( !isSquare( dm ) ) {
8957  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
8958  }
8959 
8961  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8962 }
8964 //*************************************************************************************************
8965 
8966 
8967 //*************************************************************************************************
8990 template< typename MT1 // Type of the left-hand side dense matrix
8991  , typename MT2 // Type of the right-hand side dense matrix
8992  , bool SF // Symmetry flag
8993  , bool HF // Hermitian flag
8994  , bool LF // Lower flag
8995  , bool UF > // Upper flag
8996 inline decltype(auto) declherm( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8997 {
8999 
9000  if( !isSquare( dm ) ) {
9001  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
9002  }
9003 
9005  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9006 }
9008 //*************************************************************************************************
9009 
9010 
9011 //*************************************************************************************************
9034 template< typename MT1 // Type of the left-hand side dense matrix
9035  , typename MT2 // Type of the right-hand side dense matrix
9036  , bool SF // Symmetry flag
9037  , bool HF // Hermitian flag
9038  , bool LF // Lower flag
9039  , bool UF > // Upper flag
9040 inline decltype(auto) decllow( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9041 {
9043 
9044  if( !isSquare( dm ) ) {
9045  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9046  }
9047 
9049  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9050 }
9052 //*************************************************************************************************
9053 
9054 
9055 //*************************************************************************************************
9078 template< typename MT1 // Type of the left-hand side dense matrix
9079  , typename MT2 // Type of the right-hand side dense matrix
9080  , bool SF // Symmetry flag
9081  , bool HF // Hermitian flag
9082  , bool LF // Lower flag
9083  , bool UF > // Upper flag
9084 inline decltype(auto) declupp( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9085 {
9087 
9088  if( !isSquare( dm ) ) {
9089  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9090  }
9091 
9093  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9094 }
9096 //*************************************************************************************************
9097 
9098 
9099 //*************************************************************************************************
9122 template< typename MT1 // Type of the left-hand side dense matrix
9123  , typename MT2 // Type of the right-hand side dense matrix
9124  , bool SF // Symmetry flag
9125  , bool HF // Hermitian flag
9126  , bool LF // Lower flag
9127  , bool UF > // Upper flag
9128 inline decltype(auto) decldiag( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9129 {
9131 
9132  if( !isSquare( dm ) ) {
9133  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
9134  }
9135 
9137  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9138 }
9140 //*************************************************************************************************
9141 
9142 
9143 
9144 
9145 //=================================================================================================
9146 //
9147 // SIZE SPECIALIZATIONS
9148 //
9149 //=================================================================================================
9150 
9151 //*************************************************************************************************
9153 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9154 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9155  : public Size<MT1,0UL>
9156 {};
9157 
9158 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9159 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9160  : public Size<MT2,1UL>
9161 {};
9163 //*************************************************************************************************
9164 
9165 
9166 
9167 
9168 //=================================================================================================
9169 //
9170 // ISALIGNED SPECIALIZATIONS
9171 //
9172 //=================================================================================================
9173 
9174 //*************************************************************************************************
9176 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9177 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9178  : public And< IsAligned<MT1>, IsAligned<MT2> >
9179 {};
9181 //*************************************************************************************************
9182 
9183 
9184 
9185 
9186 //=================================================================================================
9187 //
9188 // ISSYMMETRIC SPECIALIZATIONS
9189 //
9190 //=================================================================================================
9191 
9192 //*************************************************************************************************
9194 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9195 struct IsSymmetric< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9196  : public Or< Bool<SF>
9197  , And< Bool<HF>
9198  , IsBuiltin< ElementType_< TDMatTDMatMultExpr<MT1,MT2,false,true,false,false> > > >
9199  , And< Bool<LF>, Bool<UF> > >
9200 {};
9202 //*************************************************************************************************
9203 
9204 
9205 
9206 
9207 //=================================================================================================
9208 //
9209 // ISHERMITIAN SPECIALIZATIONS
9210 //
9211 //=================================================================================================
9212 
9213 //*************************************************************************************************
9215 template< typename MT1, typename MT2, bool SF, bool LF, bool UF >
9216 struct IsHermitian< TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
9217  : public TrueType
9218 {};
9220 //*************************************************************************************************
9221 
9222 
9223 
9224 
9225 //=================================================================================================
9226 //
9227 // ISLOWER SPECIALIZATIONS
9228 //
9229 //=================================================================================================
9230 
9231 //*************************************************************************************************
9233 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9234 struct IsLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9235  : public Or< Bool<LF>
9236  , And< IsLower<MT1>, IsLower<MT2> >
9237  , And< Or< Bool<SF>, Bool<HF> >
9238  , IsUpper<MT1>, IsUpper<MT2> > >
9239 {};
9241 //*************************************************************************************************
9242 
9243 
9244 
9245 
9246 //=================================================================================================
9247 //
9248 // ISUNILOWER SPECIALIZATIONS
9249 //
9250 //=================================================================================================
9251 
9252 //*************************************************************************************************
9254 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9255 struct IsUniLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9256  : public Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
9257  , And< Or< Bool<SF>, Bool<HF> >
9258  , IsUniUpper<MT1>, IsUniUpper<MT2> > >
9259 {};
9261 //*************************************************************************************************
9262 
9263 
9264 
9265 
9266 //=================================================================================================
9267 //
9268 // ISSTRICTLYLOWER SPECIALIZATIONS
9269 //
9270 //=================================================================================================
9271 
9272 //*************************************************************************************************
9274 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9275 struct IsStrictlyLower< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9276  : public Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9277  , And< IsStrictlyLower<MT2>, IsLower<MT1> >
9278  , And< Or< Bool<SF>, Bool<HF> >
9279  , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9280  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >
9281 {};
9283 //*************************************************************************************************
9284 
9285 
9286 
9287 
9288 //=================================================================================================
9289 //
9290 // ISUPPER SPECIALIZATIONS
9291 //
9292 //=================================================================================================
9293 
9294 //*************************************************************************************************
9296 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9297 struct IsUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9298  : public Or< Bool<UF>
9299  , And< IsUpper<MT1>, IsUpper<MT2> >
9300  , And< Or< Bool<SF>, Bool<HF> >
9301  , IsLower<MT1>, IsLower<MT2> > >
9302 {};
9304 //*************************************************************************************************
9305 
9306 
9307 
9308 
9309 //=================================================================================================
9310 //
9311 // ISUNIUPPER SPECIALIZATIONS
9312 //
9313 //=================================================================================================
9314 
9315 //*************************************************************************************************
9317 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9318 struct IsUniUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9319  : public Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
9320  , And< Or< Bool<SF>, Bool<HF> >
9321  , IsUniLower<MT1>, IsUniLower<MT2> > >
9322 {};
9324 //*************************************************************************************************
9325 
9326 
9327 
9328 
9329 //=================================================================================================
9330 //
9331 // ISSTRICTLYUPPER SPECIALIZATIONS
9332 //
9333 //=================================================================================================
9334 
9335 //*************************************************************************************************
9337 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9338 struct IsStrictlyUpper< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9339  : public Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9340  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
9341  , And< Or< Bool<SF>, Bool<HF> >
9342  , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9343  , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >
9344 {};
9346 //*************************************************************************************************
9347 
9348 } // namespace blaze
9349 
9350 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:131
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:487
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:475
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:295
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:544
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:153
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:286
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:617
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:534
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
Header file for the IsIntegral type trait.
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1903
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:321
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1026
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:154
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:278
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1950
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:421
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:177
Header file for the IsComplexDouble type trait.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:277
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:281
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:431
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:110
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:107
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:279
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:455
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1026
Header file for the IsLower type trait.
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:176
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:86
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:280
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:616
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:430
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:488
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:107
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
Compile time check for resizable data types.This type trait tests whether the given data type is a re...
Definition: IsResizable.h:75
System settings for the BLAS mode.
Header file for the IsSIMDCombinable type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:411
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:174
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:292
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:131
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:385
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1028
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Base class for matrices.The Matrix class is a base class for all dense and sparse matrix classes with...
Definition: Forward.h:101
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:154
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:789
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1028
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time logical &#39;or&#39; evaluation.The Or alias declaration performs at compile time a logical &#39;or&#39;...
Definition: Or.h:76
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:401
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:443
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:336
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Compile time logical &#39;and&#39; evaluation.The And alias declaration performs at compile time a logical &#39;a...
Definition: And.h:76
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:465
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:175
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:908
Header file for the IsResizable type trait.
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:283
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:282
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:289