DMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
53 #include <blaze/math/dense/MMM.h>
54 #include <blaze/math/Exception.h>
69 #include <blaze/math/shims/Reset.h>
71 #include <blaze/math/SIMD.h>
97 #include <blaze/system/BLAS.h>
98 #include <blaze/system/Blocking.h>
99 #include <blaze/system/Debugging.h>
101 #include <blaze/system/Thresholds.h>
104 #include <blaze/util/Assert.h>
105 #include <blaze/util/Complex.h>
108 #include <blaze/util/DisableIf.h>
109 #include <blaze/util/EnableIf.h>
112 #include <blaze/util/InvalidType.h>
113 #include <blaze/util/mpl/And.h>
114 #include <blaze/util/mpl/Bool.h>
115 #include <blaze/util/mpl/If.h>
116 #include <blaze/util/mpl/Not.h>
117 #include <blaze/util/mpl/Or.h>
118 #include <blaze/util/TrueType.h>
119 #include <blaze/util/Types.h>
128 
129 
130 namespace blaze {
131 
132 //=================================================================================================
133 //
134 // CLASS DMATDMATMULTEXPR
135 //
136 //=================================================================================================
137 
138 //*************************************************************************************************
145 template< typename MT1 // Type of the left-hand side dense matrix
146  , typename MT2 // Type of the right-hand side dense matrix
147  , bool SF // Symmetry flag
148  , bool HF // Hermitian flag
149  , bool LF // Lower flag
150  , bool UF > // Upper flag
152  : public MatMatMultExpr< DenseMatrix< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
153  , private Computation
154 {
155  private:
156  //**Type definitions****************************************************************************
163  //**********************************************************************************************
164 
165  //**********************************************************************************************
167  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
168  //**********************************************************************************************
169 
170  //**********************************************************************************************
172  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
173  //**********************************************************************************************
174 
175  //**********************************************************************************************
177  enum : bool {
178  SYM = ( SF && !( HF || LF || UF ) ),
179  HERM = ( HF && !( LF || UF ) ),
180  LOW = ( LF || ( ( SF || HF ) && UF ) ),
181  UPP = ( UF || ( ( SF || HF ) && LF ) )
182  };
183  //**********************************************************************************************
184 
185  //**********************************************************************************************
187 
193  template< typename T1, typename T2, typename T3 >
194  struct CanExploitSymmetry {
195  enum : bool { value = IsColumnMajorMatrix<T1>::value &&
197  };
199  //**********************************************************************************************
200 
201  //**********************************************************************************************
203 
207  template< typename T1, typename T2, typename T3 >
208  struct IsEvaluationRequired {
209  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
210  !CanExploitSymmetry<T1,T2,T3>::value };
211  };
213  //**********************************************************************************************
214 
215  //**********************************************************************************************
217 
220  template< typename T1, typename T2, typename T3 >
221  struct UseBlasKernel {
222  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
223  !SYM && !HERM && !LOW && !UPP &&
228  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
233  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
234  };
236  //**********************************************************************************************
237 
238  //**********************************************************************************************
240 
243  template< typename T1, typename T2, typename T3 >
244  struct UseVectorizedDefaultKernel {
245  enum : bool { value = useOptimizedKernels &&
247  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
250  , ElementType_<T3> >::value &&
253  };
255  //**********************************************************************************************
256 
257  //**********************************************************************************************
259 
262  using ForwardFunctor = IfTrue_< HERM
263  , DeclHerm
264  , IfTrue_< SYM
265  , DeclSym
266  , IfTrue_< LOW
267  , IfTrue_< UPP
268  , DeclDiag
269  , DeclLow >
270  , IfTrue_< UPP
271  , DeclUpp
272  , Noop > > > >;
274  //**********************************************************************************************
275 
276  public:
277  //**Type definitions****************************************************************************
280 
286  using ReturnType = const ElementType;
287  using CompositeType = const ResultType;
288 
290  using LeftOperand = If_< IsExpression<MT1>, const MT1, const MT1& >;
291 
293  using RightOperand = If_< IsExpression<MT2>, const MT2, const MT2& >;
294 
297 
300  //**********************************************************************************************
301 
302  //**Compilation flags***************************************************************************
304  enum : bool { simdEnabled = !IsDiagonal<MT2>::value &&
305  MT1::simdEnabled && MT2::simdEnabled &&
308 
310  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
311  !evaluateRight && MT2::smpAssignable };
312  //**********************************************************************************************
313 
314  //**SIMD properties*****************************************************************************
316  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
317  //**********************************************************************************************
318 
319  //**Constructor*********************************************************************************
325  explicit inline DMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
326  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
327  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
328  {
329  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
330  }
331  //**********************************************************************************************
332 
333  //**Access operator*****************************************************************************
340  inline ReturnType operator()( size_t i, size_t j ) const {
341  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
342  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
343 
344  if( IsDiagonal<MT1>::value ) {
345  return lhs_(i,i) * rhs_(i,j);
346  }
347  else if( IsDiagonal<MT2>::value ) {
348  return lhs_(i,j) * rhs_(j,j);
349  }
351  const size_t begin( ( IsUpper<MT1>::value )
352  ?( ( IsLower<MT2>::value )
353  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
354  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
355  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
356  :( ( IsLower<MT2>::value )
357  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
358  :( 0UL ) ) );
359  const size_t end( ( IsLower<MT1>::value )
360  ?( ( IsUpper<MT2>::value )
361  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
362  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
363  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
364  :( ( IsUpper<MT2>::value )
365  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
366  :( lhs_.columns() ) ) );
367 
368  if( begin >= end ) return ElementType();
369 
370  const size_t n( end - begin );
371 
372  return subvector( row( lhs_, i ), begin, n ) * subvector( column( rhs_, j ), begin, n );
373  }
374  else {
375  return row( lhs_, i ) * column( rhs_, j );
376  }
377  }
378  //**********************************************************************************************
379 
380  //**At function*********************************************************************************
388  inline ReturnType at( size_t i, size_t j ) const {
389  if( i >= lhs_.rows() ) {
390  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
391  }
392  if( j >= rhs_.columns() ) {
393  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
394  }
395  return (*this)(i,j);
396  }
397  //**********************************************************************************************
398 
399  //**Rows function*******************************************************************************
404  inline size_t rows() const noexcept {
405  return lhs_.rows();
406  }
407  //**********************************************************************************************
408 
409  //**Columns function****************************************************************************
414  inline size_t columns() const noexcept {
415  return rhs_.columns();
416  }
417  //**********************************************************************************************
418 
419  //**Left operand access*************************************************************************
424  inline LeftOperand leftOperand() const noexcept {
425  return lhs_;
426  }
427  //**********************************************************************************************
428 
429  //**Right operand access************************************************************************
434  inline RightOperand rightOperand() const noexcept {
435  return rhs_;
436  }
437  //**********************************************************************************************
438 
439  //**********************************************************************************************
445  template< typename T >
446  inline bool canAlias( const T* alias ) const noexcept {
447  return ( lhs_.canAlias( alias ) || rhs_.canAlias( alias ) );
448  }
449  //**********************************************************************************************
450 
451  //**********************************************************************************************
457  template< typename T >
458  inline bool isAliased( const T* alias ) const noexcept {
459  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
460  }
461  //**********************************************************************************************
462 
463  //**********************************************************************************************
468  inline bool isAligned() const noexcept {
469  return lhs_.isAligned() && rhs_.isAligned();
470  }
471  //**********************************************************************************************
472 
473  //**********************************************************************************************
478  inline bool canSMPAssign() const noexcept {
479  return ( !BLAZE_BLAS_MODE ||
480  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
482  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
483  ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD ) &&
485  }
486  //**********************************************************************************************
487 
488  private:
489  //**Member variables****************************************************************************
492  //**********************************************************************************************
493 
494  //**Assignment to dense matrices****************************************************************
507  template< typename MT // Type of the target dense matrix
508  , bool SO > // Storage order of the target dense matrix
510  assign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
511  {
513 
514  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
515  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
516 
517  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
518  return;
519  }
520  else if( rhs.lhs_.columns() == 0UL ) {
521  reset( ~lhs );
522  return;
523  }
524 
525  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
526  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
527 
528  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
529  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
530  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
531  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
532  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
533  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
534 
535  DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
536  }
538  //**********************************************************************************************
539 
540  //**Assignment to dense matrices (kernel selection)*********************************************
551  template< typename MT3 // Type of the left-hand side target matrix
552  , typename MT4 // Type of the left-hand side matrix operand
553  , typename MT5 > // Type of the right-hand side matrix operand
554  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
555  {
556  if( ( IsDiagonal<MT5>::value ) ||
557  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
558  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
559  selectSmallAssignKernel( C, A, B );
560  else
561  selectBlasAssignKernel( C, A, B );
562  }
564  //**********************************************************************************************
565 
566  //**Default assignment to dense matrices (general/general)**************************************
580  template< typename MT3 // Type of the left-hand side target matrix
581  , typename MT4 // Type of the left-hand side matrix operand
582  , typename MT5 > // Type of the right-hand side matrix operand
584  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
585  {
586  const size_t M( A.rows() );
587  const size_t N( B.columns() );
588  const size_t K( A.columns() );
589 
590  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
591 
592  for( size_t i=0UL; i<M; ++i )
593  {
594  const size_t kbegin( ( IsUpper<MT4>::value )
595  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
596  :( 0UL ) );
597  const size_t kend( ( IsLower<MT4>::value )
598  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
599  :( K ) );
600  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
601 
602  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
603  for( size_t j=0UL; j<N; ++j ) {
604  reset( C(i,j) );
605  }
606  continue;
607  }
608 
609  {
610  const size_t jbegin( ( IsUpper<MT5>::value )
612  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
613  :( UPP ? max(i,kbegin) : kbegin ) )
614  :( UPP ? i : 0UL ) );
615  const size_t jend( ( IsLower<MT5>::value )
617  ?( LOW ? min(i+1UL,kbegin) : kbegin )
618  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
619  :( LOW ? i+1UL : N ) );
620 
621  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
622  for( size_t j=0UL; j<jbegin; ++j ) {
623  reset( C(i,j) );
624  }
625  }
626  else if( IsStrictlyUpper<MT5>::value ) {
627  reset( C(i,0UL) );
628  }
629  for( size_t j=jbegin; j<jend; ++j ) {
630  C(i,j) = A(i,kbegin) * B(kbegin,j);
631  }
632  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
633  for( size_t j=jend; j<N; ++j ) {
634  reset( C(i,j) );
635  }
636  }
637  else if( IsStrictlyLower<MT5>::value ) {
638  reset( C(i,N-1UL) );
639  }
640  }
641 
642  for( size_t k=kbegin+1UL; k<kend; ++k )
643  {
644  const size_t jbegin( ( IsUpper<MT5>::value )
646  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
647  :( SYM || HERM || UPP ? max( i, k ) : k ) )
648  :( SYM || HERM || UPP ? i : 0UL ) );
649  const size_t jend( ( IsLower<MT5>::value )
651  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
652  :( LOW ? min(i+1UL,k) : k ) )
653  :( LOW ? i+1UL : N ) );
654 
655  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
656  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
657 
658  for( size_t j=jbegin; j<jend; ++j ) {
659  C(i,j) += A(i,k) * B(k,j);
660  }
661  if( IsLower<MT5>::value ) {
662  C(i,jend) = A(i,k) * B(k,jend);
663  }
664  }
665  }
666 
667  if( SYM || HERM ) {
668  for( size_t i=1UL; i<M; ++i ) {
669  for( size_t j=0UL; j<i; ++j ) {
670  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
671  }
672  }
673  }
674  }
676  //**********************************************************************************************
677 
678  //**Default assignment to dense matrices (general/diagonal)*************************************
692  template< typename MT3 // Type of the left-hand side target matrix
693  , typename MT4 // Type of the left-hand side matrix operand
694  , typename MT5 > // Type of the right-hand side matrix operand
695  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
696  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
697  {
699 
700  const size_t M( A.rows() );
701  const size_t N( B.columns() );
702 
703  for( size_t i=0UL; i<M; ++i )
704  {
705  const size_t jbegin( ( IsUpper<MT4>::value )
706  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
707  :( 0UL ) );
708  const size_t jend( ( IsLower<MT4>::value )
709  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
710  :( N ) );
711  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
712 
713  if( IsUpper<MT4>::value ) {
714  for( size_t j=0UL; j<jbegin; ++j ) {
715  reset( C(i,j) );
716  }
717  }
718  for( size_t j=jbegin; j<jend; ++j ) {
719  C(i,j) = A(i,j) * B(j,j);
720  }
721  if( IsLower<MT4>::value ) {
722  for( size_t j=jend; j<N; ++j ) {
723  reset( C(i,j) );
724  }
725  }
726  }
727  }
729  //**********************************************************************************************
730 
731  //**Default assignment to dense matrices (diagonal/general)*************************************
745  template< typename MT3 // Type of the left-hand side target matrix
746  , typename MT4 // Type of the left-hand side matrix operand
747  , typename MT5 > // Type of the right-hand side matrix operand
749  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
750  {
752 
753  const size_t M( A.rows() );
754  const size_t N( B.columns() );
755 
756  for( size_t i=0UL; i<M; ++i )
757  {
758  const size_t jbegin( ( IsUpper<MT5>::value )
759  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
760  :( 0UL ) );
761  const size_t jend( ( IsLower<MT5>::value )
762  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
763  :( N ) );
764  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
765 
766  if( IsUpper<MT5>::value ) {
767  for( size_t j=0UL; j<jbegin; ++j ) {
768  reset( C(i,j) );
769  }
770  }
771  for( size_t j=jbegin; j<jend; ++j ) {
772  C(i,j) = A(i,i) * B(i,j);
773  }
774  if( IsLower<MT5>::value ) {
775  for( size_t j=jend; j<N; ++j ) {
776  reset( C(i,j) );
777  }
778  }
779  }
780  }
782  //**********************************************************************************************
783 
784  //**Default assignment to dense matrices (diagonal/diagonal)************************************
798  template< typename MT3 // Type of the left-hand side target matrix
799  , typename MT4 // Type of the left-hand side matrix operand
800  , typename MT5 > // Type of the right-hand side matrix operand
801  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
802  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
803  {
805 
806  reset( C );
807 
808  for( size_t i=0UL; i<A.rows(); ++i ) {
809  C(i,i) = A(i,i) * B(i,i);
810  }
811  }
813  //**********************************************************************************************
814 
815  //**Default assignment to dense matrices (small matrices)***************************************
828  template< typename MT3 // Type of the left-hand side target matrix
829  , typename MT4 // Type of the left-hand side matrix operand
830  , typename MT5 > // Type of the right-hand side matrix operand
832  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
833  {
834  selectDefaultAssignKernel( C, A, B );
835  }
837  //**********************************************************************************************
838 
839  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
854  template< typename MT3 // Type of the left-hand side target matrix
855  , typename MT4 // Type of the left-hand side matrix operand
856  , typename MT5 > // Type of the right-hand side matrix operand
858  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
859  {
860  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
861 
862  const size_t M( A.rows() );
863  const size_t N( B.columns() );
864  const size_t K( A.columns() );
865 
866  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
867 
868  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
869  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
870 
871  if( LOW && UPP && N > SIMDSIZE*3UL ) {
872  reset( ~C );
873  }
874 
875  {
876  size_t j( 0UL );
877 
879  {
880  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
881  for( size_t i=0UL; i<M; ++i )
882  {
883  const size_t kbegin( ( IsUpper<MT4>::value )
884  ?( ( IsLower<MT5>::value )
885  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
886  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
887  :( IsLower<MT5>::value ? j : 0UL ) );
888  const size_t kend( ( IsLower<MT4>::value )
889  ?( ( IsUpper<MT5>::value )
890  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
891  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
892  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
893 
894  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
895 
896  for( size_t k=kbegin; k<kend; ++k ) {
897  const SIMDType a1( set( A(i,k) ) );
898  xmm1 += a1 * B.load(k,j );
899  xmm2 += a1 * B.load(k,j+SIMDSIZE );
900  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
901  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
902  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
903  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
904  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
905  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
906  }
907 
908  (~C).store( i, j , xmm1 );
909  (~C).store( i, j+SIMDSIZE , xmm2 );
910  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
911  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
912  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
913  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
914  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
915  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
916  }
917  }
918  }
919 
920  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
921  {
922  size_t i( 0UL );
923 
924  for( ; (i+2UL) <= M; i+=2UL )
925  {
926  const size_t kbegin( ( IsUpper<MT4>::value )
927  ?( ( IsLower<MT5>::value )
928  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
929  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
930  :( IsLower<MT5>::value ? j : 0UL ) );
931  const size_t kend( ( IsLower<MT4>::value )
932  ?( ( IsUpper<MT5>::value )
933  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
934  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
935  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
936 
937  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
938 
939  for( size_t k=kbegin; k<kend; ++k ) {
940  const SIMDType a1( set( A(i ,k) ) );
941  const SIMDType a2( set( A(i+1UL,k) ) );
942  const SIMDType b1( B.load(k,j ) );
943  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
944  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
945  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
946  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
947  xmm1 += a1 * b1;
948  xmm2 += a1 * b2;
949  xmm3 += a1 * b3;
950  xmm4 += a1 * b4;
951  xmm5 += a1 * b5;
952  xmm6 += a2 * b1;
953  xmm7 += a2 * b2;
954  xmm8 += a2 * b3;
955  xmm9 += a2 * b4;
956  xmm10 += a2 * b5;
957  }
958 
959  (~C).store( i , j , xmm1 );
960  (~C).store( i , j+SIMDSIZE , xmm2 );
961  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
962  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
963  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
964  (~C).store( i+1UL, j , xmm6 );
965  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
966  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
967  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
968  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
969  }
970 
971  if( i < M )
972  {
973  const size_t kbegin( ( IsUpper<MT4>::value )
974  ?( ( IsLower<MT5>::value )
975  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
976  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
977  :( IsLower<MT5>::value ? j : 0UL ) );
978  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
979 
980  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
981 
982  for( size_t k=kbegin; k<kend; ++k ) {
983  const SIMDType a1( set( A(i,k) ) );
984  xmm1 += a1 * B.load(k,j );
985  xmm2 += a1 * B.load(k,j+SIMDSIZE );
986  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
987  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
988  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
989  }
990 
991  (~C).store( i, j , xmm1 );
992  (~C).store( i, j+SIMDSIZE , xmm2 );
993  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
994  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
995  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
996  }
997  }
998 
999  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1000  {
1001  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
1002  size_t i( LOW ? j : 0UL );
1003 
1004  for( ; (i+2UL) <= iend; i+=2UL )
1005  {
1006  const size_t kbegin( ( IsUpper<MT4>::value )
1007  ?( ( IsLower<MT5>::value )
1008  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1009  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1010  :( IsLower<MT5>::value ? j : 0UL ) );
1011  const size_t kend( ( IsLower<MT4>::value )
1012  ?( ( IsUpper<MT5>::value )
1013  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1014  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1015  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
1016 
1017  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1018 
1019  for( size_t k=kbegin; k<kend; ++k ) {
1020  const SIMDType a1( set( A(i ,k) ) );
1021  const SIMDType a2( set( A(i+1UL,k) ) );
1022  const SIMDType b1( B.load(k,j ) );
1023  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1024  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1025  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1026  xmm1 += a1 * b1;
1027  xmm2 += a1 * b2;
1028  xmm3 += a1 * b3;
1029  xmm4 += a1 * b4;
1030  xmm5 += a2 * b1;
1031  xmm6 += a2 * b2;
1032  xmm7 += a2 * b3;
1033  xmm8 += a2 * b4;
1034  }
1035 
1036  (~C).store( i , j , xmm1 );
1037  (~C).store( i , j+SIMDSIZE , xmm2 );
1038  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1039  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1040  (~C).store( i+1UL, j , xmm5 );
1041  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1042  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1043  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1044  }
1045 
1046  if( i < iend )
1047  {
1048  const size_t kbegin( ( IsUpper<MT4>::value )
1049  ?( ( IsLower<MT5>::value )
1050  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1051  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1052  :( IsLower<MT5>::value ? j : 0UL ) );
1053  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1054 
1055  SIMDType xmm1, xmm2, xmm3, xmm4;
1056 
1057  for( size_t k=kbegin; k<kend; ++k ) {
1058  const SIMDType a1( set( A(i,k) ) );
1059  xmm1 += a1 * B.load(k,j );
1060  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1061  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1062  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1063  }
1064 
1065  (~C).store( i, j , xmm1 );
1066  (~C).store( i, j+SIMDSIZE , xmm2 );
1067  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1068  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1069  }
1070  }
1071 
1072  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1073  {
1074  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
1075  size_t i( LOW ? j : 0UL );
1076 
1077  for( ; (i+2UL) <= iend; i+=2UL )
1078  {
1079  const size_t kbegin( ( IsUpper<MT4>::value )
1080  ?( ( IsLower<MT5>::value )
1081  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1082  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1083  :( IsLower<MT5>::value ? j : 0UL ) );
1084  const size_t kend( ( IsLower<MT4>::value )
1085  ?( ( IsUpper<MT5>::value )
1086  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1087  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1088  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
1089 
1090  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1091 
1092  for( size_t k=kbegin; k<kend; ++k ) {
1093  const SIMDType a1( set( A(i ,k) ) );
1094  const SIMDType a2( set( A(i+1UL,k) ) );
1095  const SIMDType b1( B.load(k,j ) );
1096  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1097  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1098  xmm1 += a1 * b1;
1099  xmm2 += a1 * b2;
1100  xmm3 += a1 * b3;
1101  xmm4 += a2 * b1;
1102  xmm5 += a2 * b2;
1103  xmm6 += a2 * b3;
1104  }
1105 
1106  (~C).store( i , j , xmm1 );
1107  (~C).store( i , j+SIMDSIZE , xmm2 );
1108  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1109  (~C).store( i+1UL, j , xmm4 );
1110  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
1111  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1112  }
1113 
1114  if( i < iend )
1115  {
1116  const size_t kbegin( ( IsUpper<MT4>::value )
1117  ?( ( IsLower<MT5>::value )
1118  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1119  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1120  :( IsLower<MT5>::value ? j : 0UL ) );
1121  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1122 
1123  SIMDType xmm1, xmm2, xmm3;
1124 
1125  for( size_t k=kbegin; k<kend; ++k ) {
1126  const SIMDType a1( set( A(i,k) ) );
1127  xmm1 += a1 * B.load(k,j );
1128  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1129  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1130  }
1131 
1132  (~C).store( i, j , xmm1 );
1133  (~C).store( i, j+SIMDSIZE , xmm2 );
1134  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1135  }
1136  }
1137 
1138  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1139  {
1140  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
1141  size_t i( LOW ? j : 0UL );
1142 
1143  for( ; (i+4UL) <= iend; i+=4UL )
1144  {
1145  const size_t kbegin( ( IsUpper<MT4>::value )
1146  ?( ( IsLower<MT5>::value )
1147  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1148  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1149  :( IsLower<MT5>::value ? j : 0UL ) );
1150  const size_t kend( ( IsLower<MT4>::value )
1151  ?( ( IsUpper<MT5>::value )
1152  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
1153  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
1154  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1155 
1156  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1157 
1158  for( size_t k=kbegin; k<kend; ++k ) {
1159  const SIMDType a1( set( A(i ,k) ) );
1160  const SIMDType a2( set( A(i+1UL,k) ) );
1161  const SIMDType a3( set( A(i+2UL,k) ) );
1162  const SIMDType a4( set( A(i+3UL,k) ) );
1163  const SIMDType b1( B.load(k,j ) );
1164  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1165  xmm1 += a1 * b1;
1166  xmm2 += a1 * b2;
1167  xmm3 += a2 * b1;
1168  xmm4 += a2 * b2;
1169  xmm5 += a3 * b1;
1170  xmm6 += a3 * b2;
1171  xmm7 += a4 * b1;
1172  xmm8 += a4 * b2;
1173  }
1174 
1175  (~C).store( i , j , xmm1 );
1176  (~C).store( i , j+SIMDSIZE, xmm2 );
1177  (~C).store( i+1UL, j , xmm3 );
1178  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1179  (~C).store( i+2UL, j , xmm5 );
1180  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1181  (~C).store( i+3UL, j , xmm7 );
1182  (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
1183  }
1184 
1185  for( ; (i+3UL) <= iend; i+=3UL )
1186  {
1187  const size_t kbegin( ( IsUpper<MT4>::value )
1188  ?( ( IsLower<MT5>::value )
1189  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1190  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1191  :( IsLower<MT5>::value ? j : 0UL ) );
1192  const size_t kend( ( IsLower<MT4>::value )
1193  ?( ( IsUpper<MT5>::value )
1194  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
1195  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
1196  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1197 
1198  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1199 
1200  for( size_t k=kbegin; k<kend; ++k ) {
1201  const SIMDType a1( set( A(i ,k) ) );
1202  const SIMDType a2( set( A(i+1UL,k) ) );
1203  const SIMDType a3( set( A(i+2UL,k) ) );
1204  const SIMDType b1( B.load(k,j ) );
1205  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1206  xmm1 += a1 * b1;
1207  xmm2 += a1 * b2;
1208  xmm3 += a2 * b1;
1209  xmm4 += a2 * b2;
1210  xmm5 += a3 * b1;
1211  xmm6 += a3 * b2;
1212  }
1213 
1214  (~C).store( i , j , xmm1 );
1215  (~C).store( i , j+SIMDSIZE, xmm2 );
1216  (~C).store( i+1UL, j , xmm3 );
1217  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1218  (~C).store( i+2UL, j , xmm5 );
1219  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1220  }
1221 
1222  for( ; (i+2UL) <= iend; i+=2UL )
1223  {
1224  const size_t kbegin( ( IsUpper<MT4>::value )
1225  ?( ( IsLower<MT5>::value )
1226  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1227  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1228  :( IsLower<MT5>::value ? j : 0UL ) );
1229  const size_t kend( ( IsLower<MT4>::value )
1230  ?( ( IsUpper<MT5>::value )
1231  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1232  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1233  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1234 
1235  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1236  size_t k( kbegin );
1237 
1238  for( ; (k+2UL) <= kend; k+=2UL ) {
1239  const SIMDType a1( set( A(i ,k ) ) );
1240  const SIMDType a2( set( A(i+1UL,k ) ) );
1241  const SIMDType a3( set( A(i ,k+1UL) ) );
1242  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
1243  const SIMDType b1( B.load(k ,j ) );
1244  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1245  const SIMDType b3( B.load(k+1UL,j ) );
1246  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1247  xmm1 += a1 * b1;
1248  xmm2 += a1 * b2;
1249  xmm3 += a2 * b1;
1250  xmm4 += a2 * b2;
1251  xmm5 += a3 * b3;
1252  xmm6 += a3 * b4;
1253  xmm7 += a4 * b3;
1254  xmm8 += a4 * b4;
1255  }
1256 
1257  for( ; k<kend; ++k ) {
1258  const SIMDType a1( set( A(i ,k) ) );
1259  const SIMDType a2( set( A(i+1UL,k) ) );
1260  const SIMDType b1( B.load(k,j ) );
1261  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1262  xmm1 += a1 * b1;
1263  xmm2 += a1 * b2;
1264  xmm3 += a2 * b1;
1265  xmm4 += a2 * b2;
1266  }
1267 
1268  (~C).store( i , j , xmm1+xmm5 );
1269  (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
1270  (~C).store( i+1UL, j , xmm3+xmm7 );
1271  (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1272  }
1273 
1274  if( i < iend )
1275  {
1276  const size_t kbegin( ( IsUpper<MT4>::value )
1277  ?( ( IsLower<MT5>::value )
1278  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1279  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1280  :( IsLower<MT5>::value ? j : 0UL ) );
1281  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1282 
1283  SIMDType xmm1, xmm2, xmm3, xmm4;
1284  size_t k( kbegin );
1285 
1286  for( ; (k+2UL) <= kend; k+=2UL ) {
1287  const SIMDType a1( set( A(i,k ) ) );
1288  const SIMDType a2( set( A(i,k+1UL) ) );
1289  xmm1 += a1 * B.load(k ,j );
1290  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1291  xmm3 += a2 * B.load(k+1UL,j );
1292  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1293  }
1294 
1295  for( ; k<kend; ++k ) {
1296  const SIMDType a1( set( A(i,k) ) );
1297  xmm1 += a1 * B.load(k,j );
1298  xmm2 += a1 * B.load(k,j+SIMDSIZE);
1299  }
1300 
1301  (~C).store( i, j , xmm1+xmm3 );
1302  (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
1303  }
1304  }
1305 
1306  for( ; j<jpos; j+=SIMDSIZE )
1307  {
1308  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
1309  size_t i( LOW ? j : 0UL );
1310 
1311  for( ; (i+4UL) <= iend; i+=4UL )
1312  {
1313  const size_t kbegin( ( IsUpper<MT4>::value )
1314  ?( ( IsLower<MT5>::value )
1315  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1316  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1317  :( IsLower<MT5>::value ? j : 0UL ) );
1318  const size_t kend( ( IsLower<MT4>::value )
1319  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
1320  :( K ) );
1321 
1322  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1323  size_t k( kbegin );
1324 
1325  for( ; (k+2UL) <= kend; k+=2UL ) {
1326  const SIMDType b1( B.load(k ,j) );
1327  const SIMDType b2( B.load(k+1UL,j) );
1328  xmm1 += set( A(i ,k ) ) * b1;
1329  xmm2 += set( A(i+1UL,k ) ) * b1;
1330  xmm3 += set( A(i+2UL,k ) ) * b1;
1331  xmm4 += set( A(i+3UL,k ) ) * b1;
1332  xmm5 += set( A(i ,k+1UL) ) * b2;
1333  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
1334  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
1335  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
1336  }
1337 
1338  for( ; k<kend; ++k ) {
1339  const SIMDType b1( B.load(k,j) );
1340  xmm1 += set( A(i ,k) ) * b1;
1341  xmm2 += set( A(i+1UL,k) ) * b1;
1342  xmm3 += set( A(i+2UL,k) ) * b1;
1343  xmm4 += set( A(i+3UL,k) ) * b1;
1344  }
1345 
1346  (~C).store( i , j, xmm1+xmm5 );
1347  (~C).store( i+1UL, j, xmm2+xmm6 );
1348  (~C).store( i+2UL, j, xmm3+xmm7 );
1349  (~C).store( i+3UL, j, xmm4+xmm8 );
1350  }
1351 
1352  for( ; (i+3UL) <= iend; i+=3UL )
1353  {
1354  const size_t kbegin( ( IsUpper<MT4>::value )
1355  ?( ( IsLower<MT5>::value )
1356  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1357  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1358  :( IsLower<MT5>::value ? j : 0UL ) );
1359  const size_t kend( ( IsLower<MT4>::value )
1360  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
1361  :( K ) );
1362 
1363  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1364  size_t k( kbegin );
1365 
1366  for( ; (k+2UL) <= kend; k+=2UL ) {
1367  const SIMDType b1( B.load(k ,j) );
1368  const SIMDType b2( B.load(k+1UL,j) );
1369  xmm1 += set( A(i ,k ) ) * b1;
1370  xmm2 += set( A(i+1UL,k ) ) * b1;
1371  xmm3 += set( A(i+2UL,k ) ) * b1;
1372  xmm4 += set( A(i ,k+1UL) ) * b2;
1373  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
1374  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
1375  }
1376 
1377  for( ; k<kend; ++k ) {
1378  const SIMDType b1( B.load(k,j) );
1379  xmm1 += set( A(i ,k) ) * b1;
1380  xmm2 += set( A(i+1UL,k) ) * b1;
1381  xmm3 += set( A(i+2UL,k) ) * b1;
1382  }
1383 
1384  (~C).store( i , j, xmm1+xmm4 );
1385  (~C).store( i+1UL, j, xmm2+xmm5 );
1386  (~C).store( i+2UL, j, xmm3+xmm6 );
1387  }
1388 
1389  for( ; (i+2UL) <= iend; i+=2UL )
1390  {
1391  const size_t kbegin( ( IsUpper<MT4>::value )
1392  ?( ( IsLower<MT5>::value )
1393  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1394  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1395  :( IsLower<MT5>::value ? j : 0UL ) );
1396  const size_t kend( ( IsLower<MT4>::value )
1397  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1398  :( K ) );
1399 
1400  SIMDType xmm1, xmm2, xmm3, xmm4;
1401  size_t k( kbegin );
1402 
1403  for( ; (k+2UL) <= kend; k+=2UL ) {
1404  const SIMDType b1( B.load(k ,j) );
1405  const SIMDType b2( B.load(k+1UL,j) );
1406  xmm1 += set( A(i ,k ) ) * b1;
1407  xmm2 += set( A(i+1UL,k ) ) * b1;
1408  xmm3 += set( A(i ,k+1UL) ) * b2;
1409  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
1410  }
1411 
1412  for( ; k<kend; ++k ) {
1413  const SIMDType b1( B.load(k,j) );
1414  xmm1 += set( A(i ,k) ) * b1;
1415  xmm2 += set( A(i+1UL,k) ) * b1;
1416  }
1417 
1418  (~C).store( i , j, xmm1+xmm3 );
1419  (~C).store( i+1UL, j, xmm2+xmm4 );
1420  }
1421 
1422  if( i < iend )
1423  {
1424  const size_t kbegin( ( IsUpper<MT4>::value )
1425  ?( ( IsLower<MT5>::value )
1426  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1427  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1428  :( IsLower<MT5>::value ? j : 0UL ) );
1429 
1430  SIMDType xmm1, xmm2;
1431  size_t k( kbegin );
1432 
1433  for( ; (k+2UL) <= K; k+=2UL ) {
1434  xmm1 += set( A(i,k ) ) * B.load(k ,j);
1435  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
1436  }
1437 
1438  for( ; k<K; ++k ) {
1439  xmm1 += set( A(i,k) ) * B.load(k,j);
1440  }
1441 
1442  (~C).store( i, j, xmm1+xmm2 );
1443  }
1444  }
1445 
1446  for( ; remainder && j<N; ++j )
1447  {
1448  size_t i( LOW && UPP ? j : 0UL );
1449 
1450  for( ; (i+2UL) <= M; i+=2UL )
1451  {
1452  const size_t kbegin( ( IsUpper<MT4>::value )
1453  ?( ( IsLower<MT5>::value )
1454  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1455  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1456  :( IsLower<MT5>::value ? j : 0UL ) );
1457  const size_t kend( ( IsLower<MT4>::value )
1458  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1459  :( K ) );
1460 
1461  ElementType value1 = ElementType();
1462  ElementType value2 = ElementType();
1463 
1464  for( size_t k=kbegin; k<kend; ++k ) {
1465  value1 += A(i ,k) * B(k,j);
1466  value2 += A(i+1UL,k) * B(k,j);
1467  }
1468 
1469  (~C)(i ,j) = value1;
1470  (~C)(i+1UL,j) = value2;
1471  }
1472 
1473  if( i < M )
1474  {
1475  const size_t kbegin( ( IsUpper<MT4>::value )
1476  ?( ( IsLower<MT5>::value )
1477  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1478  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1479  :( IsLower<MT5>::value ? j : 0UL ) );
1480 
1481  ElementType value = ElementType();
1482 
1483  for( size_t k=kbegin; k<K; ++k ) {
1484  value += A(i,k) * B(k,j);
1485  }
1486 
1487  (~C)(i,j) = value;
1488  }
1489  }
1490  }
1491 
1492  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1493  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1494  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1495  for( size_t j=0UL; j<jend; ++j ) {
1496  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
1497  }
1498  }
1499  }
1500  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1501  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1502  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1503  for( size_t i=0UL; i<iend; ++i ) {
1504  reset( (~C)(i,j) );
1505  }
1506  }
1507  }
1508  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1509  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1510  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1511  for( size_t j=0UL; j<jend; ++j ) {
1512  reset( (~C)(i,j) );
1513  }
1514  }
1515  }
1516  }
1518  //**********************************************************************************************
1519 
1520  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1535  template< typename MT3 // Type of the left-hand side target matrix
1536  , typename MT4 // Type of the left-hand side matrix operand
1537  , typename MT5 > // Type of the right-hand side matrix operand
1539  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1540  {
1545 
1546  const ForwardFunctor fwd;
1547 
1549  const OppositeType_<MT4> tmp( serial( A ) );
1550  assign( ~C, fwd( tmp * B ) );
1551  }
1553  const OppositeType_<MT5> tmp( serial( B ) );
1554  assign( ~C, fwd( A * tmp ) );
1555  }
1556  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1557  const OppositeType_<MT4> tmp( serial( A ) );
1558  assign( ~C, fwd( tmp * B ) );
1559  }
1560  else {
1561  const OppositeType_<MT5> tmp( serial( B ) );
1562  assign( ~C, fwd( A * tmp ) );
1563  }
1564  }
1566  //**********************************************************************************************
1567 
1568  //**Default assignment to dense matrices (large matrices)***************************************
1581  template< typename MT3 // Type of the left-hand side target matrix
1582  , typename MT4 // Type of the left-hand side matrix operand
1583  , typename MT5 > // Type of the right-hand side matrix operand
1585  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1586  {
1587  selectDefaultAssignKernel( C, A, B );
1588  }
1590  //**********************************************************************************************
1591 
1592  //**Vectorized default assignment to dense matrices (large matrices)****************************
1606  template< typename MT3 // Type of the left-hand side target matrix
1607  , typename MT4 // Type of the left-hand side matrix operand
1608  , typename MT5 > // Type of the right-hand side matrix operand
1610  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
1611  {
1612  if( SYM )
1613  smmm( C, A, B, ElementType(1) );
1614  else if( HERM )
1615  hmmm( C, A, B, ElementType(1) );
1616  else if( LOW )
1617  lmmm( C, A, B, ElementType(1), ElementType(0) );
1618  else if( UPP )
1619  ummm( C, A, B, ElementType(1), ElementType(0) );
1620  else
1621  mmm( C, A, B, ElementType(1), ElementType(0) );
1622  }
1624  //**********************************************************************************************
1625 
1626  //**BLAS-based assignment to dense matrices (default)*******************************************
1639  template< typename MT3 // Type of the left-hand side target matrix
1640  , typename MT4 // Type of the left-hand side matrix operand
1641  , typename MT5 > // Type of the right-hand side matrix operand
1643  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1644  {
1645  selectLargeAssignKernel( C, A, B );
1646  }
1648  //**********************************************************************************************
1649 
1650  //**BLAS-based assignment to dense matrices*****************************************************
1651 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1652 
1664  template< typename MT3 // Type of the left-hand side target matrix
1665  , typename MT4 // Type of the left-hand side matrix operand
1666  , typename MT5 > // Type of the right-hand side matrix operand
1668  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
1669  {
1670  using ET = ElementType_<MT3>;
1671 
1672  if( IsTriangular<MT4>::value ) {
1673  assign( C, B );
1674  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1675  }
1676  else if( IsTriangular<MT5>::value ) {
1677  assign( C, A );
1678  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1679  }
1680  else {
1681  gemm( C, A, B, ET(1), ET(0) );
1682  }
1683  }
1685 #endif
1686  //**********************************************************************************************
1687 
1688  //**Assignment to sparse matrices***************************************************************
1701  template< typename MT // Type of the target sparse matrix
1702  , bool SO > // Storage order of the target sparse matrix
1704  assign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1705  {
1707 
1709 
1716 
1717  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1718  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1719 
1720  const ForwardFunctor fwd;
1721 
1722  const TmpType tmp( serial( rhs ) );
1723  assign( ~lhs, fwd( tmp ) );
1724  }
1726  //**********************************************************************************************
1727 
1728  //**Restructuring assignment to column-major matrices*******************************************
1743  template< typename MT > // Type of the target matrix
1745  assign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
1746  {
1748 
1750 
1751  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1752  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1753 
1754  const ForwardFunctor fwd;
1755 
1757  assign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
1758  else if( IsSymmetric<MT1>::value )
1759  assign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
1760  else
1761  assign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
1762  }
1764  //**********************************************************************************************
1765 
1766  //**Addition assignment to dense matrices*******************************************************
1779  template< typename MT // Type of the target dense matrix
1780  , bool SO > // Storage order of the target dense matrix
1782  addAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
1783  {
1785 
1786  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
1787  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
1788 
1789  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1790  return;
1791  }
1792 
1793  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
1794  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
1795 
1796  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
1797  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
1798  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
1799  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
1800  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
1801  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
1802 
1803  DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1804  }
1806  //**********************************************************************************************
1807 
1808  //**Addition assignment to dense matrices (kernel selection)************************************
1819  template< typename MT3 // Type of the left-hand side target matrix
1820  , typename MT4 // Type of the left-hand side matrix operand
1821  , typename MT5 > // Type of the right-hand side matrix operand
1822  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1823  {
1824  if( ( IsDiagonal<MT5>::value ) ||
1825  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
1826  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1827  selectSmallAddAssignKernel( C, A, B );
1828  else
1829  selectBlasAddAssignKernel( C, A, B );
1830  }
1832  //**********************************************************************************************
1833 
1834  //**Default addition assignment to dense matrices (general/general)*****************************
1848  template< typename MT3 // Type of the left-hand side target matrix
1849  , typename MT4 // Type of the left-hand side matrix operand
1850  , typename MT5 > // Type of the right-hand side matrix operand
1851  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
1852  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1853  {
1854  const size_t M( A.rows() );
1855  const size_t N( B.columns() );
1856  const size_t K( A.columns() );
1857 
1858  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1859 
1860  for( size_t i=0UL; i<M; ++i )
1861  {
1862  const size_t kbegin( ( IsUpper<MT4>::value )
1863  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1864  :( 0UL ) );
1865  const size_t kend( ( IsLower<MT4>::value )
1866  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1867  :( K ) );
1868  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
1869 
1870  for( size_t k=kbegin; k<kend; ++k )
1871  {
1872  const size_t jbegin( ( IsUpper<MT5>::value )
1874  ?( UPP ? max(i,k+1UL) : k+1UL )
1875  :( UPP ? max(i,k) : k ) )
1876  :( UPP ? i : 0UL ) );
1877  const size_t jend( ( IsLower<MT5>::value )
1879  ?( LOW ? min(i+1UL,k) : k )
1880  :( LOW ? min(i,k)+1UL : k+1UL ) )
1881  :( LOW ? i+1UL : N ) );
1882 
1883  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
1884  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1885 
1886  const size_t jnum( jend - jbegin );
1887  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1888 
1889  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1890  C(i,j ) += A(i,k) * B(k,j );
1891  C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1892  }
1893  if( jpos < jend ) {
1894  C(i,jpos) += A(i,k) * B(k,jpos);
1895  }
1896  }
1897  }
1898  }
1900  //**********************************************************************************************
1901 
1902  //**Default addition assignment to dense matrices (general/diagonal)****************************
1916  template< typename MT3 // Type of the left-hand side target matrix
1917  , typename MT4 // Type of the left-hand side matrix operand
1918  , typename MT5 > // Type of the right-hand side matrix operand
1919  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
1920  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1921  {
1923 
1924  const size_t M( A.rows() );
1925  const size_t N( B.columns() );
1926 
1927  for( size_t i=0UL; i<M; ++i )
1928  {
1929  const size_t jbegin( ( IsUpper<MT4>::value )
1930  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1931  :( 0UL ) );
1932  const size_t jend( ( IsLower<MT4>::value )
1933  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1934  :( N ) );
1935  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1936 
1937  const size_t jnum( jend - jbegin );
1938  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1939 
1940  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1941  C(i,j ) += A(i,j ) * B(j ,j );
1942  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1943  }
1944  if( jpos < jend ) {
1945  C(i,jpos) += A(i,jpos) * B(jpos,jpos);
1946  }
1947  }
1948  }
1950  //**********************************************************************************************
1951 
1952  //**Default addition assignment to dense matrices (diagonal/general)****************************
1966  template< typename MT3 // Type of the left-hand side target matrix
1967  , typename MT4 // Type of the left-hand side matrix operand
1968  , typename MT5 > // Type of the right-hand side matrix operand
1969  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
1970  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
1971  {
1973 
1974  const size_t M( A.rows() );
1975  const size_t N( B.columns() );
1976 
1977  for( size_t i=0UL; i<M; ++i )
1978  {
1979  const size_t jbegin( ( IsUpper<MT5>::value )
1980  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
1981  :( 0UL ) );
1982  const size_t jend( ( IsLower<MT5>::value )
1983  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
1984  :( N ) );
1985  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
1986 
1987  const size_t jnum( jend - jbegin );
1988  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
1989 
1990  for( size_t j=jbegin; j<jpos; j+=2UL ) {
1991  C(i,j ) += A(i,i) * B(i,j );
1992  C(i,j+1UL) += A(i,i) * B(i,j+1UL);
1993  }
1994  if( jpos < jend ) {
1995  C(i,jpos) += A(i,i) * B(i,jpos);
1996  }
1997  }
1998  }
2000  //**********************************************************************************************
2001 
2002  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2016  template< typename MT3 // Type of the left-hand side target matrix
2017  , typename MT4 // Type of the left-hand side matrix operand
2018  , typename MT5 > // Type of the right-hand side matrix operand
2019  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2020  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2021  {
2023 
2024  for( size_t i=0UL; i<A.rows(); ++i ) {
2025  C(i,i) += A(i,i) * B(i,i);
2026  }
2027  }
2029  //**********************************************************************************************
2030 
2031  //**Default addition assignment to dense matrices (small matrices)******************************
2045  template< typename MT3 // Type of the left-hand side target matrix
2046  , typename MT4 // Type of the left-hand side matrix operand
2047  , typename MT5 > // Type of the right-hand side matrix operand
2049  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2050  {
2051  selectDefaultAddAssignKernel( C, A, B );
2052  }
2054  //**********************************************************************************************
2055 
2056  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
2071  template< typename MT3 // Type of the left-hand side target matrix
2072  , typename MT4 // Type of the left-hand side matrix operand
2073  , typename MT5 > // Type of the right-hand side matrix operand
2075  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2076  {
2077  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2078 
2079  const size_t M( A.rows() );
2080  const size_t N( B.columns() );
2081  const size_t K( A.columns() );
2082 
2083  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2084 
2085  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
2086  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
2087 
2088  size_t j( 0UL );
2089 
2091  {
2092  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2093  for( size_t i=0UL; i<M; ++i )
2094  {
2095  const size_t kbegin( ( IsUpper<MT4>::value )
2096  ?( ( IsLower<MT5>::value )
2097  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2098  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2099  :( IsLower<MT5>::value ? j : 0UL ) );
2100  const size_t kend( ( IsLower<MT4>::value )
2101  ?( ( IsUpper<MT5>::value )
2102  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
2103  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2104  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
2105 
2106  SIMDType xmm1( (~C).load(i,j ) );
2107  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2108  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2109  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2110  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2111  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
2112  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
2113  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
2114 
2115  for( size_t k=kbegin; k<kend; ++k ) {
2116  const SIMDType a1( set( A(i,k) ) );
2117  xmm1 += a1 * B.load(k,j );
2118  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2119  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2120  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2121  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2122  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
2123  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
2124  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
2125  }
2126 
2127  (~C).store( i, j , xmm1 );
2128  (~C).store( i, j+SIMDSIZE , xmm2 );
2129  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2130  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2131  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2132  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
2133  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
2134  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
2135  }
2136  }
2137  }
2138 
2139  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
2140  {
2141  size_t i( 0UL );
2142 
2143  for( ; (i+2UL) <= M; i+=2UL )
2144  {
2145  const size_t kbegin( ( IsUpper<MT4>::value )
2146  ?( ( IsLower<MT5>::value )
2147  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2148  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2149  :( IsLower<MT5>::value ? j : 0UL ) );
2150  const size_t kend( ( IsLower<MT4>::value )
2151  ?( ( IsUpper<MT5>::value )
2152  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
2153  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2154  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
2155 
2156  SIMDType xmm1 ( (~C).load(i ,j ) );
2157  SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
2158  SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
2159  SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
2160  SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
2161  SIMDType xmm6 ( (~C).load(i+1UL,j ) );
2162  SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
2163  SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2164  SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2165  SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
2166 
2167  for( size_t k=kbegin; k<kend; ++k ) {
2168  const SIMDType a1( set( A(i ,k) ) );
2169  const SIMDType a2( set( A(i+1UL,k) ) );
2170  const SIMDType b1( B.load(k,j ) );
2171  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2172  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2173  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2174  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
2175  xmm1 += a1 * b1;
2176  xmm2 += a1 * b2;
2177  xmm3 += a1 * b3;
2178  xmm4 += a1 * b4;
2179  xmm5 += a1 * b5;
2180  xmm6 += a2 * b1;
2181  xmm7 += a2 * b2;
2182  xmm8 += a2 * b3;
2183  xmm9 += a2 * b4;
2184  xmm10 += a2 * b5;
2185  }
2186 
2187  (~C).store( i , j , xmm1 );
2188  (~C).store( i , j+SIMDSIZE , xmm2 );
2189  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2190  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2191  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
2192  (~C).store( i+1UL, j , xmm6 );
2193  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
2194  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
2195  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
2196  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
2197  }
2198 
2199  if( i < M )
2200  {
2201  const size_t kbegin( ( IsUpper<MT4>::value )
2202  ?( ( IsLower<MT5>::value )
2203  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2204  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2205  :( IsLower<MT5>::value ? j : 0UL ) );
2206  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
2207 
2208  SIMDType xmm1( (~C).load(i,j ) );
2209  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2210  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2211  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2212  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2213 
2214  for( size_t k=kbegin; k<kend; ++k ) {
2215  const SIMDType a1( set( A(i,k) ) );
2216  xmm1 += a1 * B.load(k,j );
2217  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2218  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2219  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2220  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
2221  }
2222 
2223  (~C).store( i, j , xmm1 );
2224  (~C).store( i, j+SIMDSIZE , xmm2 );
2225  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2226  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2227  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2228  }
2229  }
2230 
2231  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2232  {
2233  size_t i( 0UL );
2234 
2235  for( ; (i+2UL) <= M; i+=2UL )
2236  {
2237  const size_t kbegin( ( IsUpper<MT4>::value )
2238  ?( ( IsLower<MT5>::value )
2239  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2240  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2241  :( IsLower<MT5>::value ? j : 0UL ) );
2242  const size_t kend( ( IsLower<MT4>::value )
2243  ?( ( IsUpper<MT5>::value )
2244  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
2245  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2246  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
2247 
2248  SIMDType xmm1( (~C).load(i ,j ) );
2249  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2250  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2251  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
2252  SIMDType xmm5( (~C).load(i+1UL,j ) );
2253  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
2254  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2255  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2256 
2257  for( size_t k=kbegin; k<kend; ++k ) {
2258  const SIMDType a1( set( A(i ,k) ) );
2259  const SIMDType a2( set( A(i+1UL,k) ) );
2260  const SIMDType b1( B.load(k,j ) );
2261  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2262  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2263  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2264  xmm1 += a1 * b1;
2265  xmm2 += a1 * b2;
2266  xmm3 += a1 * b3;
2267  xmm4 += a1 * b4;
2268  xmm5 += a2 * b1;
2269  xmm6 += a2 * b2;
2270  xmm7 += a2 * b3;
2271  xmm8 += a2 * b4;
2272  }
2273 
2274  (~C).store( i , j , xmm1 );
2275  (~C).store( i , j+SIMDSIZE , xmm2 );
2276  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2277  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2278  (~C).store( i+1UL, j , xmm5 );
2279  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
2280  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2281  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2282  }
2283 
2284  if( i < M )
2285  {
2286  const size_t kbegin( ( IsUpper<MT4>::value )
2287  ?( ( IsLower<MT5>::value )
2288  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2289  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2290  :( IsLower<MT5>::value ? j : 0UL ) );
2291  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
2292 
2293  SIMDType xmm1( (~C).load(i,j ) );
2294  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2295  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2296  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2297 
2298  for( size_t k=kbegin; k<kend; ++k ) {
2299  const SIMDType a1( set( A(i,k) ) );
2300  xmm1 += a1 * B.load(k,j );
2301  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2302  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2303  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
2304  }
2305 
2306  (~C).store( i, j , xmm1 );
2307  (~C).store( i, j+SIMDSIZE , xmm2 );
2308  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2309  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2310  }
2311  }
2312 
2313  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2314  {
2315  size_t i( 0UL );
2316 
2317  for( ; (i+2UL) <= M; i+=2UL )
2318  {
2319  const size_t kbegin( ( IsUpper<MT4>::value )
2320  ?( ( IsLower<MT5>::value )
2321  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2322  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2323  :( IsLower<MT5>::value ? j : 0UL ) );
2324  const size_t kend( ( IsLower<MT4>::value )
2325  ?( ( IsUpper<MT5>::value )
2326  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
2327  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2328  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
2329 
2330  SIMDType xmm1( (~C).load(i ,j ) );
2331  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2332  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2333  SIMDType xmm4( (~C).load(i+1UL,j ) );
2334  SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
2335  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2336 
2337  for( size_t k=kbegin; k<kend; ++k ) {
2338  const SIMDType a1( set( A(i ,k) ) );
2339  const SIMDType a2( set( A(i+1UL,k) ) );
2340  const SIMDType b1( B.load(k,j ) );
2341  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2342  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2343  xmm1 += a1 * b1;
2344  xmm2 += a1 * b2;
2345  xmm3 += a1 * b3;
2346  xmm4 += a2 * b1;
2347  xmm5 += a2 * b2;
2348  xmm6 += a2 * b3;
2349  }
2350 
2351  (~C).store( i , j , xmm1 );
2352  (~C).store( i , j+SIMDSIZE , xmm2 );
2353  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2354  (~C).store( i+1UL, j , xmm4 );
2355  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
2356  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
2357  }
2358 
2359  if( i < M )
2360  {
2361  const size_t kbegin( ( IsUpper<MT4>::value )
2362  ?( ( IsLower<MT5>::value )
2363  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2364  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2365  :( IsLower<MT5>::value ? j : 0UL ) );
2366  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
2367 
2368  SIMDType xmm1( (~C).load(i,j ) );
2369  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2370  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2371 
2372  for( size_t k=kbegin; k<kend; ++k ) {
2373  const SIMDType a1( set( A(i,k) ) );
2374  xmm1 += a1 * B.load(k,j );
2375  xmm2 += a1 * B.load(k,j+SIMDSIZE );
2376  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
2377  }
2378 
2379  (~C).store( i, j , xmm1 );
2380  (~C).store( i, j+SIMDSIZE , xmm2 );
2381  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2382  }
2383  }
2384 
2385  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2386  {
2387  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
2388  size_t i( LOW ? j : 0UL );
2389 
2390  for( ; (i+4UL) <= iend; i+=4UL )
2391  {
2392  const size_t kbegin( ( IsUpper<MT4>::value )
2393  ?( ( IsLower<MT5>::value )
2394  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2395  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2396  :( IsLower<MT5>::value ? j : 0UL ) );
2397  const size_t kend( ( IsLower<MT4>::value )
2398  ?( ( IsUpper<MT5>::value )
2399  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
2400  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
2401  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
2402 
2403  SIMDType xmm1( (~C).load(i ,j ) );
2404  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2405  SIMDType xmm3( (~C).load(i+1UL,j ) );
2406  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2407  SIMDType xmm5( (~C).load(i+2UL,j ) );
2408  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
2409  SIMDType xmm7( (~C).load(i+3UL,j ) );
2410  SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
2411 
2412  for( size_t k=kbegin; k<kend; ++k ) {
2413  const SIMDType a1( set( A(i ,k) ) );
2414  const SIMDType a2( set( A(i+1UL,k) ) );
2415  const SIMDType a3( set( A(i+2UL,k) ) );
2416  const SIMDType a4( set( A(i+3UL,k) ) );
2417  const SIMDType b1( B.load(k,j ) );
2418  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2419  xmm1 += a1 * b1;
2420  xmm2 += a1 * b2;
2421  xmm3 += a2 * b1;
2422  xmm4 += a2 * b2;
2423  xmm5 += a3 * b1;
2424  xmm6 += a3 * b2;
2425  xmm7 += a4 * b1;
2426  xmm8 += a4 * b2;
2427  }
2428 
2429  (~C).store( i , j , xmm1 );
2430  (~C).store( i , j+SIMDSIZE, xmm2 );
2431  (~C).store( i+1UL, j , xmm3 );
2432  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2433  (~C).store( i+2UL, j , xmm5 );
2434  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
2435  (~C).store( i+3UL, j , xmm7 );
2436  (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
2437  }
2438 
2439  for( ; (i+3UL) <= iend; i+=3UL )
2440  {
2441  const size_t kbegin( ( IsUpper<MT4>::value )
2442  ?( ( IsLower<MT5>::value )
2443  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2444  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2445  :( IsLower<MT5>::value ? j : 0UL ) );
2446  const size_t kend( ( IsLower<MT4>::value )
2447  ?( ( IsUpper<MT5>::value )
2448  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
2449  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
2450  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
2451 
2452  SIMDType xmm1( (~C).load(i ,j ) );
2453  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2454  SIMDType xmm3( (~C).load(i+1UL,j ) );
2455  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2456  SIMDType xmm5( (~C).load(i+2UL,j ) );
2457  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
2458 
2459  for( size_t k=kbegin; k<kend; ++k ) {
2460  const SIMDType a1( set( A(i ,k) ) );
2461  const SIMDType a2( set( A(i+1UL,k) ) );
2462  const SIMDType a3( set( A(i+2UL,k) ) );
2463  const SIMDType b1( B.load(k,j ) );
2464  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2465  xmm1 += a1 * b1;
2466  xmm2 += a1 * b2;
2467  xmm3 += a2 * b1;
2468  xmm4 += a2 * b2;
2469  xmm5 += a3 * b1;
2470  xmm6 += a3 * b2;
2471  }
2472 
2473  (~C).store( i , j , xmm1 );
2474  (~C).store( i , j+SIMDSIZE, xmm2 );
2475  (~C).store( i+1UL, j , xmm3 );
2476  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2477  (~C).store( i+2UL, j , xmm5 );
2478  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
2479  }
2480 
2481  for( ; (i+2UL) <= iend; i+=2UL )
2482  {
2483  const size_t kbegin( ( IsUpper<MT4>::value )
2484  ?( ( IsLower<MT5>::value )
2485  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2486  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2487  :( IsLower<MT5>::value ? j : 0UL ) );
2488  const size_t kend( ( IsLower<MT4>::value )
2489  ?( ( IsUpper<MT5>::value )
2490  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
2491  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2492  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
2493 
2494  SIMDType xmm1( (~C).load(i ,j ) );
2495  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2496  SIMDType xmm3( (~C).load(i+1UL,j ) );
2497  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2498  SIMDType xmm5, xmm6, xmm7, xmm8;
2499  size_t k( kbegin );
2500 
2501  for( ; (k+2UL) <= kend; k+=2UL ) {
2502  const SIMDType a1( set( A(i ,k ) ) );
2503  const SIMDType a2( set( A(i+1UL,k ) ) );
2504  const SIMDType a3( set( A(i ,k+1UL) ) );
2505  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
2506  const SIMDType b1( B.load(k ,j ) );
2507  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
2508  const SIMDType b3( B.load(k+1UL,j ) );
2509  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
2510  xmm1 += a1 * b1;
2511  xmm2 += a1 * b2;
2512  xmm3 += a2 * b1;
2513  xmm4 += a2 * b2;
2514  xmm5 += a3 * b3;
2515  xmm6 += a3 * b4;
2516  xmm7 += a4 * b3;
2517  xmm8 += a4 * b4;
2518  }
2519 
2520  for( ; k<kend; ++k ) {
2521  const SIMDType a1( set( A(i ,k) ) );
2522  const SIMDType a2( set( A(i+1UL,k) ) );
2523  const SIMDType b1( B.load(k,j ) );
2524  const SIMDType b2( B.load(k,j+SIMDSIZE) );
2525  xmm1 += a1 * b1;
2526  xmm2 += a1 * b2;
2527  xmm3 += a2 * b1;
2528  xmm4 += a2 * b2;
2529  }
2530 
2531  (~C).store( i , j , xmm1+xmm5 );
2532  (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
2533  (~C).store( i+1UL, j , xmm3+xmm7 );
2534  (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
2535  }
2536 
2537  if( i < iend )
2538  {
2539  const size_t kbegin( ( IsUpper<MT4>::value )
2540  ?( ( IsLower<MT5>::value )
2541  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2542  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2543  :( IsLower<MT5>::value ? j : 0UL ) );
2544  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
2545 
2546  SIMDType xmm1( (~C).load(i,j ) );
2547  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
2548  SIMDType xmm3, xmm4;
2549  size_t k( kbegin );
2550 
2551  for( ; (k+2UL) <= kend; k+=2UL ) {
2552  const SIMDType a1( set( A(i,k ) ) );
2553  const SIMDType a2( set( A(i,k+1UL) ) );
2554  xmm1 += a1 * B.load(k ,j );
2555  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
2556  xmm3 += a2 * B.load(k+1UL,j );
2557  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
2558  }
2559 
2560  for( ; k<kend; ++k ) {
2561  const SIMDType a1( set( A(i,k) ) );
2562  xmm1 += a1 * B.load(k,j );
2563  xmm2 += a1 * B.load(k,j+SIMDSIZE);
2564  }
2565 
2566  (~C).store( i, j , xmm1+xmm3 );
2567  (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
2568  }
2569  }
2570 
2571  for( ; j<jpos; j+=SIMDSIZE )
2572  {
2573  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
2574  size_t i( LOW ? j : 0UL );
2575 
2576  for( ; (i+4UL) <= iend; i+=4UL )
2577  {
2578  const size_t kbegin( ( IsUpper<MT4>::value )
2579  ?( ( IsLower<MT5>::value )
2580  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2581  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2582  :( IsLower<MT5>::value ? j : 0UL ) );
2583  const size_t kend( ( IsLower<MT4>::value )
2584  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
2585  :( K ) );
2586 
2587  SIMDType xmm1( (~C).load(i ,j) );
2588  SIMDType xmm2( (~C).load(i+1UL,j) );
2589  SIMDType xmm3( (~C).load(i+2UL,j) );
2590  SIMDType xmm4( (~C).load(i+3UL,j) );
2591  SIMDType xmm5, xmm6, xmm7, xmm8;
2592  size_t k( kbegin );
2593 
2594  for( ; (k+2UL) <= kend; k+=2UL ) {
2595  const SIMDType b1( B.load(k ,j) );
2596  const SIMDType b2( B.load(k+1UL,j) );
2597  xmm1 += set( A(i ,k ) ) * b1;
2598  xmm2 += set( A(i+1UL,k ) ) * b1;
2599  xmm3 += set( A(i+2UL,k ) ) * b1;
2600  xmm4 += set( A(i+3UL,k ) ) * b1;
2601  xmm5 += set( A(i ,k+1UL) ) * b2;
2602  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
2603  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
2604  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
2605  }
2606 
2607  for( ; k<kend; ++k ) {
2608  const SIMDType b1( B.load(k,j) );
2609  xmm1 += set( A(i ,k) ) * b1;
2610  xmm2 += set( A(i+1UL,k) ) * b1;
2611  xmm3 += set( A(i+2UL,k) ) * b1;
2612  xmm4 += set( A(i+3UL,k) ) * b1;
2613  }
2614 
2615  (~C).store( i , j, xmm1+xmm5 );
2616  (~C).store( i+1UL, j, xmm2+xmm6 );
2617  (~C).store( i+2UL, j, xmm3+xmm7 );
2618  (~C).store( i+3UL, j, xmm4+xmm8 );
2619  }
2620 
2621  for( ; (i+3UL) <= iend; i+=3UL )
2622  {
2623  const size_t kbegin( ( IsUpper<MT4>::value )
2624  ?( ( IsLower<MT5>::value )
2625  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2626  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2627  :( IsLower<MT5>::value ? j : 0UL ) );
2628  const size_t kend( ( IsLower<MT4>::value )
2629  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
2630  :( K ) );
2631 
2632  SIMDType xmm1( (~C).load(i ,j) );
2633  SIMDType xmm2( (~C).load(i+1UL,j) );
2634  SIMDType xmm3( (~C).load(i+2UL,j) );
2635  SIMDType xmm4, xmm5, xmm6;
2636  size_t k( kbegin );
2637 
2638  for( ; (k+2UL) <= kend; k+=2UL ) {
2639  const SIMDType b1( B.load(k ,j) );
2640  const SIMDType b2( B.load(k+1UL,j) );
2641  xmm1 += set( A(i ,k ) ) * b1;
2642  xmm2 += set( A(i+1UL,k ) ) * b1;
2643  xmm3 += set( A(i+2UL,k ) ) * b1;
2644  xmm4 += set( A(i ,k+1UL) ) * b2;
2645  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
2646  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
2647  }
2648 
2649  for( ; k<kend; ++k ) {
2650  const SIMDType b1( B.load(k,j) );
2651  xmm1 += set( A(i ,k) ) * b1;
2652  xmm2 += set( A(i+1UL,k) ) * b1;
2653  xmm3 += set( A(i+2UL,k) ) * b1;
2654  }
2655 
2656  (~C).store( i , j, xmm1+xmm4 );
2657  (~C).store( i+1UL, j, xmm2+xmm5 );
2658  (~C).store( i+2UL, j, xmm3+xmm6 );
2659  }
2660 
2661  for( ; (i+2UL) <= iend; i+=2UL )
2662  {
2663  const size_t kbegin( ( IsUpper<MT4>::value )
2664  ?( ( IsLower<MT5>::value )
2665  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2666  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2667  :( IsLower<MT5>::value ? j : 0UL ) );
2668  const size_t kend( ( IsLower<MT4>::value )
2669  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2670  :( K ) );
2671 
2672  SIMDType xmm1( (~C).load(i ,j) );
2673  SIMDType xmm2( (~C).load(i+1UL,j) );
2674  SIMDType xmm3, xmm4;
2675  size_t k( kbegin );
2676 
2677  for( ; (k+2UL) <= kend; k+=2UL ) {
2678  const SIMDType b1( B.load(k ,j) );
2679  const SIMDType b2( B.load(k+1UL,j) );
2680  xmm1 += set( A(i ,k ) ) * b1;
2681  xmm2 += set( A(i+1UL,k ) ) * b1;
2682  xmm3 += set( A(i ,k+1UL) ) * b2;
2683  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
2684  }
2685 
2686  for( ; k<kend; ++k ) {
2687  const SIMDType b1( B.load(k,j) );
2688  xmm1 += set( A(i ,k) ) * b1;
2689  xmm2 += set( A(i+1UL,k) ) * b1;
2690  }
2691 
2692  (~C).store( i , j, xmm1+xmm3 );
2693  (~C).store( i+1UL, j, xmm2+xmm4 );
2694  }
2695 
2696  if( i < iend )
2697  {
2698  const size_t kbegin( ( IsUpper<MT4>::value )
2699  ?( ( IsLower<MT5>::value )
2700  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2701  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2702  :( IsLower<MT5>::value ? j : 0UL ) );
2703 
2704  SIMDType xmm1( (~C).load(i,j) );
2705  SIMDType xmm2;
2706  size_t k( kbegin );
2707 
2708  for( ; (k+2UL) <= K; k+=2UL ) {
2709  xmm1 += set( A(i,k ) ) * B.load(k ,j);
2710  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
2711  }
2712 
2713  for( ; k<K; ++k ) {
2714  xmm1 += set( A(i,k) ) * B.load(k,j);
2715  }
2716 
2717  (~C).store( i, j, xmm1+xmm2 );
2718  }
2719  }
2720 
2721  for( ; remainder && j<N; ++j )
2722  {
2723  const size_t iend( UPP ? j+1UL : M );
2724  size_t i( LOW ? j : 0UL );
2725 
2726  for( ; (i+2UL) <= iend; i+=2UL )
2727  {
2728  const size_t kbegin( ( IsUpper<MT4>::value )
2729  ?( ( IsLower<MT5>::value )
2730  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2731  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2732  :( IsLower<MT5>::value ? j : 0UL ) );
2733  const size_t kend( ( IsLower<MT4>::value )
2734  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2735  :( K ) );
2736 
2737  ElementType value1( (~C)(i ,j) );
2738  ElementType value2( (~C)(i+1UL,j) );;
2739 
2740  for( size_t k=kbegin; k<kend; ++k ) {
2741  value1 += A(i ,k) * B(k,j);
2742  value2 += A(i+1UL,k) * B(k,j);
2743  }
2744 
2745  (~C)(i ,j) = value1;
2746  (~C)(i+1UL,j) = value2;
2747  }
2748 
2749  if( i < iend )
2750  {
2751  const size_t kbegin( ( IsUpper<MT4>::value )
2752  ?( ( IsLower<MT5>::value )
2753  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2754  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2755  :( IsLower<MT5>::value ? j : 0UL ) );
2756 
2757  ElementType value( (~C)(i,j) );
2758 
2759  for( size_t k=kbegin; k<K; ++k ) {
2760  value += A(i,k) * B(k,j);
2761  }
2762 
2763  (~C)(i,j) = value;
2764  }
2765  }
2766  }
2768  //**********************************************************************************************
2769 
2770  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
2785  template< typename MT3 // Type of the left-hand side target matrix
2786  , typename MT4 // Type of the left-hand side matrix operand
2787  , typename MT5 > // Type of the right-hand side matrix operand
2789  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2790  {
2795 
2796  const ForwardFunctor fwd;
2797 
2799  const OppositeType_<MT4> tmp( serial( A ) );
2800  addAssign( ~C, fwd( tmp * B ) );
2801  }
2803  const OppositeType_<MT5> tmp( serial( B ) );
2804  addAssign( ~C, fwd( A * tmp ) );
2805  }
2806  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2807  const OppositeType_<MT4> tmp( serial( A ) );
2808  addAssign( ~C, fwd( tmp * B ) );
2809  }
2810  else {
2811  const OppositeType_<MT5> tmp( serial( B ) );
2812  addAssign( ~C, fwd( A * tmp ) );
2813  }
2814  }
2816  //**********************************************************************************************
2817 
2818  //**Default addition assignment to dense matrices (large matrices)******************************
2832  template< typename MT3 // Type of the left-hand side target matrix
2833  , typename MT4 // Type of the left-hand side matrix operand
2834  , typename MT5 > // Type of the right-hand side matrix operand
2836  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2837  {
2838  selectDefaultAddAssignKernel( C, A, B );
2839  }
2841  //**********************************************************************************************
2842 
2843  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
2858  template< typename MT3 // Type of the left-hand side target matrix
2859  , typename MT4 // Type of the left-hand side matrix operand
2860  , typename MT5 > // Type of the right-hand side matrix operand
2862  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2863  {
2864  if( LOW )
2865  lmmm( C, A, B, ElementType(1), ElementType(1) );
2866  else if( UPP )
2867  ummm( C, A, B, ElementType(1), ElementType(1) );
2868  else
2869  mmm( C, A, B, ElementType(1), ElementType(1) );
2870  }
2872  //**********************************************************************************************
2873 
2874  //**BLAS-based addition assignment to dense matrices (default)**********************************
2888  template< typename MT3 // Type of the left-hand side target matrix
2889  , typename MT4 // Type of the left-hand side matrix operand
2890  , typename MT5 > // Type of the right-hand side matrix operand
2892  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2893  {
2894  selectLargeAddAssignKernel( C, A, B );
2895  }
2897  //**********************************************************************************************
2898 
2899  //**BLAS-based addition assignment to dense matrices********************************************
2900 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2901 
2914  template< typename MT3 // Type of the left-hand side target matrix
2915  , typename MT4 // Type of the left-hand side matrix operand
2916  , typename MT5 > // Type of the right-hand side matrix operand
2918  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2919  {
2920  using ET = ElementType_<MT3>;
2921 
2922  if( IsTriangular<MT4>::value ) {
2923  ResultType_<MT3> tmp( serial( B ) );
2924  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2925  addAssign( C, tmp );
2926  }
2927  else if( IsTriangular<MT5>::value ) {
2928  ResultType_<MT3> tmp( serial( A ) );
2929  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2930  addAssign( C, tmp );
2931  }
2932  else {
2933  gemm( C, A, B, ET(1), ET(1) );
2934  }
2935  }
2937 #endif
2938  //**********************************************************************************************
2939 
2940  //**Restructuring addition assignment to column-major matrices**********************************
2955  template< typename MT > // Type of the target matrix
2957  addAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
2958  {
2960 
2962 
2963  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2964  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2965 
2966  const ForwardFunctor fwd;
2967 
2969  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
2970  else if( IsSymmetric<MT1>::value )
2971  addAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
2972  else
2973  addAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
2974  }
2976  //**********************************************************************************************
2977 
2978  //**Addition assignment to sparse matrices******************************************************
2979  // No special implementation for the addition assignment to sparse matrices.
2980  //**********************************************************************************************
2981 
2982  //**Subtraction assignment to dense matrices****************************************************
2995  template< typename MT // Type of the target dense matrix
2996  , bool SO > // Storage order of the target dense matrix
2998  subAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
2999  {
3001 
3002  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
3003  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
3004 
3005  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3006  return;
3007  }
3008 
3009  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
3010  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
3011 
3012  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
3013  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
3014  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
3015  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
3016  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
3017  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
3018 
3019  DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3020  }
3022  //**********************************************************************************************
3023 
3024  //**Subtraction assignment to dense matrices (kernel selection)*********************************
3035  template< typename MT3 // Type of the left-hand side target matrix
3036  , typename MT4 // Type of the left-hand side matrix operand
3037  , typename MT5 > // Type of the right-hand side matrix operand
3038  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3039  {
3040  if( ( IsDiagonal<MT5>::value ) ||
3041  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
3042  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
3043  selectSmallSubAssignKernel( C, A, B );
3044  else
3045  selectBlasSubAssignKernel( C, A, B );
3046  }
3048  //**********************************************************************************************
3049 
3050  //**Default subtraction assignment to dense matrices (general/general)**************************
3064  template< typename MT3 // Type of the left-hand side target matrix
3065  , typename MT4 // Type of the left-hand side matrix operand
3066  , typename MT5 > // Type of the right-hand side matrix operand
3067  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
3068  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3069  {
3070  const size_t M( A.rows() );
3071  const size_t N( B.columns() );
3072  const size_t K( A.columns() );
3073 
3074  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3075 
3076  for( size_t i=0UL; i<M; ++i )
3077  {
3078  const size_t kbegin( ( IsUpper<MT4>::value )
3079  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
3080  :( 0UL ) );
3081  const size_t kend( ( IsLower<MT4>::value )
3082  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
3083  :( K ) );
3084  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
3085 
3086  for( size_t k=kbegin; k<kend; ++k )
3087  {
3088  const size_t jbegin( ( IsUpper<MT5>::value )
3090  ?( UPP ? max(i,k+1UL) : k+1UL )
3091  :( UPP ? max(i,k) : k ) )
3092  :( UPP ? i : 0UL ) );
3093  const size_t jend( ( IsLower<MT5>::value )
3095  ?( LOW ? min(i+1UL,k) : k )
3096  :( LOW ? min(i,k)+1UL : k+1UL ) )
3097  :( LOW ? i+1UL : N ) );
3098 
3099  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
3100  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3101 
3102  const size_t jnum( jend - jbegin );
3103  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3104 
3105  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3106  C(i,j ) -= A(i,k) * B(k,j );
3107  C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3108  }
3109  if( jpos < jend ) {
3110  C(i,jpos) -= A(i,k) * B(k,jpos);
3111  }
3112  }
3113  }
3114  }
3116  //**********************************************************************************************
3117 
3118  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
3132  template< typename MT3 // Type of the left-hand side target matrix
3133  , typename MT4 // Type of the left-hand side matrix operand
3134  , typename MT5 > // Type of the right-hand side matrix operand
3135  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
3136  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3137  {
3139 
3140  const size_t M( A.rows() );
3141  const size_t N( B.columns() );
3142 
3143  for( size_t i=0UL; i<M; ++i )
3144  {
3145  const size_t jbegin( ( IsUpper<MT4>::value )
3146  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
3147  :( 0UL ) );
3148  const size_t jend( ( IsLower<MT4>::value )
3149  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
3150  :( N ) );
3151  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3152 
3153  const size_t jnum( jend - jbegin );
3154  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3155 
3156  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3157  C(i,j ) -= A(i,j ) * B(j ,j );
3158  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3159  }
3160  if( jpos < jend ) {
3161  C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3162  }
3163  }
3164  }
3166  //**********************************************************************************************
3167 
3168  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
3182  template< typename MT3 // Type of the left-hand side target matrix
3183  , typename MT4 // Type of the left-hand side matrix operand
3184  , typename MT5 > // Type of the right-hand side matrix operand
3185  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
3186  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3187  {
3189 
3190  const size_t M( A.rows() );
3191  const size_t N( B.columns() );
3192 
3193  for( size_t i=0UL; i<M; ++i )
3194  {
3195  const size_t jbegin( ( IsUpper<MT5>::value )
3196  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
3197  :( 0UL ) );
3198  const size_t jend( ( IsLower<MT5>::value )
3199  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
3200  :( N ) );
3201  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
3202 
3203  const size_t jnum( jend - jbegin );
3204  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
3205 
3206  for( size_t j=jbegin; j<jpos; j+=2UL ) {
3207  C(i,j ) -= A(i,i) * B(i,j );
3208  C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
3209  }
3210  if( jpos < jend ) {
3211  C(i,jpos) -= A(i,i) * B(i,jpos);
3212  }
3213  }
3214  }
3216  //**********************************************************************************************
3217 
3218  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
3232  template< typename MT3 // Type of the left-hand side target matrix
3233  , typename MT4 // Type of the left-hand side matrix operand
3234  , typename MT5 > // Type of the right-hand side matrix operand
3235  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
3236  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3237  {
3239 
3240  for( size_t i=0UL; i<A.rows(); ++i ) {
3241  C(i,i) -= A(i,i) * B(i,i);
3242  }
3243  }
3245  //**********************************************************************************************
3246 
3247  //**Default subtraction assignment to dense matrices (small matrices)***************************
3261  template< typename MT3 // Type of the left-hand side target matrix
3262  , typename MT4 // Type of the left-hand side matrix operand
3263  , typename MT5 > // Type of the right-hand side matrix operand
3265  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
3266  {
3267  selectDefaultSubAssignKernel( C, A, B );
3268  }
3270  //**********************************************************************************************
3271 
3272  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
3287  template< typename MT3 // Type of the left-hand side target matrix
3288  , typename MT4 // Type of the left-hand side matrix operand
3289  , typename MT5 > // Type of the right-hand side matrix operand
3291  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3292  {
3293  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3294 
3295  const size_t M( A.rows() );
3296  const size_t N( B.columns() );
3297  const size_t K( A.columns() );
3298 
3299  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3300 
3301  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3302  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3303 
3304  size_t j( 0UL );
3305 
3307  {
3308  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3309  for( size_t i=0UL; i<M; ++i )
3310  {
3311  const size_t kbegin( ( IsUpper<MT4>::value )
3312  ?( ( IsLower<MT5>::value )
3313  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3314  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3315  :( IsLower<MT5>::value ? j : 0UL ) );
3316  const size_t kend( ( IsLower<MT4>::value )
3317  ?( ( IsUpper<MT5>::value )
3318  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
3319  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
3320  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
3321 
3322  SIMDType xmm1( (~C).load(i,j ) );
3323  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3324  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3325  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3326  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3327  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
3328  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
3329  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
3330 
3331  for( size_t k=kbegin; k<kend; ++k ) {
3332  const SIMDType a1( set( A(i,k) ) );
3333  xmm1 -= a1 * B.load(k,j );
3334  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3335  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3336  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3337  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3338  xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
3339  xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
3340  xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
3341  }
3342 
3343  (~C).store( i, j , xmm1 );
3344  (~C).store( i, j+SIMDSIZE , xmm2 );
3345  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3346  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3347  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3348  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
3349  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
3350  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
3351  }
3352  }
3353  }
3354 
3355  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3356  {
3357  size_t i( 0UL );
3358 
3359  for( ; (i+2UL) <= M; i+=2UL )
3360  {
3361  const size_t kbegin( ( IsUpper<MT4>::value )
3362  ?( ( IsLower<MT5>::value )
3363  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3364  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3365  :( IsLower<MT5>::value ? j : 0UL ) );
3366  const size_t kend( ( IsLower<MT4>::value )
3367  ?( ( IsUpper<MT5>::value )
3368  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
3369  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3370  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
3371 
3372  SIMDType xmm1 ( (~C).load(i ,j ) );
3373  SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
3374  SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
3375  SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
3376  SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
3377  SIMDType xmm6 ( (~C).load(i+1UL,j ) );
3378  SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
3379  SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3380  SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3381  SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
3382 
3383  for( size_t k=kbegin; k<kend; ++k ) {
3384  const SIMDType a1( set( A(i ,k) ) );
3385  const SIMDType a2( set( A(i+1UL,k) ) );
3386  const SIMDType b1( B.load(k,j ) );
3387  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3388  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3389  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3390  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3391  xmm1 -= a1 * b1;
3392  xmm2 -= a1 * b2;
3393  xmm3 -= a1 * b3;
3394  xmm4 -= a1 * b4;
3395  xmm5 -= a1 * b5;
3396  xmm6 -= a2 * b1;
3397  xmm7 -= a2 * b2;
3398  xmm8 -= a2 * b3;
3399  xmm9 -= a2 * b4;
3400  xmm10 -= a2 * b5;
3401  }
3402 
3403  (~C).store( i , j , xmm1 );
3404  (~C).store( i , j+SIMDSIZE , xmm2 );
3405  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3406  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3407  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
3408  (~C).store( i+1UL, j , xmm6 );
3409  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
3410  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3411  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3412  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3413  }
3414 
3415  if( i < M )
3416  {
3417  const size_t kbegin( ( IsUpper<MT4>::value )
3418  ?( ( IsLower<MT5>::value )
3419  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3420  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3421  :( IsLower<MT5>::value ? j : 0UL ) );
3422  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
3423 
3424  SIMDType xmm1( (~C).load(i,j ) );
3425  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3426  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3427  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3428  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3429 
3430  for( size_t k=kbegin; k<kend; ++k ) {
3431  const SIMDType a1( set( A(i,k) ) );
3432  xmm1 -= a1 * B.load(k,j );
3433  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3434  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3435  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3436  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
3437  }
3438 
3439  (~C).store( i, j , xmm1 );
3440  (~C).store( i, j+SIMDSIZE , xmm2 );
3441  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3442  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3443  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3444  }
3445  }
3446 
3447  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3448  {
3449  size_t i( 0UL );
3450 
3451  for( ; (i+2UL) <= M; i+=2UL )
3452  {
3453  const size_t kbegin( ( IsUpper<MT4>::value )
3454  ?( ( IsLower<MT5>::value )
3455  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3456  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3457  :( IsLower<MT5>::value ? j : 0UL ) );
3458  const size_t kend( ( IsLower<MT4>::value )
3459  ?( ( IsUpper<MT5>::value )
3460  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3461  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3462  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
3463 
3464  SIMDType xmm1( (~C).load(i ,j ) );
3465  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3466  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3467  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
3468  SIMDType xmm5( (~C).load(i+1UL,j ) );
3469  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
3470  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3471  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3472 
3473  for( size_t k=kbegin; k<kend; ++k ) {
3474  const SIMDType a1( set( A(i ,k) ) );
3475  const SIMDType a2( set( A(i+1UL,k) ) );
3476  const SIMDType b1( B.load(k,j ) );
3477  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3478  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3479  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3480  xmm1 -= a1 * b1;
3481  xmm2 -= a1 * b2;
3482  xmm3 -= a1 * b3;
3483  xmm4 -= a1 * b4;
3484  xmm5 -= a2 * b1;
3485  xmm6 -= a2 * b2;
3486  xmm7 -= a2 * b3;
3487  xmm8 -= a2 * b4;
3488  }
3489 
3490  (~C).store( i , j , xmm1 );
3491  (~C).store( i , j+SIMDSIZE , xmm2 );
3492  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3493  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3494  (~C).store( i+1UL, j , xmm5 );
3495  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
3496  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3497  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3498  }
3499 
3500  if( i < M )
3501  {
3502  const size_t kbegin( ( IsUpper<MT4>::value )
3503  ?( ( IsLower<MT5>::value )
3504  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3505  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3506  :( IsLower<MT5>::value ? j : 0UL ) );
3507  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3508 
3509  SIMDType xmm1( (~C).load(i,j ) );
3510  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3511  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3512  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3513 
3514  for( size_t k=kbegin; k<kend; ++k ) {
3515  const SIMDType a1( set( A(i,k) ) );
3516  xmm1 -= a1 * B.load(k,j );
3517  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3518  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3519  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
3520  }
3521 
3522  (~C).store( i, j , xmm1 );
3523  (~C).store( i, j+SIMDSIZE , xmm2 );
3524  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3525  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3526  }
3527  }
3528 
3529  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3530  {
3531  size_t i( 0UL );
3532 
3533  for( ; (i+2UL) <= M; i+=2UL )
3534  {
3535  const size_t kbegin( ( IsUpper<MT4>::value )
3536  ?( ( IsLower<MT5>::value )
3537  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3538  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3539  :( IsLower<MT5>::value ? j : 0UL ) );
3540  const size_t kend( ( IsLower<MT4>::value )
3541  ?( ( IsUpper<MT5>::value )
3542  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
3543  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3544  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
3545 
3546  SIMDType xmm1( (~C).load(i ,j ) );
3547  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3548  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3549  SIMDType xmm4( (~C).load(i+1UL,j ) );
3550  SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
3551  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3552 
3553  for( size_t k=kbegin; k<kend; ++k ) {
3554  const SIMDType a1( set( A(i ,k) ) );
3555  const SIMDType a2( set( A(i+1UL,k) ) );
3556  const SIMDType b1( B.load(k,j ) );
3557  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3558  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3559  xmm1 -= a1 * b1;
3560  xmm2 -= a1 * b2;
3561  xmm3 -= a1 * b3;
3562  xmm4 -= a2 * b1;
3563  xmm5 -= a2 * b2;
3564  xmm6 -= a2 * b3;
3565  }
3566 
3567  (~C).store( i , j , xmm1 );
3568  (~C).store( i , j+SIMDSIZE , xmm2 );
3569  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3570  (~C).store( i+1UL, j , xmm4 );
3571  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
3572  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3573  }
3574 
3575  if( i < M )
3576  {
3577  const size_t kbegin( ( IsUpper<MT4>::value )
3578  ?( ( IsLower<MT5>::value )
3579  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3580  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3581  :( IsLower<MT5>::value ? j : 0UL ) );
3582  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
3583 
3584  SIMDType xmm1( (~C).load(i,j ) );
3585  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3586  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3587 
3588  for( size_t k=kbegin; k<kend; ++k ) {
3589  const SIMDType a1( set( A(i,k) ) );
3590  xmm1 -= a1 * B.load(k,j );
3591  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
3592  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
3593  }
3594 
3595  (~C).store( i, j , xmm1 );
3596  (~C).store( i, j+SIMDSIZE , xmm2 );
3597  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3598  }
3599  }
3600 
3601  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3602  {
3603  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
3604  size_t i( LOW ? j : 0UL );
3605 
3606  for( ; (i+4UL) <= iend; i+=4UL )
3607  {
3608  const size_t kbegin( ( IsUpper<MT4>::value )
3609  ?( ( IsLower<MT5>::value )
3610  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3611  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3612  :( IsLower<MT5>::value ? j : 0UL ) );
3613  const size_t kend( ( IsLower<MT4>::value )
3614  ?( ( IsUpper<MT5>::value )
3615  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
3616  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
3617  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3618 
3619  SIMDType xmm1( (~C).load(i ,j ) );
3620  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3621  SIMDType xmm3( (~C).load(i+1UL,j ) );
3622  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3623  SIMDType xmm5( (~C).load(i+2UL,j ) );
3624  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3625  SIMDType xmm7( (~C).load(i+3UL,j ) );
3626  SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
3627 
3628  for( size_t k=kbegin; k<kend; ++k ) {
3629  const SIMDType a1( set( A(i ,k) ) );
3630  const SIMDType a2( set( A(i+1UL,k) ) );
3631  const SIMDType a3( set( A(i+2UL,k) ) );
3632  const SIMDType a4( set( A(i+3UL,k) ) );
3633  const SIMDType b1( B.load(k,j ) );
3634  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3635  xmm1 -= a1 * b1;
3636  xmm2 -= a1 * b2;
3637  xmm3 -= a2 * b1;
3638  xmm4 -= a2 * b2;
3639  xmm5 -= a3 * b1;
3640  xmm6 -= a3 * b2;
3641  xmm7 -= a4 * b1;
3642  xmm8 -= a4 * b2;
3643  }
3644 
3645  (~C).store( i , j , xmm1 );
3646  (~C).store( i , j+SIMDSIZE, xmm2 );
3647  (~C).store( i+1UL, j , xmm3 );
3648  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3649  (~C).store( i+2UL, j , xmm5 );
3650  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3651  (~C).store( i+3UL, j , xmm7 );
3652  (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
3653  }
3654 
3655  for( ; (i+3UL) <= iend; i+=3UL )
3656  {
3657  const size_t kbegin( ( IsUpper<MT4>::value )
3658  ?( ( IsLower<MT5>::value )
3659  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3660  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3661  :( IsLower<MT5>::value ? j : 0UL ) );
3662  const size_t kend( ( IsLower<MT4>::value )
3663  ?( ( IsUpper<MT5>::value )
3664  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
3665  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
3666  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3667 
3668  SIMDType xmm1( (~C).load(i ,j ) );
3669  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3670  SIMDType xmm3( (~C).load(i+1UL,j ) );
3671  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3672  SIMDType xmm5( (~C).load(i+2UL,j ) );
3673  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3674 
3675  for( size_t k=kbegin; k<kend; ++k ) {
3676  const SIMDType a1( set( A(i ,k) ) );
3677  const SIMDType a2( set( A(i+1UL,k) ) );
3678  const SIMDType a3( set( A(i+2UL,k) ) );
3679  const SIMDType b1( B.load(k,j ) );
3680  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3681  xmm1 -= a1 * b1;
3682  xmm2 -= a1 * b2;
3683  xmm3 -= a2 * b1;
3684  xmm4 -= a2 * b2;
3685  xmm5 -= a3 * b1;
3686  xmm6 -= a3 * b2;
3687  }
3688 
3689  (~C).store( i , j , xmm1 );
3690  (~C).store( i , j+SIMDSIZE, xmm2 );
3691  (~C).store( i+1UL, j , xmm3 );
3692  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3693  (~C).store( i+2UL, j , xmm5 );
3694  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3695  }
3696 
3697  for( ; (i+2UL) <= iend; i+=2UL )
3698  {
3699  const size_t kbegin( ( IsUpper<MT4>::value )
3700  ?( ( IsLower<MT5>::value )
3701  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3702  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3703  :( IsLower<MT5>::value ? j : 0UL ) );
3704  const size_t kend( ( IsLower<MT4>::value )
3705  ?( ( IsUpper<MT5>::value )
3706  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3707  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3708  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3709 
3710  SIMDType xmm1( (~C).load(i ,j ) );
3711  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3712  SIMDType xmm3( (~C).load(i+1UL,j ) );
3713  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3714  SIMDType xmm5, xmm6, xmm7, xmm8;
3715  size_t k( kbegin );
3716 
3717  for( ; (k+2UL) <= kend; k+=2UL ) {
3718  const SIMDType a1( set( A(i ,k ) ) );
3719  const SIMDType a2( set( A(i+1UL,k ) ) );
3720  const SIMDType a3( set( A(i ,k+1UL) ) );
3721  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
3722  const SIMDType b1( B.load(k ,j ) );
3723  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3724  const SIMDType b3( B.load(k+1UL,j ) );
3725  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3726  xmm1 -= a1 * b1;
3727  xmm2 -= a1 * b2;
3728  xmm3 -= a2 * b1;
3729  xmm4 -= a2 * b2;
3730  xmm5 -= a3 * b3;
3731  xmm6 -= a3 * b4;
3732  xmm7 -= a4 * b3;
3733  xmm8 -= a4 * b4;
3734  }
3735 
3736  for( ; k<kend; ++k ) {
3737  const SIMDType a1( set( A(i ,k) ) );
3738  const SIMDType a2( set( A(i+1UL,k) ) );
3739  const SIMDType b1( B.load(k,j ) );
3740  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3741  xmm1 -= a1 * b1;
3742  xmm2 -= a1 * b2;
3743  xmm3 -= a2 * b1;
3744  xmm4 -= a2 * b2;
3745  }
3746 
3747  (~C).store( i , j , xmm1+xmm5 );
3748  (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
3749  (~C).store( i+1UL, j , xmm3+xmm7 );
3750  (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3751  }
3752 
3753  if( i < iend )
3754  {
3755  const size_t kbegin( ( IsUpper<MT4>::value )
3756  ?( ( IsLower<MT5>::value )
3757  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3758  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3759  :( IsLower<MT5>::value ? j : 0UL ) );
3760  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
3761 
3762  SIMDType xmm1( (~C).load(i,j ) );
3763  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3764  SIMDType xmm3, xmm4;
3765  size_t k( kbegin );
3766 
3767  for( ; (k+2UL) <= kend; k+=2UL ) {
3768  const SIMDType a1( set( A(i,k ) ) );
3769  const SIMDType a2( set( A(i,k+1UL) ) );
3770  xmm1 -= a1 * B.load(k ,j );
3771  xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
3772  xmm3 -= a2 * B.load(k+1UL,j );
3773  xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
3774  }
3775 
3776  for( ; k<kend; ++k ) {
3777  const SIMDType a1( set( A(i,k) ) );
3778  xmm1 -= a1 * B.load(k,j );
3779  xmm2 -= a1 * B.load(k,j+SIMDSIZE);
3780  }
3781 
3782  (~C).store( i, j , xmm1+xmm3 );
3783  (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
3784  }
3785  }
3786 
3787  for( ; j<jpos; j+=SIMDSIZE )
3788  {
3789  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
3790  size_t i( LOW ? j : 0UL );
3791 
3792  for( ; (i+4UL) <= iend; i+=4UL )
3793  {
3794  const size_t kbegin( ( IsUpper<MT4>::value )
3795  ?( ( IsLower<MT5>::value )
3796  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3797  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3798  :( IsLower<MT5>::value ? j : 0UL ) );
3799  const size_t kend( ( IsLower<MT4>::value )
3800  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
3801  :( K ) );
3802 
3803  SIMDType xmm1( (~C).load(i ,j) );
3804  SIMDType xmm2( (~C).load(i+1UL,j) );
3805  SIMDType xmm3( (~C).load(i+2UL,j) );
3806  SIMDType xmm4( (~C).load(i+3UL,j) );
3807  SIMDType xmm5, xmm6, xmm7, xmm8;
3808  size_t k( kbegin );
3809 
3810  for( ; (k+2UL) <= kend; k+=2UL ) {
3811  const SIMDType b1( B.load(k ,j) );
3812  const SIMDType b2( B.load(k+1UL,j) );
3813  xmm1 -= set( A(i ,k ) ) * b1;
3814  xmm2 -= set( A(i+1UL,k ) ) * b1;
3815  xmm3 -= set( A(i+2UL,k ) ) * b1;
3816  xmm4 -= set( A(i+3UL,k ) ) * b1;
3817  xmm5 -= set( A(i ,k+1UL) ) * b2;
3818  xmm6 -= set( A(i+1UL,k+1UL) ) * b2;
3819  xmm7 -= set( A(i+2UL,k+1UL) ) * b2;
3820  xmm8 -= set( A(i+3UL,k+1UL) ) * b2;
3821  }
3822 
3823  for( ; k<kend; ++k ) {
3824  const SIMDType b1( B.load(k,j) );
3825  xmm1 -= set( A(i ,k) ) * b1;
3826  xmm2 -= set( A(i+1UL,k) ) * b1;
3827  xmm3 -= set( A(i+2UL,k) ) * b1;
3828  xmm4 -= set( A(i+3UL,k) ) * b1;
3829  }
3830 
3831  (~C).store( i , j, xmm1+xmm5 );
3832  (~C).store( i+1UL, j, xmm2+xmm6 );
3833  (~C).store( i+2UL, j, xmm3+xmm7 );
3834  (~C).store( i+3UL, j, xmm4+xmm8 );
3835  }
3836 
3837  for( ; (i+3UL) <= iend; i+=3UL )
3838  {
3839  const size_t kbegin( ( IsUpper<MT4>::value )
3840  ?( ( IsLower<MT5>::value )
3841  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3842  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3843  :( IsLower<MT5>::value ? j : 0UL ) );
3844  const size_t kend( ( IsLower<MT4>::value )
3845  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
3846  :( K ) );
3847 
3848  SIMDType xmm1( (~C).load(i ,j) );
3849  SIMDType xmm2( (~C).load(i+1UL,j) );
3850  SIMDType xmm3( (~C).load(i+2UL,j) );
3851  SIMDType xmm4, xmm5, xmm6;
3852  size_t k( kbegin );
3853 
3854  for( ; (k+2UL) <= kend; k+=2UL ) {
3855  const SIMDType b1( B.load(k ,j) );
3856  const SIMDType b2( B.load(k+1UL,j) );
3857  xmm1 -= set( A(i ,k ) ) * b1;
3858  xmm2 -= set( A(i+1UL,k ) ) * b1;
3859  xmm3 -= set( A(i+2UL,k ) ) * b1;
3860  xmm4 -= set( A(i ,k+1UL) ) * b2;
3861  xmm5 -= set( A(i+1UL,k+1UL) ) * b2;
3862  xmm6 -= set( A(i+2UL,k+1UL) ) * b2;
3863  }
3864 
3865  for( ; k<kend; ++k ) {
3866  const SIMDType b1( B.load(k,j) );
3867  xmm1 -= set( A(i ,k) ) * b1;
3868  xmm2 -= set( A(i+1UL,k) ) * b1;
3869  xmm3 -= set( A(i+2UL,k) ) * b1;
3870  }
3871 
3872  (~C).store( i , j, xmm1+xmm4 );
3873  (~C).store( i+1UL, j, xmm2+xmm5 );
3874  (~C).store( i+2UL, j, xmm3+xmm6 );
3875  }
3876 
3877  for( ; (i+2UL) <= iend; i+=2UL )
3878  {
3879  const size_t kbegin( ( IsUpper<MT4>::value )
3880  ?( ( IsLower<MT5>::value )
3881  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3882  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3883  :( IsLower<MT5>::value ? j : 0UL ) );
3884  const size_t kend( ( IsLower<MT4>::value )
3885  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3886  :( K ) );
3887 
3888  SIMDType xmm1( (~C).load(i ,j) );
3889  SIMDType xmm2( (~C).load(i+1UL,j) );
3890  SIMDType xmm3, xmm4;
3891  size_t k( kbegin );
3892 
3893  for( ; (k+2UL) <= kend; k+=2UL ) {
3894  const SIMDType b1( B.load(k ,j) );
3895  const SIMDType b2( B.load(k+1UL,j) );
3896  xmm1 -= set( A(i ,k ) ) * b1;
3897  xmm2 -= set( A(i+1UL,k ) ) * b1;
3898  xmm3 -= set( A(i ,k+1UL) ) * b2;
3899  xmm4 -= set( A(i+1UL,k+1UL) ) * b2;
3900  }
3901 
3902  for( ; k<kend; ++k ) {
3903  const SIMDType b1( B.load(k,j) );
3904  xmm1 -= set( A(i ,k) ) * b1;
3905  xmm2 -= set( A(i+1UL,k) ) * b1;
3906  }
3907 
3908  (~C).store( i , j, xmm1+xmm3 );
3909  (~C).store( i+1UL, j, xmm2+xmm4 );
3910  }
3911 
3912  if( i < iend )
3913  {
3914  const size_t kbegin( ( IsUpper<MT4>::value )
3915  ?( ( IsLower<MT5>::value )
3916  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3917  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3918  :( IsLower<MT5>::value ? j : 0UL ) );
3919 
3920  SIMDType xmm1( (~C).load(i,j) );
3921  SIMDType xmm2;
3922  size_t k( kbegin );
3923 
3924  for( ; (k+2UL) <= K; k+=2UL ) {
3925  xmm1 -= set( A(i,k ) ) * B.load(k ,j);
3926  xmm2 -= set( A(i,k+1UL) ) * B.load(k+1UL,j);
3927  }
3928 
3929  for( ; k<K; ++k ) {
3930  xmm1 -= set( A(i,k) ) * B.load(k,j);
3931  }
3932 
3933  (~C).store( i, j, xmm1+xmm2 );
3934  }
3935  }
3936 
3937  for( ; remainder && j<N; ++j )
3938  {
3939  const size_t iend( UPP ? j+1UL : M );
3940  size_t i( LOW ? j : 0UL );
3941 
3942  for( ; (i+2UL) <= iend; i+=2UL )
3943  {
3944  const size_t kbegin( ( IsUpper<MT4>::value )
3945  ?( ( IsLower<MT5>::value )
3946  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3947  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3948  :( IsLower<MT5>::value ? j : 0UL ) );
3949  const size_t kend( ( IsLower<MT4>::value )
3950  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3951  :( K ) );
3952 
3953  ElementType value1( (~C)(i ,j) );
3954  ElementType value2( (~C)(i+1UL,j) );
3955 
3956  for( size_t k=kbegin; k<kend; ++k ) {
3957  value1 -= A(i ,k) * B(k,j);
3958  value2 -= A(i+1UL,k) * B(k,j);
3959  }
3960 
3961  (~C)(i ,j) = value1;
3962  (~C)(i+1UL,j) = value2;
3963  }
3964 
3965  if( i < iend )
3966  {
3967  const size_t kbegin( ( IsUpper<MT4>::value )
3968  ?( ( IsLower<MT5>::value )
3969  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3970  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3971  :( IsLower<MT5>::value ? j : 0UL ) );
3972 
3973  ElementType value( (~C)(i,j) );
3974 
3975  for( size_t k=kbegin; k<K; ++k ) {
3976  value -= A(i,k) * B(k,j);
3977  }
3978 
3979  (~C)(i,j) = value;
3980  }
3981  }
3982  }
3984  //**********************************************************************************************
3985 
3986  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
4001  template< typename MT3 // Type of the left-hand side target matrix
4002  , typename MT4 // Type of the left-hand side matrix operand
4003  , typename MT5 > // Type of the right-hand side matrix operand
4005  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4006  {
4011 
4012  const ForwardFunctor fwd;
4013 
4015  const OppositeType_<MT4> tmp( serial( A ) );
4016  subAssign( ~C, fwd( tmp * B ) );
4017  }
4019  const OppositeType_<MT5> tmp( serial( B ) );
4020  subAssign( ~C, fwd( A * tmp ) );
4021  }
4022  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4023  const OppositeType_<MT4> tmp( serial( A ) );
4024  subAssign( ~C, fwd( tmp * B ) );
4025  }
4026  else {
4027  const OppositeType_<MT5> tmp( serial( B ) );
4028  subAssign( ~C, fwd( A * tmp ) );
4029  }
4030  }
4032  //**********************************************************************************************
4033 
4034  //**Default subtraction assignment to dense matrices (large matrices)***************************
4048  template< typename MT3 // Type of the left-hand side target matrix
4049  , typename MT4 // Type of the left-hand side matrix operand
4050  , typename MT5 > // Type of the right-hand side matrix operand
4052  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4053  {
4054  selectDefaultSubAssignKernel( C, A, B );
4055  }
4057  //**********************************************************************************************
4058 
4059  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
4074  template< typename MT3 // Type of the left-hand side target matrix
4075  , typename MT4 // Type of the left-hand side matrix operand
4076  , typename MT5 > // Type of the right-hand side matrix operand
4078  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4079  {
4080  if( LOW )
4081  lmmm( C, A, B, ElementType(-1), ElementType(1) );
4082  else if( UPP )
4083  ummm( C, A, B, ElementType(-1), ElementType(1) );
4084  else
4085  mmm( C, A, B, ElementType(-1), ElementType(1) );
4086  }
4088  //**********************************************************************************************
4089 
4090  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
4104  template< typename MT3 // Type of the left-hand side target matrix
4105  , typename MT4 // Type of the left-hand side matrix operand
4106  , typename MT5 > // Type of the right-hand side matrix operand
4108  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4109  {
4110  selectLargeSubAssignKernel( C, A, B );
4111  }
4113  //**********************************************************************************************
4114 
4115  //**BLAS-based subraction assignment to dense matrices******************************************
4116 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4117 
4130  template< typename MT3 // Type of the left-hand side target matrix
4131  , typename MT4 // Type of the left-hand side matrix operand
4132  , typename MT5 > // Type of the right-hand side matrix operand
4134  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4135  {
4136  using ET = ElementType_<MT3>;
4137 
4138  if( IsTriangular<MT4>::value ) {
4139  ResultType_<MT3> tmp( serial( B ) );
4140  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4141  subAssign( C, tmp );
4142  }
4143  else if( IsTriangular<MT5>::value ) {
4144  ResultType_<MT3> tmp( serial( A ) );
4145  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4146  subAssign( C, tmp );
4147  }
4148  else {
4149  gemm( C, A, B, ET(-1), ET(1) );
4150  }
4151  }
4153 #endif
4154  //**********************************************************************************************
4155 
4156  //**Restructuring subtraction assignment to column-major matrices*******************************
4171  template< typename MT > // Type of the target matrix
4173  subAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4174  {
4176 
4178 
4179  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4180  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4181 
4182  const ForwardFunctor fwd;
4183 
4185  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4186  else if( IsSymmetric<MT1>::value )
4187  subAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4188  else
4189  subAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4190  }
4192  //**********************************************************************************************
4193 
4194  //**Subtraction assignment to sparse matrices***************************************************
4195  // No special implementation for the subtraction assignment to sparse matrices.
4196  //**********************************************************************************************
4197 
4198  //**Schur product assignment to dense matrices**************************************************
4211  template< typename MT // Type of the target dense matrix
4212  , bool SO > // Storage order of the target dense matrix
4213  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4214  {
4216 
4220 
4221  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4222  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4223 
4224  const ResultType tmp( serial( rhs ) );
4225  schurAssign( ~lhs, tmp );
4226  }
4228  //**********************************************************************************************
4229 
4230  //**Schur product assignment to sparse matrices*************************************************
4231  // No special implementation for the Schur product assignment to sparse matrices.
4232  //**********************************************************************************************
4233 
4234  //**Multiplication assignment to dense matrices*************************************************
4235  // No special implementation for the multiplication assignment to dense matrices.
4236  //**********************************************************************************************
4237 
4238  //**Multiplication assignment to sparse matrices************************************************
4239  // No special implementation for the multiplication assignment to sparse matrices.
4240  //**********************************************************************************************
4241 
4242  //**SMP assignment to dense matrices************************************************************
4257  template< typename MT // Type of the target dense matrix
4258  , bool SO > // Storage order of the target dense matrix
4260  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4261  {
4263 
4264  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4265  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4266 
4267  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4268  return;
4269  }
4270  else if( rhs.lhs_.columns() == 0UL ) {
4271  reset( ~lhs );
4272  return;
4273  }
4274 
4275  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4276  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4277 
4278  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4279  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4280  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4281  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4282  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4283  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4284 
4285  smpAssign( ~lhs, A * B );
4286  }
4288  //**********************************************************************************************
4289 
4290  //**SMP assignment to sparse matrices***********************************************************
4305  template< typename MT // Type of the target sparse matrix
4306  , bool SO > // Storage order of the target sparse matrix
4308  smpAssign( SparseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4309  {
4311 
4313 
4320 
4321  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4322  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4323 
4324  const ForwardFunctor fwd;
4325 
4326  const TmpType tmp( rhs );
4327  smpAssign( ~lhs, fwd( tmp ) );
4328  }
4330  //**********************************************************************************************
4331 
4332  //**Restructuring SMP assignment to column-major matrices***************************************
4347  template< typename MT > // Type of the target matrix
4349  smpAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4350  {
4352 
4354 
4355  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4356  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4357 
4358  const ForwardFunctor fwd;
4359 
4361  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4362  else if( IsSymmetric<MT1>::value )
4363  smpAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4364  else
4365  smpAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4366  }
4368  //**********************************************************************************************
4369 
4370  //**SMP addition assignment to dense matrices***************************************************
4386  template< typename MT // Type of the target dense matrix
4387  , bool SO > // Storage order of the target dense matrix
4390  {
4392 
4393  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4394  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4395 
4396  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4397  return;
4398  }
4399 
4400  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4401  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4402 
4403  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4404  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4405  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4406  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4407  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4408  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4409 
4410  smpAddAssign( ~lhs, A * B );
4411  }
4413  //**********************************************************************************************
4414 
4415  //**Restructuring SMP addition assignment to column-major matrices******************************
4430  template< typename MT > // Type of the target matrix
4432  smpAddAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4433  {
4435 
4437 
4438  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4439  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4440 
4441  const ForwardFunctor fwd;
4442 
4444  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4445  else if( IsSymmetric<MT1>::value )
4446  smpAddAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4447  else
4448  smpAddAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4449  }
4451  //**********************************************************************************************
4452 
4453  //**SMP addition assignment to sparse matrices**************************************************
4454  // No special implementation for the SMP addition assignment to sparse matrices.
4455  //**********************************************************************************************
4456 
4457  //**SMP subtraction assignment to dense matrices************************************************
4473  template< typename MT // Type of the target dense matrix
4474  , bool SO > // Storage order of the target dense matrix
4477  {
4479 
4480  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4481  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4482 
4483  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4484  return;
4485  }
4486 
4487  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
4488  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
4489 
4490  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4491  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4492  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4493  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4494  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4495  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4496 
4497  smpSubAssign( ~lhs, A * B );
4498  }
4500  //**********************************************************************************************
4501 
4502  //**Restructuring SMP subtraction assignment to column-major matrices***************************
4517  template< typename MT > // Type of the target matrix
4519  smpSubAssign( Matrix<MT,true>& lhs, const DMatDMatMultExpr& rhs )
4520  {
4522 
4524 
4525  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4526  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4527 
4528  const ForwardFunctor fwd;
4529 
4531  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * trans( rhs.rhs_ ) ) );
4532  else if( IsSymmetric<MT1>::value )
4533  smpSubAssign( ~lhs, fwd( trans( rhs.lhs_ ) * rhs.rhs_ ) );
4534  else
4535  smpSubAssign( ~lhs, fwd( rhs.lhs_ * trans( rhs.rhs_ ) ) );
4536  }
4538  //**********************************************************************************************
4539 
4540  //**SMP subtraction assignment to sparse matrices***********************************************
4541  // No special implementation for the SMP subtraction assignment to sparse matrices.
4542  //**********************************************************************************************
4543 
4544  //**SMP Schur product assignment to dense matrices**********************************************
4557  template< typename MT // Type of the target dense matrix
4558  , bool SO > // Storage order of the target dense matrix
4559  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatDMatMultExpr& rhs )
4560  {
4562 
4566 
4567  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4568  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4569 
4570  const ResultType tmp( rhs );
4571  smpSchurAssign( ~lhs, tmp );
4572  }
4574  //**********************************************************************************************
4575 
4576  //**SMP Schur product assignment to sparse matrices*********************************************
4577  // No special implementation for the SMP Schur product assignment to sparse matrices.
4578  //**********************************************************************************************
4579 
4580  //**SMP multiplication assignment to dense matrices*********************************************
4581  // No special implementation for the SMP multiplication assignment to dense matrices.
4582  //**********************************************************************************************
4583 
4584  //**SMP multiplication assignment to sparse matrices********************************************
4585  // No special implementation for the SMP multiplication assignment to sparse matrices.
4586  //**********************************************************************************************
4587 
4588  //**Compile time checks*************************************************************************
4596  //**********************************************************************************************
4597 };
4598 //*************************************************************************************************
4599 
4600 
4601 
4602 
4603 //=================================================================================================
4604 //
4605 // DMATSCALARMULTEXPR SPECIALIZATION
4606 //
4607 //=================================================================================================
4608 
4609 //*************************************************************************************************
4617 template< typename MT1 // Type of the left-hand side dense matrix
4618  , typename MT2 // Type of the right-hand side dense matrix
4619  , bool SF // Symmetry flag
4620  , bool HF // Hermitian flag
4621  , bool LF // Lower flag
4622  , bool UF // Upper flag
4623  , typename ST > // Type of the right-hand side scalar value
4624 class DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4625  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4626  , private Computation
4627 {
4628  private:
4629  //**Type definitions****************************************************************************
4632 
4633  using RES = ResultType_<MMM>;
4634  using RT1 = ResultType_<MT1>;
4635  using RT2 = ResultType_<MT2>;
4636  using ET1 = ElementType_<RT1>;
4637  using ET2 = ElementType_<RT2>;
4638  using CT1 = CompositeType_<MT1>;
4639  using CT2 = CompositeType_<MT2>;
4640  //**********************************************************************************************
4641 
4642  //**********************************************************************************************
4644  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4645  //**********************************************************************************************
4646 
4647  //**********************************************************************************************
4649  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4650  //**********************************************************************************************
4651 
4652  //**********************************************************************************************
4654  enum : bool {
4655  SYM = ( SF && !( HF || LF || UF ) ),
4656  HERM = ( HF && !( LF || UF ) ),
4657  LOW = ( LF || ( ( SF || HF ) && UF ) ),
4658  UPP = ( UF || ( ( SF || HF ) && LF ) )
4659  };
4660  //**********************************************************************************************
4661 
4662  //**********************************************************************************************
4664 
4669  template< typename T1, typename T2, typename T3 >
4670  struct CanExploitSymmetry {
4671  enum : bool { value = IsColumnMajorMatrix<T1>::value &&
4673  };
4674  //**********************************************************************************************
4675 
4676  //**********************************************************************************************
4678 
4681  template< typename T1, typename T2, typename T3 >
4682  struct IsEvaluationRequired {
4683  enum : bool { value = ( evaluateLeft || evaluateRight ) &&
4684  !CanExploitSymmetry<T1,T2,T3>::value };
4685  };
4686  //**********************************************************************************************
4687 
4688  //**********************************************************************************************
4690 
4692  template< typename T1, typename T2, typename T3, typename T4 >
4693  struct UseBlasKernel {
4694  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
4695  !SYM && !HERM && !LOW && !UPP &&
4700  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4705  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
4707  };
4708  //**********************************************************************************************
4709 
4710  //**********************************************************************************************
4712 
4714  template< typename T1, typename T2, typename T3, typename T4 >
4715  struct UseVectorizedDefaultKernel {
4716  enum : bool { value = useOptimizedKernels &&
4718  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4722  , T4 >::value &&
4723  HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
4724  HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
4725  };
4726  //**********************************************************************************************
4727 
4728  //**********************************************************************************************
4730 
4732  using ForwardFunctor = IfTrue_< HERM
4733  , DeclHerm
4734  , IfTrue_< SYM
4735  , DeclSym
4736  , IfTrue_< LOW
4737  , IfTrue_< UPP
4738  , DeclDiag
4739  , DeclLow >
4740  , IfTrue_< UPP
4741  , DeclUpp
4742  , Noop > > > >;
4743  //**********************************************************************************************
4744 
4745  public:
4746  //**Type definitions****************************************************************************
4749 
4750  using ResultType = MultTrait_<RES,ST>;
4755  using ReturnType = const ElementType;
4756  using CompositeType = const ResultType;
4757 
4760 
4762  using RightOperand = ST;
4763 
4766 
4769  //**********************************************************************************************
4770 
4771  //**Compilation flags***************************************************************************
4773  enum : bool { simdEnabled = !IsDiagonal<MT2>::value &&
4774  MT1::simdEnabled && MT2::simdEnabled &&
4778 
4780  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4781  !evaluateRight && MT2::smpAssignable };
4782  //**********************************************************************************************
4783 
4784  //**SIMD properties*****************************************************************************
4786  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
4787  //**********************************************************************************************
4788 
4789  //**Constructor*********************************************************************************
4795  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
4796  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
4797  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
4798  {}
4799  //**********************************************************************************************
4800 
4801  //**Access operator*****************************************************************************
4808  inline ReturnType operator()( size_t i, size_t j ) const {
4809  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
4810  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
4811  return matrix_(i,j) * scalar_;
4812  }
4813  //**********************************************************************************************
4814 
4815  //**At function*********************************************************************************
4823  inline ReturnType at( size_t i, size_t j ) const {
4824  if( i >= matrix_.rows() ) {
4825  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
4826  }
4827  if( j >= matrix_.columns() ) {
4828  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
4829  }
4830  return (*this)(i,j);
4831  }
4832  //**********************************************************************************************
4833 
4834  //**Rows function*******************************************************************************
4839  inline size_t rows() const {
4840  return matrix_.rows();
4841  }
4842  //**********************************************************************************************
4843 
4844  //**Columns function****************************************************************************
4849  inline size_t columns() const {
4850  return matrix_.columns();
4851  }
4852  //**********************************************************************************************
4853 
4854  //**Left operand access*************************************************************************
4859  inline LeftOperand leftOperand() const {
4860  return matrix_;
4861  }
4862  //**********************************************************************************************
4863 
4864  //**Right operand access************************************************************************
4869  inline RightOperand rightOperand() const {
4870  return scalar_;
4871  }
4872  //**********************************************************************************************
4873 
4874  //**********************************************************************************************
4880  template< typename T >
4881  inline bool canAlias( const T* alias ) const {
4882  return matrix_.canAlias( alias );
4883  }
4884  //**********************************************************************************************
4885 
4886  //**********************************************************************************************
4892  template< typename T >
4893  inline bool isAliased( const T* alias ) const {
4894  return matrix_.isAliased( alias );
4895  }
4896  //**********************************************************************************************
4897 
4898  //**********************************************************************************************
4903  inline bool isAligned() const {
4904  return matrix_.isAligned();
4905  }
4906  //**********************************************************************************************
4907 
4908  //**********************************************************************************************
4913  inline bool canSMPAssign() const noexcept {
4914  return ( !BLAZE_BLAS_MODE ||
4915  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4917  ( rows() * columns() < DMATDMATMULT_THRESHOLD ) ) &&
4918  ( rows() * columns() >= SMP_DMATDMATMULT_THRESHOLD );
4919  }
4920  //**********************************************************************************************
4921 
4922  private:
4923  //**Member variables****************************************************************************
4924  LeftOperand matrix_;
4925  RightOperand scalar_;
4926  //**********************************************************************************************
4927 
4928  //**Assignment to dense matrices****************************************************************
4940  template< typename MT // Type of the target dense matrix
4941  , bool SO > // Storage order of the target dense matrix
4943  assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
4944  {
4946 
4947  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4948  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4949 
4950  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
4951  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
4952 
4953  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
4954  return;
4955  }
4956  else if( left.columns() == 0UL ) {
4957  reset( ~lhs );
4958  return;
4959  }
4960 
4961  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
4962  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
4963 
4964  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
4965  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
4966  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
4967  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
4968  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4969  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
4970 
4971  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4972  }
4973  //**********************************************************************************************
4974 
4975  //**Assignment to dense matrices (kernel selection)*********************************************
4986  template< typename MT3 // Type of the left-hand side target matrix
4987  , typename MT4 // Type of the left-hand side matrix operand
4988  , typename MT5 // Type of the right-hand side matrix operand
4989  , typename ST2 > // Type of the scalar value
4990  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
4991  {
4992  if( ( IsDiagonal<MT5>::value ) ||
4993  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
4994  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
4995  selectSmallAssignKernel( C, A, B, scalar );
4996  else
4997  selectBlasAssignKernel( C, A, B, scalar );
4998  }
4999  //**********************************************************************************************
5000 
5001  //**Default assignment to dense matrices (general/general)**************************************
5015  template< typename MT3 // Type of the left-hand side target matrix
5016  , typename MT4 // Type of the left-hand side matrix operand
5017  , typename MT5 // Type of the right-hand side matrix operand
5018  , typename ST2 > // Type of the scalar value
5020  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5021  {
5022  const size_t M( A.rows() );
5023  const size_t N( B.columns() );
5024  const size_t K( A.columns() );
5025 
5026  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5027 
5028  for( size_t i=0UL; i<M; ++i )
5029  {
5030  const size_t kbegin( ( IsUpper<MT4>::value )
5031  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5032  :( 0UL ) );
5033  const size_t kend( ( IsLower<MT4>::value )
5034  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5035  :( K ) );
5036  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
5037 
5038  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
5039  for( size_t j=0UL; j<N; ++j ) {
5040  reset( C(i,j) );
5041  }
5042  continue;
5043  }
5044 
5045  {
5046  const size_t jbegin( ( IsUpper<MT5>::value )
5048  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
5049  :( UPP ? max(i,kbegin) : kbegin ) )
5050  :( UPP ? i : 0UL ) );
5051  const size_t jend( ( IsLower<MT5>::value )
5053  ?( LOW ? min(i+1UL,kbegin) : kbegin )
5054  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
5055  :( LOW ? i+1UL : N ) );
5056 
5057  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
5058  for( size_t j=0UL; j<jbegin; ++j ) {
5059  reset( C(i,j) );
5060  }
5061  }
5062  else if( IsStrictlyUpper<MT5>::value ) {
5063  reset( C(i,0UL) );
5064  }
5065  for( size_t j=jbegin; j<jend; ++j ) {
5066  C(i,j) = A(i,kbegin) * B(kbegin,j);
5067  }
5068  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
5069  for( size_t j=jend; j<N; ++j ) {
5070  reset( C(i,j) );
5071  }
5072  }
5073  else if( IsStrictlyLower<MT5>::value ) {
5074  reset( C(i,N-1UL) );
5075  }
5076  }
5077 
5078  for( size_t k=kbegin+1UL; k<kend; ++k )
5079  {
5080  const size_t jbegin( ( IsUpper<MT5>::value )
5082  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
5083  :( SYM || HERM || UPP ? max( i, k ) : k ) )
5084  :( SYM || HERM || UPP ? i : 0UL ) );
5085  const size_t jend( ( IsLower<MT5>::value )
5087  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
5088  :( LOW ? min(i+1UL,k) : k ) )
5089  :( LOW ? i+1UL : N ) );
5090 
5091  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5092  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5093 
5094  for( size_t j=jbegin; j<jend; ++j ) {
5095  C(i,j) += A(i,k) * B(k,j);
5096  }
5097  if( IsLower<MT5>::value ) {
5098  C(i,jend) = A(i,k) * B(k,jend);
5099  }
5100  }
5101 
5102  {
5103  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
5105  :( SYM || HERM || UPP ? i : 0UL ) );
5106  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
5108  :( LOW ? i+1UL : N ) );
5109 
5110  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
5111  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5112 
5113  for( size_t j=jbegin; j<jend; ++j ) {
5114  C(i,j) *= scalar;
5115  }
5116  }
5117  }
5118 
5119  if( SYM || HERM ) {
5120  for( size_t i=1UL; i<M; ++i ) {
5121  for( size_t j=0UL; j<i; ++j ) {
5122  C(i,j) = HERM ? conj( C(j,i) ) : C(j,i);
5123  }
5124  }
5125  }
5126  }
5127  //**********************************************************************************************
5128 
5129  //**Default assignment to dense matrices (general/diagonal)*************************************
5143  template< typename MT3 // Type of the left-hand side target matrix
5144  , typename MT4 // Type of the left-hand side matrix operand
5145  , typename MT5 // Type of the right-hand side matrix operand
5146  , typename ST2 > // Type of the scalar value
5147  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
5148  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5149  {
5151 
5152  const size_t M( A.rows() );
5153  const size_t N( B.columns() );
5154 
5155  for( size_t i=0UL; i<M; ++i )
5156  {
5157  const size_t jbegin( ( IsUpper<MT4>::value )
5158  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5159  :( 0UL ) );
5160  const size_t jend( ( IsLower<MT4>::value )
5161  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5162  :( N ) );
5163  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5164 
5165  if( IsUpper<MT4>::value ) {
5166  for( size_t j=0UL; j<jbegin; ++j ) {
5167  reset( C(i,j) );
5168  }
5169  }
5170  for( size_t j=jbegin; j<jend; ++j ) {
5171  C(i,j) = A(i,j) * B(j,j) * scalar;
5172  }
5173  if( IsLower<MT4>::value ) {
5174  for( size_t j=jend; j<N; ++j ) {
5175  reset( C(i,j) );
5176  }
5177  }
5178  }
5179  }
5180  //**********************************************************************************************
5181 
5182  //**Default assignment to dense matrices (diagonal/general)*************************************
5196  template< typename MT3 // Type of the left-hand side target matrix
5197  , typename MT4 // Type of the left-hand side matrix operand
5198  , typename MT5 // Type of the right-hand side matrix operand
5199  , typename ST2 > // Type of the scalar value
5201  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5202  {
5204 
5205  const size_t M( A.rows() );
5206  const size_t N( B.columns() );
5207 
5208  for( size_t i=0UL; i<M; ++i )
5209  {
5210  const size_t jbegin( ( IsUpper<MT5>::value )
5211  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
5212  :( 0UL ) );
5213  const size_t jend( ( IsLower<MT5>::value )
5214  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
5215  :( N ) );
5216  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
5217 
5218  if( IsUpper<MT5>::value ) {
5219  for( size_t j=0UL; j<jbegin; ++j ) {
5220  reset( C(i,j) );
5221  }
5222  }
5223  for( size_t j=jbegin; j<jend; ++j ) {
5224  C(i,j) = A(i,i) * B(i,j) * scalar;
5225  }
5226  if( IsLower<MT5>::value ) {
5227  for( size_t j=jend; j<N; ++j ) {
5228  reset( C(i,j) );
5229  }
5230  }
5231  }
5232  }
5233  //**********************************************************************************************
5234 
5235  //**Default assignment to dense matrices (diagonal/diagonal)************************************
5249  template< typename MT3 // Type of the left-hand side target matrix
5250  , typename MT4 // Type of the left-hand side matrix operand
5251  , typename MT5 // Type of the right-hand side matrix operand
5252  , typename ST2 > // Type of the scalar value
5253  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
5254  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5255  {
5257 
5258  reset( C );
5259 
5260  for( size_t i=0UL; i<A.rows(); ++i ) {
5261  C(i,i) = A(i,i) * B(i,i) * scalar;
5262  }
5263  }
5264  //**********************************************************************************************
5265 
5266  //**Default assignment to dense matrices (small matrices)***************************************
5280  template< typename MT3 // Type of the left-hand side target matrix
5281  , typename MT4 // Type of the left-hand side matrix operand
5282  , typename MT5 // Type of the right-hand side matrix operand
5283  , typename ST2 > // Type of the scalar value
5285  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
5286  {
5287  selectDefaultAssignKernel( C, A, B, scalar );
5288  }
5289  //**********************************************************************************************
5290 
5291  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
5306  template< typename MT3 // Type of the left-hand side target matrix
5307  , typename MT4 // Type of the left-hand side matrix operand
5308  , typename MT5 // Type of the right-hand side matrix operand
5309  , typename ST2 > // Type of the scalar value
5311  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
5312  {
5313  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5314 
5315  const size_t M( A.rows() );
5316  const size_t N( B.columns() );
5317  const size_t K( A.columns() );
5318 
5319  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5320 
5321  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
5322  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5323 
5324  const SIMDType factor( set( scalar ) );
5325 
5326  if( LOW && UPP && N > SIMDSIZE*3UL ) {
5327  reset( ~C );
5328  }
5329 
5330  {
5331  size_t j( 0UL );
5332 
5334  {
5335  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5336  for( size_t i=0UL; i<M; ++i )
5337  {
5338  const size_t kbegin( ( IsUpper<MT4>::value )
5339  ?( ( IsLower<MT5>::value )
5340  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5341  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5342  :( IsLower<MT5>::value ? j : 0UL ) );
5343  const size_t kend( ( IsLower<MT4>::value )
5344  ?( ( IsUpper<MT5>::value )
5345  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5346  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
5347  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
5348 
5349  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5350 
5351  for( size_t k=kbegin; k<kend; ++k ) {
5352  const SIMDType a1( set( A(i,k) ) );
5353  xmm1 += a1 * B.load(k,j );
5354  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5355  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5356  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5357  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5358  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
5359  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
5360  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
5361  }
5362 
5363  (~C).store( i, j , xmm1 * factor );
5364  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
5365  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5366  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5367  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5368  (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
5369  (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
5370  (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
5371  }
5372  }
5373  }
5374 
5375  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5376  {
5377  size_t i( 0UL );
5378 
5379  for( ; (i+2UL) <= M; i+=2UL )
5380  {
5381  const size_t kbegin( ( IsUpper<MT4>::value )
5382  ?( ( IsLower<MT5>::value )
5383  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5384  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5385  :( IsLower<MT5>::value ? j : 0UL ) );
5386  const size_t kend( ( IsLower<MT4>::value )
5387  ?( ( IsUpper<MT5>::value )
5388  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
5389  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5390  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
5391 
5392  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5393 
5394  for( size_t k=kbegin; k<kend; ++k ) {
5395  const SIMDType a1( set( A(i ,k) ) );
5396  const SIMDType a2( set( A(i+1UL,k) ) );
5397  const SIMDType b1( B.load(k,j ) );
5398  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5399  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5400  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5401  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5402  xmm1 += a1 * b1;
5403  xmm2 += a1 * b2;
5404  xmm3 += a1 * b3;
5405  xmm4 += a1 * b4;
5406  xmm5 += a1 * b5;
5407  xmm6 += a2 * b1;
5408  xmm7 += a2 * b2;
5409  xmm8 += a2 * b3;
5410  xmm9 += a2 * b4;
5411  xmm10 += a2 * b5;
5412  }
5413 
5414  (~C).store( i , j , xmm1 * factor );
5415  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
5416  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5417  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5418  (~C).store( i , j+SIMDSIZE*4UL, xmm5 * factor );
5419  (~C).store( i+1UL, j , xmm6 * factor );
5420  (~C).store( i+1UL, j+SIMDSIZE , xmm7 * factor );
5421  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
5422  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
5423  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
5424  }
5425 
5426  if( i < M )
5427  {
5428  const size_t kbegin( ( IsUpper<MT4>::value )
5429  ?( ( IsLower<MT5>::value )
5430  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5431  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5432  :( IsLower<MT5>::value ? j : 0UL ) );
5433  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
5434 
5435  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5436 
5437  for( size_t k=kbegin; k<kend; ++k ) {
5438  const SIMDType a1( set( A(i,k) ) );
5439  xmm1 += a1 * B.load(k,j );
5440  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5441  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5442  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5443  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
5444  }
5445 
5446  (~C).store( i, j , xmm1 * factor );
5447  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
5448  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5449  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5450  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
5451  }
5452  }
5453 
5454  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5455  {
5456  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
5457  size_t i( LOW ? j : 0UL );
5458 
5459  for( ; (i+2UL) <= iend; i+=2UL )
5460  {
5461  const size_t kbegin( ( IsUpper<MT4>::value )
5462  ?( ( IsLower<MT5>::value )
5463  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5464  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5465  :( IsLower<MT5>::value ? j : 0UL ) );
5466  const size_t kend( ( IsLower<MT4>::value )
5467  ?( ( IsUpper<MT5>::value )
5468  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5469  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5470  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
5471 
5472  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5473 
5474  for( size_t k=kbegin; k<kend; ++k ) {
5475  const SIMDType a1( set( A(i ,k) ) );
5476  const SIMDType a2( set( A(i+1UL,k) ) );
5477  const SIMDType b1( B.load(k,j ) );
5478  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5479  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5480  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5481  xmm1 += a1 * b1;
5482  xmm2 += a1 * b2;
5483  xmm3 += a1 * b3;
5484  xmm4 += a1 * b4;
5485  xmm5 += a2 * b1;
5486  xmm6 += a2 * b2;
5487  xmm7 += a2 * b3;
5488  xmm8 += a2 * b4;
5489  }
5490 
5491  (~C).store( i , j , xmm1 * factor );
5492  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
5493  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5494  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
5495  (~C).store( i+1UL, j , xmm5 * factor );
5496  (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
5497  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
5498  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
5499  }
5500 
5501  if( i < iend )
5502  {
5503  const size_t kbegin( ( IsUpper<MT4>::value )
5504  ?( ( IsLower<MT5>::value )
5505  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5506  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5507  :( IsLower<MT5>::value ? j : 0UL ) );
5508  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5509 
5510  SIMDType xmm1, xmm2, xmm3, xmm4;
5511 
5512  for( size_t k=kbegin; k<kend; ++k ) {
5513  const SIMDType a1( set( A(i,k) ) );
5514  xmm1 += a1 * B.load(k,j );
5515  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5516  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5517  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
5518  }
5519 
5520  (~C).store( i, j , xmm1 * factor );
5521  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
5522  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5523  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
5524  }
5525  }
5526 
5527  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5528  {
5529  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
5530  size_t i( LOW ? j : 0UL );
5531 
5532  for( ; (i+2UL) <= iend; i+=2UL )
5533  {
5534  const size_t kbegin( ( IsUpper<MT4>::value )
5535  ?( ( IsLower<MT5>::value )
5536  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5537  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5538  :( IsLower<MT5>::value ? j : 0UL ) );
5539  const size_t kend( ( IsLower<MT4>::value )
5540  ?( ( IsUpper<MT5>::value )
5541  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
5542  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5543  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
5544 
5545  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5546 
5547  for( size_t k=kbegin; k<kend; ++k ) {
5548  const SIMDType a1( set( A(i ,k) ) );
5549  const SIMDType a2( set( A(i+1UL,k) ) );
5550  const SIMDType b1( B.load(k,j ) );
5551  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5552  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5553  xmm1 += a1 * b1;
5554  xmm2 += a1 * b2;
5555  xmm3 += a1 * b3;
5556  xmm4 += a2 * b1;
5557  xmm5 += a2 * b2;
5558  xmm6 += a2 * b3;
5559  }
5560 
5561  (~C).store( i , j , xmm1 * factor );
5562  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
5563  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
5564  (~C).store( i+1UL, j , xmm4 * factor );
5565  (~C).store( i+1UL, j+SIMDSIZE , xmm5 * factor );
5566  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
5567  }
5568 
5569  if( i < iend )
5570  {
5571  const size_t kbegin( ( IsUpper<MT4>::value )
5572  ?( ( IsLower<MT5>::value )
5573  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5574  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5575  :( IsLower<MT5>::value ? j : 0UL ) );
5576  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
5577 
5578  SIMDType xmm1, xmm2, xmm3;
5579 
5580  for( size_t k=kbegin; k<kend; ++k ) {
5581  const SIMDType a1( set( A(i,k) ) );
5582  xmm1 += a1 * B.load(k,j );
5583  xmm2 += a1 * B.load(k,j+SIMDSIZE );
5584  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
5585  }
5586 
5587  (~C).store( i, j , xmm1 * factor );
5588  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
5589  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
5590  }
5591  }
5592 
5593  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5594  {
5595  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
5596  size_t i( LOW ? j : 0UL );
5597 
5598  for( ; (i+4UL) <= iend; i+=4UL )
5599  {
5600  const size_t kbegin( ( IsUpper<MT4>::value )
5601  ?( ( IsLower<MT5>::value )
5602  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5603  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5604  :( IsLower<MT5>::value ? j : 0UL ) );
5605  const size_t kend( ( IsLower<MT4>::value )
5606  ?( ( IsUpper<MT5>::value )
5607  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
5608  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
5609  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5610 
5611  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5612 
5613  for( size_t k=kbegin; k<kend; ++k ) {
5614  const SIMDType a1( set( A(i ,k) ) );
5615  const SIMDType a2( set( A(i+1UL,k) ) );
5616  const SIMDType a3( set( A(i+2UL,k) ) );
5617  const SIMDType a4( set( A(i+3UL,k) ) );
5618  const SIMDType b1( B.load(k,j ) );
5619  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5620  xmm1 += a1 * b1;
5621  xmm2 += a1 * b2;
5622  xmm3 += a2 * b1;
5623  xmm4 += a2 * b2;
5624  xmm5 += a3 * b1;
5625  xmm6 += a3 * b2;
5626  xmm7 += a4 * b1;
5627  xmm8 += a4 * b2;
5628  }
5629 
5630  (~C).store( i , j , xmm1 * factor );
5631  (~C).store( i , j+SIMDSIZE, xmm2 * factor );
5632  (~C).store( i+1UL, j , xmm3 * factor );
5633  (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
5634  (~C).store( i+2UL, j , xmm5 * factor );
5635  (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
5636  (~C).store( i+3UL, j , xmm7 * factor );
5637  (~C).store( i+3UL, j+SIMDSIZE, xmm8 * factor );
5638  }
5639 
5640  for( ; (i+3UL) <= iend; i+=3UL )
5641  {
5642  const size_t kbegin( ( IsUpper<MT4>::value )
5643  ?( ( IsLower<MT5>::value )
5644  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5645  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5646  :( IsLower<MT5>::value ? j : 0UL ) );
5647  const size_t kend( ( IsLower<MT4>::value )
5648  ?( ( IsUpper<MT5>::value )
5649  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
5650  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
5651  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5652 
5653  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5654 
5655  for( size_t k=kbegin; k<kend; ++k ) {
5656  const SIMDType a1( set( A(i ,k) ) );
5657  const SIMDType a2( set( A(i+1UL,k) ) );
5658  const SIMDType a3( set( A(i+2UL,k) ) );
5659  const SIMDType b1( B.load(k,j ) );
5660  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5661  xmm1 += a1 * b1;
5662  xmm2 += a1 * b2;
5663  xmm3 += a2 * b1;
5664  xmm4 += a2 * b2;
5665  xmm5 += a3 * b1;
5666  xmm6 += a3 * b2;
5667  }
5668 
5669  (~C).store( i , j , xmm1 * factor );
5670  (~C).store( i , j+SIMDSIZE, xmm2 * factor );
5671  (~C).store( i+1UL, j , xmm3 * factor );
5672  (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
5673  (~C).store( i+2UL, j , xmm5 * factor );
5674  (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
5675  }
5676 
5677  for( ; (i+2UL) <= iend; i+=2UL )
5678  {
5679  const size_t kbegin( ( IsUpper<MT4>::value )
5680  ?( ( IsLower<MT5>::value )
5681  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5682  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5683  :( IsLower<MT5>::value ? j : 0UL ) );
5684  const size_t kend( ( IsLower<MT4>::value )
5685  ?( ( IsUpper<MT5>::value )
5686  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
5687  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5688  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5689 
5690  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5691  size_t k( kbegin );
5692 
5693  for( ; (k+2UL) <= kend; k+=2UL ) {
5694  const SIMDType a1( set( A(i ,k ) ) );
5695  const SIMDType a2( set( A(i+1UL,k ) ) );
5696  const SIMDType a3( set( A(i ,k+1UL) ) );
5697  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
5698  const SIMDType b1( B.load(k ,j ) );
5699  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5700  const SIMDType b3( B.load(k+1UL,j ) );
5701  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5702  xmm1 += a1 * b1;
5703  xmm2 += a1 * b2;
5704  xmm3 += a2 * b1;
5705  xmm4 += a2 * b2;
5706  xmm5 += a3 * b3;
5707  xmm6 += a3 * b4;
5708  xmm7 += a4 * b3;
5709  xmm8 += a4 * b4;
5710  }
5711 
5712  for( ; k<kend; ++k ) {
5713  const SIMDType a1( set( A(i ,k) ) );
5714  const SIMDType a2( set( A(i+1UL,k) ) );
5715  const SIMDType b1( B.load(k,j ) );
5716  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5717  xmm1 += a1 * b1;
5718  xmm2 += a1 * b2;
5719  xmm3 += a2 * b1;
5720  xmm4 += a2 * b2;
5721  }
5722 
5723  (~C).store( i , j , (xmm1+xmm5) * factor );
5724  (~C).store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
5725  (~C).store( i+1UL, j , (xmm3+xmm7) * factor );
5726  (~C).store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
5727  }
5728 
5729  if( i < iend )
5730  {
5731  const size_t kbegin( ( IsUpper<MT4>::value )
5732  ?( ( IsLower<MT5>::value )
5733  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5734  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5735  :( IsLower<MT5>::value ? j : 0UL ) );
5736  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
5737 
5738  SIMDType xmm1, xmm2, xmm3, xmm4;
5739  size_t k( kbegin );
5740 
5741  for( ; (k+2UL) <= kend; k+=2UL ) {
5742  const SIMDType a1( set( A(i,k ) ) );
5743  const SIMDType a2( set( A(i,k+1UL) ) );
5744  xmm1 += a1 * B.load(k ,j );
5745  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
5746  xmm3 += a2 * B.load(k+1UL,j );
5747  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
5748  }
5749 
5750  for( ; k<kend; ++k ) {
5751  const SIMDType a1( set( A(i,k) ) );
5752  xmm1 += a1 * B.load(k,j );
5753  xmm2 += a1 * B.load(k,j+SIMDSIZE);
5754  }
5755 
5756  (~C).store( i, j , (xmm1+xmm3) * factor );
5757  (~C).store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
5758  }
5759  }
5760 
5761  for( ; j<jpos; j+=SIMDSIZE )
5762  {
5763  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
5764  size_t i( LOW ? j : 0UL );
5765 
5766  for( ; (i+4UL) <= iend; i+=4UL )
5767  {
5768  const size_t kbegin( ( IsUpper<MT4>::value )
5769  ?( ( IsLower<MT5>::value )
5770  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5771  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5772  :( IsLower<MT5>::value ? j : 0UL ) );
5773  const size_t kend( ( IsLower<MT4>::value )
5774  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
5775  :( K ) );
5776 
5777  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5778  size_t k( kbegin );
5779 
5780  for( ; (k+2UL) <= kend; k+=2UL ) {
5781  const SIMDType b1( B.load(k ,j) );
5782  const SIMDType b2( B.load(k+1UL,j) );
5783  xmm1 += set( A(i ,k ) ) * b1;
5784  xmm2 += set( A(i+1UL,k ) ) * b1;
5785  xmm3 += set( A(i+2UL,k ) ) * b1;
5786  xmm4 += set( A(i+3UL,k ) ) * b1;
5787  xmm5 += set( A(i ,k+1UL) ) * b2;
5788  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
5789  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
5790  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
5791  }
5792 
5793  for( ; k<kend; ++k ) {
5794  const SIMDType b1( B.load(k,j) );
5795  xmm1 += set( A(i ,k) ) * b1;
5796  xmm2 += set( A(i+1UL,k) ) * b1;
5797  xmm3 += set( A(i+2UL,k) ) * b1;
5798  xmm4 += set( A(i+3UL,k) ) * b1;
5799  }
5800 
5801  (~C).store( i , j, (xmm1+xmm5) * factor );
5802  (~C).store( i+1UL, j, (xmm2+xmm6) * factor );
5803  (~C).store( i+2UL, j, (xmm3+xmm7) * factor );
5804  (~C).store( i+3UL, j, (xmm4+xmm8) * factor );
5805  }
5806 
5807  for( ; (i+3UL) <= iend; i+=3UL )
5808  {
5809  const size_t kbegin( ( IsUpper<MT4>::value )
5810  ?( ( IsLower<MT5>::value )
5811  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5812  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5813  :( IsLower<MT5>::value ? j : 0UL ) );
5814  const size_t kend( ( IsLower<MT4>::value )
5815  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
5816  :( K ) );
5817 
5818  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5819  size_t k( kbegin );
5820 
5821  for( ; (k+2UL) <= kend; k+=2UL ) {
5822  const SIMDType b1( B.load(k ,j) );
5823  const SIMDType b2( B.load(k+1UL,j) );
5824  xmm1 += set( A(i ,k ) ) * b1;
5825  xmm2 += set( A(i+1UL,k ) ) * b1;
5826  xmm3 += set( A(i+2UL,k ) ) * b1;
5827  xmm4 += set( A(i ,k+1UL) ) * b2;
5828  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
5829  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
5830  }
5831 
5832  for( ; k<kend; ++k ) {
5833  const SIMDType b1( B.load(k,j) );
5834  xmm1 += set( A(i ,k) ) * b1;
5835  xmm2 += set( A(i+1UL,k) ) * b1;
5836  xmm3 += set( A(i+2UL,k) ) * b1;
5837  }
5838 
5839  (~C).store( i , j, (xmm1+xmm4) * factor );
5840  (~C).store( i+1UL, j, (xmm2+xmm5) * factor );
5841  (~C).store( i+2UL, j, (xmm3+xmm6) * factor );
5842  }
5843 
5844  for( ; (i+2UL) <= iend; i+=2UL )
5845  {
5846  const size_t kbegin( ( IsUpper<MT4>::value )
5847  ?( ( IsLower<MT5>::value )
5848  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5849  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5850  :( IsLower<MT5>::value ? j : 0UL ) );
5851  const size_t kend( ( IsLower<MT4>::value )
5852  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5853  :( K ) );
5854 
5855  SIMDType xmm1, xmm2, xmm3, xmm4;
5856  size_t k( kbegin );
5857 
5858  for( ; (k+2UL) <= kend; k+=2UL ) {
5859  const SIMDType b1( B.load(k ,j) );
5860  const SIMDType b2( B.load(k+1UL,j) );
5861  xmm1 += set( A(i ,k ) ) * b1;
5862  xmm2 += set( A(i+1UL,k ) ) * b1;
5863  xmm3 += set( A(i ,k+1UL) ) * b2;
5864  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
5865  }
5866 
5867  for( ; k<kend; ++k ) {
5868  const SIMDType b1( B.load(k,j) );
5869  xmm1 += set( A(i ,k) ) * b1;
5870  xmm2 += set( A(i+1UL,k) ) * b1;
5871  }
5872 
5873  (~C).store( i , j, (xmm1+xmm3) * factor );
5874  (~C).store( i+1UL, j, (xmm2+xmm4) * factor );
5875  }
5876 
5877  if( i < iend )
5878  {
5879  const size_t kbegin( ( IsUpper<MT4>::value )
5880  ?( ( IsLower<MT5>::value )
5881  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5882  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5883  :( IsLower<MT5>::value ? j : 0UL ) );
5884 
5885  SIMDType xmm1, xmm2;
5886  size_t k( kbegin );
5887 
5888  for( ; (k+2UL) <= K; k+=2UL ) {
5889  xmm1 += set( A(i,k ) ) * B.load(k ,j);
5890  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
5891  }
5892 
5893  for( ; k<K; ++k ) {
5894  xmm1 += set( A(i,k) ) * B.load(k,j);
5895  }
5896 
5897  (~C).store( i, j, (xmm1+xmm2) * factor );
5898  }
5899  }
5900 
5901  for( ; remainder && j<N; ++j )
5902  {
5903  size_t i( LOW && UPP ? j : 0UL );
5904 
5905  for( ; (i+2UL) <= M; i+=2UL )
5906  {
5907  const size_t kbegin( ( IsUpper<MT4>::value )
5908  ?( ( IsLower<MT5>::value )
5909  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5910  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5911  :( IsLower<MT5>::value ? j : 0UL ) );
5912  const size_t kend( ( IsLower<MT4>::value )
5913  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5914  :( K ) );
5915 
5916  ElementType value1 = ElementType();
5917  ElementType value2 = ElementType();
5918 
5919  for( size_t k=kbegin; k<kend; ++k ) {
5920  value1 += A(i ,k) * B(k,j);
5921  value2 += A(i+1UL,k) * B(k,j);
5922  }
5923 
5924  (~C)(i ,j) = value1 * scalar;
5925  (~C)(i+1UL,j) = value2 * scalar;
5926  }
5927 
5928  if( i < M )
5929  {
5930  const size_t kbegin( ( IsUpper<MT4>::value )
5931  ?( ( IsLower<MT5>::value )
5932  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5933  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5934  :( IsLower<MT5>::value ? j : 0UL ) );
5935 
5936  ElementType value = ElementType();
5937 
5938  for( size_t k=kbegin; k<K; ++k ) {
5939  value += A(i,k) * B(k,j);
5940  }
5941 
5942  (~C)(i,j) = value * scalar;
5943  }
5944  }
5945  }
5946 
5947  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
5948  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
5949  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
5950  for( size_t j=0UL; j<jend; ++j ) {
5951  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
5952  }
5953  }
5954  }
5955  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
5956  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
5957  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
5958  for( size_t i=0UL; i<iend; ++i ) {
5959  reset( (~C)(i,j) );
5960  }
5961  }
5962  }
5963  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
5964  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
5965  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
5966  for( size_t j=0UL; j<jend; ++j ) {
5967  reset( (~C)(i,j) );
5968  }
5969  }
5970  }
5971  }
5972  //**********************************************************************************************
5973 
5974  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
5989  template< typename MT3 // Type of the left-hand side target matrix
5990  , typename MT4 // Type of the left-hand side matrix operand
5991  , typename MT5 // Type of the right-hand side matrix operand
5992  , typename ST2 > // Type of the scalar value
5994  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
5995  {
6000 
6001  const ForwardFunctor fwd;
6002 
6004  const OppositeType_<MT4> tmp( serial( A ) );
6005  assign( ~C, fwd( tmp * B ) * scalar );
6006  }
6008  const OppositeType_<MT5> tmp( serial( B ) );
6009  assign( ~C, fwd( A * tmp ) * scalar );
6010  }
6011  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6012  const OppositeType_<MT4> tmp( serial( A ) );
6013  assign( ~C, fwd( tmp * B ) * scalar );
6014  }
6015  else {
6016  const OppositeType_<MT5> tmp( serial( B ) );
6017  assign( ~C, fwd( A * tmp ) * scalar );
6018  }
6019  }
6020  //**********************************************************************************************
6021 
6022  //**Default assignment to dense matrices (large matrices)***************************************
6036  template< typename MT3 // Type of the left-hand side target matrix
6037  , typename MT4 // Type of the left-hand side matrix operand
6038  , typename MT5 // Type of the right-hand side matrix operand
6039  , typename ST2 > // Type of the scalar value
6041  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6042  {
6043  selectDefaultAssignKernel( C, A, B, scalar );
6044  }
6045  //**********************************************************************************************
6046 
6047  //**Vectorized default assignment to dense matrices (large matrices)****************************
6062  template< typename MT3 // Type of the left-hand side target matrix
6063  , typename MT4 // Type of the left-hand side matrix operand
6064  , typename MT5 // Type of the right-hand side matrix operand
6065  , typename ST2 > // Type of the scalar value
6067  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6068  {
6069  if( SYM )
6070  smmm( C, A, B, scalar );
6071  else if( HERM )
6072  hmmm( C, A, B, scalar );
6073  else if( LOW )
6074  lmmm( C, A, B, scalar, ST2(0) );
6075  else if( UPP )
6076  ummm( C, A, B, scalar, ST2(0) );
6077  else
6078  mmm( C, A, B, scalar, ST2(0) );
6079  }
6080  //**********************************************************************************************
6081 
6082  //**BLAS-based assignment to dense matrices (default)*******************************************
6096  template< typename MT3 // Type of the left-hand side target matrix
6097  , typename MT4 // Type of the left-hand side matrix operand
6098  , typename MT5 // Type of the right-hand side matrix operand
6099  , typename ST2 > // Type of the scalar value
6101  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6102  {
6103  selectLargeAssignKernel( C, A, B, scalar );
6104  }
6105  //**********************************************************************************************
6106 
6107  //**BLAS-based assignment to dense matrices*****************************************************
6108 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6109 
6122  template< typename MT3 // Type of the left-hand side target matrix
6123  , typename MT4 // Type of the left-hand side matrix operand
6124  , typename MT5 // Type of the right-hand side matrix operand
6125  , typename ST2 > // Type of the scalar value
6127  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6128  {
6129  using ET = ElementType_<MT3>;
6130 
6131  if( IsTriangular<MT4>::value ) {
6132  assign( C, B );
6133  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6134  }
6135  else if( IsTriangular<MT5>::value ) {
6136  assign( C, A );
6137  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6138  }
6139  else {
6140  gemm( C, A, B, ET(scalar), ET(0) );
6141  }
6142  }
6143 #endif
6144  //**********************************************************************************************
6145 
6146  //**Assignment to sparse matrices***************************************************************
6158  template< typename MT // Type of the target sparse matrix
6159  , bool SO > // Storage order of the target sparse matrix
6161  assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6162  {
6164 
6166 
6173 
6174  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6175  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6176 
6177  const ForwardFunctor fwd;
6178 
6179  const TmpType tmp( serial( rhs ) );
6180  assign( ~lhs, fwd( tmp ) );
6181  }
6182  //**********************************************************************************************
6183 
6184  //**Restructuring assignment to column-major matrices*******************************************
6198  template< typename MT > // Type of the target matrix
6200  assign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
6201  {
6203 
6205 
6206  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6207  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6208 
6209  const ForwardFunctor fwd;
6210 
6211  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6212  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6213 
6215  assign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
6216  else if( IsSymmetric<MT1>::value )
6217  assign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
6218  else
6219  assign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
6220  }
6221  //**********************************************************************************************
6222 
6223  //**Addition assignment to dense matrices*******************************************************
6235  template< typename MT // Type of the target dense matrix
6236  , bool SO > // Storage order of the target dense matrix
6238  addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
6239  {
6241 
6242  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6243  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6244 
6245  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6246  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6247 
6248  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
6249  return;
6250  }
6251 
6252  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
6253  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
6254 
6255  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
6256  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
6257  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
6258  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
6259  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6260  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
6261 
6262  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6263  }
6264  //**********************************************************************************************
6265 
6266  //**Addition assignment to dense matrices (kernel selection)************************************
6277  template< typename MT3 // Type of the left-hand side target matrix
6278  , typename MT4 // Type of the left-hand side matrix operand
6279  , typename MT5 // Type of the right-hand side matrix operand
6280  , typename ST2 > // Type of the scalar value
6281  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6282  {
6283  if( ( IsDiagonal<MT5>::value ) ||
6284  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
6285  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6286  selectSmallAddAssignKernel( C, A, B, scalar );
6287  else
6288  selectBlasAddAssignKernel( C, A, B, scalar );
6289  }
6290  //**********************************************************************************************
6291 
6292  //**Default addition assignment to dense matrices (general/general)*****************************
6306  template< typename MT3 // Type of the left-hand side target matrix
6307  , typename MT4 // Type of the left-hand side matrix operand
6308  , typename MT5 // Type of the right-hand side matrix operand
6309  , typename ST2 > // Type of the scalar value
6310  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6311  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6312  {
6313  const ResultType tmp( serial( A * B * scalar ) );
6314  addAssign( C, tmp );
6315  }
6316  //**********************************************************************************************
6317 
6318  //**Default addition assignment to dense matrices (general/diagonal)****************************
6332  template< typename MT3 // Type of the left-hand side target matrix
6333  , typename MT4 // Type of the left-hand side matrix operand
6334  , typename MT5 // Type of the right-hand side matrix operand
6335  , typename ST2 > // Type of the scalar value
6336  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6337  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6338  {
6340 
6341  const size_t M( A.rows() );
6342  const size_t N( B.columns() );
6343 
6344  for( size_t i=0UL; i<M; ++i )
6345  {
6346  const size_t jbegin( ( IsUpper<MT4>::value )
6347  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6348  :( 0UL ) );
6349  const size_t jend( ( IsLower<MT4>::value )
6350  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6351  :( N ) );
6352  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6353 
6354  const size_t jnum( jend - jbegin );
6355  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6356 
6357  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6358  C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6359  C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6360  }
6361  if( jpos < jend ) {
6362  C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6363  }
6364  }
6365  }
6366  //**********************************************************************************************
6367 
6368  //**Default addition assignment to dense matrices (diagonal/general)****************************
6382  template< typename MT3 // Type of the left-hand side target matrix
6383  , typename MT4 // Type of the left-hand side matrix operand
6384  , typename MT5 // Type of the right-hand side matrix operand
6385  , typename ST2 > // Type of the scalar value
6386  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6387  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6388  {
6390 
6391  const size_t M( A.rows() );
6392  const size_t N( B.columns() );
6393 
6394  for( size_t i=0UL; i<M; ++i )
6395  {
6396  const size_t jbegin( ( IsUpper<MT5>::value )
6397  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6398  :( 0UL ) );
6399  const size_t jend( ( IsLower<MT5>::value )
6400  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6401  :( N ) );
6402  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
6403 
6404  const size_t jnum( jend - jbegin );
6405  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
6406 
6407  for( size_t j=jbegin; j<jpos; j+=2UL ) {
6408  C(i,j ) += A(i,i) * B(i,j ) * scalar;
6409  C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
6410  }
6411  if( jpos < jend ) {
6412  C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
6413  }
6414  }
6415  }
6416  //**********************************************************************************************
6417 
6418  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
6432  template< typename MT3 // Type of the left-hand side target matrix
6433  , typename MT4 // Type of the left-hand side matrix operand
6434  , typename MT5 // Type of the right-hand side matrix operand
6435  , typename ST2 > // Type of the scalar value
6436  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6437  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6438  {
6440 
6441  for( size_t i=0UL; i<A.rows(); ++i ) {
6442  C(i,i) += A(i,i) * B(i,i) * scalar;
6443  }
6444  }
6445  //**********************************************************************************************
6446 
6447  //**Default addition assignment to dense matrices (small matrices)******************************
6461  template< typename MT3 // Type of the left-hand side target matrix
6462  , typename MT4 // Type of the left-hand side matrix operand
6463  , typename MT5 // Type of the right-hand side matrix operand
6464  , typename ST2 > // Type of the scalar value
6466  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
6467  {
6468  selectDefaultAddAssignKernel( C, A, B, scalar );
6469  }
6470  //**********************************************************************************************
6471 
6472  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
6487  template< typename MT3 // Type of the left-hand side target matrix
6488  , typename MT4 // Type of the left-hand side matrix operand
6489  , typename MT5 // Type of the right-hand side matrix operand
6490  , typename ST2 > // Type of the scalar value
6492  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
6493  {
6494  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6495 
6496  const size_t M( A.rows() );
6497  const size_t N( B.columns() );
6498  const size_t K( A.columns() );
6499 
6500  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
6501 
6502  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
6503  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
6504 
6505  const SIMDType factor( set( scalar ) );
6506 
6507  size_t j( 0UL );
6508 
6510  {
6511  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6512  for( size_t i=0UL; i<M; ++i )
6513  {
6514  const size_t kbegin( ( IsUpper<MT4>::value )
6515  ?( ( IsLower<MT5>::value )
6516  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6517  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6518  :( IsLower<MT5>::value ? j : 0UL ) );
6519  const size_t kend( ( IsLower<MT4>::value )
6520  ?( ( IsUpper<MT5>::value )
6521  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
6522  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6523  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
6524 
6525  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6526 
6527  for( size_t k=kbegin; k<kend; ++k ) {
6528  const SIMDType a1( set( A(i,k) ) );
6529  xmm1 += a1 * B.load(k,j );
6530  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6531  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6532  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6533  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6534  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
6535  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
6536  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
6537  }
6538 
6539  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6540  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
6541  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6542  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6543  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
6544  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
6545  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
6546  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
6547  }
6548  }
6549  }
6550 
6551  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
6552  {
6553  size_t i( 0UL );
6554 
6555  for( ; (i+2UL) <= M; i+=2UL )
6556  {
6557  const size_t kbegin( ( IsUpper<MT4>::value )
6558  ?( ( IsLower<MT5>::value )
6559  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6560  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6561  :( IsLower<MT5>::value ? j : 0UL ) );
6562  const size_t kend( ( IsLower<MT4>::value )
6563  ?( ( IsUpper<MT5>::value )
6564  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
6565  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6566  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
6567 
6568  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6569 
6570  for( size_t k=kbegin; k<kend; ++k ) {
6571  const SIMDType a1( set( A(i ,k) ) );
6572  const SIMDType a2( set( A(i+1UL,k) ) );
6573  const SIMDType b1( B.load(k,j ) );
6574  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6575  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6576  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6577  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
6578  xmm1 += a1 * b1;
6579  xmm2 += a1 * b2;
6580  xmm3 += a1 * b3;
6581  xmm4 += a1 * b4;
6582  xmm5 += a1 * b5;
6583  xmm6 += a2 * b1;
6584  xmm7 += a2 * b2;
6585  xmm8 += a2 * b3;
6586  xmm9 += a2 * b4;
6587  xmm10 += a2 * b5;
6588  }
6589 
6590  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6591  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
6592  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6593  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
6594  (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
6595  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm6 * factor );
6596  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
6597  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
6598  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
6599  (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
6600  }
6601 
6602  if( i < M )
6603  {
6604  const size_t kbegin( ( IsUpper<MT4>::value )
6605  ?( ( IsLower<MT5>::value )
6606  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6607  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6608  :( IsLower<MT5>::value ? j : 0UL ) );
6609  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
6610 
6611  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6612 
6613  for( size_t k=kbegin; k<kend; ++k ) {
6614  const SIMDType a1( set( A(i,k) ) );
6615  xmm1 += a1 * B.load(k,j );
6616  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6617  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6618  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6619  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
6620  }
6621 
6622  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6623  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
6624  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6625  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6626  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
6627  }
6628  }
6629 
6630  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6631  {
6632  size_t i( 0UL );
6633 
6634  for( ; (i+2UL) <= M; i+=2UL )
6635  {
6636  const size_t kbegin( ( IsUpper<MT4>::value )
6637  ?( ( IsLower<MT5>::value )
6638  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6639  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6640  :( IsLower<MT5>::value ? j : 0UL ) );
6641  const size_t kend( ( IsLower<MT4>::value )
6642  ?( ( IsUpper<MT5>::value )
6643  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
6644  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6645  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
6646 
6647  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6648 
6649  for( size_t k=kbegin; k<kend; ++k ) {
6650  const SIMDType a1( set( A(i ,k) ) );
6651  const SIMDType a2( set( A(i+1UL,k) ) );
6652  const SIMDType b1( B.load(k,j ) );
6653  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6654  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6655  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6656  xmm1 += a1 * b1;
6657  xmm2 += a1 * b2;
6658  xmm3 += a1 * b3;
6659  xmm4 += a1 * b4;
6660  xmm5 += a2 * b1;
6661  xmm6 += a2 * b2;
6662  xmm7 += a2 * b3;
6663  xmm8 += a2 * b4;
6664  }
6665 
6666  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6667  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
6668  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6669  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
6670  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
6671  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
6672  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
6673  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
6674  }
6675 
6676  if( i < M )
6677  {
6678  const size_t kbegin( ( IsUpper<MT4>::value )
6679  ?( ( IsLower<MT5>::value )
6680  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6681  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6682  :( IsLower<MT5>::value ? j : 0UL ) );
6683  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
6684 
6685  SIMDType xmm1, xmm2, xmm3, xmm4;
6686 
6687  for( size_t k=kbegin; k<kend; ++k ) {
6688  const SIMDType a1( set( A(i,k) ) );
6689  xmm1 += a1 * B.load(k,j );
6690  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6691  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6692  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
6693  }
6694 
6695  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6696  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
6697  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6698  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
6699  }
6700  }
6701 
6702  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
6703  {
6704  size_t i( 0UL );
6705 
6706  for( ; (i+2UL) <= M; i+=2UL )
6707  {
6708  const size_t kbegin( ( IsUpper<MT4>::value )
6709  ?( ( IsLower<MT5>::value )
6710  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6711  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6712  :( IsLower<MT5>::value ? j : 0UL ) );
6713  const size_t kend( ( IsLower<MT4>::value )
6714  ?( ( IsUpper<MT5>::value )
6715  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
6716  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6717  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
6718 
6719  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6720 
6721  for( size_t k=kbegin; k<kend; ++k ) {
6722  const SIMDType a1( set( A(i ,k) ) );
6723  const SIMDType a2( set( A(i+1UL,k) ) );
6724  const SIMDType b1( B.load(k,j ) );
6725  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6726  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6727  xmm1 += a1 * b1;
6728  xmm2 += a1 * b2;
6729  xmm3 += a1 * b3;
6730  xmm4 += a2 * b1;
6731  xmm5 += a2 * b2;
6732  xmm6 += a2 * b3;
6733  }
6734 
6735  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6736  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
6737  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
6738  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm4 * factor );
6739  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
6740  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
6741  }
6742 
6743  if( i < M )
6744  {
6745  const size_t kbegin( ( IsUpper<MT4>::value )
6746  ?( ( IsLower<MT5>::value )
6747  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6748  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6749  :( IsLower<MT5>::value ? j : 0UL ) );
6750  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
6751 
6752  SIMDType xmm1, xmm2, xmm3;
6753 
6754  for( size_t k=kbegin; k<kend; ++k ) {
6755  const SIMDType a1( set( A(i,k) ) );
6756  xmm1 += a1 * B.load(k,j );
6757  xmm2 += a1 * B.load(k,j+SIMDSIZE );
6758  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
6759  }
6760 
6761  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6762  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
6763  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
6764  }
6765  }
6766 
6767  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6768  {
6769  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
6770  size_t i( LOW ? j : 0UL );
6771 
6772  for( ; (i+4UL) <= iend; i+=4UL )
6773  {
6774  const size_t kbegin( ( IsUpper<MT4>::value )
6775  ?( ( IsLower<MT5>::value )
6776  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6777  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6778  :( IsLower<MT5>::value ? j : 0UL ) );
6779  const size_t kend( ( IsLower<MT4>::value )
6780  ?( ( IsUpper<MT5>::value )
6781  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
6782  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
6783  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
6784 
6785  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6786 
6787  for( size_t k=kbegin; k<kend; ++k ) {
6788  const SIMDType a1( set( A(i ,k) ) );
6789  const SIMDType a2( set( A(i+1UL,k) ) );
6790  const SIMDType a3( set( A(i+2UL,k) ) );
6791  const SIMDType a4( set( A(i+3UL,k) ) );
6792  const SIMDType b1( B.load(k,j ) );
6793  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6794  xmm1 += a1 * b1;
6795  xmm2 += a1 * b2;
6796  xmm3 += a2 * b1;
6797  xmm4 += a2 * b2;
6798  xmm5 += a3 * b1;
6799  xmm6 += a3 * b2;
6800  xmm7 += a4 * b1;
6801  xmm8 += a4 * b2;
6802  }
6803 
6804  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6805  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
6806  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6807  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
6808  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
6809  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
6810  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
6811  (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
6812  }
6813 
6814  for( ; (i+3UL) <= iend; i+=3UL )
6815  {
6816  const size_t kbegin( ( IsUpper<MT4>::value )
6817  ?( ( IsLower<MT5>::value )
6818  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6819  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6820  :( IsLower<MT5>::value ? j : 0UL ) );
6821  const size_t kend( ( IsLower<MT4>::value )
6822  ?( ( IsUpper<MT5>::value )
6823  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
6824  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
6825  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
6826 
6827  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6828 
6829  for( size_t k=kbegin; k<kend; ++k ) {
6830  const SIMDType a1( set( A(i ,k) ) );
6831  const SIMDType a2( set( A(i+1UL,k) ) );
6832  const SIMDType a3( set( A(i+2UL,k) ) );
6833  const SIMDType b1( B.load(k,j ) );
6834  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6835  xmm1 += a1 * b1;
6836  xmm2 += a1 * b2;
6837  xmm3 += a2 * b1;
6838  xmm4 += a2 * b2;
6839  xmm5 += a3 * b1;
6840  xmm6 += a3 * b2;
6841  }
6842 
6843  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6844  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
6845  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6846  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
6847  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
6848  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
6849  }
6850 
6851  for( ; (i+2UL) <= iend; i+=2UL )
6852  {
6853  const size_t kbegin( ( IsUpper<MT4>::value )
6854  ?( ( IsLower<MT5>::value )
6855  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6856  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6857  :( IsLower<MT5>::value ? j : 0UL ) );
6858  const size_t kend( ( IsLower<MT4>::value )
6859  ?( ( IsUpper<MT5>::value )
6860  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
6861  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6862  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
6863 
6864  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6865  size_t k( kbegin );
6866 
6867  for( ; (k+2UL) <= kend; k+=2UL ) {
6868  const SIMDType a1( set( A(i ,k ) ) );
6869  const SIMDType a2( set( A(i+1UL,k ) ) );
6870  const SIMDType a3( set( A(i ,k+1UL) ) );
6871  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
6872  const SIMDType b1( B.load(k ,j ) );
6873  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
6874  const SIMDType b3( B.load(k+1UL,j ) );
6875  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
6876  xmm1 += a1 * b1;
6877  xmm2 += a1 * b2;
6878  xmm3 += a2 * b1;
6879  xmm4 += a2 * b2;
6880  xmm5 += a3 * b3;
6881  xmm6 += a3 * b4;
6882  xmm7 += a4 * b3;
6883  xmm8 += a4 * b4;
6884  }
6885 
6886  for( ; k<kend; ++k ) {
6887  const SIMDType a1( set( A(i ,k) ) );
6888  const SIMDType a2( set( A(i+1UL,k) ) );
6889  const SIMDType b1( B.load(k,j ) );
6890  const SIMDType b2( B.load(k,j+SIMDSIZE) );
6891  xmm1 += a1 * b1;
6892  xmm2 += a1 * b2;
6893  xmm3 += a2 * b1;
6894  xmm4 += a2 * b2;
6895  }
6896 
6897  (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
6898  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
6899  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + (xmm3+xmm7) * factor );
6900  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
6901  }
6902 
6903  if( i < iend )
6904  {
6905  const size_t kbegin( ( IsUpper<MT4>::value )
6906  ?( ( IsLower<MT5>::value )
6907  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6908  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6909  :( IsLower<MT5>::value ? j : 0UL ) );
6910  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
6911 
6912  SIMDType xmm1, xmm2, xmm3, xmm4;
6913  size_t k( kbegin );
6914 
6915  for( ; (k+2UL) <= kend; k+=2UL ) {
6916  const SIMDType a1( set( A(i,k ) ) );
6917  const SIMDType a2( set( A(i,k+1UL) ) );
6918  xmm1 += a1 * B.load(k ,j );
6919  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
6920  xmm3 += a2 * B.load(k+1UL,j );
6921  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
6922  }
6923 
6924  for( ; k<kend; ++k ) {
6925  const SIMDType a1( set( A(i,k) ) );
6926  xmm1 += a1 * B.load(k,j );
6927  xmm2 += a1 * B.load(k,j+SIMDSIZE);
6928  }
6929 
6930  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
6931  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
6932  }
6933  }
6934 
6935  for( ; j<jpos; j+=SIMDSIZE )
6936  {
6937  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
6938  size_t i( LOW ? j : 0UL );
6939 
6940  for( ; (i+4UL) <= iend; i+=4UL )
6941  {
6942  const size_t kbegin( ( IsUpper<MT4>::value )
6943  ?( ( IsLower<MT5>::value )
6944  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6945  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6946  :( IsLower<MT5>::value ? j : 0UL ) );
6947  const size_t kend( ( IsLower<MT4>::value )
6948  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
6949  :( K ) );
6950 
6951  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6952  size_t k( kbegin );
6953 
6954  for( ; (k+2UL) <= kend; k+=2UL ) {
6955  const SIMDType b1( B.load(k ,j) );
6956  const SIMDType b2( B.load(k+1UL,j) );
6957  xmm1 += set( A(i ,k ) ) * b1;
6958  xmm2 += set( A(i+1UL,k ) ) * b1;
6959  xmm3 += set( A(i+2UL,k ) ) * b1;
6960  xmm4 += set( A(i+3UL,k ) ) * b1;
6961  xmm5 += set( A(i ,k+1UL) ) * b2;
6962  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
6963  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
6964  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
6965  }
6966 
6967  for( ; k<kend; ++k ) {
6968  const SIMDType b1( B.load(k,j) );
6969  xmm1 += set( A(i ,k) ) * b1;
6970  xmm2 += set( A(i+1UL,k) ) * b1;
6971  xmm3 += set( A(i+2UL,k) ) * b1;
6972  xmm4 += set( A(i+3UL,k) ) * b1;
6973  }
6974 
6975  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm5) * factor );
6976  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm6) * factor );
6977  (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm7) * factor );
6978  (~C).store( i+3UL, j, (~C).load(i+3UL,j) + (xmm4+xmm8) * factor );
6979  }
6980 
6981  for( ; (i+3UL) <= iend; i+=3UL )
6982  {
6983  const size_t kbegin( ( IsUpper<MT4>::value )
6984  ?( ( IsLower<MT5>::value )
6985  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6986  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6987  :( IsLower<MT5>::value ? j : 0UL ) );
6988  const size_t kend( ( IsLower<MT4>::value )
6989  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
6990  :( K ) );
6991 
6992  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6993  size_t k( kbegin );
6994 
6995  for( ; (k+2UL) <= kend; k+=2UL ) {
6996  const SIMDType b1( B.load(k ,j) );
6997  const SIMDType b2( B.load(k+1UL,j) );
6998  xmm1 += set( A(i ,k ) ) * b1;
6999  xmm2 += set( A(i+1UL,k ) ) * b1;
7000  xmm3 += set( A(i+2UL,k ) ) * b1;
7001  xmm4 += set( A(i ,k+1UL) ) * b2;
7002  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
7003  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
7004  }
7005 
7006  for( ; k<kend; ++k ) {
7007  const SIMDType b1( B.load(k,j) );
7008  xmm1 += set( A(i ,k) ) * b1;
7009  xmm2 += set( A(i+1UL,k) ) * b1;
7010  xmm3 += set( A(i+2UL,k) ) * b1;
7011  }
7012 
7013  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm4) * factor );
7014  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm5) * factor );
7015  (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm6) * factor );
7016  }
7017 
7018  for( ; (i+2UL) <= iend; i+=2UL )
7019  {
7020  const size_t kbegin( ( IsUpper<MT4>::value )
7021  ?( ( IsLower<MT5>::value )
7022  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7023  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7024  :( IsLower<MT5>::value ? j : 0UL ) );
7025  const size_t kend( ( IsLower<MT4>::value )
7026  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
7027  :( K ) );
7028 
7029  SIMDType xmm1, xmm2, xmm3, xmm4;
7030  size_t k( kbegin );
7031 
7032  for( ; (k+2UL) <= kend; k+=2UL ) {
7033  const SIMDType b1( B.load(k ,j) );
7034  const SIMDType b2( B.load(k+1UL,j) );
7035  xmm1 += set( A(i ,k ) ) * b1;
7036  xmm2 += set( A(i+1UL,k ) ) * b1;
7037  xmm3 += set( A(i ,k+1UL) ) * b2;
7038  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
7039  }
7040 
7041  for( ; k<kend; ++k ) {
7042  const SIMDType b1( B.load(k,j) );
7043  xmm1 += set( A(i ,k) ) * b1;
7044  xmm2 += set( A(i+1UL,k) ) * b1;
7045  }
7046 
7047  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
7048  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm4) * factor );
7049  }
7050 
7051  if( i < iend )
7052  {
7053  const size_t kbegin( ( IsUpper<MT4>::value )
7054  ?( ( IsLower<MT5>::value )
7055  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7056  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7057  :( IsLower<MT5>::value ? j : 0UL ) );
7058 
7059  SIMDType xmm1, xmm2;
7060  size_t k( kbegin );
7061 
7062  for( ; (k+2UL) <= K; k+=2UL ) {
7063  xmm1 += set( A(i,k ) ) * B.load(k ,j);
7064  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
7065  }
7066 
7067  for( ; k<K; ++k ) {
7068  xmm1 += set( A(i,k) ) * B.load(k,j);
7069  }
7070 
7071  (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
7072  }
7073  }
7074 
7075  for( ; remainder && j<N; ++j )
7076  {
7077  const size_t iend( UPP ? j+1UL : M );
7078  size_t i( LOW ? j : 0UL );
7079 
7080  for( ; (i+2UL) <= iend; i+=2UL )
7081  {
7082  const size_t kbegin( ( IsUpper<MT4>::value )
7083  ?( ( IsLower<MT5>::value )
7084  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7085  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7086  :( IsLower<MT5>::value ? j : 0UL ) );
7087  const size_t kend( ( IsLower<MT4>::value )
7088  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
7089  :( K ) );
7090 
7091  ElementType value1 = ElementType();
7092  ElementType value2 = ElementType();
7093 
7094  for( size_t k=kbegin; k<kend; ++k ) {
7095  value1 += A(i ,k) * B(k,j);
7096  value2 += A(i+1UL,k) * B(k,j);
7097  }
7098 
7099  (~C)(i ,j) += value1 * scalar;
7100  (~C)(i+1UL,j) += value2 * scalar;
7101  }
7102 
7103  if( i < iend )
7104  {
7105  const size_t kbegin( ( IsUpper<MT4>::value )
7106  ?( ( IsLower<MT5>::value )
7107  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7108  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7109  :( IsLower<MT5>::value ? j : 0UL ) );
7110 
7111  ElementType value = ElementType();
7112 
7113  for( size_t k=kbegin; k<K; ++k ) {
7114  value += A(i,k) * B(k,j);
7115  }
7116 
7117  (~C)(i,j) += value * scalar;
7118  }
7119  }
7120  }
7121  //**********************************************************************************************
7122 
7123  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
7138  template< typename MT3 // Type of the left-hand side target matrix
7139  , typename MT4 // Type of the left-hand side matrix operand
7140  , typename MT5 // Type of the right-hand side matrix operand
7141  , typename ST2 > // Type of the scalar value
7143  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7144  {
7149 
7150  const ForwardFunctor fwd;
7151 
7153  const OppositeType_<MT4> tmp( serial( A ) );
7154  addAssign( ~C, fwd( tmp * B ) * scalar );
7155  }
7157  const OppositeType_<MT5> tmp( serial( B ) );
7158  addAssign( ~C, fwd( A * tmp ) * scalar );
7159  }
7160  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7161  const OppositeType_<MT4> tmp( serial( A ) );
7162  addAssign( ~C, fwd( tmp * B ) * scalar );
7163  }
7164  else {
7165  const OppositeType_<MT5> tmp( serial( B ) );
7166  addAssign( ~C, fwd( A * tmp ) * scalar );
7167  }
7168  }
7169  //**********************************************************************************************
7170 
7171  //**Default addition assignment to dense matrices (large matrices)******************************
7185  template< typename MT3 // Type of the left-hand side target matrix
7186  , typename MT4 // Type of the left-hand side matrix operand
7187  , typename MT5 // Type of the right-hand side matrix operand
7188  , typename ST2 > // Type of the scalar value
7190  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7191  {
7192  selectDefaultAddAssignKernel( C, A, B, scalar );
7193  }
7194  //**********************************************************************************************
7195 
7196  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
7211  template< typename MT3 // Type of the left-hand side target matrix
7212  , typename MT4 // Type of the left-hand side matrix operand
7213  , typename MT5 // Type of the right-hand side matrix operand
7214  , typename ST2 > // Type of the scalar value
7216  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7217  {
7218  if( LOW )
7219  lmmm( C, A, B, scalar, ST2(1) );
7220  else if( UPP )
7221  ummm( C, A, B, scalar, ST2(1) );
7222  else
7223  mmm( C, A, B, scalar, ST2(1) );
7224  }
7225  //**********************************************************************************************
7226 
7227  //**BLAS-based addition assignment to dense matrices (default)**********************************
7241  template< typename MT3 // Type of the left-hand side target matrix
7242  , typename MT4 // Type of the left-hand side matrix operand
7243  , typename MT5 // Type of the right-hand side matrix operand
7244  , typename ST2 > // Type of the scalar value
7246  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7247  {
7248  selectLargeAddAssignKernel( C, A, B, scalar );
7249  }
7250  //**********************************************************************************************
7251 
7252  //**BLAS-based addition assignment to dense matrices********************************************
7253 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7254 
7267  template< typename MT3 // Type of the left-hand side target matrix
7268  , typename MT4 // Type of the left-hand side matrix operand
7269  , typename MT5 // Type of the right-hand side matrix operand
7270  , typename ST2 > // Type of the scalar value
7272  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7273  {
7274  using ET = ElementType_<MT3>;
7275 
7276  if( IsTriangular<MT4>::value ) {
7277  ResultType_<MT3> tmp( serial( B ) );
7278  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7279  addAssign( C, tmp );
7280  }
7281  else if( IsTriangular<MT5>::value ) {
7282  ResultType_<MT3> tmp( serial( A ) );
7283  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7284  addAssign( C, tmp );
7285  }
7286  else {
7287  gemm( C, A, B, ET(scalar), ET(1) );
7288  }
7289  }
7290 #endif
7291  //**********************************************************************************************
7292 
7293  //**Restructuring addition assignment to column-major matrices**********************************
7307  template< typename MT > // Type of the target matrix
7309  addAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
7310  {
7312 
7314 
7315  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7316  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7317 
7318  const ForwardFunctor fwd;
7319 
7320  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7321  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7322 
7324  addAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
7325  else if( IsSymmetric<MT1>::value )
7326  addAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
7327  else
7328  addAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
7329  }
7330  //**********************************************************************************************
7331 
7332  //**Addition assignment to sparse matrices******************************************************
7333  // No special implementation for the addition assignment to sparse matrices.
7334  //**********************************************************************************************
7335 
7336  //**Subtraction assignment to dense matrices****************************************************
7348  template< typename MT // Type of the target dense matrix
7349  , bool SO > // Storage order of the target dense matrix
7351  subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7352  {
7354 
7355  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7356  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7357 
7358  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7359  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7360 
7361  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
7362  return;
7363  }
7364 
7365  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7366  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7367 
7368  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7369  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7370  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7371  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7372  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7373  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7374 
7375  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7376  }
7377  //**********************************************************************************************
7378 
7379  //**Subtraction assignment to dense matrices (kernel selection)*********************************
7390  template< typename MT3 // Type of the left-hand side target matrix
7391  , typename MT4 // Type of the left-hand side matrix operand
7392  , typename MT5 // Type of the right-hand side matrix operand
7393  , typename ST2 > // Type of the scalar value
7394  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7395  {
7396  if( ( IsDiagonal<MT5>::value ) ||
7397  ( !BLAZE_DEBUG_MODE && B.columns() <= SIMDSIZE*10UL ) ||
7398  ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
7399  selectSmallSubAssignKernel( C, A, B, scalar );
7400  else
7401  selectBlasSubAssignKernel( C, A, B, scalar );
7402  }
7403  //**********************************************************************************************
7404 
7405  //**Default subtraction assignment to dense matrices (general/general)**************************
7419  template< typename MT3 // Type of the left-hand side target matrix
7420  , typename MT4 // Type of the left-hand side matrix operand
7421  , typename MT5 // Type of the right-hand side matrix operand
7422  , typename ST2 > // Type of the scalar value
7423  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
7424  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7425  {
7426  const ResultType tmp( serial( A * B * scalar ) );
7427  subAssign( C, tmp );
7428  }
7429  //**********************************************************************************************
7430 
7431  //**Default subtraction assignment to dense matrices (general/diagonal)*************************
7445  template< typename MT3 // Type of the left-hand side target matrix
7446  , typename MT4 // Type of the left-hand side matrix operand
7447  , typename MT5 // Type of the right-hand side matrix operand
7448  , typename ST2 > // Type of the scalar value
7449  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7450  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7451  {
7453 
7454  const size_t M( A.rows() );
7455  const size_t N( B.columns() );
7456 
7457  for( size_t i=0UL; i<M; ++i )
7458  {
7459  const size_t jbegin( ( IsUpper<MT4>::value )
7460  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
7461  :( 0UL ) );
7462  const size_t jend( ( IsLower<MT4>::value )
7463  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
7464  :( N ) );
7465  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7466 
7467  const size_t jnum( jend - jbegin );
7468  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
7469 
7470  for( size_t j=jbegin; j<jpos; j+=2UL ) {
7471  C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7472  C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7473  }
7474  if( jpos < jend ) {
7475  C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7476  }
7477  }
7478  }
7479  //**********************************************************************************************
7480 
7481  //**Default subtraction assignment to dense matrices (diagonal/general)*************************
7495  template< typename MT3 // Type of the left-hand side target matrix
7496  , typename MT4 // Type of the left-hand side matrix operand
7497  , typename MT5 // Type of the right-hand side matrix operand
7498  , typename ST2 > // Type of the scalar value
7499  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
7500  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7501  {
7503 
7504  const size_t M( A.rows() );
7505  const size_t N( B.columns() );
7506 
7507  for( size_t i=0UL; i<M; ++i )
7508  {
7509  const size_t jbegin( ( IsUpper<MT5>::value )
7510  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
7511  :( 0UL ) );
7512  const size_t jend( ( IsLower<MT5>::value )
7513  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
7514  :( N ) );
7515  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7516 
7517  const size_t jnum( jend - jbegin );
7518  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
7519 
7520  for( size_t j=jbegin; j<jpos; j+=2UL ) {
7521  C(i,j ) -= A(i,i) * B(i,j ) * scalar;
7522  C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
7523  }
7524  if( jpos < jend ) {
7525  C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
7526  }
7527  }
7528  }
7529  //**********************************************************************************************
7530 
7531  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
7545  template< typename MT3 // Type of the left-hand side target matrix
7546  , typename MT4 // Type of the left-hand side matrix operand
7547  , typename MT5 // Type of the right-hand side matrix operand
7548  , typename ST2 > // Type of the scalar value
7549  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
7550  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7551  {
7553 
7554  for( size_t i=0UL; i<A.rows(); ++i ) {
7555  C(i,i) -= A(i,i) * B(i,i) * scalar;
7556  }
7557  }
7558  //**********************************************************************************************
7559 
7560  //**Default subtraction assignment to dense matrices (small matrices)***************************
7574  template< typename MT3 // Type of the left-hand side target matrix
7575  , typename MT4 // Type of the left-hand side matrix operand
7576  , typename MT5 // Type of the right-hand side matrix operand
7577  , typename ST2 > // Type of the scalar value
7579  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7580  {
7581  selectDefaultSubAssignKernel( C, A, B, scalar );
7582  }
7583  //**********************************************************************************************
7584 
7585  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
7600  template< typename MT3 // Type of the left-hand side target matrix
7601  , typename MT4 // Type of the left-hand side matrix operand
7602  , typename MT5 // Type of the right-hand side matrix operand
7603  , typename ST2 > // Type of the scalar value
7605  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7606  {
7607  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7608 
7609  const size_t M( A.rows() );
7610  const size_t N( B.columns() );
7611  const size_t K( A.columns() );
7612 
7613  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7614 
7615  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
7616  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
7617 
7618  const SIMDType factor( set( scalar ) );
7619 
7620  size_t j( 0UL );
7621 
7623  {
7624  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7625  for( size_t i=0UL; i<M; ++i )
7626  {
7627  const size_t kbegin( ( IsUpper<MT4>::value )
7628  ?( ( IsLower<MT5>::value )
7629  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7630  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7631  :( IsLower<MT5>::value ? j : 0UL ) );
7632  const size_t kend( ( IsLower<MT4>::value )
7633  ?( ( IsUpper<MT5>::value )
7634  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
7635  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
7636  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
7637 
7638  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7639 
7640  for( size_t k=kbegin; k<kend; ++k ) {
7641  const SIMDType a1( set( A(i,k) ) );
7642  xmm1 += a1 * B.load(k,j );
7643  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7644  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7645  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7646  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7647  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7648  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7649  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7650  }
7651 
7652  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7653  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
7654  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7655  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7656  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
7657  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
7658  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
7659  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
7660  }
7661  }
7662  }
7663 
7664  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7665  {
7666  size_t i( 0UL );
7667 
7668  for( ; (i+2UL) <= M; i+=2UL )
7669  {
7670  const size_t kbegin( ( IsUpper<MT4>::value )
7671  ?( ( IsLower<MT5>::value )
7672  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7673  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7674  :( IsLower<MT5>::value ? j : 0UL ) );
7675  const size_t kend( ( IsLower<MT4>::value )
7676  ?( ( IsUpper<MT5>::value )
7677  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
7678  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
7679  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
7680 
7681  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7682 
7683  for( size_t k=kbegin; k<kend; ++k ) {
7684  const SIMDType a1( set( A(i ,k) ) );
7685  const SIMDType a2( set( A(i+1UL,k) ) );
7686  const SIMDType b1( B.load(k,j ) );
7687  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7688  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7689  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7690  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7691  xmm1 += a1 * b1;
7692  xmm2 += a1 * b2;
7693  xmm3 += a1 * b3;
7694  xmm4 += a1 * b4;
7695  xmm5 += a1 * b5;
7696  xmm6 += a2 * b1;
7697  xmm7 += a2 * b2;
7698  xmm8 += a2 * b3;
7699  xmm9 += a2 * b4;
7700  xmm10 += a2 * b5;
7701  }
7702 
7703  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7704  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
7705  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7706  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
7707  (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
7708  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm6 * factor );
7709  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
7710  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
7711  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
7712  (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
7713  }
7714 
7715  if( i < M )
7716  {
7717  const size_t kbegin( ( IsUpper<MT4>::value )
7718  ?( ( IsLower<MT5>::value )
7719  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7720  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7721  :( IsLower<MT5>::value ? j : 0UL ) );
7722  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
7723 
7724  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7725 
7726  for( size_t k=kbegin; k<kend; ++k ) {
7727  const SIMDType a1( set( A(i,k) ) );
7728  xmm1 += a1 * B.load(k,j );
7729  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7730  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7731  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7732  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7733  }
7734 
7735  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7736  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
7737  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7738  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7739  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
7740  }
7741  }
7742 
7743  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7744  {
7745  size_t i( 0UL );
7746 
7747  for( ; (i+2UL) <= M; i+=2UL )
7748  {
7749  const size_t kbegin( ( IsUpper<MT4>::value )
7750  ?( ( IsLower<MT5>::value )
7751  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7752  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7753  :( IsLower<MT5>::value ? j : 0UL ) );
7754  const size_t kend( ( IsLower<MT4>::value )
7755  ?( ( IsUpper<MT5>::value )
7756  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
7757  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
7758  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
7759 
7760  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7761 
7762  for( size_t k=kbegin; k<kend; ++k ) {
7763  const SIMDType a1( set( A(i ,k) ) );
7764  const SIMDType a2( set( A(i+1UL,k) ) );
7765  const SIMDType b1( B.load(k,j ) );
7766  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7767  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7768  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7769  xmm1 += a1 * b1;
7770  xmm2 += a1 * b2;
7771  xmm3 += a1 * b3;
7772  xmm4 += a1 * b4;
7773  xmm5 += a2 * b1;
7774  xmm6 += a2 * b2;
7775  xmm7 += a2 * b3;
7776  xmm8 += a2 * b4;
7777  }
7778 
7779  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7780  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
7781  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7782  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
7783  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
7784  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
7785  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
7786  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
7787  }
7788 
7789  if( i < M )
7790  {
7791  const size_t kbegin( ( IsUpper<MT4>::value )
7792  ?( ( IsLower<MT5>::value )
7793  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7794  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7795  :( IsLower<MT5>::value ? j : 0UL ) );
7796  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
7797 
7798  SIMDType xmm1, xmm2, xmm3, xmm4;
7799 
7800  for( size_t k=kbegin; k<kend; ++k ) {
7801  const SIMDType a1( set( A(i,k) ) );
7802  xmm1 += a1 * B.load(k,j );
7803  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7804  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7805  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7806  }
7807 
7808  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7809  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
7810  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7811  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
7812  }
7813  }
7814 
7815  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
7816  {
7817  size_t i( 0UL );
7818 
7819  for( ; (i+2UL) <= M; i+=2UL )
7820  {
7821  const size_t kbegin( ( IsUpper<MT4>::value )
7822  ?( ( IsLower<MT5>::value )
7823  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7824  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7825  :( IsLower<MT5>::value ? j : 0UL ) );
7826  const size_t kend( ( IsLower<MT4>::value )
7827  ?( ( IsUpper<MT5>::value )
7828  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
7829  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
7830  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
7831 
7832  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7833 
7834  for( size_t k=kbegin; k<kend; ++k ) {
7835  const SIMDType a1( set( A(i ,k) ) );
7836  const SIMDType a2( set( A(i+1UL,k) ) );
7837  const SIMDType b1( B.load(k,j ) );
7838  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7839  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7840  xmm1 += a1 * b1;
7841  xmm2 += a1 * b2;
7842  xmm3 += a1 * b3;
7843  xmm4 += a2 * b1;
7844  xmm5 += a2 * b2;
7845  xmm6 += a2 * b3;
7846  }
7847 
7848  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7849  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
7850  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
7851  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm4 * factor );
7852  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
7853  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
7854  }
7855 
7856  if( i < M )
7857  {
7858  const size_t kbegin( ( IsUpper<MT4>::value )
7859  ?( ( IsLower<MT5>::value )
7860  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7861  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7862  :( IsLower<MT5>::value ? j : 0UL ) );
7863  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
7864 
7865  SIMDType xmm1, xmm2, xmm3;
7866 
7867  for( size_t k=kbegin; k<kend; ++k ) {
7868  const SIMDType a1( set( A(i,k) ) );
7869  xmm1 += a1 * B.load(k,j );
7870  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7871  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7872  }
7873 
7874  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7875  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
7876  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
7877  }
7878  }
7879 
7880  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
7881  {
7882  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
7883  size_t i( LOW ? j : 0UL );
7884 
7885  for( ; (i+4UL) <= iend; i+=4UL )
7886  {
7887  const size_t kbegin( ( IsUpper<MT4>::value )
7888  ?( ( IsLower<MT5>::value )
7889  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7890  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7891  :( IsLower<MT5>::value ? j : 0UL ) );
7892  const size_t kend( ( IsLower<MT4>::value )
7893  ?( ( IsUpper<MT5>::value )
7894  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
7895  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
7896  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
7897 
7898  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7899 
7900  for( size_t k=kbegin; k<kend; ++k ) {
7901  const SIMDType a1( set( A(i ,k) ) );
7902  const SIMDType a2( set( A(i+1UL,k) ) );
7903  const SIMDType a3( set( A(i+2UL,k) ) );
7904  const SIMDType a4( set( A(i+3UL,k) ) );
7905  const SIMDType b1( B.load(k,j ) );
7906  const SIMDType b2( B.load(k,j+SIMDSIZE) );
7907  xmm1 += a1 * b1;
7908  xmm2 += a1 * b2;
7909  xmm3 += a2 * b1;
7910  xmm4 += a2 * b2;
7911  xmm5 += a3 * b1;
7912  xmm6 += a3 * b2;
7913  xmm7 += a4 * b1;
7914  xmm8 += a4 * b2;
7915  }
7916 
7917  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7918  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
7919  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7920  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
7921  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
7922  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
7923  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
7924  (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
7925  }
7926 
7927  for( ; (i+3UL) <= iend; i+=3UL )
7928  {
7929  const size_t kbegin( ( IsUpper<MT4>::value )
7930  ?( ( IsLower<MT5>::value )
7931  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7932  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7933  :( IsLower<MT5>::value ? j : 0UL ) );
7934  const size_t kend( ( IsLower<MT4>::value )
7935  ?( ( IsUpper<MT5>::value )
7936  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
7937  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
7938  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
7939 
7940  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7941 
7942  for( size_t k=kbegin; k<kend; ++k ) {
7943  const SIMDType a1( set( A(i ,k) ) );
7944  const SIMDType a2( set( A(i+1UL,k) ) );
7945  const SIMDType a3( set( A(i+2UL,k) ) );
7946  const SIMDType b1( B.load(k,j ) );
7947  const SIMDType b2( B.load(k,j+SIMDSIZE) );
7948  xmm1 += a1 * b1;
7949  xmm2 += a1 * b2;
7950  xmm3 += a2 * b1;
7951  xmm4 += a2 * b2;
7952  xmm5 += a3 * b1;
7953  xmm6 += a3 * b2;
7954  }
7955 
7956  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7957  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
7958  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7959  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
7960  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
7961  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
7962  }
7963 
7964  for( ; (i+2UL) <= iend; i+=2UL )
7965  {
7966  const size_t kbegin( ( IsUpper<MT4>::value )
7967  ?( ( IsLower<MT5>::value )
7968  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7969  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7970  :( IsLower<MT5>::value ? j : 0UL ) );
7971  const size_t kend( ( IsLower<MT4>::value )
7972  ?( ( IsUpper<MT5>::value )
7973  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
7974  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
7975  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
7976 
7977  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7978  size_t k( kbegin );
7979 
7980  for( ; (k+2UL) <= kend; k+=2UL ) {
7981  const SIMDType a1( set( A(i ,k ) ) );
7982  const SIMDType a2( set( A(i+1UL,k ) ) );
7983  const SIMDType a3( set( A(i ,k+1UL) ) );
7984  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
7985  const SIMDType b1( B.load(k ,j ) );
7986  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
7987  const SIMDType b3( B.load(k+1UL,j ) );
7988  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
7989  xmm1 += a1 * b1;
7990  xmm2 += a1 * b2;
7991  xmm3 += a2 * b1;
7992  xmm4 += a2 * b2;
7993  xmm5 += a3 * b3;
7994  xmm6 += a3 * b4;
7995  xmm7 += a4 * b3;
7996  xmm8 += a4 * b4;
7997  }
7998 
7999  for( ; k<kend; ++k ) {
8000  const SIMDType a1( set( A(i ,k) ) );
8001  const SIMDType a2( set( A(i+1UL,k) ) );
8002  const SIMDType b1( B.load(k,j ) );
8003  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8004  xmm1 += a1 * b1;
8005  xmm2 += a1 * b2;
8006  xmm3 += a2 * b1;
8007  xmm4 += a2 * b2;
8008  }
8009 
8010  (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
8011  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
8012  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - (xmm3+xmm7) * factor );
8013  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
8014  }
8015 
8016  if( i < iend )
8017  {
8018  const size_t kbegin( ( IsUpper<MT4>::value )
8019  ?( ( IsLower<MT5>::value )
8020  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8021  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8022  :( IsLower<MT5>::value ? j : 0UL ) );
8023  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8024 
8025  SIMDType xmm1, xmm2, xmm3, xmm4;
8026  size_t k( kbegin );
8027 
8028  for( ; (k+2UL) <= kend; k+=2UL ) {
8029  const SIMDType a1( set( A(i,k ) ) );
8030  const SIMDType a2( set( A(i,k+1UL) ) );
8031  xmm1 += a1 * B.load(k ,j );
8032  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8033  xmm3 += a2 * B.load(k+1UL,j );
8034  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8035  }
8036 
8037  for( ; k<kend; ++k ) {
8038  const SIMDType a1( set( A(i,k) ) );
8039  xmm1 += a1 * B.load(k,j );
8040  xmm2 += a1 * B.load(k,j+SIMDSIZE);
8041  }
8042 
8043  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
8044  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
8045  }
8046  }
8047 
8048  for( ; j<jpos; j+=SIMDSIZE )
8049  {
8050  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
8051  size_t i( LOW ? j : 0UL );
8052 
8053  for( ; (i+4UL) <= iend; i+=4UL )
8054  {
8055  const size_t kbegin( ( IsUpper<MT4>::value )
8056  ?( ( IsLower<MT5>::value )
8057  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8058  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8059  :( IsLower<MT5>::value ? j : 0UL ) );
8060  const size_t kend( ( IsLower<MT4>::value )
8061  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
8062  :( K ) );
8063 
8064  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8065  size_t k( kbegin );
8066 
8067  for( ; (k+2UL) <= kend; k+=2UL ) {
8068  const SIMDType b1( B.load(k ,j) );
8069  const SIMDType b2( B.load(k+1UL,j) );
8070  xmm1 += set( A(i ,k ) ) * b1;
8071  xmm2 += set( A(i+1UL,k ) ) * b1;
8072  xmm3 += set( A(i+2UL,k ) ) * b1;
8073  xmm4 += set( A(i+3UL,k ) ) * b1;
8074  xmm5 += set( A(i ,k+1UL) ) * b2;
8075  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
8076  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
8077  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
8078  }
8079 
8080  for( ; k<kend; ++k ) {
8081  const SIMDType b1( B.load(k,j) );
8082  xmm1 += set( A(i ,k) ) * b1;
8083  xmm2 += set( A(i+1UL,k) ) * b1;
8084  xmm3 += set( A(i+2UL,k) ) * b1;
8085  xmm4 += set( A(i+3UL,k) ) * b1;
8086  }
8087 
8088  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm5) * factor );
8089  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm6) * factor );
8090  (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm7) * factor );
8091  (~C).store( i+3UL, j, (~C).load(i+3UL,j) - (xmm4+xmm8) * factor );
8092  }
8093 
8094  for( ; (i+3UL) <= iend; i+=3UL )
8095  {
8096  const size_t kbegin( ( IsUpper<MT4>::value )
8097  ?( ( IsLower<MT5>::value )
8098  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8099  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8100  :( IsLower<MT5>::value ? j : 0UL ) );
8101  const size_t kend( ( IsLower<MT4>::value )
8102  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
8103  :( K ) );
8104 
8105  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8106  size_t k( kbegin );
8107 
8108  for( ; (k+2UL) <= kend; k+=2UL ) {
8109  const SIMDType b1( B.load(k ,j) );
8110  const SIMDType b2( B.load(k+1UL,j) );
8111  xmm1 += set( A(i ,k ) ) * b1;
8112  xmm2 += set( A(i+1UL,k ) ) * b1;
8113  xmm3 += set( A(i+2UL,k ) ) * b1;
8114  xmm4 += set( A(i ,k+1UL) ) * b2;
8115  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
8116  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
8117  }
8118 
8119  for( ; k<kend; ++k ) {
8120  const SIMDType b1( B.load(k,j) );
8121  xmm1 += set( A(i ,k) ) * b1;
8122  xmm2 += set( A(i+1UL,k) ) * b1;
8123  xmm3 += set( A(i+2UL,k) ) * b1;
8124  }
8125 
8126  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm4) * factor );
8127  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm5) * factor );
8128  (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm6) * factor );
8129  }
8130 
8131  for( ; (i+2UL) <= iend; i+=2UL )
8132  {
8133  const size_t kbegin( ( IsUpper<MT4>::value )
8134  ?( ( IsLower<MT5>::value )
8135  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8136  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8137  :( IsLower<MT5>::value ? j : 0UL ) );
8138  const size_t kend( ( IsLower<MT4>::value )
8139  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8140  :( K ) );
8141 
8142  SIMDType xmm1, xmm2, xmm3, xmm4;
8143  size_t k( kbegin );
8144 
8145  for( ; (k+2UL) <= kend; k+=2UL ) {
8146  const SIMDType b1( B.load(k ,j) );
8147  const SIMDType b2( B.load(k+1UL,j) );
8148  xmm1 += set( A(i ,k ) ) * b1;
8149  xmm2 += set( A(i+1UL,k ) ) * b1;
8150  xmm3 += set( A(i ,k+1UL) ) * b2;
8151  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
8152  }
8153 
8154  for( ; k<kend; ++k ) {
8155  const SIMDType b1( B.load(k,j) );
8156  xmm1 += set( A(i ,k) ) * b1;
8157  xmm2 += set( A(i+1UL,k) ) * b1;
8158  }
8159 
8160  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
8161  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm4) * factor );
8162  }
8163 
8164  if( i < iend )
8165  {
8166  const size_t kbegin( ( IsUpper<MT4>::value )
8167  ?( ( IsLower<MT5>::value )
8168  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8169  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8170  :( IsLower<MT5>::value ? j : 0UL ) );
8171 
8172  SIMDType xmm1, xmm2;
8173  size_t k( kbegin );
8174 
8175  for( ; (k+2UL) <= K; k+=2UL ) {
8176  xmm1 += set( A(i,k ) ) * B.load(k ,j);
8177  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
8178  }
8179 
8180  for( ; k<K; ++k ) {
8181  xmm1 += set( A(i,k) ) * B.load(k,j);
8182  }
8183 
8184  (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
8185  }
8186  }
8187 
8188  for( ; remainder && j<N; ++j )
8189  {
8190  const size_t iend( UPP ? j+1UL : M );
8191  size_t i( LOW ? j : 0UL );
8192 
8193  for( ; (i+2UL) <= iend; i+=2UL )
8194  {
8195  const size_t kbegin( ( IsUpper<MT4>::value )
8196  ?( ( IsLower<MT5>::value )
8197  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8198  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8199  :( IsLower<MT5>::value ? j : 0UL ) );
8200  const size_t kend( ( IsLower<MT4>::value )
8201  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8202  :( K ) );
8203 
8204  ElementType value1 = ElementType();
8205  ElementType value2 = ElementType();
8206 
8207  for( size_t k=kbegin; k<kend; ++k ) {
8208  value1 += A(i ,k) * B(k,j);
8209  value2 += A(i+1UL,k) * B(k,j);
8210  }
8211 
8212  (~C)(i ,j) -= value1 * scalar;
8213  (~C)(i+1UL,j) -= value2 * scalar;
8214  }
8215 
8216  if( i < iend )
8217  {
8218  const size_t kbegin( ( IsUpper<MT4>::value )
8219  ?( ( IsLower<MT5>::value )
8220  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8221  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8222  :( IsLower<MT5>::value ? j : 0UL ) );
8223 
8224  ElementType value = ElementType();
8225 
8226  for( size_t k=kbegin; k<K; ++k ) {
8227  value += A(i,k) * B(k,j);
8228  }
8229 
8230  (~C)(i,j) -= value * scalar;
8231  }
8232  }
8233  }
8234  //**********************************************************************************************
8235 
8236  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
8250  template< typename MT3 // Type of the left-hand side target matrix
8251  , typename MT4 // Type of the left-hand side matrix operand
8252  , typename MT5 // Type of the right-hand side matrix operand
8253  , typename ST2 > // Type of the scalar value
8255  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8256  {
8261 
8262  const ForwardFunctor fwd;
8263 
8265  const OppositeType_<MT4> tmp( serial( A ) );
8266  subAssign( ~C, fwd( tmp * B ) * scalar );
8267  }
8269  const OppositeType_<MT5> tmp( serial( B ) );
8270  subAssign( ~C, fwd( A * tmp ) * scalar );
8271  }
8272  else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
8273  const OppositeType_<MT4> tmp( serial( A ) );
8274  subAssign( ~C, fwd( tmp * B ) * scalar );
8275  }
8276  else {
8277  const OppositeType_<MT5> tmp( serial( B ) );
8278  subAssign( ~C, fwd( A * tmp ) * scalar );
8279  }
8280  }
8281  //**********************************************************************************************
8282 
8283  //**Default subtraction assignment to dense matrices (large matrices)***************************
8297  template< typename MT3 // Type of the left-hand side target matrix
8298  , typename MT4 // Type of the left-hand side matrix operand
8299  , typename MT5 // Type of the right-hand side matrix operand
8300  , typename ST2 > // Type of the scalar value
8302  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8303  {
8304  selectDefaultSubAssignKernel( C, A, B, scalar );
8305  }
8306  //**********************************************************************************************
8307 
8308  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
8323  template< typename MT3 // Type of the left-hand side target matrix
8324  , typename MT4 // Type of the left-hand side matrix operand
8325  , typename MT5 // Type of the right-hand side matrix operand
8326  , typename ST2 > // Type of the scalar value
8328  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8329  {
8330  if( LOW )
8331  lmmm( C, A, B, -scalar, ST2(1) );
8332  else if( UPP )
8333  ummm( C, A, B, -scalar, ST2(1) );
8334  else
8335  mmm( C, A, B, -scalar, ST2(1) );
8336  }
8337  //**********************************************************************************************
8338 
8339  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
8353  template< typename MT3 // Type of the left-hand side target matrix
8354  , typename MT4 // Type of the left-hand side matrix operand
8355  , typename MT5 // Type of the right-hand side matrix operand
8356  , typename ST2 > // Type of the scalar value
8358  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8359  {
8360  selectLargeSubAssignKernel( C, A, B, scalar );
8361  }
8362  //**********************************************************************************************
8363 
8364  //**BLAS-based subraction assignment to dense matrices******************************************
8365 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
8366 
8379  template< typename MT3 // Type of the left-hand side target matrix
8380  , typename MT4 // Type of the left-hand side matrix operand
8381  , typename MT5 // Type of the right-hand side matrix operand
8382  , typename ST2 > // Type of the scalar value
8384  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
8385  {
8386  using ET = ElementType_<MT3>;
8387 
8388  if( IsTriangular<MT4>::value ) {
8389  ResultType_<MT3> tmp( serial( B ) );
8390  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8391  subAssign( C, tmp );
8392  }
8393  else if( IsTriangular<MT5>::value ) {
8394  ResultType_<MT3> tmp( serial( A ) );
8395  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
8396  subAssign( C, tmp );
8397  }
8398  else {
8399  gemm( C, A, B, ET(-scalar), ET(1) );
8400  }
8401  }
8402 #endif
8403  //**********************************************************************************************
8404 
8405  //**Restructuring subtraction assignment to column-major matrices*******************************
8419  template< typename MT > // Type of the target matrix
8421  subAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8422  {
8424 
8426 
8427  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8428  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8429 
8430  const ForwardFunctor fwd;
8431 
8432  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8433  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8434 
8436  subAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8437  else if( IsSymmetric<MT1>::value )
8438  subAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8439  else
8440  subAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8441  }
8442  //**********************************************************************************************
8443 
8444  //**Subtraction assignment to sparse matrices***************************************************
8445  // No special implementation for the subtraction assignment to sparse matrices.
8446  //**********************************************************************************************
8447 
8448  //**Schur product assignment to dense matrices**************************************************
8460  template< typename MT // Type of the target dense matrix
8461  , bool SO > // Storage order of the target dense matrix
8462  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8463  {
8465 
8469 
8470  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8471  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8472 
8473  const ResultType tmp( serial( rhs ) );
8474  schurAssign( ~lhs, tmp );
8475  }
8476  //**********************************************************************************************
8477 
8478  //**Schur product assignment to sparse matrices*************************************************
8479  // No special implementation for the Schur product assignment to sparse matrices.
8480  //**********************************************************************************************
8481 
8482  //**Multiplication assignment to dense matrices*************************************************
8483  // No special implementation for the multiplication assignment to dense matrices.
8484  //**********************************************************************************************
8485 
8486  //**Multiplication assignment to sparse matrices************************************************
8487  // No special implementation for the multiplication assignment to sparse matrices.
8488  //**********************************************************************************************
8489 
8490  //**SMP assignment to dense matrices************************************************************
8505  template< typename MT // Type of the target dense matrix
8506  , bool SO > // Storage order of the target dense matrix
8508  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8509  {
8511 
8512  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8513  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8514 
8515  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8516  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8517 
8518  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
8519  return;
8520  }
8521  else if( left.columns() == 0UL ) {
8522  reset( ~lhs );
8523  return;
8524  }
8525 
8526  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8527  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8528 
8529  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8530  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8531  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8532  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8533  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8534  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8535 
8536  smpAssign( ~lhs, A * B * rhs.scalar_ );
8537  }
8538  //**********************************************************************************************
8539 
8540  //**SMP assignment to sparse matrices***********************************************************
8555  template< typename MT // Type of the target sparse matrix
8556  , bool SO > // Storage order of the target sparse matrix
8559  {
8561 
8563 
8570 
8571  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8572  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8573 
8574  const ForwardFunctor fwd;
8575 
8576  const TmpType tmp( rhs );
8577  smpAssign( ~lhs, fwd( tmp ) );
8578  }
8579  //**********************************************************************************************
8580 
8581  //**Restructuring SMP assignment to column-major matrices***************************************
8595  template< typename MT > // Type of the target matrix
8597  smpAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8598  {
8600 
8602 
8603  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8604  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8605 
8606  const ForwardFunctor fwd;
8607 
8608  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8609  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8610 
8612  smpAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8613  else if( IsSymmetric<MT1>::value )
8614  smpAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8615  else
8616  smpAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8617  }
8618  //**********************************************************************************************
8619 
8620  //**SMP addition assignment to dense matrices***************************************************
8635  template< typename MT // Type of the target dense matrix
8636  , bool SO > // Storage order of the target dense matrix
8639  {
8641 
8642  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8643  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8644 
8645  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8646  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8647 
8648  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8649  return;
8650  }
8651 
8652  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8653  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8654 
8655  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8656  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8657  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8658  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8659  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8660  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8661 
8662  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
8663  }
8664  //**********************************************************************************************
8665 
8666  //**Restructuring SMP addition assignment to column-major matrices******************************
8680  template< typename MT > // Type of the target matrix
8682  smpAddAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8683  {
8685 
8687 
8688  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8689  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8690 
8691  const ForwardFunctor fwd;
8692 
8693  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8694  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8695 
8697  smpAddAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8698  else if( IsSymmetric<MT1>::value )
8699  smpAddAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8700  else
8701  smpAddAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8702  }
8703  //**********************************************************************************************
8704 
8705  //**SMP addition assignment to sparse matrices**************************************************
8706  // No special implementation for the SMP addition assignment to sparse matrices.
8707  //**********************************************************************************************
8708 
8709  //**SMP subtraction assignment to dense matrices************************************************
8724  template< typename MT // Type of the target dense matrix
8725  , bool SO > // Storage order of the target dense matrix
8728  {
8730 
8731  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8732  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8733 
8734  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8735  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8736 
8737  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
8738  return;
8739  }
8740 
8741  LT A( left ); // Evaluation of the left-hand side dense matrix operand
8742  RT B( right ); // Evaluation of the right-hand side dense matrix operand
8743 
8744  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
8745  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
8746  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
8747  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
8748  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
8749  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
8750 
8751  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
8752  }
8753  //**********************************************************************************************
8754 
8755  //**Restructuring SMP subtraction assignment to column-major matrices***************************
8769  template< typename MT > // Type of the target matrix
8771  smpSubAssign( Matrix<MT,true>& lhs, const DMatScalarMultExpr& rhs )
8772  {
8774 
8776 
8777  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8778  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8779 
8780  const ForwardFunctor fwd;
8781 
8782  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
8783  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
8784 
8786  smpSubAssign( ~lhs, fwd( trans( left ) * trans( right ) ) * rhs.scalar_ );
8787  else if( IsSymmetric<MT1>::value )
8788  smpSubAssign( ~lhs, fwd( trans( left ) * right ) * rhs.scalar_ );
8789  else
8790  smpSubAssign( ~lhs, fwd( left * trans( right ) ) * rhs.scalar_ );
8791  }
8792  //**********************************************************************************************
8793 
8794  //**SMP subtraction assignment to sparse matrices***********************************************
8795  // No special implementation for the SMP subtraction assignment to sparse matrices.
8796  //**********************************************************************************************
8797 
8798  //**SMP Schur product assignment to dense matrices**********************************************
8810  template< typename MT // Type of the target dense matrix
8811  , bool SO > // Storage order of the target dense matrix
8812  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
8813  {
8815 
8819 
8820  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
8821  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
8822 
8823  const ResultType tmp( rhs );
8824  smpSchurAssign( ~lhs, tmp );
8825  }
8826  //**********************************************************************************************
8827 
8828  //**SMP Schur product assignment to sparse matrices*********************************************
8829  // No special implementation for the SMP Schur product assignment to sparse matrices.
8830  //**********************************************************************************************
8831 
8832  //**SMP multiplication assignment to dense matrices*********************************************
8833  // No special implementation for the SMP multiplication assignment to dense matrices.
8834  //**********************************************************************************************
8835 
8836  //**SMP multiplication assignment to sparse matrices********************************************
8837  // No special implementation for the SMP multiplication assignment to sparse matrices.
8838  //**********************************************************************************************
8839 
8840  //**Compile time checks*************************************************************************
8849  //**********************************************************************************************
8850 };
8852 //*************************************************************************************************
8853 
8854 
8855 
8856 
8857 //=================================================================================================
8858 //
8859 // GLOBAL BINARY ARITHMETIC OPERATORS
8860 //
8861 //=================================================================================================
8862 
8863 //*************************************************************************************************
8890 template< typename MT1 // Type of the left-hand side dense matrix
8891  , typename MT2 > // Type of the right-hand side dense matrix
8892 inline decltype(auto)
8893  operator*( const DenseMatrix<MT1,false>& lhs, const DenseMatrix<MT2,false>& rhs )
8894 {
8896 
8897  if( (~lhs).columns() != (~rhs).rows() ) {
8898  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
8899  }
8900 
8902  return ReturnType( ~lhs, ~rhs );
8903 }
8904 //*************************************************************************************************
8905 
8906 
8907 
8908 
8909 //=================================================================================================
8910 //
8911 // GLOBAL FUNCTIONS
8912 //
8913 //=================================================================================================
8914 
8915 //*************************************************************************************************
8938 template< typename MT1 // Type of the left-hand side dense matrix
8939  , typename MT2 // Type of the right-hand side dense matrix
8940  , bool SF // Symmetry flag
8941  , bool HF // Hermitian flag
8942  , bool LF // Lower flag
8943  , bool UF > // Upper flag
8944 inline decltype(auto) declsym( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8945 {
8947 
8948  if( !isSquare( dm ) ) {
8949  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
8950  }
8951 
8953  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8954 }
8956 //*************************************************************************************************
8957 
8958 
8959 //*************************************************************************************************
8982 template< typename MT1 // Type of the left-hand side dense matrix
8983  , typename MT2 // Type of the right-hand side dense matrix
8984  , bool SF // Symmetry flag
8985  , bool HF // Hermitian flag
8986  , bool LF // Lower flag
8987  , bool UF > // Upper flag
8988 inline decltype(auto) declherm( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8989 {
8991 
8992  if( !isSquare( dm ) ) {
8993  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
8994  }
8995 
8997  return ReturnType( dm.leftOperand(), dm.rightOperand() );
8998 }
9000 //*************************************************************************************************
9001 
9002 
9003 //*************************************************************************************************
9026 template< typename MT1 // Type of the left-hand side dense matrix
9027  , typename MT2 // Type of the right-hand side dense matrix
9028  , bool SF // Symmetry flag
9029  , bool HF // Hermitian flag
9030  , bool LF // Lower flag
9031  , bool UF > // Upper flag
9032 inline decltype(auto) decllow( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9033 {
9035 
9036  if( !isSquare( dm ) ) {
9037  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
9038  }
9039 
9041  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9042 }
9044 //*************************************************************************************************
9045 
9046 
9047 //*************************************************************************************************
9070 template< typename MT1 // Type of the left-hand side dense matrix
9071  , typename MT2 // Type of the right-hand side dense matrix
9072  , bool SF // Symmetry flag
9073  , bool HF // Hermitian flag
9074  , bool LF // Lower flag
9075  , bool UF > // Upper flag
9076 inline decltype(auto) declupp( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9077 {
9079 
9080  if( !isSquare( dm ) ) {
9081  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
9082  }
9083 
9085  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9086 }
9088 //*************************************************************************************************
9089 
9090 
9091 //*************************************************************************************************
9114 template< typename MT1 // Type of the left-hand side dense matrix
9115  , typename MT2 // Type of the right-hand side dense matrix
9116  , bool SF // Symmetry flag
9117  , bool HF // Hermitian flag
9118  , bool LF // Lower flag
9119  , bool UF > // Upper flag
9120 inline decltype(auto) decldiag( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9121 {
9123 
9124  if( !isSquare( dm ) ) {
9125  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
9126  }
9127 
9129  return ReturnType( dm.leftOperand(), dm.rightOperand() );
9130 }
9132 //*************************************************************************************************
9133 
9134 
9135 
9136 
9137 //=================================================================================================
9138 //
9139 // ROWS SPECIALIZATIONS
9140 //
9141 //=================================================================================================
9142 
9143 //*************************************************************************************************
9145 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9146 struct Rows< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9147  : public Rows<MT1>
9148 {};
9150 //*************************************************************************************************
9151 
9152 
9153 
9154 
9155 //=================================================================================================
9156 //
9157 // COLUMNS SPECIALIZATIONS
9158 //
9159 //=================================================================================================
9160 
9161 //*************************************************************************************************
9163 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9164 struct Columns< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9165  : public Columns<MT2>
9166 {};
9168 //*************************************************************************************************
9169 
9170 
9171 
9172 
9173 //=================================================================================================
9174 //
9175 // ISALIGNED SPECIALIZATIONS
9176 //
9177 //=================================================================================================
9178 
9179 //*************************************************************************************************
9181 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9182 struct IsAligned< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9183  : public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
9184 {};
9186 //*************************************************************************************************
9187 
9188 
9189 
9190 
9191 //=================================================================================================
9192 //
9193 // ISSYMMETRIC SPECIALIZATIONS
9194 //
9195 //=================================================================================================
9196 
9197 //*************************************************************************************************
9199 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9200 struct IsSymmetric< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9201  : public BoolConstant< Or< Bool<SF>
9202  , And< Bool<HF>
9203  , IsBuiltin< ElementType_< DMatDMatMultExpr<MT1,MT2,false,true,false,false> > > >
9204  , And< Bool<LF>, Bool<UF> > >::value >
9205 {};
9207 //*************************************************************************************************
9208 
9209 
9210 
9211 
9212 //=================================================================================================
9213 //
9214 // ISHERMITIAN SPECIALIZATIONS
9215 //
9216 //=================================================================================================
9217 
9218 //*************************************************************************************************
9220 template< typename MT1, typename MT2, bool SF, bool LF, bool UF >
9221 struct IsHermitian< DMatDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
9222  : public TrueType
9223 {};
9225 //*************************************************************************************************
9226 
9227 
9228 
9229 
9230 //=================================================================================================
9231 //
9232 // ISLOWER SPECIALIZATIONS
9233 //
9234 //=================================================================================================
9235 
9236 //*************************************************************************************************
9238 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9239 struct IsLower< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9240  : public BoolConstant< Or< Bool<LF>
9241  , And< IsLower<MT1>, IsLower<MT2> >
9242  , And< Or< Bool<SF>, Bool<HF> >
9243  , IsUpper<MT1>, IsUpper<MT2> > >::value >
9244 {};
9246 //*************************************************************************************************
9247 
9248 
9249 
9250 
9251 //=================================================================================================
9252 //
9253 // ISUNILOWER SPECIALIZATIONS
9254 //
9255 //=================================================================================================
9256 
9257 //*************************************************************************************************
9259 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9260 struct IsUniLower< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9261  : public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
9262  , And< Or< Bool<SF>, Bool<HF> >
9263  , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
9264 {};
9266 //*************************************************************************************************
9267 
9268 
9269 
9270 
9271 //=================================================================================================
9272 //
9273 // ISSTRICTLYLOWER SPECIALIZATIONS
9274 //
9275 //=================================================================================================
9276 
9277 //*************************************************************************************************
9279 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9280 struct IsStrictlyLower< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9281  : public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9282  , And< IsStrictlyLower<MT2>, IsLower<MT1> >
9283  , And< Or< Bool<SF>, Bool<HF> >
9284  , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9285  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
9286 {};
9288 //*************************************************************************************************
9289 
9290 
9291 
9292 
9293 //=================================================================================================
9294 //
9295 // ISUPPER SPECIALIZATIONS
9296 //
9297 //=================================================================================================
9298 
9299 //*************************************************************************************************
9301 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9302 struct IsUpper< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9303  : public BoolConstant< Or< Bool<UF>
9304  , And< IsUpper<MT1>, IsUpper<MT2> >
9305  , And< Or< Bool<SF>, Bool<HF> >
9306  , IsLower<MT1>, IsLower<MT2> > >::value >
9307 {};
9309 //*************************************************************************************************
9310 
9311 
9312 
9313 
9314 //=================================================================================================
9315 //
9316 // ISUNIUPPER SPECIALIZATIONS
9317 //
9318 //=================================================================================================
9319 
9320 //*************************************************************************************************
9322 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9323 struct IsUniUpper< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9324  : public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
9325  , And< Or< Bool<SF>, Bool<HF> >
9326  , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
9327 {};
9329 //*************************************************************************************************
9330 
9331 
9332 
9333 
9334 //=================================================================================================
9335 //
9336 // ISSTRICTLYUPPER SPECIALIZATIONS
9337 //
9338 //=================================================================================================
9339 
9340 //*************************************************************************************************
9342 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
9343 struct IsStrictlyUpper< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9344  : public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
9345  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
9346  , And< Or< Bool<SF>, Bool<HF> >
9347  , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
9348  , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
9349 {};
9351 //*************************************************************************************************
9352 
9353 } // namespace blaze
9354 
9355 #endif
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:325
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Data type constraint.
Headerfile for the generic min algorithm.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:290
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:414
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Flag for lower matrices.
Definition: DMatDMatMultExpr.h:180
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:286
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
Header file for the SparseVector base class.
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:157
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:490
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:547
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:299
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:161
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:620
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:537
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:458
Header file for the IsIntegral type trait.
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1027
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:284
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:151
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:434
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1809
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Flag for Hermitian matrices.
Definition: DMatDMatMultExpr.h:179
Header file for the IsFloat type trait.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:281
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
Row< MT > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:124
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:285
Flag for upper matrices.
Definition: DMatDMatMultExpr.h:181
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:283
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:478
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1027
Header file for the IsLower type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:340
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:619
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:264
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:108
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:293
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
Compile time check for resizable data types.This type trait tests whether the given data type is a re...
Definition: IsResizable.h:75
System settings for the BLAS mode.
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Flag for symmetric matrices.
Definition: DMatDMatMultExpr.h:178
Header file for the MatScalarMultExpr base class.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:388
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:110
Utility type for generic codes.
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:162
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1029
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Base class for matrices.The Matrix class is a base class for all dense and sparse matrix classes with...
Definition: Forward.h:101
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:287
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:446
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:424
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:296
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:790
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1029
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:468
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:75
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:404
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:742
Header file for the IsResizable type trait.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:491
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:282
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.