TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/dense/MMM.h>
53 #include <blaze/math/Exception.h>
66 #include <blaze/math/shims/Reset.h>
68 #include <blaze/math/SIMD.h>
94 #include <blaze/system/BLAS.h>
95 #include <blaze/system/Blocking.h>
96 #include <blaze/system/Debugging.h>
101 #include <blaze/util/Assert.h>
102 #include <blaze/util/Complex.h>
105 #include <blaze/util/DisableIf.h>
106 #include <blaze/util/EnableIf.h>
109 #include <blaze/util/InvalidType.h>
110 #include <blaze/util/mpl/And.h>
111 #include <blaze/util/mpl/Bool.h>
112 #include <blaze/util/mpl/If.h>
113 #include <blaze/util/mpl/Not.h>
114 #include <blaze/util/mpl/Or.h>
115 #include <blaze/util/TrueType.h>
116 #include <blaze/util/Types.h>
124 
125 
126 namespace blaze {
127 
128 //=================================================================================================
129 //
130 // CLASS TDMATDMATMULTEXPR
131 //
132 //=================================================================================================
133 
134 //*************************************************************************************************
141 template< typename MT1 // Type of the left-hand side dense matrix
142  , typename MT2 // Type of the right-hand side dense matrix
143  , bool SF // Symmetry flag
144  , bool HF // Hermitian flag
145  , bool LF // Lower flag
146  , bool UF > // Upper flag
147 class TDMatDMatMultExpr
148  : public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
149  , private Computation
150 {
151  private:
152  //**Type definitions****************************************************************************
159  //**********************************************************************************************
160 
161  //**********************************************************************************************
163  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
164  //**********************************************************************************************
165 
166  //**********************************************************************************************
168  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
169  //**********************************************************************************************
170 
171  //**********************************************************************************************
173  enum : bool {
174  SYM = ( SF && !( HF || LF || UF ) ),
175  HERM = ( HF && !( LF || UF ) ),
176  LOW = ( LF || ( ( SF || HF ) && UF ) ),
177  UPP = ( UF || ( ( SF || HF ) && LF ) )
178  };
179  //**********************************************************************************************
180 
181  //**********************************************************************************************
183 
187  template< typename T1, typename T2, typename T3 >
188  struct IsEvaluationRequired {
189  enum : bool { value = ( evaluateLeft || evaluateRight ) };
190  };
192  //**********************************************************************************************
193 
194  //**********************************************************************************************
196 
199  template< typename T1, typename T2, typename T3 >
200  struct UseBlasKernel {
201  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
202  !SYM && !HERM && !LOW && !UPP &&
207  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
212  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
213  };
215  //**********************************************************************************************
216 
217  //**********************************************************************************************
219 
222  template< typename T1, typename T2, typename T3 >
223  struct UseVectorizedDefaultKernel {
224  enum : bool { value = useOptimizedKernels &&
226  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
229  , ElementType_<T3> >::value &&
232  };
234  //**********************************************************************************************
235 
236  //**********************************************************************************************
238 
241  using ForwardFunctor = IfTrue_< HERM
242  , DeclHerm
243  , IfTrue_< SYM
244  , DeclSym
245  , IfTrue_< LOW
246  , IfTrue_< UPP
247  , DeclDiag
248  , DeclLow >
249  , IfTrue_< UPP
250  , DeclUpp
251  , Noop > > > >;
253  //**********************************************************************************************
254 
255  public:
256  //**Type definitions****************************************************************************
259 
265  using ReturnType = const ElementType;
266  using CompositeType = const ResultType;
267 
269  using LeftOperand = If_< IsExpression<MT1>, const MT1, const MT1& >;
270 
272  using RightOperand = If_< IsExpression<MT2>, const MT2, const MT2& >;
273 
276 
279  //**********************************************************************************************
280 
281  //**Compilation flags***************************************************************************
283  enum : bool { simdEnabled = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
284  MT1::simdEnabled && MT2::simdEnabled &&
287 
289  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
290  !evaluateRight && MT2::smpAssignable };
291  //**********************************************************************************************
292 
293  //**SIMD properties*****************************************************************************
295  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
296  //**********************************************************************************************
297 
298  //**Constructor*********************************************************************************
304  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
305  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
306  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
307  {
308  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
309  }
310  //**********************************************************************************************
311 
312  //**Access operator*****************************************************************************
319  inline ReturnType operator()( size_t i, size_t j ) const {
320  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
321  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
322 
323  if( IsDiagonal<MT1>::value ) {
324  return lhs_(i,i) * rhs_(i,j);
325  }
326  else if( IsDiagonal<MT2>::value ) {
327  return lhs_(i,j) * rhs_(j,j);
328  }
330  const size_t begin( ( IsUpper<MT1>::value )
331  ?( ( IsLower<MT2>::value )
332  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
333  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
334  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
335  :( ( IsLower<MT2>::value )
336  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
337  :( 0UL ) ) );
338  const size_t end( ( IsLower<MT1>::value )
339  ?( ( IsUpper<MT2>::value )
340  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
341  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
342  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
343  :( ( IsUpper<MT2>::value )
344  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
345  :( lhs_.columns() ) ) );
346 
347  if( begin >= end ) return ElementType();
348 
349  const size_t n( end - begin );
350 
351  return subvector( row( lhs_, i ), begin, n ) * subvector( column( rhs_, j ), begin, n );
352  }
353  else {
354  return row( lhs_, i ) * column( rhs_, j );
355  }
356  }
357  //**********************************************************************************************
358 
359  //**At function*********************************************************************************
367  inline ReturnType at( size_t i, size_t j ) const {
368  if( i >= lhs_.rows() ) {
369  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
370  }
371  if( j >= rhs_.columns() ) {
372  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
373  }
374  return (*this)(i,j);
375  }
376  //**********************************************************************************************
377 
378  //**Rows function*******************************************************************************
383  inline size_t rows() const noexcept {
384  return lhs_.rows();
385  }
386  //**********************************************************************************************
387 
388  //**Columns function****************************************************************************
393  inline size_t columns() const noexcept {
394  return rhs_.columns();
395  }
396  //**********************************************************************************************
397 
398  //**Left operand access*************************************************************************
403  inline LeftOperand leftOperand() const noexcept {
404  return lhs_;
405  }
406  //**********************************************************************************************
407 
408  //**Right operand access************************************************************************
413  inline RightOperand rightOperand() const noexcept {
414  return rhs_;
415  }
416  //**********************************************************************************************
417 
418  //**********************************************************************************************
424  template< typename T >
425  inline bool canAlias( const T* alias ) const noexcept {
426  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
427  }
428  //**********************************************************************************************
429 
430  //**********************************************************************************************
436  template< typename T >
437  inline bool isAliased( const T* alias ) const noexcept {
438  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
439  }
440  //**********************************************************************************************
441 
442  //**********************************************************************************************
447  inline bool isAligned() const noexcept {
448  return lhs_.isAligned() && rhs_.isAligned();
449  }
450  //**********************************************************************************************
451 
452  //**********************************************************************************************
457  inline bool canSMPAssign() const noexcept {
458  return ( !BLAZE_BLAS_MODE ||
459  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
461  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
462  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
464  }
465  //**********************************************************************************************
466 
467  private:
468  //**Member variables****************************************************************************
471  //**********************************************************************************************
472 
473  //**Assignment to dense matrices****************************************************************
486  template< typename MT // Type of the target dense matrix
487  , bool SO > // Storage order of the target dense matrix
488  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
489  {
491 
492  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
493  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
494 
495  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
496  return;
497  }
498  else if( rhs.lhs_.columns() == 0UL ) {
499  reset( ~lhs );
500  return;
501  }
502 
503  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
504  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
505 
506  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
507  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
508  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
509  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
510  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
511  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
512 
513  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
514  }
516  //**********************************************************************************************
517 
518  //**Assignment to dense matrices (kernel selection)*********************************************
529  template< typename MT3 // Type of the left-hand side target matrix
530  , typename MT4 // Type of the left-hand side matrix operand
531  , typename MT5 > // Type of the right-hand side matrix operand
532  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
533  {
535  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
536  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
537  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
538  selectSmallAssignKernel( C, A, B );
539  else
540  selectBlasAssignKernel( C, A, B );
541  }
543  //**********************************************************************************************
544 
545  //**Default assignment to row-major dense matrices (general/general)****************************
559  template< typename MT3 // Type of the left-hand side target matrix
560  , typename MT4 // Type of the left-hand side matrix operand
561  , typename MT5 > // Type of the right-hand side matrix operand
563  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
564  {
565  const size_t M( A.rows() );
566  const size_t N( B.columns() );
567  const size_t K( A.columns() );
568 
569  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
570 
571  for( size_t i=0UL; i<M; ++i )
572  {
573  const size_t kbegin( ( IsUpper<MT4>::value )
574  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
575  :( 0UL ) );
576  const size_t kend( ( IsLower<MT4>::value )
577  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
578  :( K ) );
579  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
580 
581  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
582  for( size_t j=0UL; j<N; ++j ) {
583  reset( (~C)(i,j) );
584  }
585  continue;
586  }
587 
588  {
589  const size_t jbegin( ( IsUpper<MT5>::value )
591  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
592  :( UPP ? max(i,kbegin) : kbegin ) )
593  :( UPP ? i : 0UL ) );
594  const size_t jend( ( IsLower<MT5>::value )
596  ?( LOW ? min(i+1UL,kbegin) : kbegin )
597  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
598  :( LOW ? i+1UL : N ) );
599 
600  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
601  for( size_t j=0UL; j<jbegin; ++j ) {
602  reset( (~C)(i,j) );
603  }
604  }
605  else if( IsStrictlyUpper<MT5>::value ) {
606  reset( (~C)(i,0UL) );
607  }
608  for( size_t j=jbegin; j<jend; ++j ) {
609  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
610  }
611  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
612  for( size_t j=jend; j<N; ++j ) {
613  reset( (~C)(i,j) );
614  }
615  }
616  else if( IsStrictlyLower<MT5>::value ) {
617  reset( (~C)(i,N-1UL) );
618  }
619  }
620 
621  for( size_t k=kbegin+1UL; k<kend; ++k )
622  {
623  const size_t jbegin( ( IsUpper<MT5>::value )
625  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
626  :( SYM || HERM || UPP ? max( i, k ) : k ) )
627  :( SYM || HERM || UPP ? i : 0UL ) );
628  const size_t jend( ( IsLower<MT5>::value )
630  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
631  :( LOW ? min(i+1UL,k) : k ) )
632  :( LOW ? i+1UL : N ) );
633 
634  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
635  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
636 
637  for( size_t j=jbegin; j<jend; ++j ) {
638  (~C)(i,j) += A(i,k) * B(k,j);
639  }
640  if( IsLower<MT5>::value ) {
641  (~C)(i,jend) = A(i,k) * B(k,jend);
642  }
643  }
644  }
645 
646  if( SYM || HERM ) {
647  for( size_t i=1UL; i<M; ++i ) {
648  for( size_t j=0UL; j<i; ++j ) {
649  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
650  }
651  }
652  }
653  }
655  //**********************************************************************************************
656 
657  //**Default assignment to column-major dense matrices (general/general)*************************
671  template< typename MT3 // Type of the left-hand side target matrix
672  , typename MT4 // Type of the left-hand side matrix operand
673  , typename MT5 > // Type of the right-hand side matrix operand
674  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
675  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
676  {
677  const size_t M( A.rows() );
678  const size_t N( B.columns() );
679  const size_t K( A.columns() );
680 
681  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
682 
683  for( size_t j=0UL; j<N; ++j )
684  {
685  const size_t kbegin( ( IsLower<MT5>::value )
686  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
687  :( 0UL ) );
688  const size_t kend( ( IsUpper<MT5>::value )
689  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
690  :( K ) );
691  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
692 
693  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
694  for( size_t i=0UL; i<M; ++i ) {
695  reset( (~C)(i,j) );
696  }
697  continue;
698  }
699 
700  {
701  const size_t ibegin( ( IsLower<MT4>::value )
703  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
704  :( LOW ? max(j,kbegin) : kbegin ) )
705  :( LOW ? j : 0UL ) );
706  const size_t iend( ( IsUpper<MT4>::value )
708  ?( UPP ? min(j+1UL,kbegin) : kbegin )
709  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
710  :( UPP ? j+1UL : M ) );
711 
712  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
713  for( size_t i=0UL; i<ibegin; ++i ) {
714  reset( (~C)(i,j) );
715  }
716  }
717  else if( IsStrictlyLower<MT4>::value ) {
718  reset( (~C)(0UL,j) );
719  }
720  for( size_t i=ibegin; i<iend; ++i ) {
721  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
722  }
723  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
724  for( size_t i=iend; i<M; ++i ) {
725  reset( (~C)(i,j) );
726  }
727  }
728  else if( IsStrictlyUpper<MT4>::value ) {
729  reset( (~C)(M-1UL,j) );
730  }
731  }
732 
733  for( size_t k=kbegin+1UL; k<kend; ++k )
734  {
735  const size_t ibegin( ( IsLower<MT4>::value )
737  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
738  :( SYM || HERM || LOW ? max( j, k ) : k ) )
739  :( SYM || HERM || LOW ? j : 0UL ) );
740  const size_t iend( ( IsUpper<MT4>::value )
742  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
743  :( UPP ? min(j+1UL,k) : k ) )
744  :( UPP ? j+1UL : M ) );
745 
746  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
747  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
748 
749  for( size_t i=ibegin; i<iend; ++i ) {
750  (~C)(i,j) += A(i,k) * B(k,j);
751  }
752  if( IsUpper<MT4>::value ) {
753  (~C)(iend,j) = A(iend,k) * B(k,j);
754  }
755  }
756  }
757 
758  if( SYM || HERM ) {
759  for( size_t j=1UL; j<N; ++j ) {
760  for( size_t i=0UL; i<j; ++i ) {
761  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
762  }
763  }
764  }
765  }
767  //**********************************************************************************************
768 
769  //**Default assignment to row-major dense matrices (general/diagonal)***************************
783  template< typename MT3 // Type of the left-hand side target matrix
784  , typename MT4 // Type of the left-hand side matrix operand
785  , typename MT5 > // Type of the right-hand side matrix operand
786  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
787  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
788  {
789  constexpr size_t block( BLOCK_SIZE );
790 
791  const size_t M( A.rows() );
792  const size_t N( B.columns() );
793 
794  for( size_t ii=0UL; ii<M; ii+=block ) {
795  const size_t iend( min( M, ii+block ) );
796  for( size_t jj=0UL; jj<N; jj+=block ) {
797  const size_t jend( min( N, jj+block ) );
798  for( size_t i=ii; i<iend; ++i )
799  {
800  const size_t jbegin( ( IsUpper<MT4>::value )
801  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
802  :( jj ) );
803  const size_t jpos( ( IsLower<MT4>::value )
804  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
805  :( jend ) );
806 
807  if( IsUpper<MT4>::value ) {
808  for( size_t j=jj; j<jbegin; ++j ) {
809  reset( (~C)(i,j) );
810  }
811  }
812  for( size_t j=jbegin; j<jpos; ++j ) {
813  (~C)(i,j) = A(i,j) * B(j,j);
814  }
815  if( IsLower<MT4>::value ) {
816  for( size_t j=jpos; j<jend; ++j ) {
817  reset( (~C)(i,j) );
818  }
819  }
820  }
821  }
822  }
823  }
825  //**********************************************************************************************
826 
827  //**Default assignment to column-major dense matrices (general/diagonal)************************
841  template< typename MT3 // Type of the left-hand side target matrix
842  , typename MT4 // Type of the left-hand side matrix operand
843  , typename MT5 > // Type of the right-hand side matrix operand
844  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
845  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
846  {
847  const size_t M( A.rows() );
848  const size_t N( B.columns() );
849 
850  for( size_t j=0UL; j<N; ++j )
851  {
852  const size_t ibegin( ( IsLower<MT4>::value )
853  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
854  :( 0UL ) );
855  const size_t iend( ( IsUpper<MT4>::value )
856  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
857  :( M ) );
858  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
859 
860  if( IsLower<MT4>::value ) {
861  for( size_t i=0UL; i<ibegin; ++i ) {
862  reset( (~C)(i,j) );
863  }
864  }
865  for( size_t i=ibegin; i<iend; ++i ) {
866  (~C)(i,j) = A(i,j) * B(j,j);
867  }
868  if( IsUpper<MT4>::value ) {
869  for( size_t i=iend; i<M; ++i ) {
870  reset( (~C)(i,j) );
871  }
872  }
873  }
874  }
876  //**********************************************************************************************
877 
878  //**Default assignment to row-major dense matrices (diagonal/general)***************************
892  template< typename MT3 // Type of the left-hand side target matrix
893  , typename MT4 // Type of the left-hand side matrix operand
894  , typename MT5 > // Type of the right-hand side matrix operand
896  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
897  {
898  const size_t M( A.rows() );
899  const size_t N( B.columns() );
900 
901  for( size_t i=0UL; i<M; ++i )
902  {
903  const size_t jbegin( ( IsUpper<MT5>::value )
904  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
905  :( 0UL ) );
906  const size_t jend( ( IsLower<MT5>::value )
907  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
908  :( N ) );
909  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
910 
911  if( IsUpper<MT5>::value ) {
912  for( size_t j=0UL; j<jbegin; ++j ) {
913  reset( (~C)(i,j) );
914  }
915  }
916  for( size_t j=jbegin; j<jend; ++j ) {
917  (~C)(i,j) = A(i,i) * B(i,j);
918  }
919  if( IsLower<MT5>::value ) {
920  for( size_t j=jend; j<N; ++j ) {
921  reset( (~C)(i,j) );
922  }
923  }
924  }
925  }
927  //**********************************************************************************************
928 
929  //**Default assignment to column-major dense matrices (diagonal/general)************************
943  template< typename MT3 // Type of the left-hand side target matrix
944  , typename MT4 // Type of the left-hand side matrix operand
945  , typename MT5 > // Type of the right-hand side matrix operand
946  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
947  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
948  {
949  constexpr size_t block( BLOCK_SIZE );
950 
951  const size_t M( A.rows() );
952  const size_t N( B.columns() );
953 
954  for( size_t jj=0UL; jj<N; jj+=block ) {
955  const size_t jend( min( N, jj+block ) );
956  for( size_t ii=0UL; ii<M; ii+=block ) {
957  const size_t iend( min( M, ii+block ) );
958  for( size_t j=jj; j<jend; ++j )
959  {
960  const size_t ibegin( ( IsLower<MT5>::value )
961  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
962  :( ii ) );
963  const size_t ipos( ( IsUpper<MT5>::value )
964  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
965  :( iend ) );
966 
967  if( IsLower<MT5>::value ) {
968  for( size_t i=ii; i<ibegin; ++i ) {
969  reset( (~C)(i,j) );
970  }
971  }
972  for( size_t i=ibegin; i<ipos; ++i ) {
973  (~C)(i,j) = A(i,i) * B(i,j);
974  }
975  if( IsUpper<MT5>::value ) {
976  for( size_t i=ipos; i<iend; ++i ) {
977  reset( (~C)(i,j) );
978  }
979  }
980  }
981  }
982  }
983  }
985  //**********************************************************************************************
986 
987  //**Default assignment to dense matrices (diagonal/diagonal)************************************
1001  template< typename MT3 // Type of the left-hand side target matrix
1002  , typename MT4 // Type of the left-hand side matrix operand
1003  , typename MT5 > // Type of the right-hand side matrix operand
1004  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
1005  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
1006  {
1007  reset( C );
1008 
1009  for( size_t i=0UL; i<A.rows(); ++i ) {
1010  C(i,i) = A(i,i) * B(i,i);
1011  }
1012  }
1014  //**********************************************************************************************
1015 
1016  //**Default assignment to dense matrices (small matrices)***************************************
1030  template< typename MT3 // Type of the left-hand side target matrix
1031  , typename MT4 // Type of the left-hand side matrix operand
1032  , typename MT5 > // Type of the right-hand side matrix operand
1034  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1035  {
1036  selectDefaultAssignKernel( ~C, A, B );
1037  }
1039  //**********************************************************************************************
1040 
1041  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1056  template< typename MT3 // Type of the left-hand side target matrix
1057  , typename MT4 // Type of the left-hand side matrix operand
1058  , typename MT5 > // Type of the right-hand side matrix operand
1060  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1061  {
1062  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1063 
1064  const size_t M( A.rows() );
1065  const size_t N( B.columns() );
1066  const size_t K( A.columns() );
1067 
1068  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1069 
1070  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1071  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1072 
1073  if( LOW && UPP && N > SIMDSIZE*3UL ) {
1074  reset( ~C );
1075  }
1076 
1077  {
1078  size_t j( 0UL );
1079 
1081  {
1082  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1083  for( size_t i=0UL; i<M; ++i )
1084  {
1085  const size_t kbegin( ( IsUpper<MT4>::value )
1086  ?( ( IsLower<MT5>::value )
1087  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1088  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1089  :( IsLower<MT5>::value ? j : 0UL ) );
1090  const size_t kend( ( IsLower<MT4>::value )
1091  ?( ( IsUpper<MT5>::value )
1092  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1093  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1094  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
1095 
1096  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1097 
1098  for( size_t k=kbegin; k<kend; ++k ) {
1099  const SIMDType a1( set( A(i,k) ) );
1100  xmm1 += a1 * B.load(k,j );
1101  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1102  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1103  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1104  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1105  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1106  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1107  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1108  }
1109 
1110  (~C).store( i, j , xmm1 );
1111  (~C).store( i, j+SIMDSIZE , xmm2 );
1112  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1113  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1114  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1115  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1116  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1117  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1118  }
1119  }
1120  }
1121 
1122  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1123  {
1124  size_t i( 0UL );
1125 
1126  for( ; (i+2UL) <= M; i+=2UL )
1127  {
1128  const size_t kbegin( ( IsUpper<MT4>::value )
1129  ?( ( IsLower<MT5>::value )
1130  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1131  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1132  :( IsLower<MT5>::value ? j : 0UL ) );
1133  const size_t kend( ( IsLower<MT4>::value )
1134  ?( ( IsUpper<MT5>::value )
1135  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
1136  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1137  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
1138 
1139  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1140 
1141  for( size_t k=kbegin; k<kend; ++k ) {
1142  const SIMDType a1( set( A(i ,k) ) );
1143  const SIMDType a2( set( A(i+1UL,k) ) );
1144  const SIMDType b1( B.load(k,j ) );
1145  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1146  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1147  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1148  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
1149  xmm1 += a1 * b1;
1150  xmm2 += a1 * b2;
1151  xmm3 += a1 * b3;
1152  xmm4 += a1 * b4;
1153  xmm5 += a1 * b5;
1154  xmm6 += a2 * b1;
1155  xmm7 += a2 * b2;
1156  xmm8 += a2 * b3;
1157  xmm9 += a2 * b4;
1158  xmm10 += a2 * b5;
1159  }
1160 
1161  (~C).store( i , j , xmm1 );
1162  (~C).store( i , j+SIMDSIZE , xmm2 );
1163  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1164  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1165  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
1166  (~C).store( i+1UL, j , xmm6 );
1167  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
1168  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1169  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1170  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1171  }
1172 
1173  if( i < M )
1174  {
1175  const size_t kbegin( ( IsUpper<MT4>::value )
1176  ?( ( IsLower<MT5>::value )
1177  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1178  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1179  :( IsLower<MT5>::value ? j : 0UL ) );
1180  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
1181 
1182  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1183 
1184  for( size_t k=kbegin; k<kend; ++k ) {
1185  const SIMDType a1( set( A(i,k) ) );
1186  xmm1 += a1 * B.load(k,j );
1187  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1188  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1189  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1190  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1191  }
1192 
1193  (~C).store( i, j , xmm1 );
1194  (~C).store( i, j+SIMDSIZE , xmm2 );
1195  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1196  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1197  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1198  }
1199  }
1200 
1201  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1202  {
1203  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
1204  size_t i( LOW ? j : 0UL );
1205 
1206  for( ; (i+2UL) <= iend; i+=2UL )
1207  {
1208  const size_t kbegin( ( IsUpper<MT4>::value )
1209  ?( ( IsLower<MT5>::value )
1210  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1211  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1212  :( IsLower<MT5>::value ? j : 0UL ) );
1213  const size_t kend( ( IsLower<MT4>::value )
1214  ?( ( IsUpper<MT5>::value )
1215  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1216  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1217  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
1218 
1219  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1220 
1221  for( size_t k=kbegin; k<kend; ++k ) {
1222  const SIMDType a1( set( A(i ,k) ) );
1223  const SIMDType a2( set( A(i+1UL,k) ) );
1224  const SIMDType b1( B.load(k,j ) );
1225  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1226  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1227  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1228  xmm1 += a1 * b1;
1229  xmm2 += a1 * b2;
1230  xmm3 += a1 * b3;
1231  xmm4 += a1 * b4;
1232  xmm5 += a2 * b1;
1233  xmm6 += a2 * b2;
1234  xmm7 += a2 * b3;
1235  xmm8 += a2 * b4;
1236  }
1237 
1238  (~C).store( i , j , xmm1 );
1239  (~C).store( i , j+SIMDSIZE , xmm2 );
1240  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1241  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1242  (~C).store( i+1UL, j , xmm5 );
1243  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1244  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1245  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1246  }
1247 
1248  if( i < iend )
1249  {
1250  const size_t kbegin( ( IsUpper<MT4>::value )
1251  ?( ( IsLower<MT5>::value )
1252  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1253  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1254  :( IsLower<MT5>::value ? j : 0UL ) );
1255  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1256 
1257  SIMDType xmm1, xmm2, xmm3, xmm4;
1258 
1259  for( size_t k=kbegin; k<kend; ++k ) {
1260  const SIMDType a1( set( A(i,k) ) );
1261  xmm1 += a1 * B.load(k,j );
1262  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1263  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1264  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1265  }
1266 
1267  (~C).store( i, j , xmm1 );
1268  (~C).store( i, j+SIMDSIZE , xmm2 );
1269  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1270  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1271  }
1272  }
1273 
1274  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1275  {
1276  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
1277  size_t i( LOW ? j : 0UL );
1278 
1279  for( ; (i+2UL) <= iend; i+=2UL )
1280  {
1281  const size_t kbegin( ( IsUpper<MT4>::value )
1282  ?( ( IsLower<MT5>::value )
1283  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1284  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1285  :( IsLower<MT5>::value ? j : 0UL ) );
1286  const size_t kend( ( IsLower<MT4>::value )
1287  ?( ( IsUpper<MT5>::value )
1288  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1289  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1290  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
1291 
1292  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1293 
1294  for( size_t k=kbegin; k<kend; ++k ) {
1295  const SIMDType a1( set( A(i ,k) ) );
1296  const SIMDType a2( set( A(i+1UL,k) ) );
1297  const SIMDType b1( B.load(k,j ) );
1298  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1299  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1300  xmm1 += a1 * b1;
1301  xmm2 += a1 * b2;
1302  xmm3 += a1 * b3;
1303  xmm4 += a2 * b1;
1304  xmm5 += a2 * b2;
1305  xmm6 += a2 * b3;
1306  }
1307 
1308  (~C).store( i , j , xmm1 );
1309  (~C).store( i , j+SIMDSIZE , xmm2 );
1310  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1311  (~C).store( i+1UL, j , xmm4 );
1312  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
1313  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1314  }
1315 
1316  if( i < iend )
1317  {
1318  const size_t kbegin( ( IsUpper<MT4>::value )
1319  ?( ( IsLower<MT5>::value )
1320  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1321  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1322  :( IsLower<MT5>::value ? j : 0UL ) );
1323  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1324 
1325  SIMDType xmm1, xmm2, xmm3;
1326 
1327  for( size_t k=kbegin; k<kend; ++k ) {
1328  const SIMDType a1( set( A(i,k) ) );
1329  xmm1 += a1 * B.load(k,j );
1330  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1331  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1332  }
1333 
1334  (~C).store( i, j , xmm1 );
1335  (~C).store( i, j+SIMDSIZE , xmm2 );
1336  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1337  }
1338  }
1339 
1340  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1341  {
1342  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
1343  size_t i( LOW ? j : 0UL );
1344 
1345  for( ; (i+4UL) <= iend; i+=4UL )
1346  {
1347  const size_t kbegin( ( IsUpper<MT4>::value )
1348  ?( ( IsLower<MT5>::value )
1349  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1350  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1351  :( IsLower<MT5>::value ? j : 0UL ) );
1352  const size_t kend( ( IsLower<MT4>::value )
1353  ?( ( IsUpper<MT5>::value )
1354  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
1355  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
1356  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1357 
1358  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1359 
1360  for( size_t k=kbegin; k<kend; ++k ) {
1361  const SIMDType a1( set( A(i ,k) ) );
1362  const SIMDType a2( set( A(i+1UL,k) ) );
1363  const SIMDType a3( set( A(i+2UL,k) ) );
1364  const SIMDType a4( set( A(i+3UL,k) ) );
1365  const SIMDType b1( B.load(k,j ) );
1366  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1367  xmm1 += a1 * b1;
1368  xmm2 += a1 * b2;
1369  xmm3 += a2 * b1;
1370  xmm4 += a2 * b2;
1371  xmm5 += a3 * b1;
1372  xmm6 += a3 * b2;
1373  xmm7 += a4 * b1;
1374  xmm8 += a4 * b2;
1375  }
1376 
1377  (~C).store( i , j , xmm1 );
1378  (~C).store( i , j+SIMDSIZE, xmm2 );
1379  (~C).store( i+1UL, j , xmm3 );
1380  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1381  (~C).store( i+2UL, j , xmm5 );
1382  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1383  (~C).store( i+3UL, j , xmm7 );
1384  (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
1385  }
1386 
1387  for( ; (i+3UL) <= iend; i+=3UL )
1388  {
1389  const size_t kbegin( ( IsUpper<MT4>::value )
1390  ?( ( IsLower<MT5>::value )
1391  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1392  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1393  :( IsLower<MT5>::value ? j : 0UL ) );
1394  const size_t kend( ( IsLower<MT4>::value )
1395  ?( ( IsUpper<MT5>::value )
1396  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
1397  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
1398  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1399 
1400  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1401 
1402  for( size_t k=kbegin; k<kend; ++k ) {
1403  const SIMDType a1( set( A(i ,k) ) );
1404  const SIMDType a2( set( A(i+1UL,k) ) );
1405  const SIMDType a3( set( A(i+2UL,k) ) );
1406  const SIMDType b1( B.load(k,j ) );
1407  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1408  xmm1 += a1 * b1;
1409  xmm2 += a1 * b2;
1410  xmm3 += a2 * b1;
1411  xmm4 += a2 * b2;
1412  xmm5 += a3 * b1;
1413  xmm6 += a3 * b2;
1414  }
1415 
1416  (~C).store( i , j , xmm1 );
1417  (~C).store( i , j+SIMDSIZE, xmm2 );
1418  (~C).store( i+1UL, j , xmm3 );
1419  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1420  (~C).store( i+2UL, j , xmm5 );
1421  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1422  }
1423 
1424  for( ; (i+2UL) <= iend; i+=2UL )
1425  {
1426  const size_t kbegin( ( IsUpper<MT4>::value )
1427  ?( ( IsLower<MT5>::value )
1428  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1429  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1430  :( IsLower<MT5>::value ? j : 0UL ) );
1431  const size_t kend( ( IsLower<MT4>::value )
1432  ?( ( IsUpper<MT5>::value )
1433  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1434  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1435  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1436 
1437  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1438  size_t k( kbegin );
1439 
1440  for( ; (k+2UL) <= kend; k+=2UL ) {
1441  const SIMDType a1( set( A(i ,k ) ) );
1442  const SIMDType a2( set( A(i+1UL,k ) ) );
1443  const SIMDType a3( set( A(i ,k+1UL) ) );
1444  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
1445  const SIMDType b1( B.load(k ,j ) );
1446  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1447  const SIMDType b3( B.load(k+1UL,j ) );
1448  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1449  xmm1 += a1 * b1;
1450  xmm2 += a1 * b2;
1451  xmm3 += a2 * b1;
1452  xmm4 += a2 * b2;
1453  xmm5 += a3 * b3;
1454  xmm6 += a3 * b4;
1455  xmm7 += a4 * b3;
1456  xmm8 += a4 * b4;
1457  }
1458 
1459  for( ; k<kend; ++k ) {
1460  const SIMDType a1( set( A(i ,k) ) );
1461  const SIMDType a2( set( A(i+1UL,k) ) );
1462  const SIMDType b1( B.load(k,j ) );
1463  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1464  xmm1 += a1 * b1;
1465  xmm2 += a1 * b2;
1466  xmm3 += a2 * b1;
1467  xmm4 += a2 * b2;
1468  }
1469 
1470  (~C).store( i , j , xmm1+xmm5 );
1471  (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
1472  (~C).store( i+1UL, j , xmm3+xmm7 );
1473  (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1474  }
1475 
1476  if( i < iend )
1477  {
1478  const size_t kbegin( ( IsUpper<MT4>::value )
1479  ?( ( IsLower<MT5>::value )
1480  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1481  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1482  :( IsLower<MT5>::value ? j : 0UL ) );
1483  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1484 
1485  SIMDType xmm1, xmm2, xmm3, xmm4;
1486  size_t k( kbegin );
1487 
1488  for( ; (k+2UL) <= kend; k+=2UL ) {
1489  const SIMDType a1( set( A(i,k ) ) );
1490  const SIMDType a2( set( A(i,k+1UL) ) );
1491  xmm1 += a1 * B.load(k ,j );
1492  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1493  xmm3 += a2 * B.load(k+1UL,j );
1494  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1495  }
1496 
1497  for( ; k<kend; ++k ) {
1498  const SIMDType a1( set( A(i,k) ) );
1499  xmm1 += a1 * B.load(k,j );
1500  xmm2 += a1 * B.load(k,j+SIMDSIZE);
1501  }
1502 
1503  (~C).store( i, j , xmm1+xmm3 );
1504  (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
1505  }
1506  }
1507 
1508  for( ; j<jpos; j+=SIMDSIZE )
1509  {
1510  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
1511  size_t i( LOW ? j : 0UL );
1512 
1513  for( ; (i+4UL) <= iend; i+=4UL )
1514  {
1515  const size_t kbegin( ( IsUpper<MT4>::value )
1516  ?( ( IsLower<MT5>::value )
1517  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1518  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1519  :( IsLower<MT5>::value ? j : 0UL ) );
1520  const size_t kend( ( IsLower<MT4>::value )
1521  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
1522  :( K ) );
1523 
1524  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1525  size_t k( kbegin );
1526 
1527  for( ; (k+2UL) <= kend; k+=2UL ) {
1528  const SIMDType b1( B.load(k ,j) );
1529  const SIMDType b2( B.load(k+1UL,j) );
1530  xmm1 += set( A(i ,k ) ) * b1;
1531  xmm2 += set( A(i+1UL,k ) ) * b1;
1532  xmm3 += set( A(i+2UL,k ) ) * b1;
1533  xmm4 += set( A(i+3UL,k ) ) * b1;
1534  xmm5 += set( A(i ,k+1UL) ) * b2;
1535  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
1536  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
1537  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
1538  }
1539 
1540  for( ; k<kend; ++k ) {
1541  const SIMDType b1( B.load(k,j) );
1542  xmm1 += set( A(i ,k) ) * b1;
1543  xmm2 += set( A(i+1UL,k) ) * b1;
1544  xmm3 += set( A(i+2UL,k) ) * b1;
1545  xmm4 += set( A(i+3UL,k) ) * b1;
1546  }
1547 
1548  (~C).store( i , j, xmm1+xmm5 );
1549  (~C).store( i+1UL, j, xmm2+xmm6 );
1550  (~C).store( i+2UL, j, xmm3+xmm7 );
1551  (~C).store( i+3UL, j, xmm4+xmm8 );
1552  }
1553 
1554  for( ; (i+3UL) <= iend; i+=3UL )
1555  {
1556  const size_t kbegin( ( IsUpper<MT4>::value )
1557  ?( ( IsLower<MT5>::value )
1558  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1559  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1560  :( IsLower<MT5>::value ? j : 0UL ) );
1561  const size_t kend( ( IsLower<MT4>::value )
1562  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
1563  :( K ) );
1564 
1565  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1566  size_t k( kbegin );
1567 
1568  for( ; (k+2UL) <= kend; k+=2UL ) {
1569  const SIMDType b1( B.load(k ,j) );
1570  const SIMDType b2( B.load(k+1UL,j) );
1571  xmm1 += set( A(i ,k ) ) * b1;
1572  xmm2 += set( A(i+1UL,k ) ) * b1;
1573  xmm3 += set( A(i+2UL,k ) ) * b1;
1574  xmm4 += set( A(i ,k+1UL) ) * b2;
1575  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
1576  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
1577  }
1578 
1579  for( ; k<kend; ++k ) {
1580  const SIMDType b1( B.load(k,j) );
1581  xmm1 += set( A(i ,k) ) * b1;
1582  xmm2 += set( A(i+1UL,k) ) * b1;
1583  xmm3 += set( A(i+2UL,k) ) * b1;
1584  }
1585 
1586  (~C).store( i , j, xmm1+xmm4 );
1587  (~C).store( i+1UL, j, xmm2+xmm5 );
1588  (~C).store( i+2UL, j, xmm3+xmm6 );
1589  }
1590 
1591  for( ; (i+2UL) <= iend; i+=2UL )
1592  {
1593  const size_t kbegin( ( IsUpper<MT4>::value )
1594  ?( ( IsLower<MT5>::value )
1595  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1596  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1597  :( IsLower<MT5>::value ? j : 0UL ) );
1598  const size_t kend( ( IsLower<MT4>::value )
1599  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1600  :( K ) );
1601 
1602  SIMDType xmm1, xmm2, xmm3, xmm4;
1603  size_t k( kbegin );
1604 
1605  for( ; (k+2UL) <= kend; k+=2UL ) {
1606  const SIMDType b1( B.load(k ,j) );
1607  const SIMDType b2( B.load(k+1UL,j) );
1608  xmm1 += set( A(i ,k ) ) * b1;
1609  xmm2 += set( A(i+1UL,k ) ) * b1;
1610  xmm3 += set( A(i ,k+1UL) ) * b2;
1611  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
1612  }
1613 
1614  for( ; k<kend; ++k ) {
1615  const SIMDType b1( B.load(k,j) );
1616  xmm1 += set( A(i ,k) ) * b1;
1617  xmm2 += set( A(i+1UL,k) ) * b1;
1618  }
1619 
1620  (~C).store( i , j, xmm1+xmm3 );
1621  (~C).store( i+1UL, j, xmm2+xmm4 );
1622  }
1623 
1624  if( i < iend )
1625  {
1626  const size_t kbegin( ( IsUpper<MT4>::value )
1627  ?( ( IsLower<MT5>::value )
1628  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1629  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1630  :( IsLower<MT5>::value ? j : 0UL ) );
1631 
1632  SIMDType xmm1, xmm2;
1633  size_t k( kbegin );
1634 
1635  for( ; (k+2UL) <= K; k+=2UL ) {
1636  xmm1 += set( A(i,k ) ) * B.load(k ,j);
1637  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
1638  }
1639 
1640  for( ; k<K; ++k ) {
1641  xmm1 += set( A(i,k) ) * B.load(k,j);
1642  }
1643 
1644  (~C).store( i, j, xmm1+xmm2 );
1645  }
1646  }
1647 
1648  for( ; remainder && j<N; ++j )
1649  {
1650  size_t i( LOW && UPP ? j : 0UL );
1651 
1652  for( ; (i+2UL) <= M; i+=2UL )
1653  {
1654  const size_t kbegin( ( IsUpper<MT4>::value )
1655  ?( ( IsLower<MT5>::value )
1656  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1657  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1658  :( IsLower<MT5>::value ? j : 0UL ) );
1659  const size_t kend( ( IsLower<MT4>::value )
1660  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1661  :( K ) );
1662 
1663  ElementType value1 = ElementType();
1664  ElementType value2 = ElementType();
1665 
1666  for( size_t k=kbegin; k<kend; ++k ) {
1667  value1 += A(i ,k) * B(k,j);
1668  value2 += A(i+1UL,k) * B(k,j);
1669  }
1670 
1671  (~C)(i ,j) = value1;
1672  (~C)(i+1UL,j) = value2;
1673  }
1674 
1675  if( i < M )
1676  {
1677  const size_t kbegin( ( IsUpper<MT4>::value )
1678  ?( ( IsLower<MT5>::value )
1679  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1680  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1681  :( IsLower<MT5>::value ? j : 0UL ) );
1682 
1683  ElementType value = ElementType();
1684 
1685  for( size_t k=kbegin; k<K; ++k ) {
1686  value += A(i,k) * B(k,j);
1687  }
1688 
1689  (~C)(i,j) = value;
1690  }
1691  }
1692  }
1693 
1694  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1695  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1696  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1697  for( size_t j=0UL; j<jend; ++j ) {
1698  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
1699  }
1700  }
1701  }
1702  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1703  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1704  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1705  for( size_t i=0UL; i<iend; ++i ) {
1706  reset( (~C)(i,j) );
1707  }
1708  }
1709  }
1710  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1711  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1712  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1713  for( size_t j=0UL; j<jend; ++j ) {
1714  reset( (~C)(i,j) );
1715  }
1716  }
1717  }
1718  }
1720  //**********************************************************************************************
1721 
1722  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1737  template< typename MT3 // Type of the left-hand side target matrix
1738  , typename MT4 // Type of the left-hand side matrix operand
1739  , typename MT5 > // Type of the right-hand side matrix operand
1741  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1742  {
1743  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1744 
1745  const size_t M( A.rows() );
1746  const size_t N( B.columns() );
1747  const size_t K( A.columns() );
1748 
1749  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1750 
1751  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
1752  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1753 
1754  if( LOW && UPP && M > SIMDSIZE*3UL ) {
1755  reset( ~C );
1756  }
1757 
1758  {
1759  size_t i( 0UL );
1760 
1762  {
1763  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1764  for( size_t j=0UL; j<N; ++j )
1765  {
1766  const size_t kbegin( ( IsLower<MT5>::value )
1767  ?( ( IsUpper<MT4>::value )
1768  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1769  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1770  :( IsUpper<MT4>::value ? i : 0UL ) );
1771  const size_t kend( ( IsUpper<MT5>::value )
1772  ?( ( IsLower<MT4>::value )
1773  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1774  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1775  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
1776 
1777  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1778 
1779  for( size_t k=kbegin; k<kend; ++k ) {
1780  const SIMDType b1( set( B(k,j) ) );
1781  xmm1 += A.load(i ,k) * b1;
1782  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1783  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1784  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1785  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1786  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
1787  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
1788  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
1789  }
1790 
1791  (~C).store( i , j, xmm1 );
1792  (~C).store( i+SIMDSIZE , j, xmm2 );
1793  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1794  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1795  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1796  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1797  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1798  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1799  }
1800  }
1801  }
1802 
1803  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
1804  {
1805  size_t j( 0UL );
1806 
1807  for( ; (j+2UL) <= N; j+=2UL )
1808  {
1809  const size_t kbegin( ( IsLower<MT5>::value )
1810  ?( ( IsUpper<MT4>::value )
1811  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1812  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1813  :( IsUpper<MT4>::value ? i : 0UL ) );
1814  const size_t kend( ( IsUpper<MT5>::value )
1815  ?( ( IsLower<MT4>::value )
1816  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1817  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1818  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
1819 
1820  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1821 
1822  for( size_t k=kbegin; k<kend; ++k ) {
1823  const SIMDType a1( A.load(i ,k) );
1824  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1825  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1826  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1827  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1828  const SIMDType b1( set( B(k,j ) ) );
1829  const SIMDType b2( set( B(k,j+1UL) ) );
1830  xmm1 += a1 * b1;
1831  xmm2 += a2 * b1;
1832  xmm3 += a3 * b1;
1833  xmm4 += a4 * b1;
1834  xmm5 += a5 * b1;
1835  xmm6 += a1 * b2;
1836  xmm7 += a2 * b2;
1837  xmm8 += a3 * b2;
1838  xmm9 += a4 * b2;
1839  xmm10 += a5 * b2;
1840  }
1841 
1842  (~C).store( i , j , xmm1 );
1843  (~C).store( i+SIMDSIZE , j , xmm2 );
1844  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1845  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1846  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1847  (~C).store( i , j+1UL, xmm6 );
1848  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1849  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1850  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1851  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1852  }
1853 
1854  if( j < N )
1855  {
1856  const size_t kbegin( ( IsLower<MT5>::value )
1857  ?( ( IsUpper<MT4>::value )
1858  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1859  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1860  :( IsUpper<MT4>::value ? i : 0UL ) );
1861  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1862 
1863  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1864 
1865  for( size_t k=kbegin; k<kend; ++k ) {
1866  const SIMDType b1( set( B(k,j) ) );
1867  xmm1 += A.load(i ,k) * b1;
1868  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1869  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1870  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1871  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1872  }
1873 
1874  (~C).store( i , j, xmm1 );
1875  (~C).store( i+SIMDSIZE , j, xmm2 );
1876  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1877  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1878  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1879  }
1880  }
1881 
1882  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1883  {
1884  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
1885  size_t j( UPP ? i : 0UL );
1886 
1887  for( ; (j+2UL) <= jend; j+=2UL )
1888  {
1889  const size_t kbegin( ( IsLower<MT5>::value )
1890  ?( ( IsUpper<MT4>::value )
1891  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1892  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1893  :( IsUpper<MT4>::value ? i : 0UL ) );
1894  const size_t kend( ( IsUpper<MT5>::value )
1895  ?( ( IsLower<MT4>::value )
1896  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1897  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1898  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
1899 
1900  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1901 
1902  for( size_t k=kbegin; k<kend; ++k ) {
1903  const SIMDType a1( A.load(i ,k) );
1904  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1905  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1906  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1907  const SIMDType b1( set( B(k,j ) ) );
1908  const SIMDType b2( set( B(k,j+1UL) ) );
1909  xmm1 += a1 * b1;
1910  xmm2 += a2 * b1;
1911  xmm3 += a3 * b1;
1912  xmm4 += a4 * b1;
1913  xmm5 += a1 * b2;
1914  xmm6 += a2 * b2;
1915  xmm7 += a3 * b2;
1916  xmm8 += a4 * b2;
1917  }
1918 
1919  (~C).store( i , j , xmm1 );
1920  (~C).store( i+SIMDSIZE , j , xmm2 );
1921  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1922  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1923  (~C).store( i , j+1UL, xmm5 );
1924  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1925  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1926  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1927  }
1928 
1929  if( j < jend )
1930  {
1931  const size_t kbegin( ( IsLower<MT5>::value )
1932  ?( ( IsUpper<MT4>::value )
1933  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1934  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1935  :( IsUpper<MT4>::value ? i : 0UL ) );
1936  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1937 
1938  SIMDType xmm1, xmm2, xmm3, xmm4;
1939 
1940  for( size_t k=kbegin; k<kend; ++k ) {
1941  const SIMDType b1( set( B(k,j) ) );
1942  xmm1 += A.load(i ,k) * b1;
1943  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1944  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1945  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1946  }
1947 
1948  (~C).store( i , j, xmm1 );
1949  (~C).store( i+SIMDSIZE , j, xmm2 );
1950  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1951  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1952  }
1953  }
1954 
1955  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1956  {
1957  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
1958  size_t j( UPP ? i : 0UL );
1959 
1960  for( ; (j+2UL) <= jend; j+=2UL )
1961  {
1962  const size_t kbegin( ( IsLower<MT5>::value )
1963  ?( ( IsUpper<MT4>::value )
1964  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1965  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1966  :( IsUpper<MT4>::value ? i : 0UL ) );
1967  const size_t kend( ( IsUpper<MT5>::value )
1968  ?( ( IsLower<MT4>::value )
1969  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1970  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1971  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
1972 
1973  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1974 
1975  for( size_t k=kbegin; k<kend; ++k ) {
1976  const SIMDType a1( A.load(i ,k) );
1977  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1978  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1979  const SIMDType b1( set( B(k,j ) ) );
1980  const SIMDType b2( set( B(k,j+1UL) ) );
1981  xmm1 += a1 * b1;
1982  xmm2 += a2 * b1;
1983  xmm3 += a3 * b1;
1984  xmm4 += a1 * b2;
1985  xmm5 += a2 * b2;
1986  xmm6 += a3 * b2;
1987  }
1988 
1989  (~C).store( i , j , xmm1 );
1990  (~C).store( i+SIMDSIZE , j , xmm2 );
1991  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1992  (~C).store( i , j+1UL, xmm4 );
1993  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1994  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1995  }
1996 
1997  if( j < jend )
1998  {
1999  const size_t kbegin( ( IsLower<MT5>::value )
2000  ?( ( IsUpper<MT4>::value )
2001  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2002  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2003  :( IsUpper<MT4>::value ? i : 0UL ) );
2004  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2005 
2006  SIMDType xmm1, xmm2, xmm3;
2007 
2008  for( size_t k=kbegin; k<kend; ++k ) {
2009  const SIMDType b1( set( B(k,j) ) );
2010  xmm1 += A.load(i ,k) * b1;
2011  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2012  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2013  }
2014 
2015  (~C).store( i , j, xmm1 );
2016  (~C).store( i+SIMDSIZE , j, xmm2 );
2017  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2018  }
2019  }
2020 
2021  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2022  {
2023  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
2024  size_t j( UPP ? i : 0UL );
2025 
2026  for( ; (j+4UL) <= jend; j+=4UL )
2027  {
2028  const size_t kbegin( ( IsLower<MT5>::value )
2029  ?( ( IsUpper<MT4>::value )
2030  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2031  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2032  :( IsUpper<MT4>::value ? i : 0UL ) );
2033  const size_t kend( ( IsUpper<MT5>::value )
2034  ?( ( IsLower<MT4>::value )
2035  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
2036  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
2037  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2038 
2039  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2040 
2041  for( size_t k=kbegin; k<kend; ++k ) {
2042  const SIMDType a1( A.load(i ,k) );
2043  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2044  const SIMDType b1( set( B(k,j ) ) );
2045  const SIMDType b2( set( B(k,j+1UL) ) );
2046  const SIMDType b3( set( B(k,j+2UL) ) );
2047  const SIMDType b4( set( B(k,j+3UL) ) );
2048  xmm1 += a1 * b1;
2049  xmm2 += a2 * b1;
2050  xmm3 += a1 * b2;
2051  xmm4 += a2 * b2;
2052  xmm5 += a1 * b3;
2053  xmm6 += a2 * b3;
2054  xmm7 += a1 * b4;
2055  xmm8 += a2 * b4;
2056  }
2057 
2058  (~C).store( i , j , xmm1 );
2059  (~C).store( i+SIMDSIZE, j , xmm2 );
2060  (~C).store( i , j+1UL, xmm3 );
2061  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2062  (~C).store( i , j+2UL, xmm5 );
2063  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2064  (~C).store( i , j+3UL, xmm7 );
2065  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
2066  }
2067 
2068  for( ; (j+3UL) <= jend; j+=3UL )
2069  {
2070  const size_t kbegin( ( IsLower<MT5>::value )
2071  ?( ( IsUpper<MT4>::value )
2072  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2073  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2074  :( IsUpper<MT4>::value ? i : 0UL ) );
2075  const size_t kend( ( IsUpper<MT5>::value )
2076  ?( ( IsLower<MT4>::value )
2077  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
2078  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
2079  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2080 
2081  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2082 
2083  for( size_t k=kbegin; k<kend; ++k ) {
2084  const SIMDType a1( A.load(i ,k) );
2085  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2086  const SIMDType b1( set( B(k,j ) ) );
2087  const SIMDType b2( set( B(k,j+1UL) ) );
2088  const SIMDType b3( set( B(k,j+2UL) ) );
2089  xmm1 += a1 * b1;
2090  xmm2 += a2 * b1;
2091  xmm3 += a1 * b2;
2092  xmm4 += a2 * b2;
2093  xmm5 += a1 * b3;
2094  xmm6 += a2 * b3;
2095  }
2096 
2097  (~C).store( i , j , xmm1 );
2098  (~C).store( i+SIMDSIZE, j , xmm2 );
2099  (~C).store( i , j+1UL, xmm3 );
2100  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2101  (~C).store( i , j+2UL, xmm5 );
2102  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2103  }
2104 
2105  for( ; (j+2UL) <= jend; j+=2UL )
2106  {
2107  const size_t kbegin( ( IsLower<MT5>::value )
2108  ?( ( IsUpper<MT4>::value )
2109  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2110  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2111  :( IsUpper<MT4>::value ? i : 0UL ) );
2112  const size_t kend( ( IsUpper<MT5>::value )
2113  ?( ( IsLower<MT4>::value )
2114  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2115  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2116  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2117 
2118  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2119  size_t k( kbegin );
2120 
2121  for( ; (k+2UL) <= kend; k+=2UL ) {
2122  const SIMDType a1( A.load(i ,k ) );
2123  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2124  const SIMDType a3( A.load(i ,k+1UL) );
2125  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2126  const SIMDType b1( set( B(k ,j ) ) );
2127  const SIMDType b2( set( B(k ,j+1UL) ) );
2128  const SIMDType b3( set( B(k+1UL,j ) ) );
2129  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
2130  xmm1 += a1 * b1;
2131  xmm2 += a2 * b1;
2132  xmm3 += a1 * b2;
2133  xmm4 += a2 * b2;
2134  xmm5 += a3 * b3;
2135  xmm6 += a4 * b3;
2136  xmm7 += a3 * b4;
2137  xmm8 += a4 * b4;
2138  }
2139 
2140  for( ; k<kend; ++k ) {
2141  const SIMDType a1( A.load(i ,k) );
2142  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2143  const SIMDType b1( set( B(k,j ) ) );
2144  const SIMDType b2( set( B(k,j+1UL) ) );
2145  xmm1 += a1 * b1;
2146  xmm2 += a2 * b1;
2147  xmm3 += a1 * b2;
2148  xmm4 += a2 * b2;
2149  }
2150 
2151  (~C).store( i , j , xmm1+xmm5 );
2152  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
2153  (~C).store( i , j+1UL, xmm3+xmm7 );
2154  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2155  }
2156 
2157  if( j < jend )
2158  {
2159  const size_t kbegin( ( IsLower<MT5>::value )
2160  ?( ( IsUpper<MT4>::value )
2161  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2162  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2163  :( IsUpper<MT4>::value ? i : 0UL ) );
2164  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2165 
2166  SIMDType xmm1, xmm2, xmm3, xmm4;
2167  size_t k( kbegin );
2168 
2169  for( ; (k+2UL) <= kend; k+=2UL ) {
2170  const SIMDType b1( set( B(k ,j) ) );
2171  const SIMDType b2( set( B(k+1UL,j) ) );
2172  xmm1 += A.load(i ,k ) * b1;
2173  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2174  xmm3 += A.load(i ,k+1UL) * b2;
2175  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2176  }
2177 
2178  for( ; k<kend; ++k ) {
2179  const SIMDType b1( set( B(k,j) ) );
2180  xmm1 += A.load(i ,k) * b1;
2181  xmm2 += A.load(i+SIMDSIZE,k) * b1;
2182  }
2183 
2184  (~C).store( i , j, xmm1+xmm3 );
2185  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
2186  }
2187  }
2188 
2189  for( ; i<ipos; i+=SIMDSIZE )
2190  {
2191  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
2192  size_t j( UPP ? i : 0UL );
2193 
2194  for( ; (j+4UL) <= jend; j+=4UL )
2195  {
2196  const size_t kbegin( ( IsLower<MT5>::value )
2197  ?( ( IsUpper<MT4>::value )
2198  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2199  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2200  :( IsUpper<MT4>::value ? i : 0UL ) );
2201  const size_t kend( ( IsUpper<MT5>::value )
2202  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
2203  :( K ) );
2204 
2205  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2206  size_t k( kbegin );
2207 
2208  for( ; (k+2UL) <= kend; k+=2UL ) {
2209  const SIMDType a1( A.load(i,k ) );
2210  const SIMDType a2( A.load(i,k+1UL) );
2211  xmm1 += a1 * set( B(k ,j ) );
2212  xmm2 += a1 * set( B(k ,j+1UL) );
2213  xmm3 += a1 * set( B(k ,j+2UL) );
2214  xmm4 += a1 * set( B(k ,j+3UL) );
2215  xmm5 += a2 * set( B(k+1UL,j ) );
2216  xmm6 += a2 * set( B(k+1UL,j+1UL) );
2217  xmm7 += a2 * set( B(k+1UL,j+2UL) );
2218  xmm8 += a2 * set( B(k+1UL,j+3UL) );
2219  }
2220 
2221  for( ; k<kend; ++k ) {
2222  const SIMDType a1( A.load(i,k) );
2223  xmm1 += a1 * set( B(k,j ) );
2224  xmm2 += a1 * set( B(k,j+1UL) );
2225  xmm3 += a1 * set( B(k,j+2UL) );
2226  xmm4 += a1 * set( B(k,j+3UL) );
2227  }
2228 
2229  (~C).store( i, j , xmm1+xmm5 );
2230  (~C).store( i, j+1UL, xmm2+xmm6 );
2231  (~C).store( i, j+2UL, xmm3+xmm7 );
2232  (~C).store( i, j+3UL, xmm4+xmm8 );
2233  }
2234 
2235  for( ; (j+3UL) <= jend; j+=3UL )
2236  {
2237  const size_t kbegin( ( IsLower<MT5>::value )
2238  ?( ( IsUpper<MT4>::value )
2239  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2240  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2241  :( IsUpper<MT4>::value ? i : 0UL ) );
2242  const size_t kend( ( IsUpper<MT5>::value )
2243  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
2244  :( K ) );
2245 
2246  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2247  size_t k( kbegin );
2248 
2249  for( ; (k+2UL) <= kend; k+=2UL ) {
2250  const SIMDType a1( A.load(i,k ) );
2251  const SIMDType a2( A.load(i,k+1UL) );
2252  xmm1 += a1 * set( B(k ,j ) );
2253  xmm2 += a1 * set( B(k ,j+1UL) );
2254  xmm3 += a1 * set( B(k ,j+2UL) );
2255  xmm4 += a2 * set( B(k+1UL,j ) );
2256  xmm5 += a2 * set( B(k+1UL,j+1UL) );
2257  xmm6 += a2 * set( B(k+1UL,j+2UL) );
2258  }
2259 
2260  for( ; k<kend; ++k ) {
2261  const SIMDType a1( A.load(i,k) );
2262  xmm1 += a1 * set( B(k,j ) );
2263  xmm2 += a1 * set( B(k,j+1UL) );
2264  xmm3 += a1 * set( B(k,j+2UL) );
2265  }
2266 
2267  (~C).store( i, j , xmm1+xmm4 );
2268  (~C).store( i, j+1UL, xmm2+xmm5 );
2269  (~C).store( i, j+2UL, xmm3+xmm6 );
2270  }
2271 
2272  for( ; (j+2UL) <= jend; j+=2UL )
2273  {
2274  const size_t kbegin( ( IsLower<MT5>::value )
2275  ?( ( IsUpper<MT4>::value )
2276  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2277  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2278  :( IsUpper<MT4>::value ? i : 0UL ) );
2279  const size_t kend( ( IsUpper<MT5>::value )
2280  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2281  :( K ) );
2282 
2283  SIMDType xmm1, xmm2, xmm3, xmm4;
2284  size_t k( kbegin );
2285 
2286  for( ; (k+2UL) <= kend; k+=2UL ) {
2287  const SIMDType a1( A.load(i,k ) );
2288  const SIMDType a2( A.load(i,k+1UL) );
2289  xmm1 += a1 * set( B(k ,j ) );
2290  xmm2 += a1 * set( B(k ,j+1UL) );
2291  xmm3 += a2 * set( B(k+1UL,j ) );
2292  xmm4 += a2 * set( B(k+1UL,j+1UL) );
2293  }
2294 
2295  for( ; k<kend; ++k ) {
2296  const SIMDType a1( A.load(i,k) );
2297  xmm1 += a1 * set( B(k,j ) );
2298  xmm2 += a1 * set( B(k,j+1UL) );
2299  }
2300 
2301  (~C).store( i, j , xmm1+xmm3 );
2302  (~C).store( i, j+1UL, xmm2+xmm4 );
2303  }
2304 
2305  if( j < jend )
2306  {
2307  const size_t kbegin( ( IsLower<MT5>::value )
2308  ?( ( IsUpper<MT4>::value )
2309  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2310  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2311  :( IsUpper<MT4>::value ? i : 0UL ) );
2312 
2313  SIMDType xmm1, xmm2;
2314  size_t k( kbegin );
2315 
2316  for( ; (k+2UL) <= K; k+=2UL ) {
2317  xmm1 += A.load(i,k ) * set( B(k ,j) );
2318  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
2319  }
2320 
2321  for( ; k<K; ++k ) {
2322  xmm1 += A.load(i,k) * set( B(k,j) );
2323  }
2324 
2325  (~C).store( i, j, xmm1+xmm2 );
2326  }
2327  }
2328 
2329  for( ; remainder && i<M; ++i )
2330  {
2331  size_t j( LOW && UPP ? i : 0UL );
2332 
2333  for( ; (j+2UL) <= N; j+=2UL )
2334  {
2335  const size_t kbegin( ( IsLower<MT5>::value )
2336  ?( ( IsUpper<MT4>::value )
2337  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2338  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2339  :( IsUpper<MT4>::value ? i : 0UL ) );
2340  const size_t kend( ( IsUpper<MT5>::value )
2341  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2342  :( K ) );
2343 
2344  ElementType value1 = ElementType();
2345  ElementType value2 = ElementType();
2346 
2347  for( size_t k=kbegin; k<kend; ++k ) {
2348  value1 += A(i,k) * B(k,j );
2349  value2 += A(i,k) * B(k,j+1UL);
2350  }
2351 
2352  (~C)(i,j ) = value1;
2353  (~C)(i,j+1UL) = value2;
2354  }
2355 
2356  if( j < N )
2357  {
2358  const size_t kbegin( ( IsLower<MT5>::value )
2359  ?( ( IsUpper<MT4>::value )
2360  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2361  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2362  :( IsUpper<MT4>::value ? i : 0UL ) );
2363 
2364  ElementType value = ElementType();
2365 
2366  for( size_t k=kbegin; k<K; ++k ) {
2367  value += A(i,k) * B(k,j);
2368  }
2369 
2370  (~C)(i,j) = value;
2371  }
2372  }
2373  }
2374 
2375  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
2376  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2377  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2378  for( size_t i=0UL; i<iend; ++i ) {
2379  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
2380  }
2381  }
2382  }
2383  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
2384  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2385  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2386  for( size_t i=0UL; i<iend; ++i ) {
2387  reset( (~C)(i,j) );
2388  }
2389  }
2390  }
2391  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
2392  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
2393  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
2394  for( size_t j=0UL; j<jend; ++j ) {
2395  reset( (~C)(i,j) );
2396  }
2397  }
2398  }
2399  }
2401  //**********************************************************************************************
2402 
2403  //**Default assignment to dense matrices (large matrices)***************************************
2417  template< typename MT3 // Type of the left-hand side target matrix
2418  , typename MT4 // Type of the left-hand side matrix operand
2419  , typename MT5 > // Type of the right-hand side matrix operand
2421  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2422  {
2423  selectDefaultAssignKernel( C, A, B );
2424  }
2426  //**********************************************************************************************
2427 
2428  //**Vectorized default assignment to dense matrices (large matrices)****************************
2443  template< typename MT3 // Type of the left-hand side target matrix
2444  , typename MT4 // Type of the left-hand side matrix operand
2445  , typename MT5 > // Type of the right-hand side matrix operand
2447  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2448  {
2449  if( SYM )
2450  smmm( C, A, B, ElementType(1) );
2451  else if( HERM )
2452  hmmm( C, A, B, ElementType(1) );
2453  else if( LOW )
2454  lmmm( C, A, B, ElementType(1), ElementType(0) );
2455  else if( UPP )
2456  ummm( C, A, B, ElementType(1), ElementType(0) );
2457  else
2458  mmm( C, A, B, ElementType(1), ElementType(0) );
2459  }
2461  //**********************************************************************************************
2462 
2463  //**BLAS-based assignment to dense matrices (default)*******************************************
2477  template< typename MT3 // Type of the left-hand side target matrix
2478  , typename MT4 // Type of the left-hand side matrix operand
2479  , typename MT5 > // Type of the right-hand side matrix operand
2481  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2482  {
2483  selectLargeAssignKernel( C, A, B );
2484  }
2486  //**********************************************************************************************
2487 
2488  //**BLAS-based assignment to dense matrices*****************************************************
2489 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2490 
2503  template< typename MT3 // Type of the left-hand side target matrix
2504  , typename MT4 // Type of the left-hand side matrix operand
2505  , typename MT5 > // Type of the right-hand side matrix operand
2507  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2508  {
2509  using ET = ElementType_<MT3>;
2510 
2511  if( IsTriangular<MT4>::value ) {
2512  assign( C, B );
2513  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2514  }
2515  else if( IsTriangular<MT5>::value ) {
2516  assign( C, A );
2517  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2518  }
2519  else {
2520  gemm( C, A, B, ET(1), ET(0) );
2521  }
2522  }
2524 #endif
2525  //**********************************************************************************************
2526 
2527  //**Assignment to sparse matrices***************************************************************
2540  template< typename MT // Type of the target sparse matrix
2541  , bool SO > // Storage order of the target sparse matrix
2542  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2543  {
2545 
2547 
2554 
2555  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2556  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2557 
2558  const ForwardFunctor fwd;
2559 
2560  const TmpType tmp( serial( rhs ) );
2561  assign( ~lhs, fwd( tmp ) );
2562  }
2564  //**********************************************************************************************
2565 
2566  //**Addition assignment to dense matrices*******************************************************
2579  template< typename MT // Type of the target dense matrix
2580  , bool SO > // Storage order of the target dense matrix
2581  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2582  {
2584 
2585  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2586  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2587 
2588  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2589  return;
2590  }
2591 
2592  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2593  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2594 
2595  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2596  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2597  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2598  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2599  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2600  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2601 
2602  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2603  }
2605  //**********************************************************************************************
2606 
2607  //**Addition assignment to dense matrices (kernel selection)************************************
2618  template< typename MT3 // Type of the left-hand side target matrix
2619  , typename MT4 // Type of the left-hand side matrix operand
2620  , typename MT5 > // Type of the right-hand side matrix operand
2621  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2622  {
2624  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
2625  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
2626  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2627  selectSmallAddAssignKernel( C, A, B );
2628  else
2629  selectBlasAddAssignKernel( C, A, B );
2630  }
2632  //**********************************************************************************************
2633 
2634  //**Default addition assignment to row-major dense matrices (general/general)*******************
2648  template< typename MT3 // Type of the left-hand side target matrix
2649  , typename MT4 // Type of the left-hand side matrix operand
2650  , typename MT5 > // Type of the right-hand side matrix operand
2651  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2652  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2653  {
2654  const size_t M( A.rows() );
2655  const size_t N( B.columns() );
2656  const size_t K( A.columns() );
2657 
2658  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2659 
2660  for( size_t i=0UL; i<M; ++i )
2661  {
2662  const size_t kbegin( ( IsUpper<MT4>::value )
2663  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2664  :( 0UL ) );
2665  const size_t kend( ( IsLower<MT4>::value )
2666  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2667  :( K ) );
2668  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2669 
2670  for( size_t k=kbegin; k<kend; ++k )
2671  {
2672  const size_t jbegin( ( IsUpper<MT5>::value )
2674  ?( UPP ? max(i,k+1UL) : k+1UL )
2675  :( UPP ? max(i,k) : k ) )
2676  :( UPP ? i : 0UL ) );
2677  const size_t jend( ( IsLower<MT5>::value )
2679  ?( LOW ? min(i+1UL,k) : k )
2680  :( LOW ? min(i,k)+1UL : k+1UL ) )
2681  :( LOW ? i+1UL : N ) );
2682 
2683  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
2684  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2685 
2686  const size_t jnum( jend - jbegin );
2687  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2688 
2689  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2690  (~C)(i,j ) += A(i,k) * B(k,j );
2691  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2692  }
2693  if( jpos < jend ) {
2694  (~C)(i,jpos) += A(i,k) * B(k,jpos);
2695  }
2696  }
2697  }
2698  }
2700  //**********************************************************************************************
2701 
2702  //**Default addition assignment to column-major dense matrices (general/general)****************
2716  template< typename MT3 // Type of the left-hand side target matrix
2717  , typename MT4 // Type of the left-hand side matrix operand
2718  , typename MT5 > // Type of the right-hand side matrix operand
2719  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2720  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2721  {
2722  const size_t M( A.rows() );
2723  const size_t N( B.columns() );
2724  const size_t K( A.columns() );
2725 
2726  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2727 
2728  for( size_t j=0UL; j<N; ++j )
2729  {
2730  const size_t kbegin( ( IsLower<MT5>::value )
2731  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2732  :( 0UL ) );
2733  const size_t kend( ( IsUpper<MT5>::value )
2734  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2735  :( K ) );
2736  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2737 
2738  for( size_t k=kbegin; k<kend; ++k )
2739  {
2740  const size_t ibegin( ( IsLower<MT4>::value )
2742  ?( LOW ? max(j,k+1UL) : k+1UL )
2743  :( LOW ? max(j,k) : k ) )
2744  :( LOW ? j : 0UL ) );
2745  const size_t iend( ( IsUpper<MT4>::value )
2747  ?( UPP ? min(j+1UL,k) : k )
2748  :( UPP ? min(j,k)+1UL : k+1UL ) )
2749  :( UPP ? j+1UL : M ) );
2750 
2751  if( ( LOW || UPP ) && ibegin >= iend ) continue;
2752  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2753 
2754  const size_t inum( iend - ibegin );
2755  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2756 
2757  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2758  (~C)(i ,j) += A(i ,k) * B(k,j);
2759  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2760  }
2761  if( ipos < iend ) {
2762  (~C)(ipos,j) += A(ipos,k) * B(k,j);
2763  }
2764  }
2765  }
2766  }
2768  //**********************************************************************************************
2769 
2770  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2784  template< typename MT3 // Type of the left-hand side target matrix
2785  , typename MT4 // Type of the left-hand side matrix operand
2786  , typename MT5 > // Type of the right-hand side matrix operand
2787  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2788  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2789  {
2790  constexpr size_t block( BLOCK_SIZE );
2791 
2792  const size_t M( A.rows() );
2793  const size_t N( B.columns() );
2794 
2795  for( size_t ii=0UL; ii<M; ii+=block ) {
2796  const size_t iend( min( M, ii+block ) );
2797  for( size_t jj=0UL; jj<N; jj+=block ) {
2798  const size_t jend( min( N, jj+block ) );
2799  for( size_t i=ii; i<iend; ++i )
2800  {
2801  const size_t jbegin( ( IsUpper<MT4>::value )
2802  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
2803  :( jj ) );
2804  const size_t jpos( ( IsLower<MT4>::value )
2805  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
2806  :( jend ) );
2807 
2808  for( size_t j=jbegin; j<jpos; ++j ) {
2809  (~C)(i,j) += A(i,j) * B(j,j);
2810  }
2811  }
2812  }
2813  }
2814  }
2816  //**********************************************************************************************
2817 
2818  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
2832  template< typename MT3 // Type of the left-hand side target matrix
2833  , typename MT4 // Type of the left-hand side matrix operand
2834  , typename MT5 > // Type of the right-hand side matrix operand
2835  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2836  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2837  {
2838  const size_t M( A.rows() );
2839  const size_t N( B.columns() );
2840 
2841  for( size_t j=0UL; j<N; ++j )
2842  {
2843  const size_t ibegin( ( IsLower<MT4>::value )
2844  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2845  :( 0UL ) );
2846  const size_t iend( ( IsUpper<MT4>::value )
2847  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2848  :( M ) );
2849  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2850 
2851  const size_t inum( iend - ibegin );
2852  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2853 
2854  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2855  (~C)(i ,j) += A(i ,j) * B(j,j);
2856  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2857  }
2858  if( ipos < iend ) {
2859  (~C)(ipos,j) += A(ipos,j) * B(j,j);
2860  }
2861  }
2862  }
2864  //**********************************************************************************************
2865 
2866  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
2880  template< typename MT3 // Type of the left-hand side target matrix
2881  , typename MT4 // Type of the left-hand side matrix operand
2882  , typename MT5 > // Type of the right-hand side matrix operand
2883  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2884  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2885  {
2886  const size_t M( A.rows() );
2887  const size_t N( B.columns() );
2888 
2889  for( size_t i=0UL; i<M; ++i )
2890  {
2891  const size_t jbegin( ( IsUpper<MT5>::value )
2892  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2893  :( 0UL ) );
2894  const size_t jend( ( IsLower<MT5>::value )
2895  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2896  :( N ) );
2897  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2898 
2899  const size_t jnum( jend - jbegin );
2900  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2901 
2902  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2903  (~C)(i,j ) += A(i,i) * B(i,j );
2904  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2905  }
2906  if( jpos < jend ) {
2907  (~C)(i,jpos) += A(i,i) * B(i,jpos);
2908  }
2909  }
2910  }
2912  //**********************************************************************************************
2913 
2914  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2928  template< typename MT3 // Type of the left-hand side target matrix
2929  , typename MT4 // Type of the left-hand side matrix operand
2930  , typename MT5 > // Type of the right-hand side matrix operand
2931  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2932  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2933  {
2934  constexpr size_t block( BLOCK_SIZE );
2935 
2936  const size_t M( A.rows() );
2937  const size_t N( B.columns() );
2938 
2939  for( size_t jj=0UL; jj<N; jj+=block ) {
2940  const size_t jend( min( N, jj+block ) );
2941  for( size_t ii=0UL; ii<M; ii+=block ) {
2942  const size_t iend( min( M, ii+block ) );
2943  for( size_t j=jj; j<jend; ++j )
2944  {
2945  const size_t ibegin( ( IsLower<MT5>::value )
2946  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
2947  :( ii ) );
2948  const size_t ipos( ( IsUpper<MT5>::value )
2949  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
2950  :( iend ) );
2951 
2952  for( size_t i=ibegin; i<ipos; ++i ) {
2953  (~C)(i,j) += A(i,i) * B(i,j);
2954  }
2955  }
2956  }
2957  }
2958  }
2960  //**********************************************************************************************
2961 
2962  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2976  template< typename MT3 // Type of the left-hand side target matrix
2977  , typename MT4 // Type of the left-hand side matrix operand
2978  , typename MT5 > // Type of the right-hand side matrix operand
2979  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2980  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2981  {
2982  for( size_t i=0UL; i<A.rows(); ++i ) {
2983  C(i,i) += A(i,i) * B(i,i);
2984  }
2985  }
2987  //**********************************************************************************************
2988 
2989  //**Default addition assignment to dense matrices (small matrices)******************************
3003  template< typename MT3 // Type of the left-hand side target matrix
3004  , typename MT4 // Type of the left-hand side matrix operand
3005  , typename MT5 > // Type of the right-hand side matrix operand
3007  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3008  {
3009  selectDefaultAddAssignKernel( C, A, B );
3010  }
3012  //**********************************************************************************************
3013 
3014  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
3029  template< typename MT3 // Type of the left-hand side target matrix
3030  , typename MT4 // Type of the left-hand side matrix operand
3031  , typename MT5 > // Type of the right-hand side matrix operand
3033  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3034  {
3035  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3036 
3037  const size_t M( A.rows() );
3038  const size_t N( B.columns() );
3039  const size_t K( A.columns() );
3040 
3041  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3042 
3043  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3044  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3045 
3046  size_t j( 0UL );
3047 
3049  {
3050  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3051  for( size_t i=0UL; i<M; ++i )
3052  {
3053  const size_t kbegin( ( IsUpper<MT4>::value )
3054  ?( ( IsLower<MT5>::value )
3055  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3056  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3057  :( IsLower<MT5>::value ? j : 0UL ) );
3058  const size_t kend( ( IsLower<MT4>::value )
3059  ?( ( IsUpper<MT5>::value )
3060  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
3061  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
3062  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
3063 
3064  SIMDType xmm1( (~C).load(i,j ) );
3065  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3066  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3067  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3068  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3069  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
3070  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
3071  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
3072 
3073  for( size_t k=kbegin; k<kend; ++k ) {
3074  const SIMDType a1( set( A(i,k) ) );
3075  xmm1 += a1 * B.load(k,j );
3076  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3077  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3078  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3079  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3080  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
3081  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
3082  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
3083  }
3084 
3085  (~C).store( i, j , xmm1 );
3086  (~C).store( i, j+SIMDSIZE , xmm2 );
3087  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3088  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3089  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3090  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
3091  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
3092  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
3093  }
3094  }
3095  }
3096 
3097  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3098  {
3099  size_t i( 0UL );
3100 
3101  for( ; (i+2UL) <= M; i+=2UL )
3102  {
3103  const size_t kbegin( ( IsUpper<MT4>::value )
3104  ?( ( IsLower<MT5>::value )
3105  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3106  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3107  :( IsLower<MT5>::value ? j : 0UL ) );
3108  const size_t kend( ( IsLower<MT4>::value )
3109  ?( ( IsUpper<MT5>::value )
3110  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
3111  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3112  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
3113 
3114  SIMDType xmm1 ( (~C).load(i ,j ) );
3115  SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
3116  SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
3117  SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
3118  SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
3119  SIMDType xmm6 ( (~C).load(i+1UL,j ) );
3120  SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
3121  SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3122  SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3123  SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
3124 
3125  for( size_t k=kbegin; k<kend; ++k ) {
3126  const SIMDType a1( set( A(i ,k) ) );
3127  const SIMDType a2( set( A(i+1UL,k) ) );
3128  const SIMDType b1( B.load(k,j ) );
3129  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3130  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3131  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3132  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3133  xmm1 += a1 * b1;
3134  xmm2 += a1 * b2;
3135  xmm3 += a1 * b3;
3136  xmm4 += a1 * b4;
3137  xmm5 += a1 * b5;
3138  xmm6 += a2 * b1;
3139  xmm7 += a2 * b2;
3140  xmm8 += a2 * b3;
3141  xmm9 += a2 * b4;
3142  xmm10 += a2 * b5;
3143  }
3144 
3145  (~C).store( i , j , xmm1 );
3146  (~C).store( i , j+SIMDSIZE , xmm2 );
3147  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3148  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3149  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
3150  (~C).store( i+1UL, j , xmm6 );
3151  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
3152  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3153  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3154  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3155  }
3156 
3157  if( i < M )
3158  {
3159  const size_t kbegin( ( IsUpper<MT4>::value )
3160  ?( ( IsLower<MT5>::value )
3161  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3162  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3163  :( IsLower<MT5>::value ? j : 0UL ) );
3164  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
3165 
3166  SIMDType xmm1( (~C).load(i,j ) );
3167  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3168  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3169  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3170  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3171 
3172  for( size_t k=kbegin; k<kend; ++k ) {
3173  const SIMDType a1( set( A(i,k) ) );
3174  xmm1 += a1 * B.load(k,j );
3175  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3176  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3177  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3178  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3179  }
3180 
3181  (~C).store( i, j , xmm1 );
3182  (~C).store( i, j+SIMDSIZE , xmm2 );
3183  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3184  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3185  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3186  }
3187  }
3188 
3189  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3190  {
3191  size_t i( 0UL );
3192 
3193  for( ; (i+2UL) <= M; i+=2UL )
3194  {
3195  const size_t kbegin( ( IsUpper<MT4>::value )
3196  ?( ( IsLower<MT5>::value )
3197  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3198  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3199  :( IsLower<MT5>::value ? j : 0UL ) );
3200  const size_t kend( ( IsLower<MT4>::value )
3201  ?( ( IsUpper<MT5>::value )
3202  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3203  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3204  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
3205 
3206  SIMDType xmm1( (~C).load(i ,j ) );
3207  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3208  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3209  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
3210  SIMDType xmm5( (~C).load(i+1UL,j ) );
3211  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
3212  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3213  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3214 
3215  for( size_t k=kbegin; k<kend; ++k ) {
3216  const SIMDType a1( set( A(i ,k) ) );
3217  const SIMDType a2( set( A(i+1UL,k) ) );
3218  const SIMDType b1( B.load(k,j ) );
3219  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3220  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3221  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3222  xmm1 += a1 * b1;
3223  xmm2 += a1 * b2;
3224  xmm3 += a1 * b3;
3225  xmm4 += a1 * b4;
3226  xmm5 += a2 * b1;
3227  xmm6 += a2 * b2;
3228  xmm7 += a2 * b3;
3229  xmm8 += a2 * b4;
3230  }
3231 
3232  (~C).store( i , j , xmm1 );
3233  (~C).store( i , j+SIMDSIZE , xmm2 );
3234  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3235  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3236  (~C).store( i+1UL, j , xmm5 );
3237  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
3238  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3239  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3240  }
3241 
3242  if( i < M )
3243  {
3244  const size_t kbegin( ( IsUpper<MT4>::value )
3245  ?( ( IsLower<MT5>::value )
3246  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3247  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3248  :( IsLower<MT5>::value ? j : 0UL ) );
3249  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3250 
3251  SIMDType xmm1( (~C).load(i,j ) );
3252  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3253  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3254  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3255 
3256  for( size_t k=kbegin; k<kend; ++k ) {
3257  const SIMDType a1( set( A(i,k) ) );
3258  xmm1 += a1 * B.load(k,j );
3259  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3260  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3261  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3262  }
3263 
3264  (~C).store( i, j , xmm1 );
3265  (~C).store( i, j+SIMDSIZE , xmm2 );
3266  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3267  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3268  }
3269  }
3270 
3271  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3272  {
3273  size_t i( 0UL );
3274 
3275  for( ; (i+2UL) <= M; i+=2UL )
3276  {
3277  const size_t kbegin( ( IsUpper<MT4>::value )
3278  ?( ( IsLower<MT5>::value )
3279  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3280  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3281  :( IsLower<MT5>::value ? j : 0UL ) );
3282  const size_t kend( ( IsLower<MT4>::value )
3283  ?( ( IsUpper<MT5>::value )
3284  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
3285  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3286  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
3287 
3288  SIMDType xmm1( (~C).load(i ,j ) );
3289  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3290  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3291  SIMDType xmm4( (~C).load(i+1UL,j ) );
3292  SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
3293  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3294 
3295  for( size_t k=kbegin; k<kend; ++k ) {
3296  const SIMDType a1( set( A(i ,k) ) );
3297  const SIMDType a2( set( A(i+1UL,k) ) );
3298  const SIMDType b1( B.load(k,j ) );
3299  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3300  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3301  xmm1 += a1 * b1;
3302  xmm2 += a1 * b2;
3303  xmm3 += a1 * b3;
3304  xmm4 += a2 * b1;
3305  xmm5 += a2 * b2;
3306  xmm6 += a2 * b3;
3307  }
3308 
3309  (~C).store( i , j , xmm1 );
3310  (~C).store( i , j+SIMDSIZE , xmm2 );
3311  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3312  (~C).store( i+1UL, j , xmm4 );
3313  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
3314  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3315  }
3316 
3317  if( i < M )
3318  {
3319  const size_t kbegin( ( IsUpper<MT4>::value )
3320  ?( ( IsLower<MT5>::value )
3321  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3322  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3323  :( IsLower<MT5>::value ? j : 0UL ) );
3324  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
3325 
3326  SIMDType xmm1( (~C).load(i,j ) );
3327  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3328  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3329 
3330  for( size_t k=kbegin; k<kend; ++k ) {
3331  const SIMDType a1( set( A(i,k) ) );
3332  xmm1 += a1 * B.load(k,j );
3333  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3334  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3335  }
3336 
3337  (~C).store( i, j , xmm1 );
3338  (~C).store( i, j+SIMDSIZE , xmm2 );
3339  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3340  }
3341  }
3342 
3343  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3344  {
3345  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
3346  size_t i( LOW ? j : 0UL );
3347 
3348  for( ; (i+4UL) <= iend; i+=4UL )
3349  {
3350  const size_t kbegin( ( IsUpper<MT4>::value )
3351  ?( ( IsLower<MT5>::value )
3352  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3353  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3354  :( IsLower<MT5>::value ? j : 0UL ) );
3355  const size_t kend( ( IsLower<MT4>::value )
3356  ?( ( IsUpper<MT5>::value )
3357  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
3358  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
3359  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3360 
3361  SIMDType xmm1( (~C).load(i ,j ) );
3362  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3363  SIMDType xmm3( (~C).load(i+1UL,j ) );
3364  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3365  SIMDType xmm5( (~C).load(i+2UL,j ) );
3366  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3367  SIMDType xmm7( (~C).load(i+3UL,j ) );
3368  SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
3369 
3370  for( size_t k=kbegin; k<kend; ++k ) {
3371  const SIMDType a1( set( A(i ,k) ) );
3372  const SIMDType a2( set( A(i+1UL,k) ) );
3373  const SIMDType a3( set( A(i+2UL,k) ) );
3374  const SIMDType a4( set( A(i+3UL,k) ) );
3375  const SIMDType b1( B.load(k,j ) );
3376  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3377  xmm1 += a1 * b1;
3378  xmm2 += a1 * b2;
3379  xmm3 += a2 * b1;
3380  xmm4 += a2 * b2;
3381  xmm5 += a3 * b1;
3382  xmm6 += a3 * b2;
3383  xmm7 += a4 * b1;
3384  xmm8 += a4 * b2;
3385  }
3386 
3387  (~C).store( i , j , xmm1 );
3388  (~C).store( i , j+SIMDSIZE, xmm2 );
3389  (~C).store( i+1UL, j , xmm3 );
3390  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3391  (~C).store( i+2UL, j , xmm5 );
3392  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3393  (~C).store( i+3UL, j , xmm7 );
3394  (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
3395  }
3396 
3397  for( ; (i+3UL) <= iend; i+=3UL )
3398  {
3399  const size_t kbegin( ( IsUpper<MT4>::value )
3400  ?( ( IsLower<MT5>::value )
3401  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3402  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3403  :( IsLower<MT5>::value ? j : 0UL ) );
3404  const size_t kend( ( IsLower<MT4>::value )
3405  ?( ( IsUpper<MT5>::value )
3406  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
3407  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
3408  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3409 
3410  SIMDType xmm1( (~C).load(i ,j ) );
3411  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3412  SIMDType xmm3( (~C).load(i+1UL,j ) );
3413  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3414  SIMDType xmm5( (~C).load(i+2UL,j ) );
3415  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3416 
3417  for( size_t k=kbegin; k<kend; ++k ) {
3418  const SIMDType a1( set( A(i ,k) ) );
3419  const SIMDType a2( set( A(i+1UL,k) ) );
3420  const SIMDType a3( set( A(i+2UL,k) ) );
3421  const SIMDType b1( B.load(k,j ) );
3422  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3423  xmm1 += a1 * b1;
3424  xmm2 += a1 * b2;
3425  xmm3 += a2 * b1;
3426  xmm4 += a2 * b2;
3427  xmm5 += a3 * b1;
3428  xmm6 += a3 * b2;
3429  }
3430 
3431  (~C).store( i , j , xmm1 );
3432  (~C).store( i , j+SIMDSIZE, xmm2 );
3433  (~C).store( i+1UL, j , xmm3 );
3434  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3435  (~C).store( i+2UL, j , xmm5 );
3436  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3437  }
3438 
3439  for( ; (i+2UL) <= iend; i+=2UL )
3440  {
3441  const size_t kbegin( ( IsUpper<MT4>::value )
3442  ?( ( IsLower<MT5>::value )
3443  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3444  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3445  :( IsLower<MT5>::value ? j : 0UL ) );
3446  const size_t kend( ( IsLower<MT4>::value )
3447  ?( ( IsUpper<MT5>::value )
3448  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3449  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3450  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3451 
3452  SIMDType xmm1( (~C).load(i ,j ) );
3453  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3454  SIMDType xmm3( (~C).load(i+1UL,j ) );
3455  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3456  SIMDType xmm5, xmm6, xmm7, xmm8;
3457  size_t k( kbegin );
3458 
3459  for( ; (k+2UL) <= kend; k+=2UL ) {
3460  const SIMDType a1( set( A(i ,k ) ) );
3461  const SIMDType a2( set( A(i+1UL,k ) ) );
3462  const SIMDType a3( set( A(i ,k+1UL) ) );
3463  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
3464  const SIMDType b1( B.load(k ,j ) );
3465  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3466  const SIMDType b3( B.load(k+1UL,j ) );
3467  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3468  xmm1 += a1 * b1;
3469  xmm2 += a1 * b2;
3470  xmm3 += a2 * b1;
3471  xmm4 += a2 * b2;
3472  xmm5 += a3 * b3;
3473  xmm6 += a3 * b4;
3474  xmm7 += a4 * b3;
3475  xmm8 += a4 * b4;
3476  }
3477 
3478  for( ; k<kend; ++k ) {
3479  const SIMDType a1( set( A(i ,k) ) );
3480  const SIMDType a2( set( A(i+1UL,k) ) );
3481  const SIMDType b1( B.load(k,j ) );
3482  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3483  xmm1 += a1 * b1;
3484  xmm2 += a1 * b2;
3485  xmm3 += a2 * b1;
3486  xmm4 += a2 * b2;
3487  }
3488 
3489  (~C).store( i , j , xmm1+xmm5 );
3490  (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
3491  (~C).store( i+1UL, j , xmm3+xmm7 );
3492  (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3493  }
3494 
3495  if( i < iend )
3496  {
3497  const size_t kbegin( ( IsUpper<MT4>::value )
3498  ?( ( IsLower<MT5>::value )
3499  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3500  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3501  :( IsLower<MT5>::value ? j : 0UL ) );
3502  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
3503 
3504  SIMDType xmm1( (~C).load(i,j ) );
3505  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3506  SIMDType xmm3, xmm4;
3507  size_t k( kbegin );
3508 
3509  for( ; (k+2UL) <= kend; k+=2UL ) {
3510  const SIMDType a1( set( A(i,k ) ) );
3511  const SIMDType a2( set( A(i,k+1UL) ) );
3512  xmm1 += a1 * B.load(k ,j );
3513  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
3514  xmm3 += a2 * B.load(k+1UL,j );
3515  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
3516  }
3517 
3518  for( ; k<kend; ++k ) {
3519  const SIMDType a1( set( A(i,k) ) );
3520  xmm1 += a1 * B.load(k,j );
3521  xmm2 += a1 * B.load(k,j+SIMDSIZE);
3522  }
3523 
3524  (~C).store( i, j , xmm1+xmm3 );
3525  (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
3526  }
3527  }
3528 
3529  for( ; j<jpos; j+=SIMDSIZE )
3530  {
3531  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
3532  size_t i( LOW ? j : 0UL );
3533 
3534  for( ; (i+4UL) <= iend; i+=4UL )
3535  {
3536  const size_t kbegin( ( IsUpper<MT4>::value )
3537  ?( ( IsLower<MT5>::value )
3538  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3539  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3540  :( IsLower<MT5>::value ? j : 0UL ) );
3541  const size_t kend( ( IsLower<MT4>::value )
3542  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
3543  :( K ) );
3544 
3545  SIMDType xmm1( (~C).load(i ,j) );
3546  SIMDType xmm2( (~C).load(i+1UL,j) );
3547  SIMDType xmm3( (~C).load(i+2UL,j) );
3548  SIMDType xmm4( (~C).load(i+3UL,j) );
3549  SIMDType xmm5, xmm6, xmm7, xmm8;
3550  size_t k( kbegin );
3551 
3552  for( ; (k+2UL) <= kend; k+=2UL ) {
3553  const SIMDType b1( B.load(k ,j) );
3554  const SIMDType b2( B.load(k+1UL,j) );
3555  xmm1 += set( A(i ,k ) ) * b1;
3556  xmm2 += set( A(i+1UL,k ) ) * b1;
3557  xmm3 += set( A(i+2UL,k ) ) * b1;
3558  xmm4 += set( A(i+3UL,k ) ) * b1;
3559  xmm5 += set( A(i ,k+1UL) ) * b2;
3560  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
3561  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
3562  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
3563  }
3564 
3565  for( ; k<kend; ++k ) {
3566  const SIMDType b1( B.load(k,j) );
3567  xmm1 += set( A(i ,k) ) * b1;
3568  xmm2 += set( A(i+1UL,k) ) * b1;
3569  xmm3 += set( A(i+2UL,k) ) * b1;
3570  xmm4 += set( A(i+3UL,k) ) * b1;
3571  }
3572 
3573  (~C).store( i , j, xmm1+xmm5 );
3574  (~C).store( i+1UL, j, xmm2+xmm6 );
3575  (~C).store( i+2UL, j, xmm3+xmm7 );
3576  (~C).store( i+3UL, j, xmm4+xmm8 );
3577  }
3578 
3579  for( ; (i+3UL) <= iend; i+=3UL )
3580  {
3581  const size_t kbegin( ( IsUpper<MT4>::value )
3582  ?( ( IsLower<MT5>::value )
3583  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3584  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3585  :( IsLower<MT5>::value ? j : 0UL ) );
3586  const size_t kend( ( IsLower<MT4>::value )
3587  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
3588  :( K ) );
3589 
3590  SIMDType xmm1( (~C).load(i ,j) );
3591  SIMDType xmm2( (~C).load(i+1UL,j) );
3592  SIMDType xmm3( (~C).load(i+2UL,j) );
3593  SIMDType xmm4, xmm5, xmm6;
3594  size_t k( kbegin );
3595 
3596  for( ; (k+2UL) <= kend; k+=2UL ) {
3597  const SIMDType b1( B.load(k ,j) );
3598  const SIMDType b2( B.load(k+1UL,j) );
3599  xmm1 += set( A(i ,k ) ) * b1;
3600  xmm2 += set( A(i+1UL,k ) ) * b1;
3601  xmm3 += set( A(i+2UL,k ) ) * b1;
3602  xmm4 += set( A(i ,k+1UL) ) * b2;
3603  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
3604  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
3605  }
3606 
3607  for( ; k<kend; ++k ) {
3608  const SIMDType b1( B.load(k,j) );
3609  xmm1 += set( A(i ,k) ) * b1;
3610  xmm2 += set( A(i+1UL,k) ) * b1;
3611  xmm3 += set( A(i+2UL,k) ) * b1;
3612  }
3613 
3614  (~C).store( i , j, xmm1+xmm4 );
3615  (~C).store( i+1UL, j, xmm2+xmm5 );
3616  (~C).store( i+2UL, j, xmm3+xmm6 );
3617  }
3618 
3619  for( ; (i+2UL) <= iend; i+=2UL )
3620  {
3621  const size_t kbegin( ( IsUpper<MT4>::value )
3622  ?( ( IsLower<MT5>::value )
3623  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3624  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3625  :( IsLower<MT5>::value ? j : 0UL ) );
3626  const size_t kend( ( IsLower<MT4>::value )
3627  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3628  :( K ) );
3629 
3630  SIMDType xmm1( (~C).load(i ,j) );
3631  SIMDType xmm2( (~C).load(i+1UL,j) );
3632  SIMDType xmm3, xmm4;
3633  size_t k( kbegin );
3634 
3635  for( ; (k+2UL) <= kend; k+=2UL ) {
3636  const SIMDType b1( B.load(k ,j) );
3637  const SIMDType b2( B.load(k+1UL,j) );
3638  xmm1 += set( A(i ,k ) ) * b1;
3639  xmm2 += set( A(i+1UL,k ) ) * b1;
3640  xmm3 += set( A(i ,k+1UL) ) * b2;
3641  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
3642  }
3643 
3644  for( ; k<kend; ++k ) {
3645  const SIMDType b1( B.load(k,j) );
3646  xmm1 += set( A(i ,k) ) * b1;
3647  xmm2 += set( A(i+1UL,k) ) * b1;
3648  }
3649 
3650  (~C).store( i , j, xmm1+xmm3 );
3651  (~C).store( i+1UL, j, xmm2+xmm4 );
3652  }
3653 
3654  if( i < iend )
3655  {
3656  const size_t kbegin( ( IsUpper<MT4>::value )
3657  ?( ( IsLower<MT5>::value )
3658  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3659  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3660  :( IsLower<MT5>::value ? j : 0UL ) );
3661 
3662  SIMDType xmm1( (~C).load(i,j) );
3663  SIMDType xmm2;
3664  size_t k( kbegin );
3665 
3666  for( ; (k+2UL) <= K; k+=2UL ) {
3667  xmm1 += set( A(i,k ) ) * B.load(k ,j);
3668  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
3669  }
3670 
3671  for( ; k<K; ++k ) {
3672  xmm1 += set( A(i,k) ) * B.load(k,j);
3673  }
3674 
3675  (~C).store( i, j, xmm1+xmm2 );
3676  }
3677  }
3678 
3679  for( ; remainder && j<N; ++j )
3680  {
3681  const size_t iend( UPP ? j+1UL : M );
3682  size_t i( LOW ? j : 0UL );
3683 
3684  for( ; (i+2UL) <= iend; i+=2UL )
3685  {
3686  const size_t kbegin( ( IsUpper<MT4>::value )
3687  ?( ( IsLower<MT5>::value )
3688  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3689  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3690  :( IsLower<MT5>::value ? j : 0UL ) );
3691  const size_t kend( ( IsLower<MT4>::value )
3692  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3693  :( K ) );
3694 
3695  ElementType value1( (~C)(i ,j) );
3696  ElementType value2( (~C)(i+1UL,j) );;
3697 
3698  for( size_t k=kbegin; k<kend; ++k ) {
3699  value1 += A(i ,k) * B(k,j);
3700  value2 += A(i+1UL,k) * B(k,j);
3701  }
3702 
3703  (~C)(i ,j) = value1;
3704  (~C)(i+1UL,j) = value2;
3705  }
3706 
3707  if( i < iend )
3708  {
3709  const size_t kbegin( ( IsUpper<MT4>::value )
3710  ?( ( IsLower<MT5>::value )
3711  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3712  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3713  :( IsLower<MT5>::value ? j : 0UL ) );
3714 
3715  ElementType value( (~C)(i,j) );
3716 
3717  for( size_t k=kbegin; k<K; ++k ) {
3718  value += A(i,k) * B(k,j);
3719  }
3720 
3721  (~C)(i,j) = value;
3722  }
3723  }
3724  }
3726  //**********************************************************************************************
3727 
3728  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
3743  template< typename MT3 // Type of the left-hand side target matrix
3744  , typename MT4 // Type of the left-hand side matrix operand
3745  , typename MT5 > // Type of the right-hand side matrix operand
3747  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3748  {
3749  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3750 
3751  const size_t M( A.rows() );
3752  const size_t N( B.columns() );
3753  const size_t K( A.columns() );
3754 
3755  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3756 
3757  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3758  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3759 
3760  size_t i( 0UL );
3761 
3763  {
3764  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3765  for( size_t j=0UL; j<N; ++j )
3766  {
3767  const size_t kbegin( ( IsLower<MT5>::value )
3768  ?( ( IsUpper<MT4>::value )
3769  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3770  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3771  :( IsUpper<MT4>::value ? i : 0UL ) );
3772  const size_t kend( ( IsUpper<MT5>::value )
3773  ?( ( IsLower<MT4>::value )
3774  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3775  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3776  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
3777 
3778  SIMDType xmm1( (~C).load(i ,j) );
3779  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3780  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3781  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3782  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3783  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3784  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3785  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3786 
3787  for( size_t k=kbegin; k<kend; ++k ) {
3788  const SIMDType b1( set( B(k,j) ) );
3789  xmm1 += A.load(i ,k) * b1;
3790  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3791  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3792  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3793  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3794  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
3795  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
3796  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
3797  }
3798 
3799  (~C).store( i , j, xmm1 );
3800  (~C).store( i+SIMDSIZE , j, xmm2 );
3801  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3802  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3803  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3804  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3805  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3806  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3807  }
3808  }
3809  }
3810 
3811  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3812  {
3813  size_t j( 0UL );
3814 
3815  for( ; (j+2UL) <= N; j+=2UL )
3816  {
3817  const size_t kbegin( ( IsLower<MT5>::value )
3818  ?( ( IsUpper<MT4>::value )
3819  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3820  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3821  :( IsUpper<MT4>::value ? i : 0UL ) );
3822  const size_t kend( ( IsUpper<MT5>::value )
3823  ?( ( IsLower<MT4>::value )
3824  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3825  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3826  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
3827 
3828  SIMDType xmm1 ( (~C).load(i ,j ) );
3829  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3830  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3831  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3832  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3833  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3834  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3835  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3836  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3837  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3838 
3839  for( size_t k=kbegin; k<kend; ++k ) {
3840  const SIMDType a1( A.load(i ,k) );
3841  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3842  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3843  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3844  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3845  const SIMDType b1( set( B(k,j ) ) );
3846  const SIMDType b2( set( B(k,j+1UL) ) );
3847  xmm1 += a1 * b1;
3848  xmm2 += a2 * b1;
3849  xmm3 += a3 * b1;
3850  xmm4 += a4 * b1;
3851  xmm5 += a5 * b1;
3852  xmm6 += a1 * b2;
3853  xmm7 += a2 * b2;
3854  xmm8 += a3 * b2;
3855  xmm9 += a4 * b2;
3856  xmm10 += a5 * b2;
3857  }
3858 
3859  (~C).store( i , j , xmm1 );
3860  (~C).store( i+SIMDSIZE , j , xmm2 );
3861  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3862  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3863  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3864  (~C).store( i , j+1UL, xmm6 );
3865  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3866  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3867  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3868  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3869  }
3870 
3871  if( j < N )
3872  {
3873  const size_t kbegin( ( IsLower<MT5>::value )
3874  ?( ( IsUpper<MT4>::value )
3875  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3876  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3877  :( IsUpper<MT4>::value ? i : 0UL ) );
3878  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
3879 
3880  SIMDType xmm1( (~C).load(i ,j) );
3881  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3882  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3883  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3884  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3885 
3886  for( size_t k=kbegin; k<kend; ++k ) {
3887  const SIMDType b1( set( B(k,j) ) );
3888  xmm1 += A.load(i ,k) * b1;
3889  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3890  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3891  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3892  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3893  }
3894 
3895  (~C).store( i , j, xmm1 );
3896  (~C).store( i+SIMDSIZE , j, xmm2 );
3897  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3898  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3899  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3900  }
3901  }
3902 
3903  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3904  {
3905  size_t j( 0UL );
3906 
3907  for( ; (j+2UL) <= N; j+=2UL )
3908  {
3909  const size_t kbegin( ( IsLower<MT5>::value )
3910  ?( ( IsUpper<MT4>::value )
3911  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3912  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3913  :( IsUpper<MT4>::value ? i : 0UL ) );
3914  const size_t kend( ( IsUpper<MT5>::value )
3915  ?( ( IsLower<MT4>::value )
3916  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3917  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3918  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
3919 
3920  SIMDType xmm1( (~C).load(i ,j ) );
3921  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3922  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3923  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3924  SIMDType xmm5( (~C).load(i ,j+1UL) );
3925  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3926  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3927  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3928 
3929  for( size_t k=kbegin; k<kend; ++k ) {
3930  const SIMDType a1( A.load(i ,k) );
3931  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3932  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3933  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3934  const SIMDType b1( set( B(k,j ) ) );
3935  const SIMDType b2( set( B(k,j+1UL) ) );
3936  xmm1 += a1 * b1;
3937  xmm2 += a2 * b1;
3938  xmm3 += a3 * b1;
3939  xmm4 += a4 * b1;
3940  xmm5 += a1 * b2;
3941  xmm6 += a2 * b2;
3942  xmm7 += a3 * b2;
3943  xmm8 += a4 * b2;
3944  }
3945 
3946  (~C).store( i , j , xmm1 );
3947  (~C).store( i+SIMDSIZE , j , xmm2 );
3948  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3949  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3950  (~C).store( i , j+1UL, xmm5 );
3951  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3952  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3953  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3954  }
3955 
3956  if( j < N )
3957  {
3958  const size_t kbegin( ( IsLower<MT5>::value )
3959  ?( ( IsUpper<MT4>::value )
3960  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3961  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3962  :( IsUpper<MT4>::value ? i : 0UL ) );
3963  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3964 
3965  SIMDType xmm1( (~C).load(i ,j) );
3966  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3967  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3968  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3969 
3970  for( size_t k=kbegin; k<kend; ++k ) {
3971  const SIMDType b1( set( B(k,j) ) );
3972  xmm1 += A.load(i ,k) * b1;
3973  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3974  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3975  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3976  }
3977 
3978  (~C).store( i , j, xmm1 );
3979  (~C).store( i+SIMDSIZE , j, xmm2 );
3980  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3981  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3982  }
3983  }
3984 
3985  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3986  {
3987  size_t j( 0UL );
3988 
3989  for( ; (j+2UL) <= N; j+=2UL )
3990  {
3991  const size_t kbegin( ( IsLower<MT5>::value )
3992  ?( ( IsUpper<MT4>::value )
3993  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3994  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3995  :( IsUpper<MT4>::value ? i : 0UL ) );
3996  const size_t kend( ( IsUpper<MT5>::value )
3997  ?( ( IsLower<MT4>::value )
3998  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3999  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4000  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
4001 
4002  SIMDType xmm1( (~C).load(i ,j ) );
4003  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4004  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4005  SIMDType xmm4( (~C).load(i ,j+1UL) );
4006  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
4007  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4008 
4009  for( size_t k=kbegin; k<kend; ++k ) {
4010  const SIMDType a1( A.load(i ,k) );
4011  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4012  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4013  const SIMDType b1( set( B(k,j ) ) );
4014  const SIMDType b2( set( B(k,j+1UL) ) );
4015  xmm1 += a1 * b1;
4016  xmm2 += a2 * b1;
4017  xmm3 += a3 * b1;
4018  xmm4 += a1 * b2;
4019  xmm5 += a2 * b2;
4020  xmm6 += a3 * b2;
4021  }
4022 
4023  (~C).store( i , j , xmm1 );
4024  (~C).store( i+SIMDSIZE , j , xmm2 );
4025  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4026  (~C).store( i , j+1UL, xmm4 );
4027  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
4028  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
4029  }
4030 
4031  if( j < N )
4032  {
4033  const size_t kbegin( ( IsLower<MT5>::value )
4034  ?( ( IsUpper<MT4>::value )
4035  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4036  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4037  :( IsUpper<MT4>::value ? i : 0UL ) );
4038  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
4039 
4040  SIMDType xmm1( (~C).load(i ,j) );
4041  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4042  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4043 
4044  for( size_t k=kbegin; k<kend; ++k ) {
4045  const SIMDType b1( set( B(k,j) ) );
4046  xmm1 += A.load(i ,k) * b1;
4047  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4048  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4049  }
4050 
4051  (~C).store( i , j, xmm1 );
4052  (~C).store( i+SIMDSIZE , j, xmm2 );
4053  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4054  }
4055  }
4056 
4057  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4058  {
4059  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
4060  size_t j( UPP ? i : 0UL );
4061 
4062  for( ; (j+4UL) <= jend; j+=4UL )
4063  {
4064  const size_t kbegin( ( IsLower<MT5>::value )
4065  ?( ( IsUpper<MT4>::value )
4066  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4067  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4068  :( IsUpper<MT4>::value ? i : 0UL ) );
4069  const size_t kend( ( IsUpper<MT5>::value )
4070  ?( ( IsLower<MT4>::value )
4071  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
4072  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
4073  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
4074 
4075  SIMDType xmm1( (~C).load(i ,j ) );
4076  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4077  SIMDType xmm3( (~C).load(i ,j+1UL) );
4078  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4079  SIMDType xmm5( (~C).load(i ,j+2UL) );
4080  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
4081  SIMDType xmm7( (~C).load(i ,j+3UL) );
4082  SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
4083 
4084  for( size_t k=kbegin; k<kend; ++k ) {
4085  const SIMDType a1( A.load(i ,k) );
4086  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4087  const SIMDType b1( set( B(k,j ) ) );
4088  const SIMDType b2( set( B(k,j+1UL) ) );
4089  const SIMDType b3( set( B(k,j+2UL) ) );
4090  const SIMDType b4( set( B(k,j+3UL) ) );
4091  xmm1 += a1 * b1;
4092  xmm2 += a2 * b1;
4093  xmm3 += a1 * b2;
4094  xmm4 += a2 * b2;
4095  xmm5 += a1 * b3;
4096  xmm6 += a2 * b3;
4097  xmm7 += a1 * b4;
4098  xmm8 += a2 * b4;
4099  }
4100 
4101  (~C).store( i , j , xmm1 );
4102  (~C).store( i+SIMDSIZE, j , xmm2 );
4103  (~C).store( i , j+1UL, xmm3 );
4104  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4105  (~C).store( i , j+2UL, xmm5 );
4106  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
4107  (~C).store( i , j+3UL, xmm7 );
4108  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
4109  }
4110 
4111  for( ; (j+3UL) <= jend; j+=3UL )
4112  {
4113  const size_t kbegin( ( IsLower<MT5>::value )
4114  ?( ( IsUpper<MT4>::value )
4115  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4116  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4117  :( IsUpper<MT4>::value ? i : 0UL ) );
4118  const size_t kend( ( IsUpper<MT5>::value )
4119  ?( ( IsLower<MT4>::value )
4120  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
4121  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
4122  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
4123 
4124  SIMDType xmm1( (~C).load(i ,j ) );
4125  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4126  SIMDType xmm3( (~C).load(i ,j+1UL) );
4127  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4128  SIMDType xmm5( (~C).load(i ,j+2UL) );
4129  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
4130 
4131  for( size_t k=kbegin; k<kend; ++k ) {
4132  const SIMDType a1( A.load(i ,k) );
4133  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4134  const SIMDType b1( set( B(k,j ) ) );
4135  const SIMDType b2( set( B(k,j+1UL) ) );
4136  const SIMDType b3( set( B(k,j+2UL) ) );
4137  xmm1 += a1 * b1;
4138  xmm2 += a2 * b1;
4139  xmm3 += a1 * b2;
4140  xmm4 += a2 * b2;
4141  xmm5 += a1 * b3;
4142  xmm6 += a2 * b3;
4143  }
4144 
4145  (~C).store( i , j , xmm1 );
4146  (~C).store( i+SIMDSIZE, j , xmm2 );
4147  (~C).store( i , j+1UL, xmm3 );
4148  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4149  (~C).store( i , j+2UL, xmm5 );
4150  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
4151  }
4152 
4153  for( ; (j+2UL) <= jend; j+=2UL )
4154  {
4155  const size_t kbegin( ( IsLower<MT5>::value )
4156  ?( ( IsUpper<MT4>::value )
4157  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4158  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4159  :( IsUpper<MT4>::value ? i : 0UL ) );
4160  const size_t kend( ( IsUpper<MT5>::value )
4161  ?( ( IsLower<MT4>::value )
4162  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4163  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4164  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
4165 
4166  SIMDType xmm1( (~C).load(i ,j ) );
4167  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4168  SIMDType xmm3( (~C).load(i ,j+1UL) );
4169  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4170  SIMDType xmm5, xmm6, xmm7, xmm8;
4171  size_t k( kbegin );
4172 
4173  for( ; (k+2UL) < kend; k+=2UL ) {
4174  const SIMDType a1( A.load(i ,k ) );
4175  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
4176  const SIMDType a3( A.load(i ,k+1UL) );
4177  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
4178  const SIMDType b1( set( B(k ,j ) ) );
4179  const SIMDType b2( set( B(k ,j+1UL) ) );
4180  const SIMDType b3( set( B(k+1UL,j ) ) );
4181  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
4182  xmm1 += a1 * b1;
4183  xmm2 += a2 * b1;
4184  xmm3 += a1 * b2;
4185  xmm4 += a2 * b2;
4186  xmm5 += a3 * b3;
4187  xmm6 += a4 * b3;
4188  xmm7 += a3 * b4;
4189  xmm8 += a4 * b4;
4190  }
4191 
4192  for( ; k<kend; ++k ) {
4193  const SIMDType a1( A.load(i ,k) );
4194  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4195  const SIMDType b1( set( B(k,j ) ) );
4196  const SIMDType b2( set( B(k,j+1UL) ) );
4197  xmm1 += a1 * b1;
4198  xmm2 += a2 * b1;
4199  xmm3 += a1 * b2;
4200  xmm4 += a2 * b2;
4201  }
4202 
4203  (~C).store( i , j , xmm1+xmm5 );
4204  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
4205  (~C).store( i , j+1UL, xmm3+xmm7 );
4206  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
4207  }
4208 
4209  if( j < jend )
4210  {
4211  const size_t kbegin( ( IsLower<MT5>::value )
4212  ?( ( IsUpper<MT4>::value )
4213  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4214  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4215  :( IsUpper<MT4>::value ? i : 0UL ) );
4216  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
4217 
4218  SIMDType xmm1( (~C).load(i ,j) );
4219  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
4220  SIMDType xmm3, xmm4;
4221  size_t k( kbegin );
4222 
4223  for( ; (k+2UL) <= kend; k+=2UL ) {
4224  const SIMDType b1( set( B(k ,j) ) );
4225  const SIMDType b2( set( B(k+1UL,j) ) );
4226  xmm1 += A.load(i ,k ) * b1;
4227  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
4228  xmm3 += A.load(i ,k+1UL) * b2;
4229  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
4230  }
4231 
4232  for( ; k<kend; ++k ) {
4233  const SIMDType b1( set( B(k,j) ) );
4234  xmm1 += A.load(i ,k) * b1;
4235  xmm2 += A.load(i+SIMDSIZE,k) * b1;
4236  }
4237 
4238  (~C).store( i , j, xmm1+xmm3 );
4239  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
4240  }
4241  }
4242 
4243  for( ; i<ipos; i+=SIMDSIZE )
4244  {
4245  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
4246  size_t j( UPP ? i : 0UL );
4247 
4248  for( ; (j+4UL) <= jend; j+=4UL )
4249  {
4250  const size_t kbegin( ( IsLower<MT5>::value )
4251  ?( ( IsUpper<MT4>::value )
4252  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4253  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4254  :( IsUpper<MT4>::value ? i : 0UL ) );
4255  const size_t kend( ( IsUpper<MT5>::value )
4256  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
4257  :( K ) );
4258 
4259  SIMDType xmm1( (~C).load(i,j ) );
4260  SIMDType xmm2( (~C).load(i,j+1UL) );
4261  SIMDType xmm3( (~C).load(i,j+2UL) );
4262  SIMDType xmm4( (~C).load(i,j+3UL) );
4263  SIMDType xmm5, xmm6, xmm7, xmm8;
4264  size_t k( kbegin );
4265 
4266  for( ; (k+2UL) <= kend; k+=2UL ) {
4267  const SIMDType a1( A.load(i,k ) );
4268  const SIMDType a2( A.load(i,k+1UL) );
4269  xmm1 += a1 * set( B(k ,j ) );
4270  xmm2 += a1 * set( B(k ,j+1UL) );
4271  xmm3 += a1 * set( B(k ,j+2UL) );
4272  xmm4 += a1 * set( B(k ,j+3UL) );
4273  xmm5 += a2 * set( B(k+1UL,j ) );
4274  xmm6 += a2 * set( B(k+1UL,j+1UL) );
4275  xmm7 += a2 * set( B(k+1UL,j+2UL) );
4276  xmm8 += a2 * set( B(k+1UL,j+3UL) );
4277  }
4278 
4279  for( ; k<kend; ++k ) {
4280  const SIMDType a1( A.load(i,k) );
4281  xmm1 += a1 * set( B(k,j ) );
4282  xmm2 += a1 * set( B(k,j+1UL) );
4283  xmm3 += a1 * set( B(k,j+2UL) );
4284  xmm4 += a1 * set( B(k,j+3UL) );
4285  }
4286 
4287  (~C).store( i, j , xmm1+xmm5 );
4288  (~C).store( i, j+1UL, xmm2+xmm6 );
4289  (~C).store( i, j+2UL, xmm3+xmm7 );
4290  (~C).store( i, j+3UL, xmm4+xmm8 );
4291  }
4292 
4293  for( ; (j+3UL) <= jend; j+=3UL )
4294  {
4295  const size_t kbegin( ( IsLower<MT5>::value )
4296  ?( ( IsUpper<MT4>::value )
4297  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4298  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4299  :( IsUpper<MT4>::value ? i : 0UL ) );
4300  const size_t kend( ( IsUpper<MT5>::value )
4301  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
4302  :( K ) );
4303 
4304  SIMDType xmm1( (~C).load(i,j ) );
4305  SIMDType xmm2( (~C).load(i,j+1UL) );
4306  SIMDType xmm3( (~C).load(i,j+2UL) );
4307  SIMDType xmm4, xmm5, xmm6;
4308  size_t k( kbegin );
4309 
4310  for( ; (k+2UL) <= kend; k+=2UL ) {
4311  const SIMDType a1( A.load(i,k ) );
4312  const SIMDType a2( A.load(i,k+1UL) );
4313  xmm1 += a1 * set( B(k ,j ) );
4314  xmm2 += a1 * set( B(k ,j+1UL) );
4315  xmm3 += a1 * set( B(k ,j+2UL) );
4316  xmm4 += a2 * set( B(k+1UL,j ) );
4317  xmm5 += a2 * set( B(k+1UL,j+1UL) );
4318  xmm6 += a2 * set( B(k+1UL,j+2UL) );
4319  }
4320 
4321  for( ; k<kend; ++k ) {
4322  const SIMDType a1( A.load(i,k) );
4323  xmm1 += a1 * set( B(k,j ) );
4324  xmm2 += a1 * set( B(k,j+1UL) );
4325  xmm3 += a1 * set( B(k,j+2UL) );
4326  }
4327 
4328  (~C).store( i, j , xmm1+xmm4 );
4329  (~C).store( i, j+1UL, xmm2+xmm5 );
4330  (~C).store( i, j+2UL, xmm3+xmm6 );
4331  }
4332 
4333  for( ; (j+2UL) <= jend; j+=2UL )
4334  {
4335  const size_t kbegin( ( IsLower<MT5>::value )
4336  ?( ( IsUpper<MT4>::value )
4337  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4338  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4339  :( IsUpper<MT4>::value ? i : 0UL ) );
4340  const size_t kend( ( IsUpper<MT5>::value )
4341  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4342  :( K ) );
4343 
4344  SIMDType xmm1( (~C).load(i,j ) );
4345  SIMDType xmm2( (~C).load(i,j+1UL) );
4346  SIMDType xmm3, xmm4;
4347  size_t k( kbegin );
4348 
4349  for( ; (k+2UL) <= kend; k+=2UL ) {
4350  const SIMDType a1( A.load(i,k ) );
4351  const SIMDType a2( A.load(i,k+1UL) );
4352  xmm1 += a1 * set( B(k ,j ) );
4353  xmm2 += a1 * set( B(k ,j+1UL) );
4354  xmm3 += a2 * set( B(k+1UL,j ) );
4355  xmm4 += a2 * set( B(k+1UL,j+1UL) );
4356  }
4357 
4358  for( ; k<kend; ++k ) {
4359  const SIMDType a1( A.load(i,k) );
4360  xmm1 += a1 * set( B(k,j ) );
4361  xmm2 += a1 * set( B(k,j+1UL) );
4362  }
4363 
4364  (~C).store( i, j , xmm1+xmm3 );
4365  (~C).store( i, j+1UL, xmm2+xmm4 );
4366  }
4367 
4368  if( j < jend )
4369  {
4370  const size_t kbegin( ( IsLower<MT5>::value )
4371  ?( ( IsUpper<MT4>::value )
4372  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4373  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4374  :( IsUpper<MT4>::value ? i : 0UL ) );
4375 
4376  SIMDType xmm1( (~C).load(i,j) );
4377  SIMDType xmm2;
4378  size_t k( kbegin );
4379 
4380  for( ; (k+2UL) <= K; k+=2UL ) {
4381  xmm1 += A.load(i,k ) * set( B(k ,j) );
4382  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
4383  }
4384 
4385  for( ; k<K; ++k ) {
4386  xmm1 += A.load(i,k) * set( B(k,j) );
4387  }
4388 
4389  (~C).store( i, j, xmm1+xmm2 );
4390  }
4391  }
4392 
4393  for( ; remainder && i<M; ++i )
4394  {
4395  const size_t jend( LOW ? i+1UL : N );
4396  size_t j( UPP ? i : 0UL );
4397 
4398  for( ; (j+2UL) <= jend; j+=2UL )
4399  {
4400  const size_t kbegin( ( IsLower<MT5>::value )
4401  ?( ( IsUpper<MT4>::value )
4402  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4403  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4404  :( IsUpper<MT4>::value ? i : 0UL ) );
4405  const size_t kend( ( IsUpper<MT5>::value )
4406  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4407  :( K ) );
4408 
4409  ElementType value1( (~C)(i,j ) );
4410  ElementType value2( (~C)(i,j+1UL) );
4411 
4412  for( size_t k=kbegin; k<kend; ++k ) {
4413  value1 += A(i,k) * B(k,j );
4414  value2 += A(i,k) * B(k,j+1UL);
4415  }
4416 
4417  (~C)(i,j ) = value1;
4418  (~C)(i,j+1UL) = value2;
4419  }
4420 
4421  if( j < jend )
4422  {
4423  const size_t kbegin( ( IsLower<MT5>::value )
4424  ?( ( IsUpper<MT4>::value )
4425  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4426  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4427  :( IsUpper<MT4>::value ? i : 0UL ) );
4428 
4429  ElementType value( (~C)(i,j) );
4430 
4431  for( size_t k=kbegin; k<K; ++k ) {
4432  value += A(i,k) * B(k,j);
4433  }
4434 
4435  (~C)(i,j) = value;
4436  }
4437  }
4438  }
4440  //**********************************************************************************************
4441 
4442  //**Default addition assignment to dense matrices (large matrices)******************************
4456  template< typename MT3 // Type of the left-hand side target matrix
4457  , typename MT4 // Type of the left-hand side matrix operand
4458  , typename MT5 > // Type of the right-hand side matrix operand
4460  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4461  {
4462  selectDefaultAddAssignKernel( C, A, B );
4463  }
4465  //**********************************************************************************************
4466 
4467  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
4482  template< typename MT3 // Type of the left-hand side target matrix
4483  , typename MT4 // Type of the left-hand side matrix operand
4484  , typename MT5 > // Type of the right-hand side matrix operand
4486  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4487  {
4488  if( LOW )
4489  lmmm( C, A, B, ElementType(1), ElementType(1) );
4490  else if( UPP )
4491  ummm( C, A, B, ElementType(1), ElementType(1) );
4492  else
4493  mmm( C, A, B, ElementType(1), ElementType(1) );
4494  }
4496  //**********************************************************************************************
4497 
4498  //**BLAS-based addition assignment to dense matrices (default)**********************************
4512  template< typename MT3 // Type of the left-hand side target matrix
4513  , typename MT4 // Type of the left-hand side matrix operand
4514  , typename MT5 > // Type of the right-hand side matrix operand
4516  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4517  {
4518  selectLargeAddAssignKernel( C, A, B );
4519  }
4521  //**********************************************************************************************
4522 
4523  //**BLAS-based addition assignment to dense matrices********************************************
4524 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4525 
4538  template< typename MT3 // Type of the left-hand side target matrix
4539  , typename MT4 // Type of the left-hand side matrix operand
4540  , typename MT5 > // Type of the right-hand side matrix operand
4542  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4543  {
4544  using ET = ElementType_<MT3>;
4545 
4546  if( IsTriangular<MT4>::value ) {
4547  ResultType_<MT3> tmp( serial( B ) );
4548  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4549  addAssign( C, tmp );
4550  }
4551  else if( IsTriangular<MT5>::value ) {
4552  ResultType_<MT3> tmp( serial( A ) );
4553  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4554  addAssign( C, tmp );
4555  }
4556  else {
4557  gemm( C, A, B, ET(1), ET(1) );
4558  }
4559  }
4561 #endif
4562  //**********************************************************************************************
4563 
4564  //**Addition assignment to sparse matrices******************************************************
4565  // No special implementation for the addition assignment to sparse matrices.
4566  //**********************************************************************************************
4567 
4568  //**Subtraction assignment to dense matrices****************************************************
4581  template< typename MT // Type of the target dense matrix
4582  , bool SO > // Storage order of the target dense matrix
4583  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
4584  {
4586 
4587  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4588  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4589 
4590  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4591  return;
4592  }
4593 
4594  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
4595  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
4596 
4597  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4598  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4599  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4600  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4601  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4602  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4603 
4604  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
4605  }
4607  //**********************************************************************************************
4608 
4609  //**Subtraction assignment to dense matrices (kernel selection)*********************************
4620  template< typename MT3 // Type of the left-hand side target matrix
4621  , typename MT4 // Type of the left-hand side matrix operand
4622  , typename MT5 > // Type of the right-hand side matrix operand
4623  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4624  {
4626  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
4627  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
4628  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
4629  selectSmallSubAssignKernel( C, A, B );
4630  else
4631  selectBlasSubAssignKernel( C, A, B );
4632  }
4634  //**********************************************************************************************
4635 
4636  //**Default subtraction assignment to row-major dense matrices (general/general)****************
4650  template< typename MT3 // Type of the left-hand side target matrix
4651  , typename MT4 // Type of the left-hand side matrix operand
4652  , typename MT5 > // Type of the right-hand side matrix operand
4653  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4654  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4655  {
4656  const size_t M( A.rows() );
4657  const size_t N( B.columns() );
4658  const size_t K( A.columns() );
4659 
4660  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4661 
4662  for( size_t i=0UL; i<M; ++i )
4663  {
4664  const size_t kbegin( ( IsUpper<MT4>::value )
4665  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4666  :( 0UL ) );
4667  const size_t kend( ( IsLower<MT4>::value )
4668  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4669  :( K ) );
4670  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4671 
4672  for( size_t k=kbegin; k<kend; ++k )
4673  {
4674  const size_t jbegin( ( IsUpper<MT5>::value )
4676  ?( UPP ? max(i,k+1UL) : k+1UL )
4677  :( UPP ? max(i,k) : k ) )
4678  :( UPP ? i : 0UL ) );
4679  const size_t jend( ( IsLower<MT5>::value )
4681  ?( LOW ? min(i+1UL,k) : k )
4682  :( LOW ? min(i,k)+1UL : k+1UL ) )
4683  :( LOW ? i+1UL : N ) );
4684 
4685  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
4686  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4687 
4688  const size_t jnum( jend - jbegin );
4689  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4690 
4691  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4692  (~C)(i,j ) -= A(i,k) * B(k,j );
4693  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4694  }
4695  if( jpos < jend ) {
4696  (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4697  }
4698  }
4699  }
4700  }
4702  //**********************************************************************************************
4703 
4704  //**Default subtraction assignment to column-major dense matrices (general/general)*************
4718  template< typename MT3 // Type of the left-hand side target matrix
4719  , typename MT4 // Type of the left-hand side matrix operand
4720  , typename MT5 > // Type of the right-hand side matrix operand
4721  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4722  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4723  {
4724  const size_t M( A.rows() );
4725  const size_t N( B.columns() );
4726  const size_t K( A.columns() );
4727 
4728  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4729 
4730  for( size_t j=0UL; j<N; ++j )
4731  {
4732  const size_t kbegin( ( IsLower<MT5>::value )
4733  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4734  :( 0UL ) );
4735  const size_t kend( ( IsUpper<MT5>::value )
4736  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4737  :( K ) );
4738  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4739 
4740  for( size_t k=kbegin; k<kend; ++k )
4741  {
4742  const size_t ibegin( ( IsLower<MT4>::value )
4744  ?( LOW ? max(j,k+1UL) : k+1UL )
4745  :( LOW ? max(j,k) : k ) )
4746  :( LOW ? j : 0UL ) );
4747  const size_t iend( ( IsUpper<MT4>::value )
4749  ?( UPP ? min(j+1UL,k) : k )
4750  :( UPP ? min(j,k)+1UL : k+1UL ) )
4751  :( UPP ? j+1UL : M ) );
4752 
4753  if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
4754  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4755 
4756  const size_t inum( iend - ibegin );
4757  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4758 
4759  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4760  (~C)(i ,j) -= A(i ,k) * B(k,j);
4761  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4762  }
4763  if( ipos < iend ) {
4764  (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4765  }
4766  }
4767  }
4768  }
4770  //**********************************************************************************************
4771 
4772  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
4786  template< typename MT3 // Type of the left-hand side target matrix
4787  , typename MT4 // Type of the left-hand side matrix operand
4788  , typename MT5 > // Type of the right-hand side matrix operand
4789  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4790  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4791  {
4792  constexpr size_t block( BLOCK_SIZE );
4793 
4794  const size_t M( A.rows() );
4795  const size_t N( B.columns() );
4796 
4797  for( size_t ii=0UL; ii<M; ii+=block ) {
4798  const size_t iend( min( M, ii+block ) );
4799  for( size_t jj=0UL; jj<N; jj+=block ) {
4800  const size_t jend( min( N, jj+block ) );
4801  for( size_t i=ii; i<iend; ++i )
4802  {
4803  const size_t jbegin( ( IsUpper<MT4>::value )
4804  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
4805  :( jj ) );
4806  const size_t jpos( ( IsLower<MT4>::value )
4807  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
4808  :( jend ) );
4809 
4810  for( size_t j=jbegin; j<jpos; ++j ) {
4811  (~C)(i,j) -= A(i,j) * B(j,j);
4812  }
4813  }
4814  }
4815  }
4816  }
4818  //**********************************************************************************************
4819 
4820  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
4834  template< typename MT3 // Type of the left-hand side target matrix
4835  , typename MT4 // Type of the left-hand side matrix operand
4836  , typename MT5 > // Type of the right-hand side matrix operand
4837  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4838  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4839  {
4840  const size_t M( A.rows() );
4841  const size_t N( B.columns() );
4842 
4843  for( size_t j=0UL; j<N; ++j )
4844  {
4845  const size_t ibegin( ( IsLower<MT4>::value )
4846  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4847  :( 0UL ) );
4848  const size_t iend( ( IsUpper<MT4>::value )
4849  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4850  :( M ) );
4851  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4852 
4853  const size_t inum( iend - ibegin );
4854  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4855 
4856  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4857  (~C)(i ,j) -= A(i ,j) * B(j,j);
4858  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4859  }
4860  if( ipos < iend ) {
4861  (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4862  }
4863  }
4864  }
4866  //**********************************************************************************************
4867 
4868  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
4882  template< typename MT3 // Type of the left-hand side target matrix
4883  , typename MT4 // Type of the left-hand side matrix operand
4884  , typename MT5 > // Type of the right-hand side matrix operand
4885  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4886  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4887  {
4888  const size_t M( A.rows() );
4889  const size_t N( B.columns() );
4890 
4891  for( size_t i=0UL; i<M; ++i )
4892  {
4893  const size_t jbegin( ( IsUpper<MT5>::value )
4894  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4895  :( 0UL ) );
4896  const size_t jend( ( IsLower<MT5>::value )
4897  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4898  :( N ) );
4899  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4900 
4901  const size_t jnum( jend - jbegin );
4902  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4903 
4904  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4905  (~C)(i,j ) -= A(i,i) * B(i,j );
4906  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4907  }
4908  if( jpos < jend ) {
4909  (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4910  }
4911  }
4912  }
4914  //**********************************************************************************************
4915 
4916  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
4930  template< typename MT3 // Type of the left-hand side target matrix
4931  , typename MT4 // Type of the left-hand side matrix operand
4932  , typename MT5 > // Type of the right-hand side matrix operand
4933  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4934  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4935  {
4936  constexpr size_t block( BLOCK_SIZE );
4937 
4938  const size_t M( A.rows() );
4939  const size_t N( B.columns() );
4940 
4941  for( size_t jj=0UL; jj<N; jj+=block ) {
4942  const size_t jend( min( N, jj+block ) );
4943  for( size_t ii=0UL; ii<M; ii+=block ) {
4944  const size_t iend( min( M, ii+block ) );
4945  for( size_t j=jj; j<jend; ++j )
4946  {
4947  const size_t ibegin( ( IsLower<MT5>::value )
4948  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
4949  :( ii ) );
4950  const size_t ipos( ( IsUpper<MT5>::value )
4951  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
4952  :( iend ) );
4953 
4954  for( size_t i=ibegin; i<ipos; ++i ) {
4955  (~C)(i,j) -= A(i,i) * B(i,j);
4956  }
4957  }
4958  }
4959  }
4960  }
4962  //**********************************************************************************************
4963 
4964  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
4978  template< typename MT3 // Type of the left-hand side target matrix
4979  , typename MT4 // Type of the left-hand side matrix operand
4980  , typename MT5 > // Type of the right-hand side matrix operand
4981  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4982  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4983  {
4984  for( size_t i=0UL; i<A.rows(); ++i ) {
4985  C(i,i) -= A(i,i) * B(i,i);
4986  }
4987  }
4989  //**********************************************************************************************
4990 
4991  //**Default subtraction assignment to dense matrices (small matrices)***************************
5005  template< typename MT3 // Type of the left-hand side target matrix
5006  , typename MT4 // Type of the left-hand side matrix operand
5007  , typename MT5 > // Type of the right-hand side matrix operand
5009  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5010  {
5011  selectDefaultSubAssignKernel( C, A, B );
5012  }
5014  //**********************************************************************************************
5015 
5016  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
5031  template< typename MT3 // Type of the left-hand side target matrix
5032  , typename MT4 // Type of the left-hand side matrix operand
5033  , typename MT5 > // Type of the right-hand side matrix operand
5035  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
5036  {
5037  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5038 
5039  const size_t M( A.rows() );
5040  const size_t N( B.columns() );
5041  const size_t K( A.columns() );
5042 
5043  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5044 
5045  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
5046  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5047 
5048  size_t j( 0UL );
5049 
5051  {
5052  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5053  for( size_t i=0UL; i<M; ++i )
5054  {
5055  const size_t kbegin( ( IsUpper<MT4>::value )
5056  ?( ( IsLower<MT5>::value )
5057  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5058  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5059  :( IsLower<MT5>::value ? j : 0UL ) );
5060  const size_t kend( ( IsLower<MT4>::value )
5061  ?( ( IsUpper<MT5>::value )
5062  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5063  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
5064  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
5065 
5066  SIMDType xmm1( (~C).load(i,j ) );
5067  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5068  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5069  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5070  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
5071  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
5072  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
5073  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
5074 
5075  for( size_t k=kbegin; k<kend; ++k ) {
5076  const SIMDType a1( set( A(i,k) ) );
5077  xmm1 -= a1 * B.load(k,j );
5078  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5079  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5080  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5081  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5082  xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
5083  xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
5084  xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
5085  }
5086 
5087  (~C).store( i, j , xmm1 );
5088  (~C).store( i, j+SIMDSIZE , xmm2 );
5089  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5090  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5091  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
5092  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
5093  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
5094  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
5095  }
5096  }
5097  }
5098 
5099  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5100  {
5101  size_t i( 0UL );
5102 
5103  for( ; (i+2UL) <= M; i+=2UL )
5104  {
5105  const size_t kbegin( ( IsUpper<MT4>::value )
5106  ?( ( IsLower<MT5>::value )
5107  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5108  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5109  :( IsLower<MT5>::value ? j : 0UL ) );
5110  const size_t kend( ( IsLower<MT4>::value )
5111  ?( ( IsUpper<MT5>::value )
5112  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
5113  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5114  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
5115 
5116  SIMDType xmm1 ( (~C).load(i ,j ) );
5117  SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
5118  SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
5119  SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
5120  SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
5121  SIMDType xmm6 ( (~C).load(i+1UL,j ) );
5122  SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
5123  SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5124  SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
5125  SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
5126 
5127  for( size_t k=kbegin; k<kend; ++k ) {
5128  const SIMDType a1( set( A(i ,k) ) );
5129  const SIMDType a2( set( A(i+1UL,k) ) );
5130  const SIMDType b1( B.load(k,j ) );
5131  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5132  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5133  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5134  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5135  xmm1 -= a1 * b1;
5136  xmm2 -= a1 * b2;
5137  xmm3 -= a1 * b3;
5138  xmm4 -= a1 * b4;
5139  xmm5 -= a1 * b5;
5140  xmm6 -= a2 * b1;
5141  xmm7 -= a2 * b2;
5142  xmm8 -= a2 * b3;
5143  xmm9 -= a2 * b4;
5144  xmm10 -= a2 * b5;
5145  }
5146 
5147  (~C).store( i , j , xmm1 );
5148  (~C).store( i , j+SIMDSIZE , xmm2 );
5149  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5150  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
5151  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
5152  (~C).store( i+1UL, j , xmm6 );
5153  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
5154  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
5155  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
5156  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
5157  }
5158 
5159  if( i < M )
5160  {
5161  const size_t kbegin( ( IsUpper<MT4>::value )
5162  ?( ( IsLower<MT5>::value )
5163  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5164  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5165  :( IsLower<MT5>::value ? j : 0UL ) );
5166  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
5167 
5168  SIMDType xmm1( (~C).load(i,j ) );
5169  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5170  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5171  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5172  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
5173 
5174  for( size_t k=kbegin; k<kend; ++k ) {
5175  const SIMDType a1( set( A(i,k) ) );
5176  xmm1 -= a1 * B.load(k,j );
5177  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5178  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5179  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5180  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5181  }
5182 
5183  (~C).store( i, j , xmm1 );
5184  (~C).store( i, j+SIMDSIZE , xmm2 );
5185  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5186  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5187  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
5188  }
5189  }
5190 
5191  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5192  {
5193  size_t i( 0UL );
5194 
5195  for( ; (i+2UL) <= M; i+=2UL )
5196  {
5197  const size_t kbegin( ( IsUpper<MT4>::value )
5198  ?( ( IsLower<MT5>::value )
5199  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5200  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5201  :( IsLower<MT5>::value ? j : 0UL ) );
5202  const size_t kend( ( IsLower<MT4>::value )
5203  ?( ( IsUpper<MT5>::value )
5204  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5205  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5206  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
5207 
5208  SIMDType xmm1( (~C).load(i ,j ) );
5209  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
5210  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
5211  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
5212  SIMDType xmm5( (~C).load(i+1UL,j ) );
5213  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
5214  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5215  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
5216 
5217  for( size_t k=kbegin; k<kend; ++k ) {
5218  const SIMDType a1( set( A(i ,k) ) );
5219  const SIMDType a2( set( A(i+1UL,k) ) );
5220  const SIMDType b1( B.load(k,j ) );
5221  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5222  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5223  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5224  xmm1 -= a1 * b1;
5225  xmm2 -= a1 * b2;
5226  xmm3 -= a1 * b3;
5227  xmm4 -= a1 * b4;
5228  xmm5 -= a2 * b1;
5229  xmm6 -= a2 * b2;
5230  xmm7 -= a2 * b3;
5231  xmm8 -= a2 * b4;
5232  }
5233 
5234  (~C).store( i , j , xmm1 );
5235  (~C).store( i , j+SIMDSIZE , xmm2 );
5236  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5237  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
5238  (~C).store( i+1UL, j , xmm5 );
5239  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
5240  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
5241  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
5242  }
5243 
5244  if( i < M )
5245  {
5246  const size_t kbegin( ( IsUpper<MT4>::value )
5247  ?( ( IsLower<MT5>::value )
5248  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5249  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5250  :( IsLower<MT5>::value ? j : 0UL ) );
5251  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5252 
5253  SIMDType xmm1( (~C).load(i,j ) );
5254  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5255  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5256  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5257 
5258  for( size_t k=kbegin; k<kend; ++k ) {
5259  const SIMDType a1( set( A(i,k) ) );
5260  xmm1 -= a1 * B.load(k,j );
5261  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5262  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5263  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5264  }
5265 
5266  (~C).store( i, j , xmm1 );
5267  (~C).store( i, j+SIMDSIZE , xmm2 );
5268  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5269  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5270  }
5271  }
5272 
5273  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5274  {
5275  size_t i( 0UL );
5276 
5277  for( ; (i+2UL) <= M; i+=2UL )
5278  {
5279  const size_t kbegin( ( IsUpper<MT4>::value )
5280  ?( ( IsLower<MT5>::value )
5281  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5282  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5283  :( IsLower<MT5>::value ? j : 0UL ) );
5284  const size_t kend( ( IsLower<MT4>::value )
5285  ?( ( IsUpper<MT5>::value )
5286  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
5287  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5288  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
5289 
5290  SIMDType xmm1( (~C).load(i ,j ) );
5291  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
5292  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
5293  SIMDType xmm4( (~C).load(i+1UL,j ) );
5294  SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
5295  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5296 
5297  for( size_t k=kbegin; k<kend; ++k ) {
5298  const SIMDType a1( set( A(i ,k) ) );
5299  const SIMDType a2( set( A(i+1UL,k) ) );
5300  const SIMDType b1( B.load(k,j ) );
5301  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5302  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5303  xmm1 -= a1 * b1;
5304  xmm2 -= a1 * b2;
5305  xmm3 -= a1 * b3;
5306  xmm4 -= a2 * b1;
5307  xmm5 -= a2 * b2;
5308  xmm6 -= a2 * b3;
5309  }
5310 
5311  (~C).store( i , j , xmm1 );
5312  (~C).store( i , j+SIMDSIZE , xmm2 );
5313  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5314  (~C).store( i+1UL, j , xmm4 );
5315  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
5316  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
5317  }
5318 
5319  if( i < M )
5320  {
5321  const size_t kbegin( ( IsUpper<MT4>::value )
5322  ?( ( IsLower<MT5>::value )
5323  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5324  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5325  :( IsLower<MT5>::value ? j : 0UL ) );
5326  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
5327 
5328  SIMDType xmm1( (~C).load(i,j ) );
5329  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5330  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5331 
5332  for( size_t k=kbegin; k<kend; ++k ) {
5333  const SIMDType a1( set( A(i,k) ) );
5334  xmm1 -= a1 * B.load(k,j );
5335  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5336  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5337  }
5338 
5339  (~C).store( i, j , xmm1 );
5340  (~C).store( i, j+SIMDSIZE , xmm2 );
5341  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5342  }
5343  }
5344 
5345  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5346  {
5347  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
5348  size_t i( LOW ? j : 0UL );
5349 
5350  for( ; (i+4UL) <= iend; i+=4UL )
5351  {
5352  const size_t kbegin( ( IsUpper<MT4>::value )
5353  ?( ( IsLower<MT5>::value )
5354  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5355  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5356  :( IsLower<MT5>::value ? j : 0UL ) );
5357  const size_t kend( ( IsLower<MT4>::value )
5358  ?( ( IsUpper<MT5>::value )
5359  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
5360  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
5361  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5362 
5363  SIMDType xmm1( (~C).load(i ,j ) );
5364  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5365  SIMDType xmm3( (~C).load(i+1UL,j ) );
5366  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5367  SIMDType xmm5( (~C).load(i+2UL,j ) );
5368  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
5369  SIMDType xmm7( (~C).load(i+3UL,j ) );
5370  SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
5371 
5372  for( size_t k=kbegin; k<kend; ++k ) {
5373  const SIMDType a1( set( A(i ,k) ) );
5374  const SIMDType a2( set( A(i+1UL,k) ) );
5375  const SIMDType a3( set( A(i+2UL,k) ) );
5376  const SIMDType a4( set( A(i+3UL,k) ) );
5377  const SIMDType b1( B.load(k,j ) );
5378  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5379  xmm1 -= a1 * b1;
5380  xmm2 -= a1 * b2;
5381  xmm3 -= a2 * b1;
5382  xmm4 -= a2 * b2;
5383  xmm5 -= a3 * b1;
5384  xmm6 -= a3 * b2;
5385  xmm7 -= a4 * b1;
5386  xmm8 -= a4 * b2;
5387  }
5388 
5389  (~C).store( i , j , xmm1 );
5390  (~C).store( i , j+SIMDSIZE, xmm2 );
5391  (~C).store( i+1UL, j , xmm3 );
5392  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
5393  (~C).store( i+2UL, j , xmm5 );
5394  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
5395  (~C).store( i+3UL, j , xmm7 );
5396  (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
5397  }
5398 
5399  for( ; (i+3UL) <= iend; i+=3UL )
5400  {
5401  const size_t kbegin( ( IsUpper<MT4>::value )
5402  ?( ( IsLower<MT5>::value )
5403  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5404  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5405  :( IsLower<MT5>::value ? j : 0UL ) );
5406  const size_t kend( ( IsLower<MT4>::value )
5407  ?( ( IsUpper<MT5>::value )
5408  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
5409  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
5410  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5411 
5412  SIMDType xmm1( (~C).load(i ,j ) );
5413  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5414  SIMDType xmm3( (~C).load(i+1UL,j ) );
5415  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5416  SIMDType xmm5( (~C).load(i+2UL,j ) );
5417  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
5418 
5419  for( size_t k=kbegin; k<kend; ++k ) {
5420  const SIMDType a1( set( A(i ,k) ) );
5421  const SIMDType a2( set( A(i+1UL,k) ) );
5422  const SIMDType a3( set( A(i+2UL,k) ) );
5423  const SIMDType b1( B.load(k,j ) );
5424  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5425  xmm1 -= a1 * b1;
5426  xmm2 -= a1 * b2;
5427  xmm3 -= a2 * b1;
5428  xmm4 -= a2 * b2;
5429  xmm5 -= a3 * b1;
5430  xmm6 -= a3 * b2;
5431  }
5432 
5433  (~C).store( i , j , xmm1 );
5434  (~C).store( i , j+SIMDSIZE, xmm2 );
5435  (~C).store( i+1UL, j , xmm3 );
5436  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
5437  (~C).store( i+2UL, j , xmm5 );
5438  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
5439  }
5440 
5441  for( ; (i+2UL) <= iend; i+=2UL )
5442  {
5443  const size_t kbegin( ( IsUpper<MT4>::value )
5444  ?( ( IsLower<MT5>::value )
5445  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5446  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5447  :( IsLower<MT5>::value ? j : 0UL ) );
5448  const size_t kend( ( IsLower<MT4>::value )
5449  ?( ( IsUpper<MT5>::value )
5450  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
5451  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5452  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5453 
5454  SIMDType xmm1( (~C).load(i ,j ) );
5455  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5456  SIMDType xmm3( (~C).load(i+1UL,j ) );
5457  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5458  SIMDType xmm5, xmm6, xmm7, xmm8;
5459  size_t k( kbegin );
5460 
5461  for( ; (k+2UL) <= kend; k+=2UL ) {
5462  const SIMDType a1( set( A(i ,k ) ) );
5463  const SIMDType a2( set( A(i+1UL,k ) ) );
5464  const SIMDType a3( set( A(i ,k+1UL) ) );
5465  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
5466  const SIMDType b1( B.load(k ,j ) );
5467  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5468  const SIMDType b3( B.load(k+1UL,j ) );
5469  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5470  xmm1 -= a1 * b1;
5471  xmm2 -= a1 * b2;
5472  xmm3 -= a2 * b1;
5473  xmm4 -= a2 * b2;
5474  xmm5 -= a3 * b3;
5475  xmm6 -= a3 * b4;
5476  xmm7 -= a4 * b3;
5477  xmm8 -= a4 * b4;
5478  }
5479 
5480  for( ; k<kend; ++k ) {
5481  const SIMDType a1( set( A(i ,k) ) );
5482  const SIMDType a2( set( A(i+1UL,k) ) );
5483  const SIMDType b1( B.load(k,j ) );
5484  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5485  xmm1 -= a1 * b1;
5486  xmm2 -= a1 * b2;
5487  xmm3 -= a2 * b1;
5488  xmm4 -= a2 * b2;
5489  }
5490 
5491  (~C).store( i , j , xmm1+xmm5 );
5492  (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
5493  (~C).store( i+1UL, j , xmm3+xmm7 );
5494  (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
5495  }
5496 
5497  if( i < iend )
5498  {
5499  const size_t kbegin( ( IsUpper<MT4>::value )
5500  ?( ( IsLower<MT5>::value )
5501  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5502  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5503  :( IsLower<MT5>::value ? j : 0UL ) );
5504  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
5505 
5506  SIMDType xmm1( (~C).load(i,j ) );
5507  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
5508  SIMDType xmm3, xmm4;
5509  size_t k( kbegin );
5510 
5511  for( ; (k+2UL) <= kend; k+=2UL ) {
5512  const SIMDType a1( set( A(i,k ) ) );
5513  const SIMDType a2( set( A(i,k+1UL) ) );
5514  xmm1 -= a1 * B.load(k ,j );
5515  xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
5516  xmm3 -= a2 * B.load(k+1UL,j );
5517  xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
5518  }
5519 
5520  for( ; k<kend; ++k ) {
5521  const SIMDType a1( set( A(i,k) ) );
5522  xmm1 -= a1 * B.load(k,j );
5523  xmm2 -= a1 * B.load(k,j+SIMDSIZE);
5524  }
5525 
5526  (~C).store( i, j , xmm1+xmm3 );
5527  (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
5528  }
5529  }
5530 
5531  for( ; j<jpos; j+=SIMDSIZE )
5532  {
5533  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
5534  size_t i( LOW ? j : 0UL );
5535 
5536  for( ; (i+4UL) <= iend; i+=4UL )
5537  {
5538  const size_t kbegin( ( IsUpper<MT4>::value )
5539  ?( ( IsLower<MT5>::value )
5540  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5541  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5542  :( IsLower<MT5>::value ? j : 0UL ) );
5543  const size_t kend( ( IsLower<MT4>::value )
5544  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
5545  :( K ) );
5546 
5547  SIMDType xmm1( (~C).load(i ,j) );
5548  SIMDType xmm2( (~C).load(i+1UL,j) );
5549  SIMDType xmm3( (~C).load(i+2UL,j) );
5550  SIMDType xmm4( (~C).load(i+3UL,j) );
5551  SIMDType xmm5, xmm6, xmm7, xmm8;
5552  size_t k( kbegin );
5553 
5554  for( ; (k+2UL) <= kend; k+=2UL ) {
5555  const SIMDType b1( B.load(k ,j) );
5556  const SIMDType b2( B.load(k+1UL,j) );
5557  xmm1 -= set( A(i ,k ) ) * b1;
5558  xmm2 -= set( A(i+1UL,k ) ) * b1;
5559  xmm3 -= set( A(i+2UL,k ) ) * b1;
5560  xmm4 -= set( A(i+3UL,k ) ) * b1;
5561  xmm5 -= set( A(i ,k+1UL) ) * b2;
5562  xmm6 -= set( A(i+1UL,k+1UL) ) * b2;
5563  xmm7 -= set( A(i+2UL,k+1UL) ) * b2;
5564  xmm8 -= set( A(i+3UL,k+1UL) ) * b2;
5565  }
5566 
5567  for( ; k<kend; ++k ) {
5568  const SIMDType b1( B.load(k,j) );
5569  xmm1 -= set( A(i ,k) ) * b1;
5570  xmm2 -= set( A(i+1UL,k) ) * b1;
5571  xmm3 -= set( A(i+2UL,k) ) * b1;
5572  xmm4 -= set( A(i+3UL,k) ) * b1;
5573  }
5574 
5575  (~C).store( i , j, xmm1+xmm5 );
5576  (~C).store( i+1UL, j, xmm2+xmm6 );
5577  (~C).store( i+2UL, j, xmm3+xmm7 );
5578  (~C).store( i+3UL, j, xmm4+xmm8 );
5579  }
5580 
5581  for( ; (i+3UL) <= iend; i+=3UL )
5582  {
5583  const size_t kbegin( ( IsUpper<MT4>::value )
5584  ?( ( IsLower<MT5>::value )
5585  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5586  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5587  :( IsLower<MT5>::value ? j : 0UL ) );
5588  const size_t kend( ( IsLower<MT4>::value )
5589  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
5590  :( K ) );
5591 
5592  SIMDType xmm1( (~C).load(i ,j) );
5593  SIMDType xmm2( (~C).load(i+1UL,j) );
5594  SIMDType xmm3( (~C).load(i+2UL,j) );
5595  SIMDType xmm4, xmm5, xmm6;
5596  size_t k( kbegin );
5597 
5598  for( ; (k+2UL) <= kend; k+=2UL ) {
5599  const SIMDType b1( B.load(k ,j) );
5600  const SIMDType b2( B.load(k+1UL,j) );
5601  xmm1 -= set( A(i ,k ) ) * b1;
5602  xmm2 -= set( A(i+1UL,k ) ) * b1;
5603  xmm3 -= set( A(i+2UL,k ) ) * b1;
5604  xmm4 -= set( A(i ,k+1UL) ) * b2;
5605  xmm5 -= set( A(i+1UL,k+1UL) ) * b2;
5606  xmm6 -= set( A(i+2UL,k+1UL) ) * b2;
5607  }
5608 
5609  for( ; k<kend; ++k ) {
5610  const SIMDType b1( B.load(k,j) );
5611  xmm1 -= set( A(i ,k) ) * b1;
5612  xmm2 -= set( A(i+1UL,k) ) * b1;
5613  xmm3 -= set( A(i+2UL,k) ) * b1;
5614  }
5615 
5616  (~C).store( i , j, xmm1+xmm4 );
5617  (~C).store( i+1UL, j, xmm2+xmm5 );
5618  (~C).store( i+2UL, j, xmm3+xmm6 );
5619  }
5620 
5621  for( ; (i+2UL) <= iend; i+=2UL )
5622  {
5623  const size_t kbegin( ( IsUpper<MT4>::value )
5624  ?( ( IsLower<MT5>::value )
5625  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5626  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5627  :( IsLower<MT5>::value ? j : 0UL ) );
5628  const size_t kend( ( IsLower<MT4>::value )
5629  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5630  :( K ) );
5631 
5632  SIMDType xmm1( (~C).load(i ,j) );
5633  SIMDType xmm2( (~C).load(i+1UL,j) );
5634  SIMDType xmm3, xmm4;
5635  size_t k( kbegin );
5636 
5637  for( ; (k+2UL) <= kend; k+=2UL ) {
5638  const SIMDType b1( B.load(k ,j) );
5639  const SIMDType b2( B.load(k+1UL,j) );
5640  xmm1 -= set( A(i ,k ) ) * b1;
5641  xmm2 -= set( A(i+1UL,k ) ) * b1;
5642  xmm3 -= set( A(i ,k+1UL) ) * b2;
5643  xmm4 -= set( A(i+1UL,k+1UL) ) * b2;
5644  }
5645 
5646  for( ; k<kend; ++k ) {
5647  const SIMDType b1( B.load(k,j) );
5648  xmm1 -= set( A(i ,k) ) * b1;
5649  xmm2 -= set( A(i+1UL,k) ) * b1;
5650  }
5651 
5652  (~C).store( i , j, xmm1+xmm3 );
5653  (~C).store( i+1UL, j, xmm2+xmm4 );
5654  }
5655 
5656  if( i < iend )
5657  {
5658  const size_t kbegin( ( IsUpper<MT4>::value )
5659  ?( ( IsLower<MT5>::value )
5660  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5661  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5662  :( IsLower<MT5>::value ? j : 0UL ) );
5663 
5664  SIMDType xmm1( (~C).load(i,j) );
5665  SIMDType xmm2;
5666  size_t k( kbegin );
5667 
5668  for( ; (k+2UL) <= K; k+=2UL ) {
5669  xmm1 -= set( A(i,k ) ) * B.load(k ,j);
5670  xmm2 -= set( A(i,k+1UL) ) * B.load(k+1UL,j);
5671  }
5672 
5673  for( ; k<K; ++k ) {
5674  xmm1 -= set( A(i,k) ) * B.load(k,j);
5675  }
5676 
5677  (~C).store( i, j, xmm1+xmm2 );
5678  }
5679  }
5680 
5681  for( ; remainder && j<N; ++j )
5682  {
5683  const size_t iend( UPP ? j+1UL : M );
5684  size_t i( LOW ? j : 0UL );
5685 
5686  for( ; (i+2UL) <= iend; i+=2UL )
5687  {
5688  const size_t kbegin( ( IsUpper<MT4>::value )
5689  ?( ( IsLower<MT5>::value )
5690  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5691  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5692  :( IsLower<MT5>::value ? j : 0UL ) );
5693  const size_t kend( ( IsLower<MT4>::value )
5694  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5695  :( K ) );
5696 
5697  ElementType value1( (~C)(i ,j) );
5698  ElementType value2( (~C)(i+1UL,j) );
5699 
5700  for( size_t k=kbegin; k<kend; ++k ) {
5701  value1 -= A(i ,k) * B(k,j);
5702  value2 -= A(i+1UL,k) * B(k,j);
5703  }
5704 
5705  (~C)(i ,j) = value1;
5706  (~C)(i+1UL,j) = value2;
5707  }
5708 
5709  if( i < iend )
5710  {
5711  const size_t kbegin( ( IsUpper<MT4>::value )
5712  ?( ( IsLower<MT5>::value )
5713  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5714  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5715  :( IsLower<MT5>::value ? j : 0UL ) );
5716 
5717  ElementType value( (~C)(i,j) );
5718 
5719  for( size_t k=kbegin; k<K; ++k ) {
5720  value -= A(i,k) * B(k,j);
5721  }
5722 
5723  (~C)(i,j) = value;
5724  }
5725  }
5726  }
5728  //**********************************************************************************************
5729 
5730  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
5745  template< typename MT3 // Type of the left-hand side target matrix
5746  , typename MT4 // Type of the left-hand side matrix operand
5747  , typename MT5 > // Type of the right-hand side matrix operand
5749  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
5750  {
5751  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5752 
5753  const size_t M( A.rows() );
5754  const size_t N( B.columns() );
5755  const size_t K( A.columns() );
5756 
5757  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5758 
5759  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
5760  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5761 
5762  size_t i( 0UL );
5763 
5765  {
5766  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5767  for( size_t j=0UL; j<N; ++j )
5768  {
5769  const size_t kbegin( ( IsLower<MT5>::value )
5770  ?( ( IsUpper<MT4>::value )
5771  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5772  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5773  :( IsUpper<MT4>::value ? i : 0UL ) );
5774  const size_t kend( ( IsUpper<MT5>::value )
5775  ?( ( IsLower<MT4>::value )
5776  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
5777  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
5778  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
5779 
5780  SIMDType xmm1( (~C).load(i ,j) );
5781  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5782  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5783  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5784  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
5785  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
5786  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
5787  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
5788 
5789  for( size_t k=kbegin; k<kend; ++k ) {
5790  const SIMDType b1( set( B(k,j) ) );
5791  xmm1 -= A.load(i ,k) * b1;
5792  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5793  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5794  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5795  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5796  xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
5797  xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
5798  xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
5799  }
5800 
5801  (~C).store( i , j, xmm1 );
5802  (~C).store( i+SIMDSIZE , j, xmm2 );
5803  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5804  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5805  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
5806  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
5807  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
5808  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
5809  }
5810  }
5811  }
5812 
5813  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5814  {
5815  size_t j( 0UL );
5816 
5817  for( ; (j+2UL) <= N; j+=2UL )
5818  {
5819  const size_t kbegin( ( IsLower<MT5>::value )
5820  ?( ( IsUpper<MT4>::value )
5821  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5822  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5823  :( IsUpper<MT4>::value ? i : 0UL ) );
5824  const size_t kend( ( IsUpper<MT5>::value )
5825  ?( ( IsLower<MT4>::value )
5826  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5827  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5828  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
5829 
5830  SIMDType xmm1 ( (~C).load(i ,j ) );
5831  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
5832  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
5833  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
5834  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
5835  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
5836  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
5837  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
5838  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
5839  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
5840 
5841  for( size_t k=kbegin; k<kend; ++k ) {
5842  const SIMDType a1( A.load(i ,k) );
5843  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5844  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5845  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5846  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5847  const SIMDType b1( set( B(k,j ) ) );
5848  const SIMDType b2( set( B(k,j+1UL) ) );
5849  xmm1 -= a1 * b1;
5850  xmm2 -= a2 * b1;
5851  xmm3 -= a3 * b1;
5852  xmm4 -= a4 * b1;
5853  xmm5 -= a5 * b1;
5854  xmm6 -= a1 * b2;
5855  xmm7 -= a2 * b2;
5856  xmm8 -= a3 * b2;
5857  xmm9 -= a4 * b2;
5858  xmm10 -= a5 * b2;
5859  }
5860 
5861  (~C).store( i , j , xmm1 );
5862  (~C).store( i+SIMDSIZE , j , xmm2 );
5863  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
5864  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
5865  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
5866  (~C).store( i , j+1UL, xmm6 );
5867  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
5868  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
5869  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
5870  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
5871  }
5872 
5873  if( j < N )
5874  {
5875  const size_t kbegin( ( IsLower<MT5>::value )
5876  ?( ( IsUpper<MT4>::value )
5877  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5878  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5879  :( IsUpper<MT4>::value ? i : 0UL ) );
5880  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
5881 
5882  SIMDType xmm1( (~C).load(i ,j) );
5883  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5884  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5885  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5886  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
5887 
5888  for( size_t k=kbegin; k<kend; ++k ) {
5889  const SIMDType b1( set( B(k,j) ) );
5890  xmm1 -= A.load(i ,k) * b1;
5891  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5892  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5893  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5894  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5895  }
5896 
5897  (~C).store( i , j, xmm1 );
5898  (~C).store( i+SIMDSIZE , j, xmm2 );
5899  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5900  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5901  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
5902  }
5903  }
5904 
5905  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5906  {
5907  size_t j( 0UL );
5908 
5909  for( ; (j+2UL) <= N; j+=2UL )
5910  {
5911  const size_t kbegin( ( IsLower<MT5>::value )
5912  ?( ( IsUpper<MT4>::value )
5913  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5914  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5915  :( IsUpper<MT4>::value ? i : 0UL ) );
5916  const size_t kend( ( IsUpper<MT5>::value )
5917  ?( ( IsLower<MT4>::value )
5918  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5919  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5920  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
5921 
5922  SIMDType xmm1( (~C).load(i ,j ) );
5923  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
5924  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
5925  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
5926  SIMDType xmm5( (~C).load(i ,j+1UL) );
5927  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
5928  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
5929  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
5930 
5931  for( size_t k=kbegin; k<kend; ++k ) {
5932  const SIMDType a1( A.load(i ,k) );
5933  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5934  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5935  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5936  const SIMDType b1( set( B(k,j ) ) );
5937  const SIMDType b2( set( B(k,j+1UL) ) );
5938  xmm1 -= a1 * b1;
5939  xmm2 -= a2 * b1;
5940  xmm3 -= a3 * b1;
5941  xmm4 -= a4 * b1;
5942  xmm5 -= a1 * b2;
5943  xmm6 -= a2 * b2;
5944  xmm7 -= a3 * b2;
5945  xmm8 -= a4 * b2;
5946  }
5947 
5948  (~C).store( i , j , xmm1 );
5949  (~C).store( i+SIMDSIZE , j , xmm2 );
5950  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
5951  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
5952  (~C).store( i , j+1UL, xmm5 );
5953  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
5954  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
5955  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
5956  }
5957 
5958  if( j < N )
5959  {
5960  const size_t kbegin( ( IsLower<MT5>::value )
5961  ?( ( IsUpper<MT4>::value )
5962  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5963  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5964  :( IsUpper<MT4>::value ? i : 0UL ) );
5965  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
5966 
5967  SIMDType xmm1( (~C).load(i ,j) );
5968  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5969  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5970  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5971 
5972  for( size_t k=kbegin; k<kend; ++k ) {
5973  const SIMDType b1( set( B(k,j) ) );
5974  xmm1 -= A.load(i ,k) * b1;
5975  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5976  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5977  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5978  }
5979 
5980  (~C).store( i , j, xmm1 );
5981  (~C).store( i+SIMDSIZE , j, xmm2 );
5982  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5983  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5984  }
5985  }
5986 
5987  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5988  {
5989  size_t j( 0UL );
5990 
5991  for( ; (j+2UL) <= N; j+=2UL )
5992  {
5993  const size_t kbegin( ( IsLower<MT5>::value )
5994  ?( ( IsUpper<MT4>::value )
5995  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5996  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5997  :( IsUpper<MT4>::value ? i : 0UL ) );
5998  const size_t kend( ( IsUpper<MT5>::value )
5999  ?( ( IsLower<MT4>::value )
6000  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6001  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6002  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
6003 
6004  SIMDType xmm1( (~C).load(i ,j ) );
6005  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
6006  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
6007  SIMDType xmm4( (~C).load(i ,j+1UL) );
6008  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
6009  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
6010 
6011  for( size_t k=kbegin; k<kend; ++k ) {
6012  const SIMDType a1( A.load(i ,k) );
6013  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6014  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6015  const SIMDType b1( set( B(k,j ) ) );
6016  const SIMDType b2( set( B(k,j+1UL) ) );
6017  xmm1 -= a1 * b1;
6018  xmm2 -= a2 * b1;
6019  xmm3 -= a3 * b1;
6020  xmm4 -= a1 * b2;
6021  xmm5 -= a2 * b2;
6022  xmm6 -= a3 * b2;
6023  }
6024 
6025  (~C).store( i , j , xmm1 );
6026  (~C).store( i+SIMDSIZE , j , xmm2 );
6027  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
6028  (~C).store( i , j+1UL, xmm4 );
6029  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
6030  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
6031  }
6032 
6033  if( j < N )
6034  {
6035  const size_t kbegin( ( IsLower<MT5>::value )
6036  ?( ( IsUpper<MT4>::value )
6037  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6038  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6039  :( IsUpper<MT4>::value ? i : 0UL ) );
6040  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
6041 
6042  SIMDType xmm1( (~C).load(i ,j) );
6043  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
6044  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
6045 
6046  for( size_t k=kbegin; k<kend; ++k ) {
6047  const SIMDType b1( set( B(k,j) ) );
6048  xmm1 -= A.load(i ,k) * b1;
6049  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6050  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6051  }
6052 
6053  (~C).store( i , j, xmm1 );
6054  (~C).store( i+SIMDSIZE , j, xmm2 );
6055  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
6056  }
6057  }
6058 
6059  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6060  {
6061  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
6062  size_t j( UPP ? i : 0UL );
6063 
6064  for( ; (j+4UL) <= jend; j+=4UL )
6065  {
6066  const size_t kbegin( ( IsLower<MT5>::value )
6067  ?( ( IsUpper<MT4>::value )
6068  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6069  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6070  :( IsUpper<MT4>::value ? i : 0UL ) );
6071  const size_t kend( ( IsUpper<MT5>::value )
6072  ?( ( IsLower<MT4>::value )
6073  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
6074  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
6075  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6076 
6077  SIMDType xmm1( (~C).load(i ,j ) );
6078  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6079  SIMDType xmm3( (~C).load(i ,j+1UL) );
6080  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6081  SIMDType xmm5( (~C).load(i ,j+2UL) );
6082  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
6083  SIMDType xmm7( (~C).load(i ,j+3UL) );
6084  SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
6085 
6086  for( size_t k=kbegin; k<kend; ++k ) {
6087  const SIMDType a1( A.load(i ,k) );
6088  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6089  const SIMDType b1( set( B(k,j ) ) );
6090  const SIMDType b2( set( B(k,j+1UL) ) );
6091  const SIMDType b3( set( B(k,j+2UL) ) );
6092  const SIMDType b4( set( B(k,j+3UL) ) );
6093  xmm1 -= a1 * b1;
6094  xmm2 -= a2 * b1;
6095  xmm3 -= a1 * b2;
6096  xmm4 -= a2 * b2;
6097  xmm5 -= a1 * b3;
6098  xmm6 -= a2 * b3;
6099  xmm7 -= a1 * b4;
6100  xmm8 -= a2 * b4;
6101  }
6102 
6103  (~C).store( i , j , xmm1 );
6104  (~C).store( i+SIMDSIZE, j , xmm2 );
6105  (~C).store( i , j+1UL, xmm3 );
6106  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
6107  (~C).store( i , j+2UL, xmm5 );
6108  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
6109  (~C).store( i , j+3UL, xmm7 );
6110  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
6111  }
6112 
6113  for( ; (j+3UL) <= jend; j+=3UL )
6114  {
6115  const size_t kbegin( ( IsLower<MT5>::value )
6116  ?( ( IsUpper<MT4>::value )
6117  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6118  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6119  :( IsUpper<MT4>::value ? i : 0UL ) );
6120  const size_t kend( ( IsUpper<MT5>::value )
6121  ?( ( IsLower<MT4>::value )
6122  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
6123  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
6124  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6125 
6126  SIMDType xmm1( (~C).load(i ,j ) );
6127  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6128  SIMDType xmm3( (~C).load(i ,j+1UL) );
6129  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6130  SIMDType xmm5( (~C).load(i ,j+2UL) );
6131  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
6132 
6133  for( size_t k=kbegin; k<kend; ++k ) {
6134  const SIMDType a1( A.load(i ,k) );
6135  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6136  const SIMDType b1( set( B(k,j ) ) );
6137  const SIMDType b2( set( B(k,j+1UL) ) );
6138  const SIMDType b3( set( B(k,j+2UL) ) );
6139  xmm1 -= a1 * b1;
6140  xmm2 -= a2 * b1;
6141  xmm3 -= a1 * b2;
6142  xmm4 -= a2 * b2;
6143  xmm5 -= a1 * b3;
6144  xmm6 -= a2 * b3;
6145  }
6146 
6147  (~C).store( i , j , xmm1 );
6148  (~C).store( i+SIMDSIZE, j , xmm2 );
6149  (~C).store( i , j+1UL, xmm3 );
6150  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
6151  (~C).store( i , j+2UL, xmm5 );
6152  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
6153  }
6154 
6155  for( ; (j+2UL) <= jend; j+=2UL )
6156  {
6157  const size_t kbegin( ( IsLower<MT5>::value )
6158  ?( ( IsUpper<MT4>::value )
6159  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6160  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6161  :( IsUpper<MT4>::value ? i : 0UL ) );
6162  const size_t kend( ( IsUpper<MT5>::value )
6163  ?( ( IsLower<MT4>::value )
6164  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6165  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6166  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6167 
6168  SIMDType xmm1( (~C).load(i ,j ) );
6169  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6170  SIMDType xmm3( (~C).load(i ,j+1UL) );
6171  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6172  SIMDType xmm5, xmm6, xmm7, xmm8;
6173  size_t k( kbegin );
6174 
6175  for( ; (k+2UL) <= kend; k+=2UL ) {
6176  const SIMDType a1( A.load(i ,k ) );
6177  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6178  const SIMDType a3( A.load(i ,k+1UL) );
6179  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6180  const SIMDType b1( set( B(k ,j ) ) );
6181  const SIMDType b2( set( B(k ,j+1UL) ) );
6182  const SIMDType b3( set( B(k+1UL,j ) ) );
6183  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
6184  xmm1 -= a1 * b1;
6185  xmm2 -= a2 * b1;
6186  xmm3 -= a1 * b2;
6187  xmm4 -= a2 * b2;
6188  xmm5 -= a3 * b3;
6189  xmm6 -= a4 * b3;
6190  xmm7 -= a3 * b4;
6191  xmm8 -= a4 * b4;
6192  }
6193 
6194  for( ; k<kend; ++k ) {
6195  const SIMDType a1( A.load(i ,k) );
6196  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6197  const SIMDType b1( set( B(k,j ) ) );
6198  const SIMDType b2( set( B(k,j+1UL) ) );
6199  xmm1 -= a1 * b1;
6200  xmm2 -= a2 * b1;
6201  xmm3 -= a1 * b2;
6202  xmm4 -= a2 * b2;
6203  }
6204 
6205  (~C).store( i , j , xmm1+xmm5 );
6206  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
6207  (~C).store( i , j+1UL, xmm3+xmm7 );
6208  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
6209  }
6210 
6211  if( j < jend )
6212  {
6213  const size_t kbegin( ( IsLower<MT5>::value )
6214  ?( ( IsUpper<MT4>::value )
6215  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6216  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6217  :( IsUpper<MT4>::value ? i : 0UL ) );
6218  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6219 
6220  SIMDType xmm1( (~C).load(i ,j) );
6221  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
6222  SIMDType xmm3, xmm4;
6223  size_t k( kbegin );
6224 
6225  for( ; (k+2UL) <= kend; k+=2UL ) {
6226  const SIMDType b1( set( B(k ,j) ) );
6227  const SIMDType b2( set( B(k+1UL,j) ) );
6228  xmm1 -= A.load(i ,k ) * b1;
6229  xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
6230  xmm3 -= A.load(i ,k+1UL) * b2;
6231  xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
6232  }
6233 
6234  for( ; k<kend; ++k ) {
6235  const SIMDType b1( set( B(k,j) ) );
6236  xmm1 -= A.load(i ,k) * b1;
6237  xmm2 -= A.load(i+SIMDSIZE,k) * b1;
6238  }
6239 
6240  (~C).store( i , j, xmm1+xmm3 );
6241  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
6242  }
6243  }
6244 
6245  for( ; i<ipos; i+=SIMDSIZE )
6246  {
6247  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
6248  size_t j( UPP ? i : 0UL );
6249 
6250  for( ; (j+4UL) <= jend; j+=4UL )
6251  {
6252  const size_t kbegin( ( IsLower<MT5>::value )
6253  ?( ( IsUpper<MT4>::value )
6254  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6255  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6256  :( IsUpper<MT4>::value ? i : 0UL ) );
6257  const size_t kend( ( IsUpper<MT5>::value )
6258  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
6259  :( K ) );
6260 
6261  SIMDType xmm1( (~C).load(i,j ) );
6262  SIMDType xmm2( (~C).load(i,j+1UL) );
6263  SIMDType xmm3( (~C).load(i,j+2UL) );
6264  SIMDType xmm4( (~C).load(i,j+3UL) );
6265  SIMDType xmm5, xmm6, xmm7, xmm8;
6266  size_t k( kbegin );
6267 
6268  for( ; (k+2UL) <= kend; k+=2UL ) {
6269  const SIMDType a1( A.load(i,k ) );
6270  const SIMDType a2( A.load(i,k+1UL) );
6271  xmm1 -= a1 * set( B(k ,j ) );
6272  xmm2 -= a1 * set( B(k ,j+1UL) );
6273  xmm3 -= a1 * set( B(k ,j+2UL) );
6274  xmm4 -= a1 * set( B(k ,j+3UL) );
6275  xmm5 -= a2 * set( B(k+1UL,j ) );
6276  xmm6 -= a2 * set( B(k+1UL,j+1UL) );
6277  xmm7 -= a2 * set( B(k+1UL,j+2UL) );
6278  xmm8 -= a2 * set( B(k+1UL,j+3UL) );
6279  }
6280 
6281  for( ; k<kend; ++k ) {
6282  const SIMDType a1( A.load(i,k) );
6283  xmm1 -= a1 * set( B(k,j ) );
6284  xmm2 -= a1 * set( B(k,j+1UL) );
6285  xmm3 -= a1 * set( B(k,j+2UL) );
6286  xmm4 -= a1 * set( B(k,j+3UL) );
6287  }
6288 
6289  (~C).store( i, j , xmm1+xmm5 );
6290  (~C).store( i, j+1UL, xmm2+xmm6 );
6291  (~C).store( i, j+2UL, xmm3+xmm7 );
6292  (~C).store( i, j+3UL, xmm4+xmm8 );
6293  }
6294 
6295  for( ; (j+3UL) <= jend; j+=3UL )
6296  {
6297  const size_t kbegin( ( IsLower<MT5>::value )
6298  ?( ( IsUpper<MT4>::value )
6299  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6300  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6301  :( IsUpper<MT4>::value ? i : 0UL ) );
6302  const size_t kend( ( IsUpper<MT5>::value )
6303  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
6304  :( K ) );
6305 
6306  SIMDType xmm1( (~C).load(i,j ) );
6307  SIMDType xmm2( (~C).load(i,j+1UL) );
6308  SIMDType xmm3( (~C).load(i,j+2UL) );
6309  SIMDType xmm4, xmm5, xmm6;
6310  size_t k( kbegin );
6311 
6312  for( ; (k+2UL) <= kend; k+=2UL ) {
6313  const SIMDType a1( A.load(i,k ) );
6314  const SIMDType a2( A.load(i,k+1UL) );
6315  xmm1 -= a1 * set( B(k ,j ) );
6316  xmm2 -= a1 * set( B(k ,j+1UL) );
6317  xmm3 -= a1 * set( B(k ,j+2UL) );
6318  xmm4 -= a2 * set( B(k+1UL,j ) );
6319  xmm5 -= a2 * set( B(k+1UL,j+1UL) );
6320  xmm6 -= a2 * set( B(k+1UL,j+2UL) );
6321  }
6322 
6323  for( ; k<kend; ++k ) {
6324  const SIMDType a1( A.load(i,k) );
6325  xmm1 -= a1 * set( B(k,j ) );
6326  xmm2 -= a1 * set( B(k,j+1UL) );
6327  xmm3 -= a1 * set( B(k,j+2UL) );
6328  }
6329 
6330  (~C).store( i, j , xmm1+xmm4 );
6331  (~C).store( i, j+1UL, xmm2+xmm5 );
6332  (~C).store( i, j+2UL, xmm3+xmm6 );
6333  }
6334 
6335  for( ; (j+2UL) <= jend; j+=2UL )
6336  {
6337  const size_t kbegin( ( IsLower<MT5>::value )
6338  ?( ( IsUpper<MT4>::value )
6339  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6340  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6341  :( IsUpper<MT4>::value ? i : 0UL ) );
6342  const size_t kend( ( IsUpper<MT5>::value )
6343  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6344  :( K ) );
6345 
6346  SIMDType xmm1( (~C).load(i,j ) );
6347  SIMDType xmm2( (~C).load(i,j+1UL) );
6348  SIMDType xmm3, xmm4;
6349  size_t k( kbegin );
6350 
6351  for( ; (k+2UL) <= kend; k+=2UL ) {
6352  const SIMDType a1( A.load(i,k ) );
6353  const SIMDType a2( A.load(i,k+1UL) );
6354  xmm1 -= a1 * set( B(k ,j ) );
6355  xmm2 -= a1 * set( B(k ,j+1UL) );
6356  xmm3 -= a2 * set( B(k+1UL,j ) );
6357  xmm4 -= a2 * set( B(k+1UL,j+1UL) );
6358  }
6359 
6360  for( ; k<kend; ++k ) {
6361  const SIMDType a1( A.load(i,k) );
6362  xmm1 -= a1 * set( B(k,j ) );
6363  xmm2 -= a1 * set( B(k,j+1UL) );
6364  }
6365 
6366  (~C).store( i, j , xmm1+xmm3 );
6367  (~C).store( i, j+1UL, xmm2+xmm4 );
6368  }
6369 
6370  if( j < jend )
6371  {
6372  const size_t kbegin( ( IsLower<MT5>::value )
6373  ?( ( IsUpper<MT4>::value )
6374  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6375  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6376  :( IsUpper<MT4>::value ? i : 0UL ) );
6377 
6378  SIMDType xmm1( (~C).load(i,j) );
6379  SIMDType xmm2;
6380  size_t k( kbegin );
6381 
6382  for( ; (k+2UL) <= K; k+=2UL ) {
6383  xmm1 -= A.load(i,k ) * set( B(k ,j) );
6384  xmm2 -= A.load(i,k+1UL) * set( B(k+1UL,j) );
6385  }
6386 
6387  for( ; k<K; ++k ) {
6388  xmm1 -= A.load(i,k) * set( B(k,j) );
6389  }
6390 
6391  (~C).store( i, j, xmm1+xmm2 );
6392  }
6393  }
6394 
6395  for( ; remainder && i<M; ++i )
6396  {
6397  const size_t jend( LOW ? i+1UL : N );
6398  size_t j( UPP ? i : 0UL );
6399 
6400  for( ; (j+2UL) <= jend; j+=2UL )
6401  {
6402  const size_t kbegin( ( IsLower<MT5>::value )
6403  ?( ( IsUpper<MT4>::value )
6404  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6405  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6406  :( IsUpper<MT4>::value ? i : 0UL ) );
6407  const size_t kend( ( IsUpper<MT5>::value )
6408  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6409  :( K ) );
6410 
6411  ElementType value1( (~C)(i,j ) );
6412  ElementType value2( (~C)(i,j+1UL) );
6413 
6414  for( size_t k=kbegin; k<kend; ++k ) {
6415  value1 -= A(i,k) * B(k,j );
6416  value2 -= A(i,k) * B(k,j+1UL);
6417  }
6418 
6419  (~C)(i,j ) = value1;
6420  (~C)(i,j+1UL) = value2;
6421  }
6422 
6423  if( j < jend )
6424  {
6425  const size_t kbegin( ( IsLower<MT5>::value )
6426  ?( ( IsUpper<MT4>::value )
6427  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6428  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6429  :( IsUpper<MT4>::value ? i : 0UL ) );
6430 
6431  ElementType value( (~C)(i,j) );
6432 
6433  for( size_t k=kbegin; k<K; ++k ) {
6434  value -= A(i,k) * B(k,j);
6435  }
6436 
6437  (~C)(i,j) = value;
6438  }
6439  }
6440  }
6442  //**********************************************************************************************
6443 
6444  //**Default subtraction assignment to dense matrices (large matrices)***************************
6458  template< typename MT3 // Type of the left-hand side target matrix
6459  , typename MT4 // Type of the left-hand side matrix operand
6460  , typename MT5 > // Type of the right-hand side matrix operand
6462  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6463  {
6464  selectDefaultSubAssignKernel( C, A, B );
6465  }
6467  //**********************************************************************************************
6468 
6469  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
6484  template< typename MT3 // Type of the left-hand side target matrix
6485  , typename MT4 // Type of the left-hand side matrix operand
6486  , typename MT5 > // Type of the right-hand side matrix operand
6488  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6489  {
6490  if( LOW )
6491  lmmm( C, A, B, ElementType(-1), ElementType(1) );
6492  else if( UPP )
6493  ummm( C, A, B, ElementType(-1), ElementType(1) );
6494  else
6495  mmm( C, A, B, ElementType(-1), ElementType(1) );
6496  }
6498  //**********************************************************************************************
6499 
6500  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
6514  template< typename MT3 // Type of the left-hand side target matrix
6515  , typename MT4 // Type of the left-hand side matrix operand
6516  , typename MT5 > // Type of the right-hand side matrix operand
6518  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6519  {
6520  selectLargeSubAssignKernel( C, A, B );
6521  }
6523  //**********************************************************************************************
6524 
6525  //**BLAS-based subraction assignment to dense matrices******************************************
6526 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6527 
6540  template< typename MT3 // Type of the left-hand side target matrix
6541  , typename MT4 // Type of the left-hand side matrix operand
6542  , typename MT5 > // Type of the right-hand side matrix operand
6544  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6545  {
6546  using ET = ElementType_<MT3>;
6547 
6548  if( IsTriangular<MT4>::value ) {
6549  ResultType_<MT3> tmp( serial( B ) );
6550  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
6551  subAssign( C, tmp );
6552  }
6553  else if( IsTriangular<MT5>::value ) {
6554  ResultType_<MT3> tmp( serial( A ) );
6555  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
6556  subAssign( C, tmp );
6557  }
6558  else {
6559  gemm( C, A, B, ET(-1), ET(1) );
6560  }
6561  }
6563 #endif
6564  //**********************************************************************************************
6565 
6566  //**Subtraction assignment to sparse matrices***************************************************
6567  // No special implementation for the subtraction assignment to sparse matrices.
6568  //**********************************************************************************************
6569 
6570  //**Schur product assignment to dense matrices**************************************************
6583  template< typename MT // Type of the target dense matrix
6584  , bool SO > // Storage order of the target dense matrix
6585  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6586  {
6588 
6592 
6593  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6594  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6595 
6596  const ResultType tmp( serial( rhs ) );
6597  schurAssign( ~lhs, tmp );
6598  }
6600  //**********************************************************************************************
6601 
6602  //**Schur product assignment to sparse matrices*************************************************
6603  // No special implementation for the Schur product assignment to sparse matrices.
6604  //**********************************************************************************************
6605 
6606  //**Multiplication assignment to dense matrices*************************************************
6607  // No special implementation for the multiplication assignment to dense matrices.
6608  //**********************************************************************************************
6609 
6610  //**Multiplication assignment to sparse matrices************************************************
6611  // No special implementation for the multiplication assignment to sparse matrices.
6612  //**********************************************************************************************
6613 
6614  //**SMP assignment to dense matrices************************************************************
6630  template< typename MT // Type of the target dense matrix
6631  , bool SO > // Storage order of the target dense matrix
6633  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6634  {
6636 
6637  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6638  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6639 
6640  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
6641  return;
6642  }
6643  else if( rhs.lhs_.columns() == 0UL ) {
6644  reset( ~lhs );
6645  return;
6646  }
6647 
6648  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6649  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6650 
6651  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6652  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6653  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6654  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6655  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6656  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6657 
6658  smpAssign( ~lhs, A * B );
6659  }
6661  //**********************************************************************************************
6662 
6663  //**SMP assignment to sparse matrices***********************************************************
6679  template< typename MT // Type of the target sparse matrix
6680  , bool SO > // Storage order of the target sparse matrix
6682  smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6683  {
6685 
6687 
6694 
6695  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6696  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6697 
6698  const ForwardFunctor fwd;
6699 
6700  const TmpType tmp( rhs );
6701  smpAssign( ~lhs, fwd( tmp ) );
6702  }
6704  //**********************************************************************************************
6705 
6706  //**SMP addition assignment to dense matrices***************************************************
6722  template< typename MT // Type of the target dense matrix
6723  , bool SO > // Storage order of the target dense matrix
6726  {
6728 
6729  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6730  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6731 
6732  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6733  return;
6734  }
6735 
6736  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6737  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6738 
6739  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6740  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6741  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6742  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6743  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6744  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6745 
6746  smpAddAssign( ~lhs, A * B );
6747  }
6749  //**********************************************************************************************
6750 
6751  //**SMP addition assignment to sparse matrices**************************************************
6752  // No special implementation for the SMP addition assignment to sparse matrices.
6753  //**********************************************************************************************
6754 
6755  //**SMP subtraction assignment to dense matrices************************************************
6771  template< typename MT // Type of the target dense matrix
6772  , bool SO > // Storage order of the target dense matrix
6775  {
6777 
6778  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6779  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6780 
6781  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6782  return;
6783  }
6784 
6785  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6786  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6787 
6788  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6789  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6790  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6791  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6792  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6793  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6794 
6795  smpSubAssign( ~lhs, A * B );
6796  }
6798  //**********************************************************************************************
6799 
6800  //**SMP subtraction assignment to sparse matrices***********************************************
6801  // No special implementation for the SMP subtraction assignment to sparse matrices.
6802  //**********************************************************************************************
6803 
6804  //**SMP Schur product assignment to dense matrices**********************************************
6817  template< typename MT // Type of the target dense matrix
6818  , bool SO > // Storage order of the target dense matrix
6819  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6820  {
6822 
6826 
6827  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6828  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6829 
6830  const ResultType tmp( rhs );
6831  smpSchurAssign( ~lhs, tmp );
6832  }
6834  //**********************************************************************************************
6835 
6836  //**SMP Schur product assignment to sparse matrices*********************************************
6837  // No special implementation for the SMP Schur product assignment to sparse matrices.
6838  //**********************************************************************************************
6839 
6840  //**SMP multiplication assignment to dense matrices*********************************************
6841  // No special implementation for the SMP multiplication assignment to dense matrices.
6842  //**********************************************************************************************
6843 
6844  //**SMP multiplication assignment to sparse matrices********************************************
6845  // No special implementation for the SMP multiplication assignment to sparse matrices.
6846  //**********************************************************************************************
6847 
6848  //**Compile time checks*************************************************************************
6856  //**********************************************************************************************
6857 };
6858 //*************************************************************************************************
6859 
6860 
6861 
6862 
6863 //=================================================================================================
6864 //
6865 // DMATSCALARMULTEXPR SPECIALIZATION
6866 //
6867 //=================================================================================================
6868 
6869 //*************************************************************************************************
6877 template< typename MT1 // Type of the left-hand side dense matrix
6878  , typename MT2 // Type of the right-hand side dense matrix
6879  , bool SF // Symmetry flag
6880  , bool HF // Hermitian flag
6881  , bool LF // Lower flag
6882  , bool UF // Upper flag
6883  , typename ST > // Type of the right-hand side scalar value
6884 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
6885  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
6886  , private Computation
6887 {
6888  private:
6889  //**Type definitions****************************************************************************
6892 
6893  using RES = ResultType_<MMM>;
6894  using RT1 = ResultType_<MT1>;
6895  using RT2 = ResultType_<MT2>;
6896  using ET1 = ElementType_<RT1>;
6897  using ET2 = ElementType_<RT2>;
6898  using CT1 = CompositeType_<MT1>;
6899  using CT2 = CompositeType_<MT2>;
6900  //**********************************************************************************************
6901 
6902  //**********************************************************************************************
6904  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
6905  //**********************************************************************************************
6906 
6907  //**********************************************************************************************
6909  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
6910  //**********************************************************************************************
6911 
6912  //**********************************************************************************************
6914  enum : bool {
6915  SYM = ( SF && !( HF || LF || UF ) ),
6916  HERM = ( HF && !( LF || UF ) ),
6917  LOW = ( LF || ( ( SF || HF ) && UF ) ),
6918  UPP = ( UF || ( ( SF || HF ) && LF ) )
6919  };
6920  //**********************************************************************************************
6921 
6922  //**********************************************************************************************
6924 
6927  template< typename T1, typename T2, typename T3 >
6928  struct IsEvaluationRequired {
6929  enum : bool { value = ( evaluateLeft || evaluateRight ) };
6930  };
6931  //**********************************************************************************************
6932 
6933  //**********************************************************************************************
6935 
6937  template< typename T1, typename T2, typename T3, typename T4 >
6938  struct UseBlasKernel {
6939  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
6940  !SYM && !HERM && !LOW && !UPP &&
6945  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6950  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
6952  };
6953  //**********************************************************************************************
6954 
6955  //**********************************************************************************************
6957 
6959  template< typename T1, typename T2, typename T3, typename T4 >
6960  struct UseVectorizedDefaultKernel {
6961  enum : bool { value = useOptimizedKernels &&
6965  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6969  , T4 >::value &&
6972  };
6973  //**********************************************************************************************
6974 
6975  //**********************************************************************************************
6977 
6979  using ForwardFunctor = IfTrue_< HERM
6980  , DeclHerm
6981  , IfTrue_< SYM
6982  , DeclSym
6983  , IfTrue_< LOW
6984  , IfTrue_< UPP
6985  , DeclDiag
6986  , DeclLow >
6987  , IfTrue_< UPP
6988  , DeclUpp
6989  , Noop > > > >;
6990  //**********************************************************************************************
6991 
6992  public:
6993  //**Type definitions****************************************************************************
6995  using ResultType = MultTrait_<RES,ST>;
7000  using ReturnType = const ElementType;
7001  using CompositeType = const ResultType;
7002 
7005 
7007  using RightOperand = ST;
7008 
7011 
7014  //**********************************************************************************************
7015 
7016  //**Compilation flags***************************************************************************
7018  enum : bool { simdEnabled = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
7019  MT1::simdEnabled && MT2::simdEnabled &&
7023 
7025  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
7026  !evaluateRight && MT2::smpAssignable };
7027  //**********************************************************************************************
7028 
7029  //**SIMD properties*****************************************************************************
7031  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
7032  //**********************************************************************************************
7033 
7034  //**Constructor*********************************************************************************
7040  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
7041  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
7042  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
7043  {}
7044  //**********************************************************************************************
7045 
7046  //**Access operator*****************************************************************************
7053  inline ReturnType operator()( size_t i, size_t j ) const {
7054  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
7055  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
7056  return matrix_(i,j) * scalar_;
7057  }
7058  //**********************************************************************************************
7059 
7060  //**At function*********************************************************************************
7068  inline ReturnType at( size_t i, size_t j ) const {
7069  if( i >= matrix_.rows() ) {
7070  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
7071  }
7072  if( j >= matrix_.columns() ) {
7073  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
7074  }
7075  return (*this)(i,j);
7076  }
7077  //**********************************************************************************************
7078 
7079  //**Rows function*******************************************************************************
7084  inline size_t rows() const {
7085  return matrix_.rows();
7086  }
7087  //**********************************************************************************************
7088 
7089  //**Columns function****************************************************************************
7094  inline size_t columns() const {
7095  return matrix_.columns();
7096  }
7097  //**********************************************************************************************
7098 
7099  //**Left operand access*************************************************************************
7104  inline LeftOperand leftOperand() const {
7105  return matrix_;
7106  }
7107  //**********************************************************************************************
7108 
7109  //**Right operand access************************************************************************
7114  inline RightOperand rightOperand() const {
7115  return scalar_;
7116  }
7117  //**********************************************************************************************
7118 
7119  //**********************************************************************************************
7125  template< typename T >
7126  inline bool canAlias( const T* alias ) const {
7127  return matrix_.canAlias( alias );
7128  }
7129  //**********************************************************************************************
7130 
7131  //**********************************************************************************************
7137  template< typename T >
7138  inline bool isAliased( const T* alias ) const {
7139  return matrix_.isAliased( alias );
7140  }
7141  //**********************************************************************************************
7142 
7143  //**********************************************************************************************
7148  inline bool isAligned() const {
7149  return matrix_.isAligned();
7150  }
7151  //**********************************************************************************************
7152 
7153  //**********************************************************************************************
7158  inline bool canSMPAssign() const noexcept {
7159  return ( !BLAZE_BLAS_MODE ||
7160  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7162  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7163  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7164  }
7165  //**********************************************************************************************
7166 
7167  private:
7168  //**Member variables****************************************************************************
7169  LeftOperand matrix_;
7170  RightOperand scalar_;
7171  //**********************************************************************************************
7172 
7173  //**Assignment to dense matrices****************************************************************
7185  template< typename MT // Type of the target dense matrix
7186  , bool SO > // Storage order of the target dense matrix
7187  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7188  {
7190 
7191  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7192  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7193 
7194  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7195  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7196 
7197  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7198  return;
7199  }
7200  else if( left.columns() == 0UL ) {
7201  reset( ~lhs );
7202  return;
7203  }
7204 
7205  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7206  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7207 
7208  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7209  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7210  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7211  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7212  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7213  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7214 
7215  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
7216  }
7217  //**********************************************************************************************
7218 
7219  //**Assignment to dense matrices (kernel selection)*********************************************
7230  template< typename MT3 // Type of the left-hand side target matrix
7231  , typename MT4 // Type of the left-hand side matrix operand
7232  , typename MT5 // Type of the right-hand side matrix operand
7233  , typename ST2 > // Type of the scalar value
7234  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7235  {
7237  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
7238  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
7239  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7240  selectSmallAssignKernel( C, A, B, scalar );
7241  else
7242  selectBlasAssignKernel( C, A, B, scalar );
7243  }
7244  //**********************************************************************************************
7245 
7246  //**Default assignment to row-major dense matrices (general/general)****************************
7260  template< typename MT3 // Type of the left-hand side target matrix
7261  , typename MT4 // Type of the left-hand side matrix operand
7262  , typename MT5 // Type of the right-hand side matrix operand
7263  , typename ST2 > // Type of the scalar value
7265  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7266  {
7267  const size_t M( A.rows() );
7268  const size_t N( B.columns() );
7269  const size_t K( A.columns() );
7270 
7271  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7272 
7273  for( size_t i=0UL; i<M; ++i )
7274  {
7275  const size_t kbegin( ( IsUpper<MT4>::value )
7276  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
7277  :( 0UL ) );
7278  const size_t kend( ( IsLower<MT4>::value )
7279  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
7280  :( K ) );
7281  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7282 
7283  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
7284  for( size_t j=0UL; j<N; ++j ) {
7285  reset( (~C)(i,j) );
7286  }
7287  continue;
7288  }
7289 
7290  {
7291  const size_t jbegin( ( IsUpper<MT5>::value )
7293  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
7294  :( UPP ? max(i,kbegin) : kbegin ) )
7295  :( UPP ? i : 0UL ) );
7296  const size_t jend( ( IsLower<MT5>::value )
7298  ?( LOW ? min(i+1UL,kbegin) : kbegin )
7299  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
7300  :( LOW ? i+1UL : N ) );
7301 
7302  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
7303  for( size_t j=0UL; j<jbegin; ++j ) {
7304  reset( (~C)(i,j) );
7305  }
7306  }
7307  else if( IsStrictlyUpper<MT5>::value ) {
7308  reset( (~C)(i,0UL) );
7309  }
7310  for( size_t j=jbegin; j<jend; ++j ) {
7311  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
7312  }
7313  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
7314  for( size_t j=jend; j<N; ++j ) {
7315  reset( (~C)(i,j) );
7316  }
7317  }
7318  else if( IsStrictlyLower<MT5>::value ) {
7319  reset( (~C)(i,N-1UL) );
7320  }
7321  }
7322 
7323  for( size_t k=kbegin+1UL; k<kend; ++k )
7324  {
7325  const size_t jbegin( ( IsUpper<MT5>::value )
7327  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
7328  :( SYM || HERM || UPP ? max( i, k ) : k ) )
7329  :( SYM || HERM || UPP ? i : 0UL ) );
7330  const size_t jend( ( IsLower<MT5>::value )
7332  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
7333  :( LOW ? min(i+1UL,k) : k ) )
7334  :( LOW ? i+1UL : N ) );
7335 
7336  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7337  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7338 
7339  for( size_t j=jbegin; j<jend; ++j ) {
7340  (~C)(i,j) += A(i,k) * B(k,j);
7341  }
7342  if( IsLower<MT5>::value ) {
7343  (~C)(i,jend) = A(i,k) * B(k,jend);
7344  }
7345  }
7346 
7347  {
7348  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
7350  :( SYM || HERM || UPP ? i : 0UL ) );
7351  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
7353  :( LOW ? i+1UL : N ) );
7354 
7355  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7356  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7357 
7358  for( size_t j=jbegin; j<jend; ++j ) {
7359  (~C)(i,j) *= scalar;
7360  }
7361  }
7362  }
7363 
7364  if( SYM || HERM ) {
7365  for( size_t i=1UL; i<M; ++i ) {
7366  for( size_t j=0UL; j<i; ++j ) {
7367  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
7368  }
7369  }
7370  }
7371  }
7372  //**********************************************************************************************
7373 
7374  //**Default assignment to column-major dense matrices (general/general)*************************
7388  template< typename MT3 // Type of the left-hand side target matrix
7389  , typename MT4 // Type of the left-hand side matrix operand
7390  , typename MT5 // Type of the right-hand side matrix operand
7391  , typename ST2 > // Type of the scalar value
7392  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
7393  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7394  {
7395  const size_t M( A.rows() );
7396  const size_t N( B.columns() );
7397  const size_t K( A.columns() );
7398 
7399  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7400 
7401  for( size_t j=0UL; j<N; ++j )
7402  {
7403  const size_t kbegin( ( IsLower<MT5>::value )
7404  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
7405  :( 0UL ) );
7406  const size_t kend( ( IsUpper<MT5>::value )
7407  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
7408  :( K ) );
7409  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7410 
7411  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
7412  for( size_t i=0UL; i<M; ++i ) {
7413  reset( (~C)(i,j) );
7414  }
7415  continue;
7416  }
7417 
7418  {
7419  const size_t ibegin( ( IsLower<MT4>::value )
7421  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
7422  :( LOW ? max(j,kbegin) : kbegin ) )
7423  :( LOW ? j : 0UL ) );
7424  const size_t iend( ( IsUpper<MT4>::value )
7426  ?( UPP ? min(j+1UL,kbegin) : kbegin )
7427  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
7428  :( UPP ? j+1UL : M ) );
7429 
7430  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
7431  for( size_t i=0UL; i<ibegin; ++i ) {
7432  reset( (~C)(i,j) );
7433  }
7434  }
7435  else if( IsStrictlyLower<MT4>::value ) {
7436  reset( (~C)(0UL,j) );
7437  }
7438  for( size_t i=ibegin; i<iend; ++i ) {
7439  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
7440  }
7441  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
7442  for( size_t i=iend; i<M; ++i ) {
7443  reset( (~C)(i,j) );
7444  }
7445  }
7446  else if( IsStrictlyUpper<MT4>::value ) {
7447  reset( (~C)(M-1UL,j) );
7448  }
7449  }
7450 
7451  for( size_t k=kbegin+1UL; k<kend; ++k )
7452  {
7453  const size_t ibegin( ( IsLower<MT4>::value )
7455  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
7456  :( SYM || HERM || LOW ? max( j, k ) : k ) )
7457  :( SYM || HERM || LOW ? j : 0UL ) );
7458  const size_t iend( ( IsUpper<MT4>::value )
7460  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
7461  :( UPP ? min(j+1UL,k) : k ) )
7462  :( UPP ? j+1UL : M ) );
7463 
7464  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7465  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7466 
7467  for( size_t i=ibegin; i<iend; ++i ) {
7468  (~C)(i,j) += A(i,k) * B(k,j);
7469  }
7470  if( IsUpper<MT4>::value ) {
7471  (~C)(iend,j) = A(iend,k) * B(k,j);
7472  }
7473  }
7474 
7475  {
7476  const size_t ibegin( ( ( IsLower<MT4>::value && IsLower<MT5>::value ) )
7478  :( SYM || HERM || LOW ? j : 0UL ) );
7479  const size_t iend( ( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) )
7481  :( UPP ? j+1UL : M ) );
7482 
7483  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7484  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7485 
7486  for( size_t i=ibegin; i<iend; ++i ) {
7487  (~C)(i,j) *= scalar;
7488  }
7489  }
7490  }
7491 
7492  if( SYM || HERM ) {
7493  for( size_t j=1UL; j<N; ++j ) {
7494  for( size_t i=0UL; i<j; ++i ) {
7495  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
7496  }
7497  }
7498  }
7499  }
7500  //**********************************************************************************************
7501 
7502  //**Default assignment to row-major dense matrices (general/diagonal)***************************
7516  template< typename MT3 // Type of the left-hand side target matrix
7517  , typename MT4 // Type of the left-hand side matrix operand
7518  , typename MT5 // Type of the right-hand side matrix operand
7519  , typename ST2 > // Type of the scalar value
7520  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7521  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7522  {
7523  constexpr size_t block( BLOCK_SIZE );
7524 
7525  const size_t M( A.rows() );
7526  const size_t N( B.columns() );
7527 
7528  for( size_t ii=0UL; ii<M; ii+=block ) {
7529  const size_t iend( min( M, ii+block ) );
7530  for( size_t jj=0UL; jj<N; jj+=block ) {
7531  const size_t jend( min( N, jj+block ) );
7532  for( size_t i=ii; i<iend; ++i )
7533  {
7534  const size_t jbegin( ( IsUpper<MT4>::value )
7535  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
7536  :( jj ) );
7537  const size_t jpos( ( IsLower<MT4>::value )
7538  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
7539  :( jend ) );
7540 
7541  if( IsUpper<MT4>::value ) {
7542  for( size_t j=jj; j<jbegin; ++j ) {
7543  reset( (~C)(i,j) );
7544  }
7545  }
7546  for( size_t j=jbegin; j<jpos; ++j ) {
7547  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
7548  }
7549  if( IsLower<MT4>::value ) {
7550  for( size_t j=jpos; j<jend; ++j ) {
7551  reset( (~C)(i,j) );
7552  }
7553  }
7554  }
7555  }
7556  }
7557  }
7558  //**********************************************************************************************
7559 
7560  //**Default assignment to column-major dense matrices (general/diagonal)************************
7574  template< typename MT3 // Type of the left-hand side target matrix
7575  , typename MT4 // Type of the left-hand side matrix operand
7576  , typename MT5 // Type of the right-hand side matrix operand
7577  , typename ST2 > // Type of the scalar value
7578  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7579  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7580  {
7581  const size_t M( A.rows() );
7582  const size_t N( B.columns() );
7583 
7584  for( size_t j=0UL; j<N; ++j )
7585  {
7586  const size_t ibegin( ( IsLower<MT4>::value )
7587  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
7588  :( 0UL ) );
7589  const size_t iend( ( IsUpper<MT4>::value )
7590  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
7591  :( M ) );
7592  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7593 
7594  if( IsLower<MT4>::value ) {
7595  for( size_t i=0UL; i<ibegin; ++i ) {
7596  reset( (~C)(i,j) );
7597  }
7598  }
7599  for( size_t i=ibegin; i<iend; ++i ) {
7600  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
7601  }
7602  if( IsUpper<MT4>::value ) {
7603  for( size_t i=iend; i<M; ++i ) {
7604  reset( (~C)(i,j) );
7605  }
7606  }
7607  }
7608  }
7609  //**********************************************************************************************
7610 
7611  //**Default assignment to row-major dense matrices (diagonal/general)***************************
7625  template< typename MT3 // Type of the left-hand side target matrix
7626  , typename MT4 // Type of the left-hand side matrix operand
7627  , typename MT5 // Type of the right-hand side matrix operand
7628  , typename ST2 > // Type of the scalar value
7630  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7631  {
7632  const size_t M( A.rows() );
7633  const size_t N( B.columns() );
7634 
7635  for( size_t i=0UL; i<M; ++i )
7636  {
7637  const size_t jbegin( ( IsUpper<MT5>::value )
7638  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
7639  :( 0UL ) );
7640  const size_t jend( ( IsLower<MT5>::value )
7641  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
7642  :( N ) );
7643  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7644 
7645  if( IsUpper<MT5>::value ) {
7646  for( size_t j=0UL; j<jbegin; ++j ) {
7647  reset( (~C)(i,j) );
7648  }
7649  }
7650  for( size_t j=jbegin; j<jend; ++j ) {
7651  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
7652  }
7653  if( IsLower<MT5>::value ) {
7654  for( size_t j=jend; j<N; ++j ) {
7655  reset( (~C)(i,j) );
7656  }
7657  }
7658  }
7659  }
7660  //**********************************************************************************************
7661 
7662  //**Default assignment to column-major dense matrices (diagonal/general)************************
7676  template< typename MT3 // Type of the left-hand side target matrix
7677  , typename MT4 // Type of the left-hand side matrix operand
7678  , typename MT5 // Type of the right-hand side matrix operand
7679  , typename ST2 > // Type of the scalar value
7680  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
7681  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7682  {
7683  constexpr size_t block( BLOCK_SIZE );
7684 
7685  const size_t M( A.rows() );
7686  const size_t N( B.columns() );
7687 
7688  for( size_t jj=0UL; jj<N; jj+=block ) {
7689  const size_t jend( min( N, jj+block ) );
7690  for( size_t ii=0UL; ii<M; ii+=block ) {
7691  const size_t iend( min( M, ii+block ) );
7692  for( size_t j=jj; j<jend; ++j )
7693  {
7694  const size_t ibegin( ( IsLower<MT5>::value )
7695  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
7696  :( ii ) );
7697  const size_t ipos( ( IsUpper<MT5>::value )
7698  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
7699  :( iend ) );
7700 
7701  if( IsLower<MT5>::value ) {
7702  for( size_t i=ii; i<ibegin; ++i ) {
7703  reset( (~C)(i,j) );
7704  }
7705  }
7706  for( size_t i=ibegin; i<ipos; ++i ) {
7707  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
7708  }
7709  if( IsUpper<MT5>::value ) {
7710  for( size_t i=ipos; i<iend; ++i ) {
7711  reset( (~C)(i,j) );
7712  }
7713  }
7714  }
7715  }
7716  }
7717  }
7718  //**********************************************************************************************
7719 
7720  //**Default assignment to dense matrices (diagonal/diagonal)************************************
7734  template< typename MT3 // Type of the left-hand side target matrix
7735  , typename MT4 // Type of the left-hand side matrix operand
7736  , typename MT5 // Type of the right-hand side matrix operand
7737  , typename ST2 > // Type of the scalar value
7738  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
7739  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7740  {
7741  reset( C );
7742 
7743  for( size_t i=0UL; i<A.rows(); ++i ) {
7744  C(i,i) = A(i,i) * B(i,i) * scalar;
7745  }
7746  }
7747  //**********************************************************************************************
7748 
7749  //**Default assignment to dense matrices (small matrices)***************************************
7763  template< typename MT3 // Type of the left-hand side target matrix
7764  , typename MT4 // Type of the left-hand side matrix operand
7765  , typename MT5 // Type of the right-hand side matrix operand
7766  , typename ST2 > // Type of the scalar value
7768  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7769  {
7770  selectDefaultAssignKernel( C, A, B, scalar );
7771  }
7772  //**********************************************************************************************
7773 
7774  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
7789  template< typename MT3 // Type of the left-hand side target matrix
7790  , typename MT4 // Type of the left-hand side matrix operand
7791  , typename MT5 // Type of the right-hand side matrix operand
7792  , typename ST2 > // Type of the scalar value
7794  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7795  {
7796  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7797 
7798  const size_t M( A.rows() );
7799  const size_t N( B.columns() );
7800  const size_t K( A.columns() );
7801 
7802  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7803 
7804  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
7805  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
7806 
7807  const SIMDType factor( set( scalar ) );
7808 
7809  if( LOW && UPP && N > SIMDSIZE*3UL ) {
7810  reset( ~C );
7811  }
7812 
7813  {
7814  size_t j( 0UL );
7815 
7817  {
7818  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7819  for( size_t i=0UL; i<M; ++i )
7820  {
7821  const size_t kbegin( ( IsUpper<MT4>::value )
7822  ?( ( IsLower<MT5>::value )
7823  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7824  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7825  :( IsLower<MT5>::value ? j : 0UL ) );
7826  const size_t kend( ( IsLower<MT4>::value )
7827  ?( ( IsUpper<MT5>::value )
7828  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
7829  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
7830  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
7831 
7832  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7833 
7834  for( size_t k=kbegin; k<kend; ++k ) {
7835  const SIMDType a1( set( A(i,k) ) );
7836  xmm1 += a1 * B.load(k,j );
7837  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7838  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7839  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7840  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7841  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7842  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7843  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7844  }
7845 
7846  (~C).store( i, j , xmm1 * factor );
7847  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
7848  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7849  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7850  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7851  (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
7852  (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
7853  (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
7854  }
7855  }
7856  }
7857 
7858  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7859  {
7860  size_t i( 0UL );
7861 
7862  for( ; (i+2UL) <= M; i+=2UL )
7863  {
7864  const size_t kbegin( ( IsUpper<MT4>::value )
7865  ?( ( IsLower<MT5>::value )
7866  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7867  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7868  :( IsLower<MT5>::value ? j : 0UL ) );
7869  const size_t kend( ( IsLower<MT4>::value )
7870  ?( ( IsUpper<MT5>::value )
7871  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
7872  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
7873  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
7874 
7875  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7876 
7877  for( size_t k=kbegin; k<kend; ++k ) {
7878  const SIMDType a1( set( A(i ,k) ) );
7879  const SIMDType a2( set( A(i+1UL,k) ) );
7880  const SIMDType b1( B.load(k,j ) );
7881  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7882  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7883  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7884  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7885  xmm1 += a1 * b1;
7886  xmm2 += a1 * b2;
7887  xmm3 += a1 * b3;
7888  xmm4 += a1 * b4;
7889  xmm5 += a1 * b5;
7890  xmm6 += a2 * b1;
7891  xmm7 += a2 * b2;
7892  xmm8 += a2 * b3;
7893  xmm9 += a2 * b4;
7894  xmm10 += a2 * b5;
7895  }
7896 
7897  (~C).store( i , j , xmm1 * factor );
7898  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
7899  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7900  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7901  (~C).store( i , j+SIMDSIZE*4UL, xmm5 * factor );
7902  (~C).store( i+1UL, j , xmm6 * factor );
7903  (~C).store( i+1UL, j+SIMDSIZE , xmm7 * factor );
7904  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
7905  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
7906  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
7907  }
7908 
7909  if( i < M )
7910  {
7911  const size_t kbegin( ( IsUpper<MT4>::value )
7912  ?( ( IsLower<MT5>::value )
7913  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7914  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7915  :( IsLower<MT5>::value ? j : 0UL ) );
7916  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
7917 
7918  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7919 
7920  for( size_t k=kbegin; k<kend; ++k ) {
7921  const SIMDType a1( set( A(i,k) ) );
7922  xmm1 += a1 * B.load(k,j );
7923  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7924  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7925  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7926  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7927  }
7928 
7929  (~C).store( i, j , xmm1 * factor );
7930  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
7931  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7932  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7933  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7934  }
7935  }
7936 
7937  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7938  {
7939  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
7940  size_t i( LOW ? j : 0UL );
7941 
7942  for( ; (i+2UL) <= iend; i+=2UL )
7943  {
7944  const size_t kbegin( ( IsUpper<MT4>::value )
7945  ?( ( IsLower<MT5>::value )
7946  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7947  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7948  :( IsLower<MT5>::value ? j : 0UL ) );
7949  const size_t kend( ( IsLower<MT4>::value )
7950  ?( ( IsUpper<MT5>::value )
7951  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
7952  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
7953  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
7954 
7955  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7956 
7957  for( size_t k=kbegin; k<kend; ++k ) {
7958  const SIMDType a1( set( A(i ,k) ) );
7959  const SIMDType a2( set( A(i+1UL,k) ) );
7960  const SIMDType b1( B.load(k,j ) );
7961  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7962  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7963  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7964  xmm1 += a1 * b1;
7965  xmm2 += a1 * b2;
7966  xmm3 += a1 * b3;
7967  xmm4 += a1 * b4;
7968  xmm5 += a2 * b1;
7969  xmm6 += a2 * b2;
7970  xmm7 += a2 * b3;
7971  xmm8 += a2 * b4;
7972  }
7973 
7974  (~C).store( i , j , xmm1 * factor );
7975  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
7976  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7977  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7978  (~C).store( i+1UL, j , xmm5 * factor );
7979  (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
7980  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
7981  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
7982  }
7983 
7984  if( i < iend )
7985  {
7986  const size_t kbegin( ( IsUpper<MT4>::value )
7987  ?( ( IsLower<MT5>::value )
7988  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7989  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7990  :( IsLower<MT5>::value ? j : 0UL ) );
7991  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
7992 
7993  SIMDType xmm1, xmm2, xmm3, xmm4;
7994 
7995  for( size_t k=kbegin; k<kend; ++k ) {
7996  const SIMDType a1( set( A(i,k) ) );
7997  xmm1 += a1 * B.load(k,j );
7998  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7999  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8000  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8001  }
8002 
8003  (~C).store( i, j , xmm1 * factor );
8004  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
8005  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8006  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8007  }
8008  }
8009 
8010  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8011  {
8012  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
8013  size_t i( LOW ? j : 0UL );
8014 
8015  for( ; (i+2UL) <= iend; i+=2UL )
8016  {
8017  const size_t kbegin( ( IsUpper<MT4>::value )
8018  ?( ( IsLower<MT5>::value )
8019  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8020  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8021  :( IsLower<MT5>::value ? j : 0UL ) );
8022  const size_t kend( ( IsLower<MT4>::value )
8023  ?( ( IsUpper<MT5>::value )
8024  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
8025  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8026  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
8027 
8028  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8029 
8030  for( size_t k=kbegin; k<kend; ++k ) {
8031  const SIMDType a1( set( A(i ,k) ) );
8032  const SIMDType a2( set( A(i+1UL,k) ) );
8033  const SIMDType b1( B.load(k,j ) );
8034  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8035  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8036  xmm1 += a1 * b1;
8037  xmm2 += a1 * b2;
8038  xmm3 += a1 * b3;
8039  xmm4 += a2 * b1;
8040  xmm5 += a2 * b2;
8041  xmm6 += a2 * b3;
8042  }
8043 
8044  (~C).store( i , j , xmm1 * factor );
8045  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
8046  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8047  (~C).store( i+1UL, j , xmm4 * factor );
8048  (~C).store( i+1UL, j+SIMDSIZE , xmm5 * factor );
8049  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
8050  }
8051 
8052  if( i < iend )
8053  {
8054  const size_t kbegin( ( IsUpper<MT4>::value )
8055  ?( ( IsLower<MT5>::value )
8056  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8057  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8058  :( IsLower<MT5>::value ? j : 0UL ) );
8059  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
8060 
8061  SIMDType xmm1, xmm2, xmm3;
8062 
8063  for( size_t k=kbegin; k<kend; ++k ) {
8064  const SIMDType a1( set( A(i,k) ) );
8065  xmm1 += a1 * B.load(k,j );
8066  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8067  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8068  }
8069 
8070  (~C).store( i, j , xmm1 * factor );
8071  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
8072  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8073  }
8074  }
8075 
8076  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8077  {
8078  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
8079  size_t i( LOW ? j : 0UL );
8080 
8081  for( ; (i+4UL) <= iend; i+=4UL )
8082  {
8083  const size_t kbegin( ( IsUpper<MT4>::value )
8084  ?( ( IsLower<MT5>::value )
8085  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8086  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8087  :( IsLower<MT5>::value ? j : 0UL ) );
8088  const size_t kend( ( IsLower<MT4>::value )
8089  ?( ( IsUpper<MT5>::value )
8090  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
8091  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
8092  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
8093 
8094  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8095 
8096  for( size_t k=kbegin; k<kend; ++k ) {
8097  const SIMDType a1( set( A(i ,k) ) );
8098  const SIMDType a2( set( A(i+1UL,k) ) );
8099  const SIMDType a3( set( A(i+2UL,k) ) );
8100  const SIMDType a4( set( A(i+3UL,k) ) );
8101  const SIMDType b1( B.load(k,j ) );
8102  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8103  xmm1 += a1 * b1;
8104  xmm2 += a1 * b2;
8105  xmm3 += a2 * b1;
8106  xmm4 += a2 * b2;
8107  xmm5 += a3 * b1;
8108  xmm6 += a3 * b2;
8109  xmm7 += a4 * b1;
8110  xmm8 += a4 * b2;
8111  }
8112 
8113  (~C).store( i , j , xmm1 * factor );
8114  (~C).store( i , j+SIMDSIZE, xmm2 * factor );
8115  (~C).store( i+1UL, j , xmm3 * factor );
8116  (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8117  (~C).store( i+2UL, j , xmm5 * factor );
8118  (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8119  (~C).store( i+3UL, j , xmm7 * factor );
8120  (~C).store( i+3UL, j+SIMDSIZE, xmm8 * factor );
8121  }
8122 
8123  for( ; (i+3UL) <= iend; i+=3UL )
8124  {
8125  const size_t kbegin( ( IsUpper<MT4>::value )
8126  ?( ( IsLower<MT5>::value )
8127  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8128  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8129  :( IsLower<MT5>::value ? j : 0UL ) );
8130  const size_t kend( ( IsLower<MT4>::value )
8131  ?( ( IsUpper<MT5>::value )
8132  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
8133  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
8134  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
8135 
8136  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8137 
8138  for( size_t k=kbegin; k<kend; ++k ) {
8139  const SIMDType a1( set( A(i ,k) ) );
8140  const SIMDType a2( set( A(i+1UL,k) ) );
8141  const SIMDType a3( set( A(i+2UL,k) ) );
8142  const SIMDType b1( B.load(k,j ) );
8143  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8144  xmm1 += a1 * b1;
8145  xmm2 += a1 * b2;
8146  xmm3 += a2 * b1;
8147  xmm4 += a2 * b2;
8148  xmm5 += a3 * b1;
8149  xmm6 += a3 * b2;
8150  }
8151 
8152  (~C).store( i , j , xmm1 * factor );
8153  (~C).store( i , j+SIMDSIZE, xmm2 * factor );
8154  (~C).store( i+1UL, j , xmm3 * factor );
8155  (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8156  (~C).store( i+2UL, j , xmm5 * factor );
8157  (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8158  }
8159 
8160  for( ; (i+2UL) <= iend; i+=2UL )
8161  {
8162  const size_t kbegin( ( IsUpper<MT4>::value )
8163  ?( ( IsLower<MT5>::value )
8164  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8165  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8166  :( IsLower<MT5>::value ? j : 0UL ) );
8167  const size_t kend( ( IsLower<MT4>::value )
8168  ?( ( IsUpper<MT5>::value )
8169  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8170  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8171  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
8172 
8173  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8174  size_t k( kbegin );
8175 
8176  for( ; (k+2UL) <= kend; k+=2UL ) {
8177  const SIMDType a1( set( A(i ,k ) ) );
8178  const SIMDType a2( set( A(i+1UL,k ) ) );
8179  const SIMDType a3( set( A(i ,k+1UL) ) );
8180  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
8181  const SIMDType b1( B.load(k ,j ) );
8182  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
8183  const SIMDType b3( B.load(k+1UL,j ) );
8184  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
8185  xmm1 += a1 * b1;
8186  xmm2 += a1 * b2;
8187  xmm3 += a2 * b1;
8188  xmm4 += a2 * b2;
8189  xmm5 += a3 * b3;
8190  xmm6 += a3 * b4;
8191  xmm7 += a4 * b3;
8192  xmm8 += a4 * b4;
8193  }
8194 
8195  for( ; k<kend; ++k ) {
8196  const SIMDType a1( set( A(i ,k) ) );
8197  const SIMDType a2( set( A(i+1UL,k) ) );
8198  const SIMDType b1( B.load(k,j ) );
8199  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8200  xmm1 += a1 * b1;
8201  xmm2 += a1 * b2;
8202  xmm3 += a2 * b1;
8203  xmm4 += a2 * b2;
8204  }
8205 
8206  (~C).store( i , j , (xmm1+xmm5) * factor );
8207  (~C).store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
8208  (~C).store( i+1UL, j , (xmm3+xmm7) * factor );
8209  (~C).store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
8210  }
8211 
8212  if( i < iend )
8213  {
8214  const size_t kbegin( ( IsUpper<MT4>::value )
8215  ?( ( IsLower<MT5>::value )
8216  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8217  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8218  :( IsLower<MT5>::value ? j : 0UL ) );
8219  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8220 
8221  SIMDType xmm1, xmm2, xmm3, xmm4;
8222  size_t k( kbegin );
8223 
8224  for( ; (k+2UL) <= kend; k+=2UL ) {
8225  const SIMDType a1( set( A(i,k ) ) );
8226  const SIMDType a2( set( A(i,k+1UL) ) );
8227  xmm1 += a1 * B.load(k ,j );
8228  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8229  xmm3 += a2 * B.load(k+1UL,j );
8230  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8231  }
8232 
8233  for( ; k<kend; ++k ) {
8234  const SIMDType a1( set( A(i,k) ) );
8235  xmm1 += a1 * B.load(k,j );
8236  xmm2 += a1 * B.load(k,j+SIMDSIZE);
8237  }
8238 
8239  (~C).store( i, j , (xmm1+xmm3) * factor );
8240  (~C).store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
8241  }
8242  }
8243 
8244  for( ; j<jpos; j+=SIMDSIZE )
8245  {
8246  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
8247  size_t i( LOW ? j : 0UL );
8248 
8249  for( ; (i+4UL) <= iend; i+=4UL )
8250  {
8251  const size_t kbegin( ( IsUpper<MT4>::value )
8252  ?( ( IsLower<MT5>::value )
8253  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8254  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8255  :( IsLower<MT5>::value ? j : 0UL ) );
8256  const size_t kend( ( IsLower<MT4>::value )
8257  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
8258  :( K ) );
8259 
8260  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8261  size_t k( kbegin );
8262 
8263  for( ; (k+2UL) <= kend; k+=2UL ) {
8264  const SIMDType b1( B.load(k ,j) );
8265  const SIMDType b2( B.load(k+1UL,j) );
8266  xmm1 += set( A(i ,k ) ) * b1;
8267  xmm2 += set( A(i+1UL,k ) ) * b1;
8268  xmm3 += set( A(i+2UL,k ) ) * b1;
8269  xmm4 += set( A(i+3UL,k ) ) * b1;
8270  xmm5 += set( A(i ,k+1UL) ) * b2;
8271  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
8272  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
8273  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
8274  }
8275 
8276  for( ; k<kend; ++k ) {
8277  const SIMDType b1( B.load(k,j) );
8278  xmm1 += set( A(i ,k) ) * b1;
8279  xmm2 += set( A(i+1UL,k) ) * b1;
8280  xmm3 += set( A(i+2UL,k) ) * b1;
8281  xmm4 += set( A(i+3UL,k) ) * b1;
8282  }
8283 
8284  (~C).store( i , j, (xmm1+xmm5) * factor );
8285  (~C).store( i+1UL, j, (xmm2+xmm6) * factor );
8286  (~C).store( i+2UL, j, (xmm3+xmm7) * factor );
8287  (~C).store( i+3UL, j, (xmm4+xmm8) * factor );
8288  }
8289 
8290  for( ; (i+3UL) <= iend; i+=3UL )
8291  {
8292  const size_t kbegin( ( IsUpper<MT4>::value )
8293  ?( ( IsLower<MT5>::value )
8294  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8295  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8296  :( IsLower<MT5>::value ? j : 0UL ) );
8297  const size_t kend( ( IsLower<MT4>::value )
8298  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
8299  :( K ) );
8300 
8301  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8302  size_t k( kbegin );
8303 
8304  for( ; (k+2UL) <= kend; k+=2UL ) {
8305  const SIMDType b1( B.load(k ,j) );
8306  const SIMDType b2( B.load(k+1UL,j) );
8307  xmm1 += set( A(i ,k ) ) * b1;
8308  xmm2 += set( A(i+1UL,k ) ) * b1;
8309  xmm3 += set( A(i+2UL,k ) ) * b1;
8310  xmm4 += set( A(i ,k+1UL) ) * b2;
8311  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
8312  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
8313  }
8314 
8315  for( ; k<kend; ++k ) {
8316  const SIMDType b1( B.load(k,j) );
8317  xmm1 += set( A(i ,k) ) * b1;
8318  xmm2 += set( A(i+1UL,k) ) * b1;
8319  xmm3 += set( A(i+2UL,k) ) * b1;
8320  }
8321 
8322  (~C).store( i , j, (xmm1+xmm4) * factor );
8323  (~C).store( i+1UL, j, (xmm2+xmm5) * factor );
8324  (~C).store( i+2UL, j, (xmm3+xmm6) * factor );
8325  }
8326 
8327  for( ; (i+2UL) <= iend; i+=2UL )
8328  {
8329  const size_t kbegin( ( IsUpper<MT4>::value )
8330  ?( ( IsLower<MT5>::value )
8331  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8332  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8333  :( IsLower<MT5>::value ? j : 0UL ) );
8334  const size_t kend( ( IsLower<MT4>::value )
8335  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8336  :( K ) );
8337 
8338  SIMDType xmm1, xmm2, xmm3, xmm4;
8339  size_t k( kbegin );
8340 
8341  for( ; (k+2UL) <= kend; k+=2UL ) {
8342  const SIMDType b1( B.load(k ,j) );
8343  const SIMDType b2( B.load(k+1UL,j) );
8344  xmm1 += set( A(i ,k ) ) * b1;
8345  xmm2 += set( A(i+1UL,k ) ) * b1;
8346  xmm3 += set( A(i ,k+1UL) ) * b2;
8347  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
8348  }
8349 
8350  for( ; k<kend; ++k ) {
8351  const SIMDType b1( B.load(k,j) );
8352  xmm1 += set( A(i ,k) ) * b1;
8353  xmm2 += set( A(i+1UL,k) ) * b1;
8354  }
8355 
8356  (~C).store( i , j, (xmm1+xmm3) * factor );
8357  (~C).store( i+1UL, j, (xmm2+xmm4) * factor );
8358  }
8359 
8360  if( i < iend )
8361  {
8362  const size_t kbegin( ( IsUpper<MT4>::value )
8363  ?( ( IsLower<MT5>::value )
8364  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8365  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8366  :( IsLower<MT5>::value ? j : 0UL ) );
8367 
8368  SIMDType xmm1, xmm2;
8369  size_t k( kbegin );
8370 
8371  for( ; (k+2UL) <= K; k+=2UL ) {
8372  xmm1 += set( A(i,k ) ) * B.load(k ,j);
8373  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
8374  }
8375 
8376  for( ; k<K; ++k ) {
8377  xmm1 += set( A(i,k) ) * B.load(k,j);
8378  }
8379 
8380  (~C).store( i, j, (xmm1+xmm2) * factor );
8381  }
8382  }
8383 
8384  for( ; remainder && j<N; ++j )
8385  {
8386  size_t i( LOW && UPP ? j : 0UL );
8387 
8388  for( ; (i+2UL) <= M; i+=2UL )
8389  {
8390  const size_t kbegin( ( IsUpper<MT4>::value )
8391  ?( ( IsLower<MT5>::value )
8392  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8393  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8394  :( IsLower<MT5>::value ? j : 0UL ) );
8395  const size_t kend( ( IsLower<MT4>::value )
8396  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8397  :( K ) );
8398 
8399  ElementType value1 = ElementType();
8400  ElementType value2 = ElementType();
8401 
8402  for( size_t k=kbegin; k<kend; ++k ) {
8403  value1 += A(i ,k) * B(k,j);
8404  value2 += A(i+1UL,k) * B(k,j);
8405  }
8406 
8407  (~C)(i ,j) = value1 * scalar;
8408  (~C)(i+1UL,j) = value2 * scalar;
8409  }
8410 
8411  if( i < M )
8412  {
8413  const size_t kbegin( ( IsUpper<MT4>::value )
8414  ?( ( IsLower<MT5>::value )
8415  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8416  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8417  :( IsLower<MT5>::value ? j : 0UL ) );
8418 
8419  ElementType value = ElementType();
8420 
8421  for( size_t k=kbegin; k<K; ++k ) {
8422  value += A(i,k) * B(k,j);
8423  }
8424 
8425  (~C)(i,j) = value * scalar;
8426  }
8427  }
8428  }
8429 
8430  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
8431  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8432  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8433  for( size_t j=0UL; j<jend; ++j ) {
8434  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
8435  }
8436  }
8437  }
8438  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
8439  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
8440  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
8441  for( size_t i=0UL; i<iend; ++i ) {
8442  reset( (~C)(i,j) );
8443  }
8444  }
8445  }
8446  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
8447  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8448  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8449  for( size_t j=0UL; j<jend; ++j ) {
8450  reset( (~C)(i,j) );
8451  }
8452  }
8453  }
8454  }
8455  //**********************************************************************************************
8456 
8457  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
8472  template< typename MT3 // Type of the left-hand side target matrix
8473  , typename MT4 // Type of the left-hand side matrix operand
8474  , typename MT5 // Type of the right-hand side matrix operand
8475  , typename ST2 > // Type of the scalar value
8477  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8478  {
8479  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
8480 
8481  const size_t M( A.rows() );
8482  const size_t N( B.columns() );
8483  const size_t K( A.columns() );
8484 
8485  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8486 
8487  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
8488  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
8489 
8490  const SIMDType factor( set( scalar ) );
8491 
8492  if( LOW && UPP && M > SIMDSIZE*3UL ) {
8493  reset( ~C );
8494  }
8495 
8496  {
8497  size_t i( 0UL );
8498 
8500  {
8501  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8502  for( size_t j=0UL; j<N; ++j )
8503  {
8504  const size_t kbegin( ( IsLower<MT5>::value )
8505  ?( ( IsUpper<MT4>::value )
8506  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8507  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8508  :( IsUpper<MT4>::value ? i : 0UL ) );
8509  const size_t kend( ( IsUpper<MT5>::value )
8510  ?( ( IsLower<MT4>::value )
8511  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
8512  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
8513  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
8514 
8515  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8516 
8517  for( size_t k=kbegin; k<kend; ++k ) {
8518  const SIMDType b1( set( B(k,j) ) );
8519  xmm1 += A.load(i ,k) * b1;
8520  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8521  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8522  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8523  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8524  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
8525  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
8526  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
8527  }
8528 
8529  (~C).store( i , j, xmm1 * factor );
8530  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8531  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8532  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8533  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8534  (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
8535  (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
8536  (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
8537  }
8538  }
8539  }
8540 
8541  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
8542  {
8543  size_t j( 0UL );
8544 
8545  for( ; (j+2UL) <= N; j+=2UL )
8546  {
8547  const size_t kbegin( ( IsLower<MT5>::value )
8548  ?( ( IsUpper<MT4>::value )
8549  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8550  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8551  :( IsUpper<MT4>::value ? i : 0UL ) );
8552  const size_t kend( ( IsUpper<MT5>::value )
8553  ?( ( IsLower<MT4>::value )
8554  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8555  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8556  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
8557 
8558  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8559 
8560  for( size_t k=kbegin; k<kend; ++k ) {
8561  const SIMDType a1( A.load(i ,k) );
8562  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8563  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8564  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8565  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
8566  const SIMDType b1( set( B(k,j ) ) );
8567  const SIMDType b2( set( B(k,j+1UL) ) );
8568  xmm1 += a1 * b1;
8569  xmm2 += a2 * b1;
8570  xmm3 += a3 * b1;
8571  xmm4 += a4 * b1;
8572  xmm5 += a5 * b1;
8573  xmm6 += a1 * b2;
8574  xmm7 += a2 * b2;
8575  xmm8 += a3 * b2;
8576  xmm9 += a4 * b2;
8577  xmm10 += a5 * b2;
8578  }
8579 
8580  (~C).store( i , j , xmm1 * factor );
8581  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8582  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8583  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8584  (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
8585  (~C).store( i , j+1UL, xmm6 * factor );
8586  (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
8587  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
8588  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
8589  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
8590  }
8591 
8592  if( j < N )
8593  {
8594  const size_t kbegin( ( IsLower<MT5>::value )
8595  ?( ( IsUpper<MT4>::value )
8596  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8597  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8598  :( IsUpper<MT4>::value ? i : 0UL ) );
8599  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
8600 
8601  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8602 
8603  for( size_t k=kbegin; k<kend; ++k ) {
8604  const SIMDType b1( set( B(k,j) ) );
8605  xmm1 += A.load(i ,k) * b1;
8606  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8607  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8608  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8609  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8610  }
8611 
8612  (~C).store( i , j, xmm1 * factor );
8613  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8614  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8615  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8616  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8617  }
8618  }
8619 
8620  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8621  {
8622  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
8623  size_t j( UPP ? i : 0UL );
8624 
8625  for( ; (j+2UL) <= jend; j+=2UL )
8626  {
8627  const size_t kbegin( ( IsLower<MT5>::value )
8628  ?( ( IsUpper<MT4>::value )
8629  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8630  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8631  :( IsUpper<MT4>::value ? i : 0UL ) );
8632  const size_t kend( ( IsUpper<MT5>::value )
8633  ?( ( IsLower<MT4>::value )
8634  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8635  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8636  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
8637 
8638  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8639 
8640  for( size_t k=kbegin; k<kend; ++k ) {
8641  const SIMDType a1( A.load(i ,k) );
8642  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8643  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8644  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8645  const SIMDType b1( set( B(k,j ) ) );
8646  const SIMDType b2( set( B(k,j+1UL) ) );
8647  xmm1 += a1 * b1;
8648  xmm2 += a2 * b1;
8649  xmm3 += a3 * b1;
8650  xmm4 += a4 * b1;
8651  xmm5 += a1 * b2;
8652  xmm6 += a2 * b2;
8653  xmm7 += a3 * b2;
8654  xmm8 += a4 * b2;
8655  }
8656 
8657  (~C).store( i , j , xmm1 * factor );
8658  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8659  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8660  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8661  (~C).store( i , j+1UL, xmm5 * factor );
8662  (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
8663  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
8664  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
8665  }
8666 
8667  if( j < jend )
8668  {
8669  const size_t kbegin( ( IsLower<MT5>::value )
8670  ?( ( IsUpper<MT4>::value )
8671  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8672  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8673  :( IsUpper<MT4>::value ? i : 0UL ) );
8674  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
8675 
8676  SIMDType xmm1, xmm2, xmm3, xmm4;
8677 
8678  for( size_t k=kbegin; k<kend; ++k ) {
8679  const SIMDType b1( set( B(k,j) ) );
8680  xmm1 += A.load(i ,k) * b1;
8681  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8682  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8683  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8684  }
8685 
8686  (~C).store( i , j, xmm1 * factor );
8687  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8688  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8689  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8690  }
8691  }
8692 
8693  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
8694  {
8695  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
8696  size_t j( UPP ? i : 0UL );
8697 
8698  for( ; (j+2UL) <= jend; j+=2UL )
8699  {
8700  const size_t kbegin( ( IsLower<MT5>::value )
8701  ?( ( IsUpper<MT4>::value )
8702  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8703  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8704  :( IsUpper<MT4>::value ? i : 0UL ) );
8705  const size_t kend( ( IsUpper<MT5>::value )
8706  ?( ( IsLower<MT4>::value )
8707  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8708  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8709  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
8710 
8711  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8712 
8713  for( size_t k=kbegin; k<kend; ++k ) {
8714  const SIMDType a1( A.load(i ,k) );
8715  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8716  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8717  const SIMDType b1( set( B(k,j ) ) );
8718  const SIMDType b2( set( B(k,j+1UL) ) );
8719  xmm1 += a1 * b1;
8720  xmm2 += a2 * b1;
8721  xmm3 += a3 * b1;
8722  xmm4 += a1 * b2;
8723  xmm5 += a2 * b2;
8724  xmm6 += a3 * b2;
8725  }
8726 
8727  (~C).store( i , j , xmm1 * factor );
8728  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8729  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8730  (~C).store( i , j+1UL, xmm4 * factor );
8731  (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
8732  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
8733  }
8734 
8735  if( j < jend )
8736  {
8737  const size_t kbegin( ( IsLower<MT5>::value )
8738  ?( ( IsUpper<MT4>::value )
8739  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8740  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8741  :( IsUpper<MT4>::value ? i : 0UL ) );
8742  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
8743 
8744  SIMDType xmm1, xmm2, xmm3;
8745 
8746  for( size_t k=kbegin; k<kend; ++k ) {
8747  const SIMDType b1( set( B(k,j) ) );
8748  xmm1 += A.load(i ,k) * b1;
8749  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8750  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8751  }
8752 
8753  (~C).store( i , j, xmm1 * factor );
8754  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8755  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8756  }
8757  }
8758 
8759  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8760  {
8761  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
8762  size_t j( UPP ? i : 0UL );
8763 
8764  for( ; (j+4UL) <= jend; j+=4UL )
8765  {
8766  const size_t kbegin( ( IsLower<MT5>::value )
8767  ?( ( IsUpper<MT4>::value )
8768  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8769  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8770  :( IsUpper<MT4>::value ? i : 0UL ) );
8771  const size_t kend( ( IsUpper<MT5>::value )
8772  ?( ( IsLower<MT4>::value )
8773  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
8774  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
8775  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8776 
8777  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8778 
8779  for( size_t k=kbegin; k<kend; ++k ) {
8780  const SIMDType a1( A.load(i ,k) );
8781  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8782  const SIMDType b1( set( B(k,j ) ) );
8783  const SIMDType b2( set( B(k,j+1UL) ) );
8784  const SIMDType b3( set( B(k,j+2UL) ) );
8785  const SIMDType b4( set( B(k,j+3UL) ) );
8786  xmm1 += a1 * b1;
8787  xmm2 += a2 * b1;
8788  xmm3 += a1 * b2;
8789  xmm4 += a2 * b2;
8790  xmm5 += a1 * b3;
8791  xmm6 += a2 * b3;
8792  xmm7 += a1 * b4;
8793  xmm8 += a2 * b4;
8794  }
8795 
8796  (~C).store( i , j , xmm1 * factor );
8797  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
8798  (~C).store( i , j+1UL, xmm3 * factor );
8799  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8800  (~C).store( i , j+2UL, xmm5 * factor );
8801  (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8802  (~C).store( i , j+3UL, xmm7 * factor );
8803  (~C).store( i+SIMDSIZE, j+3UL, xmm8 * factor );
8804  }
8805 
8806  for( ; (j+3UL) <= jend; j+=3UL )
8807  {
8808  const size_t kbegin( ( IsLower<MT5>::value )
8809  ?( ( IsUpper<MT4>::value )
8810  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8811  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8812  :( IsUpper<MT4>::value ? i : 0UL ) );
8813  const size_t kend( ( IsUpper<MT5>::value )
8814  ?( ( IsLower<MT4>::value )
8815  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
8816  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
8817  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8818 
8819  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8820 
8821  for( size_t k=kbegin; k<kend; ++k ) {
8822  const SIMDType a1( A.load(i ,k) );
8823  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8824  const SIMDType b1( set( B(k,j ) ) );
8825  const SIMDType b2( set( B(k,j+1UL) ) );
8826  const SIMDType b3( set( B(k,j+2UL) ) );
8827  xmm1 += a1 * b1;
8828  xmm2 += a2 * b1;
8829  xmm3 += a1 * b2;
8830  xmm4 += a2 * b2;
8831  xmm5 += a1 * b3;
8832  xmm6 += a2 * b3;
8833  }
8834 
8835  (~C).store( i , j , xmm1 * factor );
8836  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
8837  (~C).store( i , j+1UL, xmm3 * factor );
8838  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8839  (~C).store( i , j+2UL, xmm5 * factor );
8840  (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8841  }
8842 
8843  for( ; (j+2UL) <= jend; j+=2UL )
8844  {
8845  const size_t kbegin( ( IsLower<MT5>::value )
8846  ?( ( IsUpper<MT4>::value )
8847  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8848  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8849  :( IsUpper<MT4>::value ? i : 0UL ) );
8850  const size_t kend( ( IsUpper<MT5>::value )
8851  ?( ( IsLower<MT4>::value )
8852  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8853  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8854  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8855 
8856  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8857  size_t k( kbegin );
8858 
8859  for( ; (k+2UL) <= kend; k+=2UL ) {
8860  const SIMDType a1( A.load(i ,k ) );
8861  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8862  const SIMDType a3( A.load(i ,k+1UL) );
8863  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8864  const SIMDType b1( set( B(k ,j ) ) );
8865  const SIMDType b2( set( B(k ,j+1UL) ) );
8866  const SIMDType b3( set( B(k+1UL,j ) ) );
8867  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
8868  xmm1 += a1 * b1;
8869  xmm2 += a2 * b1;
8870  xmm3 += a1 * b2;
8871  xmm4 += a2 * b2;
8872  xmm5 += a3 * b3;
8873  xmm6 += a4 * b3;
8874  xmm7 += a3 * b4;
8875  xmm8 += a4 * b4;
8876  }
8877 
8878  for( ; k<kend; ++k ) {
8879  const SIMDType a1( A.load(i ,k) );
8880  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8881  const SIMDType b1( set( B(k,j ) ) );
8882  const SIMDType b2( set( B(k,j+1UL) ) );
8883  xmm1 += a1 * b1;
8884  xmm2 += a2 * b1;
8885  xmm3 += a1 * b2;
8886  xmm4 += a2 * b2;
8887  }
8888 
8889  (~C).store( i , j , (xmm1+xmm5) * factor );
8890  (~C).store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
8891  (~C).store( i , j+1UL, (xmm3+xmm7) * factor );
8892  (~C).store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
8893  }
8894 
8895  if( j < jend )
8896  {
8897  const size_t kbegin( ( IsLower<MT5>::value )
8898  ?( ( IsUpper<MT4>::value )
8899  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8900  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8901  :( IsUpper<MT4>::value ? i : 0UL ) );
8902  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8903 
8904  SIMDType xmm1, xmm2, xmm3, xmm4;
8905  size_t k( kbegin );
8906 
8907  for( ; (k+2UL) <= kend; k+=2UL ) {
8908  const SIMDType b1( set( B(k ,j) ) );
8909  const SIMDType b2( set( B(k+1UL,j) ) );
8910  xmm1 += A.load(i ,k ) * b1;
8911  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8912  xmm3 += A.load(i ,k+1UL) * b2;
8913  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8914  }
8915 
8916  for( ; k<kend; ++k ) {
8917  const SIMDType b1( set( B(k,j) ) );
8918  xmm1 += A.load(i ,k) * b1;
8919  xmm2 += A.load(i+SIMDSIZE,k) * b1;
8920  }
8921 
8922  (~C).store( i , j, (xmm1+xmm3) * factor );
8923  (~C).store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
8924  }
8925  }
8926 
8927  for( ; i<ipos; i+=SIMDSIZE )
8928  {
8929  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
8930  size_t j( UPP ? i : 0UL );
8931 
8932  for( ; (j+4UL) <= jend; j+=4UL )
8933  {
8934  const size_t kbegin( ( IsLower<MT5>::value )
8935  ?( ( IsUpper<MT4>::value )
8936  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8937  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8938  :( IsUpper<MT4>::value ? i : 0UL ) );
8939  const size_t kend( ( IsUpper<MT5>::value )
8940  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
8941  :( K ) );
8942 
8943  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8944  size_t k( kbegin );
8945 
8946  for( ; (k+2UL) <= kend; k+=2UL ) {
8947  const SIMDType a1( A.load(i,k ) );
8948  const SIMDType a2( A.load(i,k+1UL) );
8949  xmm1 += a1 * set( B(k ,j ) );
8950  xmm2 += a1 * set( B(k ,j+1UL) );
8951  xmm3 += a1 * set( B(k ,j+2UL) );
8952  xmm4 += a1 * set( B(k ,j+3UL) );
8953  xmm5 += a2 * set( B(k+1UL,j ) );
8954  xmm6 += a2 * set( B(k+1UL,j+1UL) );
8955  xmm7 += a2 * set( B(k+1UL,j+2UL) );
8956  xmm8 += a2 * set( B(k+1UL,j+3UL) );
8957  }
8958 
8959  for( ; k<kend; ++k ) {
8960  const SIMDType a1( A.load(i,k) );
8961  xmm1 += a1 * set( B(k,j ) );
8962  xmm2 += a1 * set( B(k,j+1UL) );
8963  xmm3 += a1 * set( B(k,j+2UL) );
8964  xmm4 += a1 * set( B(k,j+3UL) );
8965  }
8966 
8967  (~C).store( i, j , (xmm1+xmm5) * factor );
8968  (~C).store( i, j+1UL, (xmm2+xmm6) * factor );
8969  (~C).store( i, j+2UL, (xmm3+xmm7) * factor );
8970  (~C).store( i, j+3UL, (xmm4+xmm8) * factor );
8971  }
8972 
8973  for( ; (j+3UL) <= jend; j+=3UL )
8974  {
8975  const size_t kbegin( ( IsLower<MT5>::value )
8976  ?( ( IsUpper<MT4>::value )
8977  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8978  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8979  :( IsUpper<MT4>::value ? i : 0UL ) );
8980  const size_t kend( ( IsUpper<MT5>::value )
8981  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
8982  :( K ) );
8983 
8984  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8985  size_t k( kbegin );
8986 
8987  for( ; (k+2UL) <= kend; k+=2UL ) {
8988  const SIMDType a1( A.load(i,k ) );
8989  const SIMDType a2( A.load(i,k+1UL) );
8990  xmm1 += a1 * set( B(k ,j ) );
8991  xmm2 += a1 * set( B(k ,j+1UL) );
8992  xmm3 += a1 * set( B(k ,j+2UL) );
8993  xmm4 += a2 * set( B(k+1UL,j ) );
8994  xmm5 += a2 * set( B(k+1UL,j+1UL) );
8995  xmm6 += a2 * set( B(k+1UL,j+2UL) );
8996  }
8997 
8998  for( ; k<kend; ++k ) {
8999  const SIMDType a1( A.load(i,k) );
9000  xmm1 += a1 * set( B(k,j ) );
9001  xmm2 += a1 * set( B(k,j+1UL) );
9002  xmm3 += a1 * set( B(k,j+2UL) );
9003  }
9004 
9005  (~C).store( i, j , (xmm1+xmm4) * factor );
9006  (~C).store( i, j+1UL, (xmm2+xmm5) * factor );
9007  (~C).store( i, j+2UL, (xmm3+xmm6) * factor );
9008  }
9009 
9010  for( ; (j+2UL) <= jend; j+=2UL )
9011  {
9012  const size_t kbegin( ( IsLower<MT5>::value )
9013  ?( ( IsUpper<MT4>::value )
9014  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9015  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9016  :( IsUpper<MT4>::value ? i : 0UL ) );
9017  const size_t kend( ( IsUpper<MT5>::value )
9018  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
9019  :( K ) );
9020 
9021  SIMDType xmm1, xmm2, xmm3, xmm4;
9022  size_t k( kbegin );
9023 
9024  for( ; k<kend; ++k ) {
9025  const SIMDType a1( A.load(i,k) );
9026  xmm1 += a1 * set( B(k,j ) );
9027  xmm2 += a1 * set( B(k,j+1UL) );
9028  }
9029 
9030  for( ; (k+2UL) <= kend; k+=2UL ) {
9031  const SIMDType a1( A.load(i,k ) );
9032  const SIMDType a2( A.load(i,k+1UL) );
9033  xmm1 += a1 * set( B(k ,j ) );
9034  xmm2 += a1 * set( B(k ,j+1UL) );
9035  xmm3 += a2 * set( B(k+1UL,j ) );
9036  xmm4 += a2 * set( B(k+1UL,j+1UL) );
9037  }
9038 
9039  (~C).store( i, j , (xmm1+xmm3) * factor );
9040  (~C).store( i, j+1UL, (xmm2+xmm4) * factor );
9041  }
9042 
9043  if( j < jend )
9044  {
9045  const size_t kbegin( ( IsLower<MT5>::value )
9046  ?( ( IsUpper<MT4>::value )
9047  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9048  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9049  :( IsUpper<MT4>::value ? i : 0UL ) );
9050 
9051  SIMDType xmm1, xmm2;
9052  size_t k( kbegin );
9053 
9054  for( ; (k+2UL) <= K; k+=2UL ) {
9055  xmm1 += A.load(i,k ) * set( B(k ,j) );
9056  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
9057  }
9058 
9059  for( ; k<K; ++k ) {
9060  xmm1 += A.load(i,k) * set( B(k,j) );
9061  }
9062 
9063  (~C).store( i, j, (xmm1+xmm2) * factor );
9064  }
9065  }
9066 
9067  for( ; remainder && i<M; ++i )
9068  {
9069  size_t j( LOW && UPP ? i : 0UL );
9070 
9071  for( ; (j+2UL) <= N; j+=2UL )
9072  {
9073  const size_t kbegin( ( IsLower<MT5>::value )
9074  ?( ( IsUpper<MT4>::value )
9075  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9076  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9077  :( IsUpper<MT4>::value ? i : 0UL ) );
9078  const size_t kend( ( IsUpper<MT5>::value )
9079  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
9080  :( K ) );
9081 
9082  ElementType value1 = ElementType();
9083  ElementType value2 = ElementType();
9084 
9085  for( size_t k=kbegin; k<kend; ++k ) {
9086  value1 += A(i,k) * B(k,j );
9087  value2 += A(i,k) * B(k,j+1UL);
9088  }
9089 
9090  (~C)(i,j ) = value1 * scalar;
9091  (~C)(i,j+1UL) = value2 * scalar;
9092  }
9093 
9094  if( j < N )
9095  {
9096  const size_t kbegin( ( IsLower<MT5>::value )
9097  ?( ( IsUpper<MT4>::value )
9098  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9099  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9100  :( IsUpper<MT4>::value ? i : 0UL ) );
9101 
9102  ElementType value = ElementType();
9103 
9104  for( size_t k=kbegin; k<K; ++k ) {
9105  value += A(i,k) * B(k,j);
9106  }
9107 
9108  (~C)(i,j) = value * scalar;
9109  }
9110  }
9111  }
9112 
9113  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
9114  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9115  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9116  for( size_t i=0UL; i<iend; ++i ) {
9117  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
9118  }
9119  }
9120  }
9121  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
9122  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9123  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9124  for( size_t i=0UL; i<iend; ++i ) {
9125  reset( (~C)(i,j) );
9126  }
9127  }
9128  }
9129  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
9130  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
9131  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
9132  for( size_t j=0UL; j<jend; ++j ) {
9133  reset( (~C)(i,j) );
9134  }
9135  }
9136  }
9137  }
9138  //**********************************************************************************************
9139 
9140  //**Default assignment to dense matrices (large matrices)***************************************
9154  template< typename MT3 // Type of the left-hand side target matrix
9155  , typename MT4 // Type of the left-hand side matrix operand
9156  , typename MT5 // Type of the right-hand side matrix operand
9157  , typename ST2 > // Type of the scalar value
9159  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9160  {
9161  selectDefaultAssignKernel( C, A, B, scalar );
9162  }
9163  //**********************************************************************************************
9164 
9165  //**Vectorized default assignment to dense matrices (large matrices)****************************
9180  template< typename MT3 // Type of the left-hand side target matrix
9181  , typename MT4 // Type of the left-hand side matrix operand
9182  , typename MT5 // Type of the right-hand side matrix operand
9183  , typename ST2 > // Type of the scalar value
9185  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9186  {
9187  if( SYM )
9188  smmm( C, A, B, scalar );
9189  else if( HERM )
9190  hmmm( C, A, B, scalar );
9191  else if( LOW )
9192  lmmm( C, A, B, scalar, ST2(0) );
9193  else if( UPP )
9194  ummm( C, A, B, scalar, ST2(0) );
9195  else
9196  mmm( C, A, B, scalar, ST2(0) );
9197  }
9198  //**********************************************************************************************
9199 
9200  //**BLAS-based assignment to dense matrices (default)*******************************************
9214  template< typename MT3 // Type of the left-hand side target matrix
9215  , typename MT4 // Type of the left-hand side matrix operand
9216  , typename MT5 // Type of the right-hand side matrix operand
9217  , typename ST2 > // Type of the scalar value
9219  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9220  {
9221  selectLargeAssignKernel( C, A, B, scalar );
9222  }
9223  //**********************************************************************************************
9224 
9225  //**BLAS-based assignment to dense matrices*****************************************************
9226 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9227 
9240  template< typename MT3 // Type of the left-hand side target matrix
9241  , typename MT4 // Type of the left-hand side matrix operand
9242  , typename MT5 // Type of the right-hand side matrix operand
9243  , typename ST2 > // Type of the scalar value
9245  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9246  {
9247  using ET = ElementType_<MT3>;
9248 
9249  if( IsTriangular<MT4>::value ) {
9250  assign( C, B );
9251  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9252  }
9253  else if( IsTriangular<MT5>::value ) {
9254  assign( C, A );
9255  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9256  }
9257  else {
9258  gemm( C, A, B, ET(scalar), ET(0) );
9259  }
9260  }
9261 #endif
9262  //**********************************************************************************************
9263 
9264  //**Assignment to sparse matrices***************************************************************
9276  template< typename MT // Type of the target sparse matrix
9277  , bool SO > // Storage order of the target sparse matrix
9278  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9279  {
9281 
9283 
9290 
9291  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9292  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9293 
9294  const ForwardFunctor fwd;
9295 
9296  const TmpType tmp( serial( rhs ) );
9297  assign( ~lhs, fwd( tmp ) );
9298  }
9299  //**********************************************************************************************
9300 
9301  //**Addition assignment to dense matrices*******************************************************
9313  template< typename MT // Type of the target dense matrix
9314  , bool SO > // Storage order of the target dense matrix
9315  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9316  {
9318 
9319  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9320  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9321 
9322  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
9323  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
9324 
9325  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
9326  return;
9327  }
9328 
9329  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
9330  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
9331 
9332  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9333  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9334  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9335  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9336  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
9337  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
9338 
9339  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
9340  }
9341  //**********************************************************************************************
9342 
9343  //**Addition assignment to dense matrices (kernel selection)************************************
9354  template< typename MT3 // Type of the left-hand side target matrix
9355  , typename MT4 // Type of the left-hand side matrix operand
9356  , typename MT5 // Type of the right-hand side matrix operand
9357  , typename ST2 > // Type of the scalar value
9358  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9359  {
9361  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
9362  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
9363  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9364  selectSmallAddAssignKernel( C, A, B, scalar );
9365  else
9366  selectBlasAddAssignKernel( C, A, B, scalar );
9367  }
9368  //**********************************************************************************************
9369 
9370  //**Default addition assignment to dense matrices (general/general)*****************************
9384  template< typename MT3 // Type of the left-hand side target matrix
9385  , typename MT4 // Type of the left-hand side matrix operand
9386  , typename MT5 // Type of the right-hand side matrix operand
9387  , typename ST2 > // Type of the scalar value
9388  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
9389  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9390  {
9391  const ResultType tmp( serial( A * B * scalar ) );
9392  addAssign( C, tmp );
9393  }
9394  //**********************************************************************************************
9395 
9396  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
9410  template< typename MT3 // Type of the left-hand side target matrix
9411  , typename MT4 // Type of the left-hand side matrix operand
9412  , typename MT5 // Type of the right-hand side matrix operand
9413  , typename ST2 > // Type of the scalar value
9414  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9415  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9416  {
9417  constexpr size_t block( BLOCK_SIZE );
9418 
9419  const size_t M( A.rows() );
9420  const size_t N( B.columns() );
9421 
9422  for( size_t ii=0UL; ii<M; ii+=block ) {
9423  const size_t iend( min( M, ii+block ) );
9424  for( size_t jj=0UL; jj<N; jj+=block ) {
9425  const size_t jend( min( N, jj+block ) );
9426  for( size_t i=ii; i<iend; ++i )
9427  {
9428  const size_t jbegin( ( IsUpper<MT4>::value )
9429  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
9430  :( jj ) );
9431  const size_t jpos( ( IsLower<MT4>::value )
9432  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
9433  :( jend ) );
9434 
9435  for( size_t j=jbegin; j<jpos; ++j ) {
9436  (~C)(i,j) += A(i,j) * B(j,j) * scalar;
9437  }
9438  }
9439  }
9440  }
9441  }
9442  //**********************************************************************************************
9443 
9444  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
9458  template< typename MT3 // Type of the left-hand side target matrix
9459  , typename MT4 // Type of the left-hand side matrix operand
9460  , typename MT5 // Type of the right-hand side matrix operand
9461  , typename ST2 > // Type of the scalar value
9462  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9463  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9464  {
9465  const size_t M( A.rows() );
9466  const size_t N( B.columns() );
9467 
9468  for( size_t j=0UL; j<N; ++j )
9469  {
9470  const size_t ibegin( ( IsLower<MT4>::value )
9471  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
9472  :( 0UL ) );
9473  const size_t iend( ( IsUpper<MT4>::value )
9474  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
9475  :( M ) );
9476  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
9477 
9478  const size_t inum( iend - ibegin );
9479  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
9480 
9481  for( size_t i=ibegin; i<ipos; i+=2UL ) {
9482  (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
9483  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
9484  }
9485  if( ipos < iend ) {
9486  (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
9487  }
9488  }
9489  }
9490  //**********************************************************************************************
9491 
9492  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
9506  template< typename MT3 // Type of the left-hand side target matrix
9507  , typename MT4 // Type of the left-hand side matrix operand
9508  , typename MT5 // Type of the right-hand side matrix operand
9509  , typename ST2 > // Type of the scalar value
9510  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9511  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9512  {
9513  const size_t M( A.rows() );
9514  const size_t N( B.columns() );
9515 
9516  for( size_t i=0UL; i<M; ++i )
9517  {
9518  const size_t jbegin( ( IsUpper<MT5>::value )
9519  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
9520  :( 0UL ) );
9521  const size_t jend( ( IsLower<MT5>::value )
9522  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
9523  :( N ) );
9524  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
9525 
9526  const size_t jnum( jend - jbegin );
9527  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
9528 
9529  for( size_t j=jbegin; j<jpos; j+=2UL ) {
9530  (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
9531  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
9532  }
9533  if( jpos < jend ) {
9534  (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
9535  }
9536  }
9537  }
9538  //**********************************************************************************************
9539 
9540  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
9554  template< typename MT3 // Type of the left-hand side target matrix
9555  , typename MT4 // Type of the left-hand side matrix operand
9556  , typename MT5 // Type of the right-hand side matrix operand
9557  , typename ST2 > // Type of the scalar value
9558  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9559  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9560  {
9561  constexpr size_t block( BLOCK_SIZE );
9562 
9563  const size_t M( A.rows() );
9564  const size_t N( B.columns() );
9565 
9566  for( size_t jj=0UL; jj<N; jj+=block ) {
9567  const size_t jend( min( N, jj+block ) );
9568  for( size_t ii=0UL; ii<M; ii+=block ) {
9569  const size_t iend( min( M, ii+block ) );
9570  for( size_t j=jj; j<jend; ++j )
9571  {
9572  const size_t ibegin( ( IsLower<MT5>::value )
9573  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
9574  :( ii ) );
9575  const size_t ipos( ( IsUpper<MT5>::value )
9576  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
9577  :( iend ) );
9578 
9579  for( size_t i=ibegin; i<ipos; ++i ) {
9580  (~C)(i,j) += A(i,i) * B(i,j) * scalar;
9581  }
9582  }
9583  }
9584  }
9585  }
9586  //**********************************************************************************************
9587 
9588  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
9602  template< typename MT3 // Type of the left-hand side target matrix
9603  , typename MT4 // Type of the left-hand side matrix operand
9604  , typename MT5 // Type of the right-hand side matrix operand
9605  , typename ST2 > // Type of the scalar value
9606  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
9607  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9608  {
9609  for( size_t i=0UL; i<A.rows(); ++i ) {
9610  C(i,i) += A(i,i) * B(i,i) * scalar;
9611  }
9612  }
9613  //**********************************************************************************************
9614 
9615  //**Default addition assignment to dense matrices (small matrices)******************************
9629  template< typename MT3 // Type of the left-hand side target matrix
9630  , typename MT4 // Type of the left-hand side matrix operand
9631  , typename MT5 // Type of the right-hand side matrix operand
9632  , typename ST2 > // Type of the scalar value
9634  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9635  {
9636  selectDefaultAddAssignKernel( C, A, B, scalar );
9637  }
9638  //**********************************************************************************************
9639 
9640  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
9655  template< typename MT3 // Type of the left-hand side target matrix
9656  , typename MT4 // Type of the left-hand side matrix operand
9657  , typename MT5 // Type of the right-hand side matrix operand
9658  , typename ST2 > // Type of the scalar value
9660  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9661  {
9662  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
9663 
9664  const size_t M( A.rows() );
9665  const size_t N( B.columns() );
9666  const size_t K( A.columns() );
9667 
9668  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
9669 
9670  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
9671  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
9672 
9673  const SIMDType factor( set( scalar ) );
9674 
9675  size_t j( 0UL );
9676 
9678  {
9679  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
9680  for( size_t i=0UL; i<M; ++i )
9681  {
9682  const size_t kbegin( ( IsUpper<MT4>::value )
9683  ?( ( IsLower<MT5>::value )
9684  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9685  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9686  :( IsLower<MT5>::value ? j : 0UL ) );
9687  const size_t kend( ( IsLower<MT4>::value )
9688  ?( ( IsUpper<MT5>::value )
9689  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
9690  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
9691  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
9692 
9693  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9694 
9695  for( size_t k=kbegin; k<kend; ++k ) {
9696  const SIMDType a1( set( A(i,k) ) );
9697  xmm1 += a1 * B.load(k,j );
9698  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9699  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9700  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9701  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9702  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
9703  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
9704  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
9705  }
9706 
9707  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9708  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9709  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9710  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9711  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9712  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
9713  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
9714  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
9715  }
9716  }
9717  }
9718 
9719  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
9720  {
9721  size_t i( 0UL );
9722 
9723  for( ; (i+2UL) <= M; i+=2UL )
9724  {
9725  const size_t kbegin( ( IsUpper<MT4>::value )
9726  ?( ( IsLower<MT5>::value )
9727  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9728  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9729  :( IsLower<MT5>::value ? j : 0UL ) );
9730  const size_t kend( ( IsLower<MT4>::value )
9731  ?( ( IsUpper<MT5>::value )
9732  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
9733  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9734  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
9735 
9736  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9737 
9738  for( size_t k=kbegin; k<kend; ++k ) {
9739  const SIMDType a1( set( A(i ,k) ) );
9740  const SIMDType a2( set( A(i+1UL,k) ) );
9741  const SIMDType b1( B.load(k,j ) );
9742  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9743  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9744  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9745  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
9746  xmm1 += a1 * b1;
9747  xmm2 += a1 * b2;
9748  xmm3 += a1 * b3;
9749  xmm4 += a1 * b4;
9750  xmm5 += a1 * b5;
9751  xmm6 += a2 * b1;
9752  xmm7 += a2 * b2;
9753  xmm8 += a2 * b3;
9754  xmm9 += a2 * b4;
9755  xmm10 += a2 * b5;
9756  }
9757 
9758  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9759  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9760  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9761  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9762  (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
9763  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm6 * factor );
9764  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
9765  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
9766  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
9767  (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
9768  }
9769 
9770  if( i < M )
9771  {
9772  const size_t kbegin( ( IsUpper<MT4>::value )
9773  ?( ( IsLower<MT5>::value )
9774  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9775  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9776  :( IsLower<MT5>::value ? j : 0UL ) );
9777  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
9778 
9779  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9780 
9781  for( size_t k=kbegin; k<kend; ++k ) {
9782  const SIMDType a1( set( A(i,k) ) );
9783  xmm1 += a1 * B.load(k,j );
9784  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9785  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9786  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9787  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9788  }
9789 
9790  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9791  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9792  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9793  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9794  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9795  }
9796  }
9797 
9798  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
9799  {
9800  size_t i( 0UL );
9801 
9802  for( ; (i+2UL) <= M; i+=2UL )
9803  {
9804  const size_t kbegin( ( IsUpper<MT4>::value )
9805  ?( ( IsLower<MT5>::value )
9806  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9807  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9808  :( IsLower<MT5>::value ? j : 0UL ) );
9809  const size_t kend( ( IsLower<MT4>::value )
9810  ?( ( IsUpper<MT5>::value )
9811  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
9812  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9813  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
9814 
9815  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9816 
9817  for( size_t k=kbegin; k<kend; ++k ) {
9818  const SIMDType a1( set( A(i ,k) ) );
9819  const SIMDType a2( set( A(i+1UL,k) ) );
9820  const SIMDType b1( B.load(k,j ) );
9821  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9822  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9823  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9824  xmm1 += a1 * b1;
9825  xmm2 += a1 * b2;
9826  xmm3 += a1 * b3;
9827  xmm4 += a1 * b4;
9828  xmm5 += a2 * b1;
9829  xmm6 += a2 * b2;
9830  xmm7 += a2 * b3;
9831  xmm8 += a2 * b4;
9832  }
9833 
9834  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9835  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9836  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9837  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9838  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
9839  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
9840  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
9841  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
9842  }
9843 
9844  if( i < M )
9845  {
9846  const size_t kbegin( ( IsUpper<MT4>::value )
9847  ?( ( IsLower<MT5>::value )
9848  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9849  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9850  :( IsLower<MT5>::value ? j : 0UL ) );
9851  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
9852 
9853  SIMDType xmm1, xmm2, xmm3, xmm4;
9854 
9855  for( size_t k=kbegin; k<kend; ++k ) {
9856  const SIMDType a1( set( A(i,k) ) );
9857  xmm1 += a1 * B.load(k,j );
9858  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9859  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9860  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9861  }
9862 
9863  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9864  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9865  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9866  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9867  }
9868  }
9869 
9870  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
9871  {
9872  size_t i( 0UL );
9873 
9874  for( ; (i+2UL) <= M; i+=2UL )
9875  {
9876  const size_t kbegin( ( IsUpper<MT4>::value )
9877  ?( ( IsLower<MT5>::value )
9878  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9879  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9880  :( IsLower<MT5>::value ? j : 0UL ) );
9881  const size_t kend( ( IsLower<MT4>::value )
9882  ?( ( IsUpper<MT5>::value )
9883  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
9884  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9885  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
9886 
9887  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9888 
9889  for( size_t k=kbegin; k<kend; ++k ) {
9890  const SIMDType a1( set( A(i ,k) ) );
9891  const SIMDType a2( set( A(i+1UL,k) ) );
9892  const SIMDType b1( B.load(k,j ) );
9893  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9894  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9895  xmm1 += a1 * b1;
9896  xmm2 += a1 * b2;
9897  xmm3 += a1 * b3;
9898  xmm4 += a2 * b1;
9899  xmm5 += a2 * b2;
9900  xmm6 += a2 * b3;
9901  }
9902 
9903  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9904  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9905  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9906  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm4 * factor );
9907  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
9908  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
9909  }
9910 
9911  if( i < M )
9912  {
9913  const size_t kbegin( ( IsUpper<MT4>::value )
9914  ?( ( IsLower<MT5>::value )
9915  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9916  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9917  :( IsLower<MT5>::value ? j : 0UL ) );
9918  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
9919 
9920  SIMDType xmm1, xmm2, xmm3;
9921 
9922  for( size_t k=kbegin; k<kend; ++k ) {
9923  const SIMDType a1( set( A(i,k) ) );
9924  xmm1 += a1 * B.load(k,j );
9925  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9926  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9927  }
9928 
9929  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9930  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9931  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9932  }
9933  }
9934 
9935  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
9936  {
9937  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
9938  size_t i( LOW ? j : 0UL );
9939 
9940  for( ; (i+4UL) <= iend; i+=4UL )
9941  {
9942  const size_t kbegin( ( IsUpper<MT4>::value )
9943  ?( ( IsLower<MT5>::value )
9944  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9945  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9946  :( IsLower<MT5>::value ? j : 0UL ) );
9947  const size_t kend( ( IsLower<MT4>::value )
9948  ?( ( IsUpper<MT5>::value )
9949  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
9950  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
9951  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
9952 
9953  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9954 
9955  for( size_t k=kbegin; k<kend; ++k ) {
9956  const SIMDType a1( set( A(i ,k) ) );
9957  const SIMDType a2( set( A(i+1UL,k) ) );
9958  const SIMDType a3( set( A(i+2UL,k) ) );
9959  const SIMDType a4( set( A(i+3UL,k) ) );
9960  const SIMDType b1( B.load(k,j ) );
9961  const SIMDType b2( B.load(k,j+SIMDSIZE) );
9962  xmm1 += a1 * b1;
9963  xmm2 += a1 * b2;
9964  xmm3 += a2 * b1;
9965  xmm4 += a2 * b2;
9966  xmm5 += a3 * b1;
9967  xmm6 += a3 * b2;
9968  xmm7 += a4 * b1;
9969  xmm8 += a4 * b2;
9970  }
9971 
9972  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9973  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
9974  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9975  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
9976  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
9977  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
9978  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
9979  (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
9980  }
9981 
9982  for( ; (i+3UL) <= iend; i+=3UL )
9983  {
9984  const size_t kbegin( ( IsUpper<MT4>::value )
9985  ?( ( IsLower<MT5>::value )
9986  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9987  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9988  :( IsLower<MT5>::value ? j : 0UL ) );
9989  const size_t kend( ( IsLower<MT4>::value )
9990  ?( ( IsUpper<MT5>::value )
9991  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
9992  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
9993  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
9994 
9995  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9996 
9997  for( size_t k=kbegin; k<kend; ++k ) {
9998  const SIMDType a1( set( A(i ,k) ) );
9999  const SIMDType a2( set( A(i+1UL,k) ) );
10000  const SIMDType a3( set( A(i+2UL,k) ) );
10001  const SIMDType b1( B.load(k,j ) );
10002  const SIMDType b2( B.load(k,j+SIMDSIZE) );
10003  xmm1 += a1 * b1;
10004  xmm2 += a1 * b2;
10005  xmm3 += a2 * b1;
10006  xmm4 += a2 * b2;
10007  xmm5 += a3 * b1;
10008  xmm6 += a3 * b2;
10009  }
10010 
10011  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10012  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
10013  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
10014  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
10015  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
10016  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
10017  }
10018 
10019  for( ; (i+2UL) <= iend; i+=2UL )
10020  {
10021  const size_t kbegin( ( IsUpper<MT4>::value )
10022  ?( ( IsLower<MT5>::value )
10023  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10024  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10025  :( IsLower<MT5>::value ? j : 0UL ) );
10026  const size_t kend( ( IsLower<MT4>::value )
10027  ?( ( IsUpper<MT5>::value )
10028  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
10029  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
10030  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
10031 
10032  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10033  size_t k( kbegin );
10034 
10035  for( ; (k+2UL) <= kend; k+=2UL ) {
10036  const SIMDType a1( set( A(i ,k ) ) );
10037  const SIMDType a2( set( A(i+1UL,k ) ) );
10038  const SIMDType a3( set( A(i ,k+1UL) ) );
10039  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
10040  const SIMDType b1( B.load(k ,j ) );
10041  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
10042  const SIMDType b3( B.load(k+1UL,j ) );
10043  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
10044  xmm1 += a1 * b1;
10045  xmm2 += a1 * b2;
10046  xmm3 += a2 * b1;
10047  xmm4 += a2 * b2;
10048  xmm5 += a3 * b3;
10049  xmm6 += a3 * b4;
10050  xmm7 += a4 * b3;
10051  xmm8 += a4 * b4;
10052  }
10053 
10054  for( ; k<kend; ++k ) {
10055  const SIMDType a1( set( A(i ,k) ) );
10056  const SIMDType a2( set( A(i+1UL,k) ) );
10057  const SIMDType b1( B.load(k,j ) );
10058  const SIMDType b2( B.load(k,j+SIMDSIZE) );
10059  xmm1 += a1 * b1;
10060  xmm2 += a1 * b2;
10061  xmm3 += a2 * b1;
10062  xmm4 += a2 * b2;
10063  }
10064 
10065  (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
10066  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
10067  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + (xmm3+xmm7) * factor );
10068  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
10069  }
10070 
10071  if( i < iend )
10072  {
10073  const size_t kbegin( ( IsUpper<MT4>::value )
10074  ?( ( IsLower<MT5>::value )
10075  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10076  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10077  :( IsLower<MT5>::value ? j : 0UL ) );
10078  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
10079 
10080  SIMDType xmm1, xmm2, xmm3, xmm4;
10081  size_t k( kbegin );
10082 
10083  for( ; (k+2UL) <= kend; k+=2UL ) {
10084  const SIMDType a1( set( A(i,k ) ) );
10085  const SIMDType a2( set( A(i,k+1UL) ) );
10086  xmm1 += a1 * B.load(k ,j );
10087  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
10088  xmm3 += a2 * B.load(k+1UL,j );
10089  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
10090  }
10091 
10092  for( ; k<kend; ++k ) {
10093  const SIMDType a1( set( A(i,k) ) );
10094  xmm1 += a1 * B.load(k,j );
10095  xmm2 += a1 * B.load(k,j+SIMDSIZE);
10096  }
10097 
10098  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
10099  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
10100  }
10101  }
10102 
10103  for( ; j<jpos; j+=SIMDSIZE )
10104  {
10105  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
10106  size_t i( LOW ? j : 0UL );
10107 
10108  for( ; (i+4UL) <= iend; i+=4UL )
10109  {
10110  const size_t kbegin( ( IsUpper<MT4>::value )
10111  ?( ( IsLower<MT5>::value )
10112  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10113  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10114  :( IsLower<MT5>::value ? j : 0UL ) );
10115  const size_t kend( ( IsLower<MT4>::value )
10116  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
10117  :( K ) );
10118 
10119  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10120  size_t k( kbegin );
10121 
10122  for( ; (k+2UL) <= kend; k+=2UL ) {
10123  const SIMDType b1( B.load(k ,j) );
10124  const SIMDType b2( B.load(k+1UL,j) );
10125  xmm1 += set( A(i ,k ) ) * b1;
10126  xmm2 += set( A(i+1UL,k ) ) * b1;
10127  xmm3 += set( A(i+2UL,k ) ) * b1;
10128  xmm4 += set( A(i+3UL,k ) ) * b1;
10129  xmm5 += set( A(i ,k+1UL) ) * b2;
10130  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
10131  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
10132  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
10133  }
10134 
10135  for( ; k<kend; ++k ) {
10136  const SIMDType b1( B.load(k,j) );
10137  xmm1 += set( A(i ,k) ) * b1;
10138  xmm2 += set( A(i+1UL,k) ) * b1;
10139  xmm3 += set( A(i+2UL,k) ) * b1;
10140  xmm4 += set( A(i+3UL,k) ) * b1;
10141  }
10142 
10143  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm5) * factor );
10144  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm6) * factor );
10145  (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm7) * factor );
10146  (~C).store( i+3UL, j, (~C).load(i+3UL,j) + (xmm4+xmm8) * factor );
10147  }
10148 
10149  for( ; (i+3UL) <= iend; i+=3UL )
10150  {
10151  const size_t kbegin( ( IsUpper<MT4>::value )
10152  ?( ( IsLower<MT5>::value )
10153  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10154  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10155  :( IsLower<MT5>::value ? j : 0UL ) );
10156  const size_t kend( ( IsLower<MT4>::value )
10157  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
10158  :( K ) );
10159 
10160  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10161  size_t k( kbegin );
10162 
10163  for( ; (k+2UL) <= kend; k+=2UL ) {
10164  const SIMDType b1( B.load(k ,j) );
10165  const SIMDType b2( B.load(k+1UL,j) );
10166  xmm1 += set( A(i ,k ) ) * b1;
10167  xmm2 += set( A(i+1UL,k ) ) * b1;
10168  xmm3 += set( A(i+2UL,k ) ) * b1;
10169  xmm4 += set( A(i ,k+1UL) ) * b2;
10170  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
10171  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
10172  }
10173 
10174  for( ; k<kend; ++k ) {
10175  const SIMDType b1( B.load(k,j) );
10176  xmm1 += set( A(i ,k) ) * b1;
10177  xmm2 += set( A(i+1UL,k) ) * b1;
10178  xmm3 += set( A(i+2UL,k) ) * b1;
10179  }
10180 
10181  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm4) * factor );
10182  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm5) * factor );
10183  (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm6) * factor );
10184  }
10185 
10186  for( ; (i+2UL) <= iend; i+=2UL )
10187  {
10188  const size_t kbegin( ( IsUpper<MT4>::value )
10189  ?( ( IsLower<MT5>::value )
10190  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10191  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10192  :( IsLower<MT5>::value ? j : 0UL ) );
10193  const size_t kend( ( IsLower<MT4>::value )
10194  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
10195  :( K ) );
10196 
10197  SIMDType xmm1, xmm2, xmm3, xmm4;
10198  size_t k( kbegin );
10199 
10200  for( ; (k+2UL) <= kend; k+=2UL ) {
10201  const SIMDType b1( B.load(k ,j) );
10202  const SIMDType b2( B.load(k+1UL,j) );
10203  xmm1 += set( A(i ,k ) ) * b1;
10204  xmm2 += set( A(i+1UL,k ) ) * b1;
10205  xmm3 += set( A(i ,k+1UL) ) * b2;
10206  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
10207  }
10208 
10209  for( ; k<kend; ++k ) {
10210  const SIMDType b1( B.load(k,j) );
10211  xmm1 += set( A(i ,k) ) * b1;
10212  xmm2 += set( A(i+1UL,k) ) * b1;
10213  }
10214 
10215  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
10216  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm4) * factor );
10217  }
10218 
10219  if( i < iend )
10220  {
10221  const size_t kbegin( ( IsUpper<MT4>::value )
10222  ?( ( IsLower<MT5>::value )
10223  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10224  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10225  :( IsLower<MT5>::value ? j : 0UL ) );
10226 
10227  SIMDType xmm1, xmm2;
10228  size_t k( kbegin );
10229 
10230  for( ; (k+2UL) <= K; k+=2UL ) {
10231  xmm1 += set( A(i,k ) ) * B.load(k ,j);
10232  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
10233  }
10234 
10235  for( ; k<K; ++k ) {
10236  xmm1 += set( A(i,k) ) * B.load(k,j);
10237  }
10238 
10239  (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
10240  }
10241  }
10242 
10243  for( ; remainder && j<N; ++j )
10244  {
10245  const size_t iend( UPP ? j+1UL : M );
10246  size_t i( LOW ? j : 0UL );
10247 
10248  for( ; (i+2UL) <= iend; i+=2UL )
10249  {
10250  const size_t kbegin( ( IsUpper<MT4>::value )
10251  ?( ( IsLower<MT5>::value )
10252  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10253  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10254  :( IsLower<MT5>::value ? j : 0UL ) );
10255  const size_t kend( ( IsLower<MT4>::value )
10256  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
10257  :( K ) );
10258 
10259  ElementType value1 = ElementType();
10260  ElementType value2 = ElementType();
10261 
10262  for( size_t k=kbegin; k<kend; ++k ) {
10263  value1 += A(i ,k) * B(k,j);
10264  value2 += A(i+1UL,k) * B(k,j);
10265  }
10266 
10267  (~C)(i ,j) += value1 * scalar;
10268  (~C)(i+1UL,j) += value2 * scalar;
10269  }
10270 
10271  if( i < iend )
10272  {
10273  const size_t kbegin( ( IsUpper<MT4>::value )
10274  ?( ( IsLower<MT5>::value )
10275  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10276  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10277  :( IsLower<MT5>::value ? j : 0UL ) );
10278 
10279  ElementType value = ElementType();
10280 
10281  for( size_t k=kbegin; k<K; ++k ) {
10282  value += A(i,k) * B(k,j);
10283  }
10284 
10285  (~C)(i,j) += value * scalar;
10286  }
10287  }
10288  }
10289  //**********************************************************************************************
10290 
10291  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
10306  template< typename MT3 // Type of the left-hand side target matrix
10307  , typename MT4 // Type of the left-hand side matrix operand
10308  , typename MT5 // Type of the right-hand side matrix operand
10309  , typename ST2 > // Type of the scalar value
10311  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
10312  {
10313  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10314 
10315  const size_t M( A.rows() );
10316  const size_t N( B.columns() );
10317  const size_t K( A.columns() );
10318 
10319  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
10320 
10321  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
10322  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
10323 
10324  const SIMDType factor( set( scalar ) );
10325 
10326  size_t i( 0UL );
10327 
10329  {
10330  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
10331  for( size_t j=0UL; j<N; ++j )
10332  {
10333  const size_t kbegin( ( IsLower<MT5>::value )
10334  ?( ( IsUpper<MT4>::value )
10335  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10336  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10337  :( IsUpper<MT4>::value ? i : 0UL ) );
10338  const size_t kend( ( IsUpper<MT5>::value )
10339  ?( ( IsLower<MT4>::value )
10340  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
10341  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
10342  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
10343 
10344  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10345 
10346  for( size_t k=kbegin; k<kend; ++k ) {
10347  const SIMDType b1( set( B(k,j) ) );
10348  xmm1 += A.load(i ,k) * b1;
10349  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10350  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10351  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10352  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10353  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
10354  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
10355  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
10356  }
10357 
10358  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10359  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10360  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10361  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10362  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10363  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
10364  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
10365  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
10366  }
10367  }
10368  }
10369 
10370  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
10371  {
10372  size_t j( 0UL );
10373 
10374  for( ; (j+2UL) <= N; j+=2UL )
10375  {
10376  const size_t kbegin( ( IsLower<MT5>::value )
10377  ?( ( IsUpper<MT4>::value )
10378  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10379  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10380  :( IsUpper<MT4>::value ? i : 0UL ) );
10381  const size_t kend( ( IsUpper<MT5>::value )
10382  ?( ( IsLower<MT4>::value )
10383  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10384  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10385  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
10386 
10387  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10388 
10389  for( size_t k=kbegin; k<kend; ++k ) {
10390  const SIMDType a1( A.load(i ,k) );
10391  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10392  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10393  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10394  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
10395  const SIMDType b1( set( B(k,j ) ) );
10396  const SIMDType b2( set( B(k,j+1UL) ) );
10397  xmm1 += a1 * b1;
10398  xmm2 += a2 * b1;
10399  xmm3 += a3 * b1;
10400  xmm4 += a4 * b1;
10401  xmm5 += a5 * b1;
10402  xmm6 += a1 * b2;
10403  xmm7 += a2 * b2;
10404  xmm8 += a3 * b2;
10405  xmm9 += a4 * b2;
10406  xmm10 += a5 * b2;
10407  }
10408 
10409  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10410  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10411  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10412  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10413  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
10414  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
10415  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
10416  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
10417  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
10418  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
10419  }
10420 
10421  if( j < N )
10422  {
10423  const size_t kbegin( ( IsLower<MT5>::value )
10424  ?( ( IsUpper<MT4>::value )
10425  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10426  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10427  :( IsUpper<MT4>::value ? i : 0UL ) );
10428  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
10429 
10430  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10431 
10432  for( size_t k=kbegin; k<kend; ++k ) {
10433  const SIMDType b1( set( B(k,j) ) );
10434  xmm1 += A.load(i ,k) * b1;
10435  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10436  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10437  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10438  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10439  }
10440 
10441  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10442  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10443  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10444  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10445  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10446  }
10447  }
10448 
10449  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10450  {
10451  size_t j( 0UL );
10452 
10453  for( ; (j+2UL) <= N; j+=2UL )
10454  {
10455  const size_t kbegin( ( IsLower<MT5>::value )
10456  ?( ( IsUpper<MT4>::value )
10457  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10458  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10459  :( IsUpper<MT4>::value ? i : 0UL ) );
10460  const size_t kend( ( IsUpper<MT5>::value )
10461  ?( ( IsLower<MT4>::value )
10462  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10463  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10464  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
10465 
10466  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10467 
10468  for( size_t k=kbegin; k<kend; ++k ) {
10469  const SIMDType a1( A.load(i ,k) );
10470  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10471  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10472  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10473  const SIMDType b1( set( B(k,j ) ) );
10474  const SIMDType b2( set( B(k,j+1UL) ) );
10475  xmm1 += a1 * b1;
10476  xmm2 += a2 * b1;
10477  xmm3 += a3 * b1;
10478  xmm4 += a4 * b1;
10479  xmm5 += a1 * b2;
10480  xmm6 += a2 * b2;
10481  xmm7 += a3 * b2;
10482  xmm8 += a4 * b2;
10483  }
10484 
10485  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10486  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10487  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10488  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10489  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
10490  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
10491  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
10492  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
10493  }
10494 
10495  if( j < N )
10496  {
10497  const size_t kbegin( ( IsLower<MT5>::value )
10498  ?( ( IsUpper<MT4>::value )
10499  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10500  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10501  :( IsUpper<MT4>::value ? i : 0UL ) );
10502  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
10503 
10504  SIMDType xmm1, xmm2, xmm3, xmm4;
10505 
10506  for( size_t k=kbegin; k<kend; ++k ) {
10507  const SIMDType b1( set( B(k,j) ) );
10508  xmm1 += A.load(i ,k) * b1;
10509  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10510  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10511  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10512  }
10513 
10514  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10515  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10516  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10517  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10518  }
10519  }
10520 
10521  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
10522  {
10523  size_t j( 0UL );
10524 
10525  for( ; (j+2UL) <= N; j+=2UL )
10526  {
10527  const size_t kbegin( ( IsLower<MT5>::value )
10528  ?( ( IsUpper<MT4>::value )
10529  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10530  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10531  :( IsUpper<MT4>::value ? i : 0UL ) );
10532  const size_t kend( ( IsUpper<MT5>::value )
10533  ?( ( IsLower<MT4>::value )
10534  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10535  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10536  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
10537 
10538  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10539 
10540  for( size_t k=kbegin; k<kend; ++k ) {
10541  const SIMDType a1( A.load(i ,k) );
10542  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10543  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10544  const SIMDType b1( set( B(k,j ) ) );
10545  const SIMDType b2( set( B(k,j+1UL) ) );
10546  xmm1 += a1 * b1;
10547  xmm2 += a2 * b1;
10548  xmm3 += a3 * b1;
10549  xmm4 += a1 * b2;
10550  xmm5 += a2 * b2;
10551  xmm6 += a3 * b2;
10552  }
10553 
10554  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10555  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10556  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10557  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
10558  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
10559  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
10560  }
10561 
10562  if( j < N )
10563  {
10564  const size_t kbegin( ( IsLower<MT5>::value )
10565  ?( ( IsUpper<MT4>::value )
10566  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10567  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10568  :( IsUpper<MT4>::value ? i : 0UL ) );
10569  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
10570 
10571  SIMDType xmm1, xmm2, xmm3;
10572 
10573  for( size_t k=kbegin; k<kend; ++k ) {
10574  const SIMDType b1( set( B(k,j) ) );
10575  xmm1 += A.load(i ,k) * b1;
10576  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10577  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10578  }
10579 
10580  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10581  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10582  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10583  }
10584  }
10585 
10586  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10587  {
10588  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
10589  size_t j( UPP ? i : 0UL );
10590 
10591  for( ; (j+4UL) <= jend; j+=4UL )
10592  {
10593  const size_t kbegin( ( IsLower<MT5>::value )
10594  ?( ( IsUpper<MT4>::value )
10595  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10596  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10597  :( IsUpper<MT4>::value ? i : 0UL ) );
10598  const size_t kend( ( IsUpper<MT5>::value )
10599  ?( ( IsLower<MT4>::value )
10600  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
10601  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
10602  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
10603 
10604  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10605 
10606  for( size_t k=kbegin; k<kend; ++k ) {
10607  const SIMDType a1( A.load(i ,k) );
10608  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10609  const SIMDType b1( set( B(k,j ) ) );
10610  const SIMDType b2( set( B(k,j+1UL) ) );
10611  const SIMDType b3( set( B(k,j+2UL) ) );
10612  const SIMDType b4( set( B(k,j+3UL) ) );
10613  xmm1 += a1 * b1;
10614  xmm2 += a2 * b1;
10615  xmm3 += a1 * b2;
10616  xmm4 += a2 * b2;
10617  xmm5 += a1 * b3;
10618  xmm6 += a2 * b3;
10619  xmm7 += a1 * b4;
10620  xmm8 += a2 * b4;
10621  }
10622 
10623  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10624  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
10625  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
10626  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10627  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
10628  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10629  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
10630  (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
10631  }
10632 
10633  for( ; (j+3UL) <= jend; j+=3UL )
10634  {
10635  const size_t kbegin( ( IsLower<MT5>::value )
10636  ?( ( IsUpper<MT4>::value )
10637  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10638  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10639  :( IsUpper<MT4>::value ? i : 0UL ) );
10640  const size_t kend( ( IsUpper<MT5>::value )
10641  ?( ( IsLower<MT4>::value )
10642  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
10643  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
10644  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
10645 
10646  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10647 
10648  for( size_t k=kbegin; k<kend; ++k ) {
10649  const SIMDType a1( A.load(i ,k) );
10650  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10651  const SIMDType b1( set( B(k,j ) ) );
10652  const SIMDType b2( set( B(k,j+1UL) ) );
10653  const SIMDType b3( set( B(k,j+2UL) ) );
10654  xmm1 += a1 * b1;
10655  xmm2 += a2 * b1;
10656  xmm3 += a1 * b2;
10657  xmm4 += a2 * b2;
10658  xmm5 += a1 * b3;
10659  xmm6 += a2 * b3;
10660  }
10661 
10662  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10663  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
10664  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
10665  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10666  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
10667  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10668  }
10669 
10670  for( ; (j+2UL) <= jend; j+=2UL )
10671  {
10672  const size_t kbegin( ( IsLower<MT5>::value )
10673  ?( ( IsUpper<MT4>::value )
10674  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10675  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10676  :( IsUpper<MT4>::value ? i : 0UL ) );
10677  const size_t kend( ( IsUpper<MT5>::value )
10678  ?( ( IsLower<MT4>::value )
10679  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10680  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10681  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
10682 
10683  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10684  size_t k( kbegin );
10685 
10686  for( ; (k+2UL) <= kend; k+=2UL ) {
10687  const SIMDType a1( A.load(i ,k ) );
10688  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
10689  const SIMDType a3( A.load(i ,k+1UL) );
10690  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
10691  const SIMDType b1( set( B(k ,j ) ) );
10692  const SIMDType b2( set( B(k ,j+1UL) ) );
10693  const SIMDType b3( set( B(k+1UL,j ) ) );
10694  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
10695  xmm1 += a1 * b1;
10696  xmm2 += a2 * b1;
10697  xmm3 += a1 * b2;
10698  xmm4 += a2 * b2;
10699  xmm5 += a3 * b3;
10700  xmm6 += a4 * b3;
10701  xmm7 += a3 * b4;
10702  xmm8 += a4 * b4;
10703  }
10704 
10705  for( ; k<kend; ++k ) {
10706  const SIMDType a1( A.load(i ,k) );
10707  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10708  const SIMDType b1( set( B(k,j ) ) );
10709  const SIMDType b2( set( B(k,j+1UL) ) );
10710  xmm1 += a1 * b1;
10711  xmm2 += a2 * b1;
10712  xmm3 += a1 * b2;
10713  xmm4 += a2 * b2;
10714  }
10715 
10716  (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
10717  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
10718  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + (xmm3+xmm7) * factor );
10719  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
10720  }
10721 
10722  if( j < jend )
10723  {
10724  const size_t kbegin( ( IsLower<MT5>::value )
10725  ?( ( IsUpper<MT4>::value )
10726  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10727  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10728  :( IsUpper<MT4>::value ? i : 0UL ) );
10729  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
10730 
10731  SIMDType xmm1, xmm2, xmm3, xmm4;
10732  size_t k( kbegin );
10733 
10734  for( ; (k+2UL) <= kend; k+=2UL ) {
10735  const SIMDType b1( set( B(k ,j) ) );
10736  const SIMDType b2( set( B(k+1UL,j) ) );
10737  xmm1 += A.load(i ,k ) * b1;
10738  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
10739  xmm3 += A.load(i ,k+1UL) * b2;
10740  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
10741  }
10742 
10743  for( ; k<kend; ++k ) {
10744  const SIMDType b1( set( B(k,j) ) );
10745  xmm1 += A.load(i ,k) * b1;
10746  xmm2 += A.load(i+SIMDSIZE,k) * b1;
10747  }
10748 
10749  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
10750  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
10751  }
10752  }
10753 
10754  for( ; i<ipos; i+=SIMDSIZE )
10755  {
10756  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
10757  size_t j( UPP ? i : 0UL );
10758 
10759  for( ; (j+4UL) <= jend; j+=4UL )
10760  {
10761  const size_t kbegin( ( IsLower<MT5>::value )
10762  ?( ( IsUpper<MT4>::value )
10763  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10764  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10765  :( IsUpper<MT4>::value ? i : 0UL ) );
10766  const size_t kend( ( IsUpper<MT5>::value )
10767  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
10768  :( K ) );
10769 
10770  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10771  size_t k( kbegin );
10772 
10773  for( ; (k+2UL) <= kend; k+=2UL ) {
10774  const SIMDType a1( A.load(i,k ) );
10775  const SIMDType a2( A.load(i,k+1UL) );
10776  xmm1 += a1 * set( B(k ,j ) );
10777  xmm2 += a1 * set( B(k ,j+1UL) );
10778  xmm3 += a1 * set( B(k ,j+2UL) );
10779  xmm4 += a1 * set( B(k ,j+3UL) );
10780  xmm5 += a2 * set( B(k+1UL,j ) );
10781  xmm6 += a2 * set( B(k+1UL,j+1UL) );
10782  xmm7 += a2 * set( B(k+1UL,j+2UL) );
10783  xmm8 += a2 * set( B(k+1UL,j+3UL) );
10784  }
10785 
10786  for( ; k<kend; ++k ) {
10787  const SIMDType a1( A.load(i,k) );
10788  xmm1 += a1 * set( B(k,j ) );
10789  xmm2 += a1 * set( B(k,j+1UL) );
10790  xmm3 += a1 * set( B(k,j+2UL) );
10791  xmm4 += a1 * set( B(k,j+3UL) );
10792  }
10793 
10794  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm5) * factor );
10795  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm6) * factor );
10796  (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm7) * factor );
10797  (~C).store( i, j+3UL, (~C).load(i,j+3UL) + (xmm4+xmm8) * factor );
10798  }
10799 
10800  for( ; (j+3UL) <= jend; j+=3UL )
10801  {
10802  const size_t kbegin( ( IsLower<MT5>::value )
10803  ?( ( IsUpper<MT4>::value )
10804  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10805  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10806  :( IsUpper<MT4>::value ? i : 0UL ) );
10807  const size_t kend( ( IsUpper<MT5>::value )
10808  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
10809  :( K ) );
10810 
10811  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10812  size_t k( kbegin );
10813 
10814  for( ; (k+2UL) <= kend; k+=2UL ) {
10815  const SIMDType a1( A.load(i,k ) );
10816  const SIMDType a2( A.load(i,k+1UL) );
10817  xmm1 += a1 * set( B(k ,j ) );
10818  xmm2 += a1 * set( B(k ,j+1UL) );
10819  xmm3 += a1 * set( B(k ,j+2UL) );
10820  xmm4 += a2 * set( B(k+1UL,j ) );
10821  xmm5 += a2 * set( B(k+1UL,j+1UL) );
10822  xmm6 += a2 * set( B(k+1UL,j+2UL) );
10823  }
10824 
10825  for( ; k<kend; ++k ) {
10826  const SIMDType a1( A.load(i,k) );
10827  xmm1 += a1 * set( B(k,j ) );
10828  xmm2 += a1 * set( B(k,j+1UL) );
10829  xmm3 += a1 * set( B(k,j+2UL) );
10830  }
10831 
10832  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm4) * factor );
10833  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm5) * factor );
10834  (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm6) * factor );
10835  }
10836 
10837  for( ; (j+2UL) <= jend; j+=2UL )
10838  {
10839  const size_t kbegin( ( IsLower<MT5>::value )
10840  ?( ( IsUpper<MT4>::value )
10841  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10842  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10843  :( IsUpper<MT4>::value ? i : 0UL ) );
10844  const size_t kend( ( IsUpper<MT5>::value )
10845  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10846  :( K ) );
10847 
10848  SIMDType xmm1, xmm2, xmm3, xmm4;
10849  size_t k( kbegin );
10850 
10851  for( ; (k+2UL) <= kend; k+=2UL ) {
10852  const SIMDType a1( A.load(i,k ) );
10853  const SIMDType a2( A.load(i,k+1UL) );
10854  xmm1 += a1 * set( B(k ,j ) );
10855  xmm2 += a1 * set( B(k ,j+1UL) );
10856  xmm3 += a2 * set( B(k+1UL,j ) );
10857  xmm4 += a2 * set( B(k+1UL,j+1UL) );
10858  }
10859 
10860  for( ; k<kend; ++k ) {
10861  const SIMDType a1( A.load(i,k) );
10862  xmm1 += a1 * set( B(k,j ) );
10863  xmm2 += a1 * set( B(k,j+1UL) );
10864  }
10865 
10866  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
10867  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm4) * factor );
10868  }
10869 
10870  if( j < jend )
10871  {
10872  const size_t kbegin( ( IsLower<MT5>::value )
10873  ?( ( IsUpper<MT4>::value )
10874  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10875  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10876  :( IsUpper<MT4>::value ? i : 0UL ) );
10877 
10878  SIMDType xmm1, xmm2;
10879  size_t k( kbegin );
10880 
10881  for( ; (k+2UL) <= K; k+=2UL ) {
10882  xmm1 += A.load(i,k ) * set( B(k ,j) );
10883  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
10884  }
10885 
10886  for( ; k<K; ++k ) {
10887  xmm1 += A.load(i,k) * set( B(k,j) );
10888  }
10889 
10890  (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
10891  }
10892  }
10893 
10894  for( ; remainder && i<M; ++i )
10895  {
10896  const size_t jend( LOW ? i+1UL : N );
10897  size_t j( UPP ? i : 0UL );
10898 
10899  for( ; (j+2UL) <= jend; j+=2UL )
10900  {
10901  const size_t kbegin( ( IsLower<MT5>::value )
10902  ?( ( IsUpper<MT4>::value )
10903  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10904  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10905  :( IsUpper<MT4>::value ? i : 0UL ) );
10906  const size_t kend( ( IsUpper<MT5>::value )
10907  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10908  :( K ) );
10909 
10910  ElementType value1 = ElementType();
10911  ElementType value2 = ElementType();
10912 
10913  for( size_t k=kbegin; k<kend; ++k ) {
10914  value1 += A(i,k) * B(k,j );
10915  value2 += A(i,k) * B(k,j+1UL);
10916  }
10917 
10918  (~C)(i,j ) += value1 * scalar;
10919  (~C)(i,j+1UL) += value2 * scalar;
10920  }
10921 
10922  if( j < jend )
10923  {
10924  const size_t kbegin( ( IsLower<MT5>::value )
10925  ?( ( IsUpper<MT4>::value )
10926  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10927  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10928  :( IsUpper<MT4>::value ? i : 0UL ) );
10929 
10930  ElementType value = ElementType();
10931 
10932  for( size_t k=kbegin; k<K; ++k ) {
10933  value += A(i,k) * B(k,j);
10934  }
10935 
10936  (~C)(i,j) += value * scalar;
10937  }
10938  }
10939  }
10940  //**********************************************************************************************
10941 
10942  //**Default addition assignment to dense matrices (large matrices)******************************
10956  template< typename MT3 // Type of the left-hand side target matrix
10957  , typename MT4 // Type of the left-hand side matrix operand
10958  , typename MT5 // Type of the right-hand side matrix operand
10959  , typename ST2 > // Type of the scalar value
10961  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10962  {
10963  selectDefaultAddAssignKernel( C, A, B, scalar );
10964  }
10965  //**********************************************************************************************
10966 
10967  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
10982  template< typename MT3 // Type of the left-hand side target matrix
10983  , typename MT4 // Type of the left-hand side matrix operand
10984  , typename MT5 // Type of the right-hand side matrix operand
10985  , typename ST2 > // Type of the scalar value
10987  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10988  {
10989  if( LOW )
10990  lmmm( C, A, B, scalar, ST2(1) );
10991  else if( UPP )
10992  ummm( C, A, B, scalar, ST2(1) );
10993  else
10994  mmm( C, A, B, scalar, ST2(1) );
10995  }
10996  //**********************************************************************************************
10997 
10998  //**BLAS-based addition assignment to dense matrices (default)**********************************
11012  template< typename MT3 // Type of the left-hand side target matrix
11013  , typename MT4 // Type of the left-hand side matrix operand
11014  , typename MT5 // Type of the right-hand side matrix operand
11015  , typename ST2 > // Type of the scalar value
11017  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11018  {
11019  selectLargeAddAssignKernel( C, A, B, scalar );
11020  }
11021  //**********************************************************************************************
11022 
11023  //**BLAS-based addition assignment to dense matrices********************************************
11024 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
11025 
11038  template< typename MT3 // Type of the left-hand side target matrix
11039  , typename MT4 // Type of the left-hand side matrix operand
11040  , typename MT5 // Type of the right-hand side matrix operand
11041  , typename ST2 > // Type of the scalar value
11043  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11044  {
11045  using ET = ElementType_<MT3>;
11046 
11047  if( IsTriangular<MT4>::value ) {
11048  ResultType_<MT3> tmp( serial( B ) );
11049  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
11050  addAssign( C, tmp );
11051  }
11052  else if( IsTriangular<MT5>::value ) {
11053  ResultType_<MT3> tmp( serial( A ) );
11054  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
11055  addAssign( C, tmp );
11056  }
11057  else {
11058  gemm( C, A, B, ET(scalar), ET(1) );
11059  }
11060  }
11061 #endif
11062  //**********************************************************************************************
11063 
11064  //**Addition assignment to sparse matrices******************************************************
11065  // No special implementation for the addition assignment to sparse matrices.
11066  //**********************************************************************************************
11067 
11068  //**Subtraction assignment to dense matrices****************************************************
11080  template< typename MT // Type of the target dense matrix
11081  , bool SO > // Storage order of the target dense matrix
11082  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11083  {
11085 
11086  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11087  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11088 
11089  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
11090  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
11091 
11092  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11093  return;
11094  }
11095 
11096  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
11097  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
11098 
11099  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11100  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11101  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11102  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11103  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11104  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11105 
11106  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
11107  }
11108  //**********************************************************************************************
11109 
11110  //**Subtraction assignment to dense matrices (kernel selection)*********************************
11121  template< typename MT3 // Type of the left-hand side target matrix
11122  , typename MT4 // Type of the left-hand side matrix operand
11123  , typename MT5 // Type of the right-hand side matrix operand
11124  , typename ST2 > // Type of the scalar value
11125  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11126  {
11128  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
11129  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
11130  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
11131  selectSmallSubAssignKernel( C, A, B, scalar );
11132  else
11133  selectBlasSubAssignKernel( C, A, B, scalar );
11134  }
11135  //**********************************************************************************************
11136 
11137  //**Default subtraction assignment to dense matrices********************************************
11151  template< typename MT3 // Type of the left-hand side target matrix
11152  , typename MT4 // Type of the left-hand side matrix operand
11153  , typename MT5 // Type of the right-hand side matrix operand
11154  , typename ST2 > // Type of the scalar value
11155  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
11156  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11157  {
11158  const ResultType tmp( serial( A * B * scalar ) );
11159  subAssign( C, tmp );
11160  }
11161  //**********************************************************************************************
11162 
11163  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
11177  template< typename MT3 // Type of the left-hand side target matrix
11178  , typename MT4 // Type of the left-hand side matrix operand
11179  , typename MT5 // Type of the right-hand side matrix operand
11180  , typename ST2 > // Type of the scalar value
11181  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
11182  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
11183  {
11184  constexpr size_t block( BLOCK_SIZE );
11185 
11186  const size_t M( A.rows() );
11187  const size_t N( B.columns() );
11188 
11189  for( size_t ii=0UL; ii<M; ii+=block ) {
11190  const size_t iend( min( M, ii+block ) );
11191  for( size_t jj=0UL; jj<N; jj+=block ) {
11192  const size_t jend( min( N, jj+block ) );
11193  for( size_t i=ii; i<iend; ++i )
11194  {
11195  const size_t jbegin( ( IsUpper<MT4>::value )
11196  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
11197  :( jj ) );
11198  const size_t jpos( ( IsLower<MT4>::value )
11199  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
11200  :( jend ) );
11201 
11202  for( size_t j=jbegin; j<jpos; ++j ) {
11203  (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
11204  }
11205  }
11206  }
11207  }
11208  }
11209  //**********************************************************************************************
11210 
11211  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
11225  template< typename MT3 // Type of the left-hand side target matrix
11226  , typename MT4 // Type of the left-hand side matrix operand
11227  , typename MT5 // Type of the right-hand side matrix operand
11228  , typename ST2 > // Type of the scalar value
11229  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
11230  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
11231  {
11232  const size_t M( A.rows() );
11233  const size_t N( B.columns() );
11234 
11235  for( size_t j=0UL; j<N; ++j )
11236  {
11237  const size_t ibegin( ( IsLower<MT4>::value )
11238  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
11239  :( 0UL ) );
11240  const size_t iend( ( IsUpper<MT4>::value )
11241  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
11242  :( M ) );
11243  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
11244 
11245  const size_t inum( iend - ibegin );
11246  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
11247 
11248  for( size_t i=ibegin; i<ipos; i+=2UL ) {
11249  (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
11250  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
11251  }
11252  if( ipos < iend ) {
11253  (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
11254  }
11255  }
11256  }
11257  //**********************************************************************************************
11258 
11259  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
11273  template< typename MT3 // Type of the left-hand side target matrix
11274  , typename MT4 // Type of the left-hand side matrix operand
11275  , typename MT5 // Type of the right-hand side matrix operand
11276  , typename ST2 > // Type of the scalar value
11277  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
11278  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
11279  {
11280  const size_t M( A.rows() );
11281  const size_t N( B.columns() );
11282 
11283  for( size_t i=0UL; i<M; ++i )
11284  {
11285  const size_t jbegin( ( IsUpper<MT5>::value )
11286  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
11287  :( 0UL ) );
11288  const size_t jend( ( IsLower<MT5>::value )
11289  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
11290  :( N ) );
11291  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
11292 
11293  const size_t jnum( jend - jbegin );
11294  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
11295 
11296  for( size_t j=jbegin; j<jpos; j+=2UL ) {
11297  (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
11298  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
11299  }
11300  if( jpos < jend ) {
11301  (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
11302  }
11303  }
11304  }
11305  //**********************************************************************************************
11306 
11307  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
11321  template< typename MT3 // Type of the left-hand side target matrix
11322  , typename MT4 // Type of the left-hand side matrix operand
11323  , typename MT5 // Type of the right-hand side matrix operand
11324  , typename ST2 > // Type of the scalar value
11325  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
11326  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
11327  {
11328  constexpr size_t block( BLOCK_SIZE );
11329 
11330  const size_t M( A.rows() );
11331  const size_t N( B.columns() );
11332 
11333  for( size_t jj=0UL; jj<N; jj+=block ) {
11334  const size_t jend( min( N, jj+block ) );
11335  for( size_t ii=0UL; ii<M; ii+=block ) {
11336  const size_t iend( min( M, ii+block ) );
11337  for( size_t j=jj; j<jend; ++j )
11338  {
11339  const size_t ibegin( ( IsLower<MT5>::value )
11340  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
11341  :( ii ) );
11342  const size_t ipos( ( IsUpper<MT5>::value )
11343  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
11344  :( iend ) );
11345 
11346  for( size_t i=ibegin; i<ipos; ++i ) {
11347  (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
11348  }
11349  }
11350  }
11351  }
11352  }
11353  //**********************************************************************************************
11354 
11355  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
11369  template< typename MT3 // Type of the left-hand side target matrix
11370  , typename MT4 // Type of the left-hand side matrix operand
11371  , typename MT5 // Type of the right-hand side matrix operand
11372  , typename ST2 > // Type of the scalar value
11373  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
11374  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11375  {
11376  for( size_t i=0UL; i<A.rows(); ++i ) {
11377  C(i,i) -= A(i,i) * B(i,i) * scalar;
11378  }
11379  }
11380  //**********************************************************************************************
11381 
11382  //**Default subtraction assignment to dense matrices (small matrices)***************************
11396  template< typename MT3 // Type of the left-hand side target matrix
11397  , typename MT4 // Type of the left-hand side matrix operand
11398  , typename MT5 // Type of the right-hand side matrix operand
11399  , typename ST2 > // Type of the scalar value
11401  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11402  {
11403  selectDefaultSubAssignKernel( C, A, B, scalar );
11404  }
11405  //**********************************************************************************************
11406 
11407  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
11422  template< typename MT3 // Type of the left-hand side target matrix
11423  , typename MT4 // Type of the left-hand side matrix operand
11424  , typename MT5 // Type of the right-hand side matrix operand
11425  , typename ST2 > // Type of the scalar value
11427  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
11428  {
11429  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
11430 
11431  const size_t M( A.rows() );
11432  const size_t N( B.columns() );
11433  const size_t K( A.columns() );
11434 
11435  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
11436 
11437  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
11438  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
11439 
11440  const SIMDType factor( set( scalar ) );
11441 
11442  size_t j( 0UL );
11443 
11445  {
11446  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
11447  for( size_t i=0UL; i<M; ++i )
11448  {
11449  const size_t kbegin( ( IsUpper<MT4>::value )
11450  ?( ( IsLower<MT5>::value )
11451  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11452  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11453  :( IsLower<MT5>::value ? j : 0UL ) );
11454  const size_t kend( ( IsLower<MT4>::value )
11455  ?( ( IsUpper<MT5>::value )
11456  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
11457  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
11458  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
11459 
11460  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11461 
11462  for( size_t k=kbegin; k<kend; ++k ) {
11463  const SIMDType a1( set( A(i,k) ) );
11464  xmm1 += a1 * B.load(k,j );
11465  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11466  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11467  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11468  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11469  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
11470  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
11471  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
11472  }
11473 
11474  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11475  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11476  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11477  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11478  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11479  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
11480  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
11481  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
11482  }
11483  }
11484  }
11485 
11486  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
11487  {
11488  size_t i( 0UL );
11489 
11490  for( ; (i+2UL) <= M; i+=2UL )
11491  {
11492  const size_t kbegin( ( IsUpper<MT4>::value )
11493  ?( ( IsLower<MT5>::value )
11494  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11495  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11496  :( IsLower<MT5>::value ? j : 0UL ) );
11497  const size_t kend( ( IsLower<MT4>::value )
11498  ?( ( IsUpper<MT5>::value )
11499  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
11500  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
11501  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
11502 
11503  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
11504 
11505  for( size_t k=kbegin; k<kend; ++k ) {
11506  const SIMDType a1( set( A(i ,k) ) );
11507  const SIMDType a2( set( A(i+1UL,k) ) );
11508  const SIMDType b1( B.load(k,j ) );
11509  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11510  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11511  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11512  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
11513  xmm1 += a1 * b1;
11514  xmm2 += a1 * b2;
11515  xmm3 += a1 * b3;
11516  xmm4 += a1 * b4;
11517  xmm5 += a1 * b5;
11518  xmm6 += a2 * b1;
11519  xmm7 += a2 * b2;
11520  xmm8 += a2 * b3;
11521  xmm9 += a2 * b4;
11522  xmm10 += a2 * b5;
11523  }
11524 
11525  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11526  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11527  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11528  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11529  (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
11530  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm6 * factor );
11531  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
11532  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
11533  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
11534  (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
11535  }
11536 
11537  if( i < M )
11538  {
11539  const size_t kbegin( ( IsUpper<MT4>::value )
11540  ?( ( IsLower<MT5>::value )
11541  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11542  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11543  :( IsLower<MT5>::value ? j : 0UL ) );
11544  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
11545 
11546  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
11547 
11548  for( size_t k=kbegin; k<kend; ++k ) {
11549  const SIMDType a1( set( A(i,k) ) );
11550  xmm1 += a1 * B.load(k,j );
11551  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11552  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11553  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11554  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11555  }
11556 
11557  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11558  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11559  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11560  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11561  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11562  }
11563  }
11564 
11565  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
11566  {
11567  size_t i( 0UL );
11568 
11569  for( ; (i+2UL) <= M; i+=2UL )
11570  {
11571  const size_t kbegin( ( IsUpper<MT4>::value )
11572  ?( ( IsLower<MT5>::value )
11573  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11574  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11575  :( IsLower<MT5>::value ? j : 0UL ) );
11576  const size_t kend( ( IsLower<MT4>::value )
11577  ?( ( IsUpper<MT5>::value )
11578  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
11579  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
11580  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
11581 
11582  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11583 
11584  for( size_t k=kbegin; k<kend; ++k ) {
11585  const SIMDType a1( set( A(i ,k) ) );
11586  const SIMDType a2( set( A(i+1UL,k) ) );
11587  const SIMDType b1( B.load(k,j ) );
11588  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11589  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11590  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11591  xmm1 += a1 * b1;
11592  xmm2 += a1 * b2;
11593  xmm3 += a1 * b3;
11594  xmm4 += a1 * b4;
11595  xmm5 += a2 * b1;
11596  xmm6 += a2 * b2;
11597  xmm7 += a2 * b3;
11598  xmm8 += a2 * b4;
11599  }
11600 
11601  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11602  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11603  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11604  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11605  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
11606  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
11607  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
11608  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
11609  }
11610 
11611  if( i < M )
11612  {
11613  const size_t kbegin( ( IsUpper<MT4>::value )
11614  ?( ( IsLower<MT5>::value )
11615  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11616  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11617  :( IsLower<MT5>::value ? j : 0UL ) );
11618  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
11619 
11620  SIMDType xmm1, xmm2, xmm3, xmm4;
11621 
11622  for( size_t k=kbegin; k<kend; ++k ) {
11623  const SIMDType a1( set( A(i,k) ) );
11624  xmm1 += a1 * B.load(k,j );
11625  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11626  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11627  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11628  }
11629 
11630  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11631  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11632  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11633  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11634  }
11635  }
11636 
11637  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
11638  {
11639  size_t i( 0UL );
11640 
11641  for( ; (i+2UL) <= M; i+=2UL )
11642  {
11643  const size_t kbegin( ( IsUpper<MT4>::value )
11644  ?( ( IsLower<MT5>::value )
11645  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11646  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11647  :( IsLower<MT5>::value ? j : 0UL ) );
11648  const size_t kend( ( IsLower<MT4>::value )
11649  ?( ( IsUpper<MT5>::value )
11650  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
11651  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
11652  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
11653 
11654  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11655 
11656  for( size_t k=kbegin; k<kend; ++k ) {
11657  const SIMDType a1( set( A(i ,k) ) );
11658  const SIMDType a2( set( A(i+1UL,k) ) );
11659  const SIMDType b1( B.load(k,j ) );
11660  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11661  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11662  xmm1 += a1 * b1;
11663  xmm2 += a1 * b2;
11664  xmm3 += a1 * b3;
11665  xmm4 += a2 * b1;
11666  xmm5 += a2 * b2;
11667  xmm6 += a2 * b3;
11668  }
11669 
11670  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11671  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11672  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11673  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm4 * factor );
11674  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
11675  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
11676  }
11677 
11678  if( i < M )
11679  {
11680  const size_t kbegin( ( IsUpper<MT4>::value )
11681  ?( ( IsLower<MT5>::value )
11682  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11683  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11684  :( IsLower<MT5>::value ? j : 0UL ) );
11685  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
11686 
11687  SIMDType xmm1, xmm2, xmm3;
11688 
11689  for( size_t k=kbegin; k<kend; ++k ) {
11690  const SIMDType a1( set( A(i,k) ) );
11691  xmm1 += a1 * B.load(k,j );
11692  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11693  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11694  }
11695 
11696  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11697  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11698  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11699  }
11700  }
11701 
11702  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
11703  {
11704  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
11705  size_t i( LOW ? j : 0UL );
11706 
11707  for( ; (i+4UL) <= iend; i+=4UL )
11708  {
11709  const size_t kbegin( ( IsUpper<MT4>::value )
11710  ?( ( IsLower<MT5>::value )
11711  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11712  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11713  :( IsLower<MT5>::value ? j : 0UL ) );
11714  const size_t kend( ( IsLower<MT4>::value )
11715  ?( ( IsUpper<MT5>::value )
11716  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
11717  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
11718  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
11719 
11720  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11721 
11722  for( size_t k=kbegin; k<kend; ++k ) {
11723  const SIMDType a1( set( A(i ,k) ) );
11724  const SIMDType a2( set( A(i+1UL,k) ) );
11725  const SIMDType a3( set( A(i+2UL,k) ) );
11726  const SIMDType a4( set( A(i+3UL,k) ) );
11727  const SIMDType b1( B.load(k,j ) );
11728  const SIMDType b2( B.load(k,j+SIMDSIZE) );
11729  xmm1 += a1 * b1;
11730  xmm2 += a1 * b2;
11731  xmm3 += a2 * b1;
11732  xmm4 += a2 * b2;
11733  xmm5 += a3 * b1;
11734  xmm6 += a3 * b2;
11735  xmm7 += a4 * b1;
11736  xmm8 += a4 * b2;
11737  }
11738 
11739  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11740  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
11741  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
11742  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11743  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
11744  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11745  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
11746  (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
11747  }
11748 
11749  for( ; (i+3UL) <= iend; i+=3UL )
11750  {
11751  const size_t kbegin( ( IsUpper<MT4>::value )
11752  ?( ( IsLower<MT5>::value )
11753  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11754  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11755  :( IsLower<MT5>::value ? j : 0UL ) );
11756  const size_t kend( ( IsLower<MT4>::value )
11757  ?( ( IsUpper<MT5>::value )
11758  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
11759  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
11760  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
11761 
11762  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11763 
11764  for( size_t k=kbegin; k<kend; ++k ) {
11765  const SIMDType a1( set( A(i ,k) ) );
11766  const SIMDType a2( set( A(i+1UL,k) ) );
11767  const SIMDType a3( set( A(i+2UL,k) ) );
11768  const SIMDType b1( B.load(k,j ) );
11769  const SIMDType b2( B.load(k,j+SIMDSIZE) );
11770  xmm1 += a1 * b1;
11771  xmm2 += a1 * b2;
11772  xmm3 += a2 * b1;
11773  xmm4 += a2 * b2;
11774  xmm5 += a3 * b1;
11775  xmm6 += a3 * b2;
11776  }
11777 
11778  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11779  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
11780  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
11781  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11782  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
11783  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11784  }
11785 
11786  for( ; (i+2UL) <= iend; i+=2UL )
11787  {
11788  const size_t kbegin( ( IsUpper<MT4>::value )
11789  ?( ( IsLower<MT5>::value )
11790  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11791  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11792  :( IsLower<MT5>::value ? j : 0UL ) );
11793  const size_t kend( ( IsLower<MT4>::value )
11794  ?( ( IsUpper<MT5>::value )
11795  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
11796  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
11797  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
11798 
11799  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11800  size_t k( kbegin );
11801 
11802  for( ; (k+2UL) <= kend; k+=2UL ) {
11803  const SIMDType a1( set( A(i ,k ) ) );
11804  const SIMDType a2( set( A(i+1UL,k ) ) );
11805  const SIMDType a3( set( A(i ,k+1UL) ) );
11806  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
11807  const SIMDType b1( B.load(k ,j ) );
11808  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
11809  const SIMDType b3( B.load(k+1UL,j ) );
11810  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
11811  xmm1 += a1 * b1;
11812  xmm2 += a1 * b2;
11813  xmm3 += a2 * b1;
11814  xmm4 += a2 * b2;
11815  xmm5 += a3 * b3;
11816  xmm6 += a3 * b4;
11817  xmm7 += a4 * b3;
11818  xmm8 += a4 * b4;
11819  }
11820 
11821  for( ; k<kend; ++k ) {
11822  const SIMDType a1( set( A(i ,k) ) );
11823  const SIMDType a2( set( A(i+1UL,k) ) );
11824  const SIMDType b1( B.load(k,j ) );
11825  const SIMDType b2( B.load(k,j+SIMDSIZE) );
11826  xmm1 += a1 * b1;
11827  xmm2 += a1 * b2;
11828  xmm3 += a2 * b1;
11829  xmm4 += a2 * b2;
11830  }
11831 
11832  (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
11833  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
11834  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - (xmm3+xmm7) * factor );
11835  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
11836  }
11837 
11838  if( i < iend )
11839  {
11840  const size_t kbegin( ( IsUpper<MT4>::value )
11841  ?( ( IsLower<MT5>::value )
11842  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11843  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11844  :( IsLower<MT5>::value ? j : 0UL ) );
11845  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
11846 
11847  SIMDType xmm1, xmm2, xmm3, xmm4;
11848  size_t k( kbegin );
11849 
11850  for( ; (k+2UL) <= kend; k+=2UL ) {
11851  const SIMDType a1( set( A(i,k ) ) );
11852  const SIMDType a2( set( A(i,k+1UL) ) );
11853  xmm1 += a1 * B.load(k ,j );
11854  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
11855  xmm3 += a2 * B.load(k+1UL,j );
11856  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
11857  }
11858 
11859  for( ; k<kend; ++k ) {
11860  const SIMDType a1( set( A(i,k) ) );
11861  xmm1 += a1 * B.load(k,j );
11862  xmm2 += a1 * B.load(k,j+SIMDSIZE);
11863  }
11864 
11865  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
11866  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
11867  }
11868  }
11869 
11870  for( ; j<jpos; j+=SIMDSIZE )
11871  {
11872  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
11873  size_t i( LOW ? j : 0UL );
11874 
11875  for( ; (i+4UL) <= iend; i+=4UL )
11876  {
11877  const size_t kbegin( ( IsUpper<MT4>::value )
11878  ?( ( IsLower<MT5>::value )
11879  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11880  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11881  :( IsLower<MT5>::value ? j : 0UL ) );
11882  const size_t kend( ( IsLower<MT4>::value )
11883  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
11884  :( K ) );
11885 
11886  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11887  size_t k( kbegin );
11888 
11889  for( ; (k+2UL) <= kend; k+=2UL ) {
11890  const SIMDType b1( B.load(k ,j) );
11891  const SIMDType b2( B.load(k+1UL,j) );
11892  xmm1 += set( A(i ,k ) ) * b1;
11893  xmm2 += set( A(i+1UL,k ) ) * b1;
11894  xmm3 += set( A(i+2UL,k ) ) * b1;
11895  xmm4 += set( A(i+3UL,k ) ) * b1;
11896  xmm5 += set( A(i ,k+1UL) ) * b2;
11897  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
11898  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
11899  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
11900  }
11901 
11902  for( ; k<kend; ++k ) {
11903  const SIMDType b1( B.load(k,j) );
11904  xmm1 += set( A(i ,k) ) * b1;
11905  xmm2 += set( A(i+1UL,k) ) * b1;
11906  xmm3 += set( A(i+2UL,k) ) * b1;
11907  xmm4 += set( A(i+3UL,k) ) * b1;
11908  }
11909 
11910  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm5) * factor );
11911  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm6) * factor );
11912  (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm7) * factor );
11913  (~C).store( i+3UL, j, (~C).load(i+3UL,j) - (xmm4+xmm8) * factor );
11914  }
11915 
11916  for( ; (i+3UL) <= iend; i+=3UL )
11917  {
11918  const size_t kbegin( ( IsUpper<MT4>::value )
11919  ?( ( IsLower<MT5>::value )
11920  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11921  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11922  :( IsLower<MT5>::value ? j : 0UL ) );
11923  const size_t kend( ( IsLower<MT4>::value )
11924  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
11925  :( K ) );
11926 
11927  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11928  size_t k( kbegin );
11929 
11930  for( ; (k+2UL) <= kend; k+=2UL ) {
11931  const SIMDType b1( B.load(k ,j) );
11932  const SIMDType b2( B.load(k+1UL,j) );
11933  xmm1 += set( A(i ,k ) ) * b1;
11934  xmm2 += set( A(i+1UL,k ) ) * b1;
11935  xmm3 += set( A(i+2UL,k ) ) * b1;
11936  xmm4 += set( A(i ,k+1UL) ) * b2;
11937  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
11938  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
11939  }
11940 
11941  for( ; k<kend; ++k ) {
11942  const SIMDType b1( B.load(k,j) );
11943  xmm1 += set( A(i ,k) ) * b1;
11944  xmm2 += set( A(i+1UL,k) ) * b1;
11945  xmm3 += set( A(i+2UL,k) ) * b1;
11946  }
11947 
11948  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm4) * factor );
11949  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm5) * factor );
11950  (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm6) * factor );
11951  }
11952 
11953  for( ; (i+2UL) <= iend; i+=2UL )
11954  {
11955  const size_t kbegin( ( IsUpper<MT4>::value )
11956  ?( ( IsLower<MT5>::value )
11957  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11958  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11959  :( IsLower<MT5>::value ? j : 0UL ) );
11960  const size_t kend( ( IsLower<MT4>::value )
11961  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
11962  :( K ) );
11963 
11964  SIMDType xmm1, xmm2, xmm3, xmm4;
11965  size_t k( kbegin );
11966 
11967  for( ; (k+2UL) <= kend; k+=2UL ) {
11968  const SIMDType b1( B.load(k ,j) );
11969  const SIMDType b2( B.load(k+1UL,j) );
11970  xmm1 += set( A(i ,k ) ) * b1;
11971  xmm2 += set( A(i+1UL,k ) ) * b1;
11972  xmm3 += set( A(i ,k+1UL) ) * b2;
11973  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
11974  }
11975 
11976  for( ; k<kend; ++k ) {
11977  const SIMDType b1( B.load(k,j) );
11978  xmm1 += set( A(i ,k) ) * b1;
11979  xmm2 += set( A(i+1UL,k) ) * b1;
11980  }
11981 
11982  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
11983  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm4) * factor );
11984  }
11985 
11986  if( i < iend )
11987  {
11988  const size_t kbegin( ( IsUpper<MT4>::value )
11989  ?( ( IsLower<MT5>::value )
11990  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11991  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11992  :( IsLower<MT5>::value ? j : 0UL ) );
11993 
11994  SIMDType xmm1, xmm2;
11995  size_t k( kbegin );
11996 
11997  for( ; (k+2UL) <= K; k+=2UL ) {
11998  xmm1 += set( A(i,k ) ) * B.load(k ,j);
11999  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
12000  }
12001 
12002  for( ; k<K; ++k ) {
12003  xmm1 += set( A(i,k) ) * B.load(k,j);
12004  }
12005 
12006  (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
12007  }
12008  }
12009 
12010  for( ; remainder && j<N; ++j )
12011  {
12012  const size_t iend( UPP ? j+1UL : M );
12013  size_t i( LOW ? j : 0UL );
12014 
12015  for( ; (i+2UL) <= iend; i+=2UL )
12016  {
12017  const size_t kbegin( ( IsUpper<MT4>::value )
12018  ?( ( IsLower<MT5>::value )
12019  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
12020  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
12021  :( IsLower<MT5>::value ? j : 0UL ) );
12022  const size_t kend( ( IsLower<MT4>::value )
12023  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
12024  :( K ) );
12025 
12026  ElementType value1 = ElementType();
12027  ElementType value2 = ElementType();
12028 
12029  for( size_t k=kbegin; k<kend; ++k ) {
12030  value1 += A(i ,k) * B(k,j);
12031  value2 += A(i+1UL,k) * B(k,j);
12032  }
12033 
12034  (~C)(i ,j) -= value1 * scalar;
12035  (~C)(i+1UL,j) -= value2 * scalar;
12036  }
12037 
12038  if( i < iend )
12039  {
12040  const size_t kbegin( ( IsUpper<MT4>::value )
12041  ?( ( IsLower<MT5>::value )
12042  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
12043  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
12044  :( IsLower<MT5>::value ? j : 0UL ) );
12045 
12046  ElementType value = ElementType();
12047 
12048  for( size_t k=kbegin; k<K; ++k ) {
12049  value += A(i,k) * B(k,j);
12050  }
12051 
12052  (~C)(i,j) -= value * scalar;
12053  }
12054  }
12055  }
12056  //**********************************************************************************************
12057 
12058  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
12073  template< typename MT3 // Type of the left-hand side target matrix
12074  , typename MT4 // Type of the left-hand side matrix operand
12075  , typename MT5 // Type of the right-hand side matrix operand
12076  , typename ST2 > // Type of the scalar value
12078  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
12079  {
12080  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
12081 
12082  const size_t M( A.rows() );
12083  const size_t N( B.columns() );
12084  const size_t K( A.columns() );
12085 
12086  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
12087 
12088  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
12089  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
12090 
12091  const SIMDType factor( set( scalar ) );
12092 
12093  size_t i( 0UL );
12094 
12096  {
12097  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
12098  for( size_t j=0UL; j<N; ++j )
12099  {
12100  const size_t kbegin( ( IsLower<MT5>::value )
12101  ?( ( IsUpper<MT4>::value )
12102  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12103  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12104  :( IsUpper<MT4>::value ? i : 0UL ) );
12105  const size_t kend( ( IsUpper<MT5>::value )
12106  ?( ( IsLower<MT4>::value )
12107  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
12108  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
12109  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
12110 
12111  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12112 
12113  for( size_t k=kbegin; k<kend; ++k ) {
12114  const SIMDType b1( set( B(k,j) ) );
12115  xmm1 += A.load(i ,k) * b1;
12116  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12117  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12118  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12119  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12120  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
12121  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
12122  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
12123  }
12124 
12125  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12126  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12127  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12128  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12129  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12130  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
12131  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
12132  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
12133  }
12134  }
12135  }
12136 
12137  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
12138  {
12139  size_t j( 0UL );
12140 
12141  for( ; (j+2UL) <= N; j+=2UL )
12142  {
12143  const size_t kbegin( ( IsLower<MT5>::value )
12144  ?( ( IsUpper<MT4>::value )
12145  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12146  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12147  :( IsUpper<MT4>::value ? i : 0UL ) );
12148  const size_t kend( ( IsUpper<MT5>::value )
12149  ?( ( IsLower<MT4>::value )
12150  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
12151  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
12152  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
12153 
12154  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
12155 
12156  for( size_t k=kbegin; k<kend; ++k ) {
12157  const SIMDType a1( A.load(i ,k) );
12158  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12159  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12160  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12161  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
12162  const SIMDType b1( set( B(k,j ) ) );
12163  const SIMDType b2( set( B(k,j+1UL) ) );
12164  xmm1 += a1 * b1;
12165  xmm2 += a2 * b1;
12166  xmm3 += a3 * b1;
12167  xmm4 += a4 * b1;
12168  xmm5 += a5 * b1;
12169  xmm6 += a1 * b2;
12170  xmm7 += a2 * b2;
12171  xmm8 += a3 * b2;
12172  xmm9 += a4 * b2;
12173  xmm10 += a5 * b2;
12174  }
12175 
12176  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12177  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12178  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12179  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12180  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
12181  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
12182  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
12183  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
12184  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
12185  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
12186  }
12187 
12188  if( j < N )
12189  {
12190  const size_t kbegin( ( IsLower<MT5>::value )
12191  ?( ( IsUpper<MT4>::value )
12192  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12193  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12194  :( IsUpper<MT4>::value ? i : 0UL ) );
12195  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
12196 
12197  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
12198 
12199  for( size_t k=kbegin; k<kend; ++k ) {
12200  const SIMDType b1( set( B(k,j) ) );
12201  xmm1 += A.load(i ,k) * b1;
12202  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12203  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12204  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12205  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12206  }
12207 
12208  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12209  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12210  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12211  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12212  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12213  }
12214  }
12215 
12216  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
12217  {
12218  size_t j( 0UL );
12219 
12220  for( ; (j+2UL) <= N; j+=2UL )
12221  {
12222  const size_t kbegin( ( IsLower<MT5>::value )
12223  ?( ( IsUpper<MT4>::value )
12224  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12225  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12226  :( IsUpper<MT4>::value ? i : 0UL ) );
12227  const size_t kend( ( IsUpper<MT5>::value )
12228  ?( ( IsLower<MT4>::value )
12229  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
12230  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
12231  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
12232 
12233  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12234 
12235  for( size_t k=kbegin; k<kend; ++k ) {
12236  const SIMDType a1( A.load(i ,k) );
12237  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12238  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12239  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12240  const SIMDType b1( set( B(k,j ) ) );
12241  const SIMDType b2( set( B(k,j+1UL) ) );
12242  xmm1 += a1 * b1;
12243  xmm2 += a2 * b1;
12244  xmm3 += a3 * b1;
12245  xmm4 += a4 * b1;
12246  xmm5 += a1 * b2;
12247  xmm6 += a2 * b2;
12248  xmm7 += a3 * b2;
12249  xmm8 += a4 * b2;
12250  }
12251 
12252  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12253  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12254  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12255  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12256  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
12257  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
12258  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
12259  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
12260  }
12261 
12262  if( j < N )
12263  {
12264  const size_t kbegin( ( IsLower<MT5>::value )
12265  ?( ( IsUpper<MT4>::value )
12266  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12267  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12268  :( IsUpper<MT4>::value ? i : 0UL ) );
12269  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
12270 
12271  SIMDType xmm1, xmm2, xmm3, xmm4;
12272 
12273  for( size_t k=kbegin; k<kend; ++k ) {
12274  const SIMDType b1( set( B(k,j) ) );
12275  xmm1 += A.load(i ,k) * b1;
12276  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12277  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12278  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12279  }
12280 
12281  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12282  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12283  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12284  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12285  }
12286  }
12287 
12288  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
12289  {
12290  size_t j( 0UL );
12291 
12292  for( ; (j+2UL) <= N; j+=2UL )
12293  {
12294  const size_t kbegin( ( IsLower<MT5>::value )
12295  ?( ( IsUpper<MT4>::value )
12296  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12297  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12298  :( IsUpper<MT4>::value ? i : 0UL ) );
12299  const size_t kend( ( IsUpper<MT5>::value )
12300  ?( ( IsLower<MT4>::value )
12301  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
12302  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
12303  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
12304 
12305  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12306 
12307  for( size_t k=kbegin; k<kend; ++k ) {
12308  const SIMDType a1( A.load(i ,k) );
12309  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12310  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12311  const SIMDType b1( set( B(k,j ) ) );
12312  const SIMDType b2( set( B(k,j+1UL) ) );
12313  xmm1 += a1 * b1;
12314  xmm2 += a2 * b1;
12315  xmm3 += a3 * b1;
12316  xmm4 += a1 * b2;
12317  xmm5 += a2 * b2;
12318  xmm6 += a3 * b2;
12319  }
12320 
12321  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12322  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12323  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12324  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
12325  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
12326  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
12327  }
12328 
12329  if( j < N )
12330  {
12331  const size_t kbegin( ( IsLower<MT5>::value )
12332  ?( ( IsUpper<MT4>::value )
12333  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12334  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12335  :( IsUpper<MT4>::value ? i : 0UL ) );
12336  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
12337 
12338  SIMDType xmm1, xmm2, xmm3;
12339 
12340  for( size_t k=kbegin; k<kend; ++k ) {
12341  const SIMDType b1( set( B(k,j) ) );
12342  xmm1 += A.load(i ,k) * b1;
12343  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12344  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12345  }
12346 
12347  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12348  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12349  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12350  }
12351  }
12352 
12353  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
12354  {
12355  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
12356  size_t j( UPP ? i : 0UL );
12357 
12358  for( ; (j+4UL) <= jend; j+=4UL )
12359  {
12360  const size_t kbegin( ( IsLower<MT5>::value )
12361  ?( ( IsUpper<MT4>::value )
12362  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12363  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12364  :( IsUpper<MT4>::value ? i : 0UL ) );
12365  const size_t kend( ( IsUpper<MT5>::value )
12366  ?( ( IsLower<MT4>::value )
12367  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
12368  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
12369  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
12370 
12371  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12372 
12373  for( size_t k=kbegin; k<kend; ++k ) {
12374  const SIMDType a1( A.load(i ,k) );
12375  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12376  const SIMDType b1( set( B(k,j ) ) );
12377  const SIMDType b2( set( B(k,j+1UL) ) );
12378  const SIMDType b3( set( B(k,j+2UL) ) );
12379  const SIMDType b4( set( B(k,j+3UL) ) );
12380  xmm1 += a1 * b1;
12381  xmm2 += a2 * b1;
12382  xmm3 += a1 * b2;
12383  xmm4 += a2 * b2;
12384  xmm5 += a1 * b3;
12385  xmm6 += a2 * b3;
12386  xmm7 += a1 * b4;
12387  xmm8 += a2 * b4;
12388  }
12389 
12390  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12391  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
12392  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
12393  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12394  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
12395  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12396  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
12397  (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
12398  }
12399 
12400  for( ; (j+3UL) <= jend; j+=3UL )
12401  {
12402  const size_t kbegin( ( IsLower<MT5>::value )
12403  ?( ( IsUpper<MT4>::value )
12404  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12405  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12406  :( IsUpper<MT4>::value ? i : 0UL ) );
12407  const size_t kend( ( IsUpper<MT5>::value )
12408  ?( ( IsLower<MT4>::value )
12409  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
12410  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
12411  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
12412 
12413  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12414 
12415  for( size_t k=kbegin; k<kend; ++k ) {
12416  const SIMDType a1( A.load(i ,k) );
12417  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12418  const SIMDType b1( set( B(k,j ) ) );
12419  const SIMDType b2( set( B(k,j+1UL) ) );
12420  const SIMDType b3( set( B(k,j+2UL) ) );
12421  xmm1 += a1 * b1;
12422  xmm2 += a2 * b1;
12423  xmm3 += a1 * b2;
12424  xmm4 += a2 * b2;
12425  xmm5 += a1 * b3;
12426  xmm6 += a2 * b3;
12427  }
12428 
12429  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12430  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
12431  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
12432  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12433  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
12434  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12435  }
12436 
12437  for( ; (j+2UL) <= jend; j+=2UL )
12438  {
12439  const size_t kbegin( ( IsLower<MT5>::value )
12440  ?( ( IsUpper<MT4>::value )
12441  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12442  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12443  :( IsUpper<MT4>::value ? i : 0UL ) );
12444  const size_t kend( ( IsUpper<MT5>::value )
12445  ?( ( IsLower<MT4>::value )
12446  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
12447  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
12448  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
12449 
12450  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12451  size_t k( kbegin );
12452 
12453  for( ; (k+2UL) <= kend; k+=2UL ) {
12454  const SIMDType a1( A.load(i ,k ) );
12455  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
12456  const SIMDType a3( A.load(i ,k+1UL) );
12457  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
12458  const SIMDType b1( set( B(k ,j ) ) );
12459  const SIMDType b2( set( B(k ,j+1UL) ) );
12460  const SIMDType b3( set( B(k+1UL,j ) ) );
12461  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
12462  xmm1 += a1 * b1;
12463  xmm2 += a2 * b1;
12464  xmm3 += a1 * b2;
12465  xmm4 += a2 * b2;
12466  xmm5 += a3 * b3;
12467  xmm6 += a4 * b3;
12468  xmm7 += a3 * b4;
12469  xmm8 += a4 * b4;
12470  }
12471 
12472  for( ; k<kend; ++k ) {
12473  const SIMDType a1( A.load(i ,k) );
12474  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12475  const SIMDType b1( set( B(k,j ) ) );
12476  const SIMDType b2( set( B(k,j+1UL) ) );
12477  xmm1 += a1 * b1;
12478  xmm2 += a2 * b1;
12479  xmm3 += a1 * b2;
12480  xmm4 += a2 * b2;
12481  }
12482 
12483  (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
12484  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
12485  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - (xmm3+xmm7) * factor );
12486  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
12487  }
12488 
12489  if( j < jend )
12490  {
12491  const size_t kbegin( ( IsLower<MT5>::value )
12492  ?( ( IsUpper<MT4>::value )
12493  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12494  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12495  :( IsUpper<MT4>::value ? i : 0UL ) );
12496  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
12497 
12498  SIMDType xmm1, xmm2, xmm3, xmm4;
12499  size_t k( kbegin );
12500 
12501  for( ; (k+2UL) <= kend; k+=2UL ) {
12502  const SIMDType b1( set( B(k ,j) ) );
12503  const SIMDType b2( set( B(k+1UL,j) ) );
12504  xmm1 += A.load(i ,k ) * b1;
12505  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
12506  xmm3 += A.load(i ,k+1UL) * b2;
12507  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
12508  }
12509 
12510  for( ; k<kend; ++k ) {
12511  const SIMDType b1( set( B(k,j) ) );
12512  xmm1 += A.load(i ,k) * b1;
12513  xmm2 += A.load(i+SIMDSIZE,k) * b1;
12514  }
12515 
12516  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
12517  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
12518  }
12519  }
12520 
12521  for( ; i<ipos; i+=SIMDSIZE )
12522  {
12523  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
12524  size_t j( UPP ? i : 0UL );
12525 
12526  for( ; (j+4UL) <= jend; j+=4UL )
12527  {
12528  const size_t kbegin( ( IsLower<MT5>::value )
12529  ?( ( IsUpper<MT4>::value )
12530  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12531  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12532  :( IsUpper<MT4>::value ? i : 0UL ) );
12533  const size_t kend( ( IsUpper<MT5>::value )
12534  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
12535  :( K ) );
12536 
12537  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12538  size_t k( kbegin );
12539 
12540  for( ; (k+2UL) <= kend; k+=2UL ) {
12541  const SIMDType a1( A.load(i,k ) );
12542  const SIMDType a2( A.load(i,k+1UL) );
12543  xmm1 += a1 * set( B(k ,j ) );
12544  xmm2 += a1 * set( B(k ,j+1UL) );
12545  xmm3 += a1 * set( B(k ,j+2UL) );
12546  xmm4 += a1 * set( B(k ,j+3UL) );
12547  xmm5 += a2 * set( B(k+1UL,j ) );
12548  xmm6 += a2 * set( B(k+1UL,j+1UL) );
12549  xmm7 += a2 * set( B(k+1UL,j+2UL) );
12550  xmm8 += a2 * set( B(k+1UL,j+3UL) );
12551  }
12552 
12553  for( ; k<kend; ++k ) {
12554  const SIMDType a1( A.load(i,k) );
12555  xmm1 += a1 * set( B(k,j ) );
12556  xmm2 += a1 * set( B(k,j+1UL) );
12557  xmm3 += a1 * set( B(k,j+2UL) );
12558  xmm4 += a1 * set( B(k,j+3UL) );
12559  }
12560 
12561  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm5) * factor );
12562  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm6) * factor );
12563  (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm7) * factor );
12564  (~C).store( i, j+3UL, (~C).load(i,j+3UL) - (xmm4+xmm8) * factor );
12565  }
12566 
12567  for( ; (j+3UL) <= jend; j+=3UL )
12568  {
12569  const size_t kbegin( ( IsLower<MT5>::value )
12570  ?( ( IsUpper<MT4>::value )
12571  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12572  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12573  :( IsUpper<MT4>::value ? i : 0UL ) );
12574  const size_t kend( ( IsUpper<MT5>::value )
12575  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
12576  :( K ) );
12577 
12578  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12579  size_t k( kbegin );
12580 
12581  for( ; (k+2UL) <= kend; k+=2UL ) {
12582  const SIMDType a1( A.load(i,k ) );
12583  const SIMDType a2( A.load(i,k+1UL) );
12584  xmm1 += a1 * set( B(k ,j ) );
12585  xmm2 += a1 * set( B(k ,j+1UL) );
12586  xmm3 += a1 * set( B(k ,j+2UL) );
12587  xmm4 += a2 * set( B(k+1UL,j ) );
12588  xmm5 += a2 * set( B(k+1UL,j+1UL) );
12589  xmm6 += a2 * set( B(k+1UL,j+2UL) );
12590  }
12591 
12592  for( ; k<kend; ++k ) {
12593  const SIMDType a1( A.load(i,k) );
12594  xmm1 += a1 * set( B(k,j ) );
12595  xmm2 += a1 * set( B(k,j+1UL) );
12596  xmm3 += a1 * set( B(k,j+2UL) );
12597  }
12598 
12599  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm4) * factor );
12600  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm5) * factor );
12601  (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm6) * factor );
12602  }
12603 
12604  for( ; (j+2UL) <= jend; j+=2UL )
12605  {
12606  const size_t kbegin( ( IsLower<MT5>::value )
12607  ?( ( IsUpper<MT4>::value )
12608  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12609  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12610  :( IsUpper<MT4>::value ? i : 0UL ) );
12611  const size_t kend( ( IsUpper<MT5>::value )
12612  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
12613  :( K ) );
12614 
12615  SIMDType xmm1, xmm2, xmm3, xmm4;
12616  size_t k( kbegin );
12617 
12618  for( ; (k+2UL) <= kend; k+=2UL ) {
12619  const SIMDType a1( A.load(i,k ) );
12620  const SIMDType a2( A.load(i,k+1UL) );
12621  xmm1 += a1 * set( B(k ,j ) );
12622  xmm2 += a1 * set( B(k ,j+1UL) );
12623  xmm3 += a2 * set( B(k+1UL,j ) );
12624  xmm4 += a2 * set( B(k+1UL,j+1UL) );
12625  }
12626 
12627  for( ; k<kend; ++k ) {
12628  const SIMDType a1( A.load(i,k) );
12629  xmm1 += a1 * set( B(k,j ) );
12630  xmm2 += a1 * set( B(k,j+1UL) );
12631  }
12632 
12633  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
12634  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm4) * factor );
12635  }
12636 
12637  if( j < jend )
12638  {
12639  const size_t kbegin( ( IsLower<MT5>::value )
12640  ?( ( IsUpper<MT4>::value )
12641  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12642  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12643  :( IsUpper<MT4>::value ? i : 0UL ) );
12644 
12645  SIMDType xmm1, xmm2;
12646  size_t k( kbegin );
12647 
12648  for( ; (k+2UL) <= K; k+=2UL ) {
12649  xmm1 += A.load(i,k ) * set( B(k ,j) );
12650  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
12651  }
12652 
12653  for( ; k<K; ++k ) {
12654  xmm1 += A.load(i,k) * set( B(k,j) );
12655  }
12656 
12657  (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
12658  }
12659  }
12660 
12661  for( ; remainder && i<M; ++i )
12662  {
12663  const size_t jend( LOW ? i+1UL : N );
12664  size_t j( UPP ? i : 0UL );
12665 
12666  for( ; (j+2UL) <= jend; j+=2UL )
12667  {
12668  const size_t kbegin( ( IsLower<MT5>::value )
12669  ?( ( IsUpper<MT4>::value )
12670  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12671  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12672  :( IsUpper<MT4>::value ? i : 0UL ) );
12673  const size_t kend( ( IsUpper<MT5>::value )
12674  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
12675  :( K ) );
12676 
12677  ElementType value1 = ElementType();
12678  ElementType value2 = ElementType();
12679 
12680  for( size_t k=kbegin; k<kend; ++k ) {
12681  value1 += A(i,k) * B(k,j );
12682  value2 += A(i,k) * B(k,j+1UL);
12683  }
12684 
12685  (~C)(i,j ) -= value1 * scalar;
12686  (~C)(i,j+1UL) -= value2 * scalar;
12687  }
12688 
12689  if( j < jend )
12690  {
12691  const size_t kbegin( ( IsLower<MT5>::value )
12692  ?( ( IsUpper<MT4>::value )
12693  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12694  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12695  :( IsUpper<MT4>::value ? i : 0UL ) );
12696 
12697  ElementType value = ElementType();
12698 
12699  for( size_t k=kbegin; k<K; ++k ) {
12700  value += A(i,k) * B(k,j);
12701  }
12702 
12703  (~C)(i,j) -= value * scalar;
12704  }
12705  }
12706  }
12707  //**********************************************************************************************
12708 
12709  //**Default subtraction assignment to dense matrices (large matrices)***************************
12723  template< typename MT3 // Type of the left-hand side target matrix
12724  , typename MT4 // Type of the left-hand side matrix operand
12725  , typename MT5 // Type of the right-hand side matrix operand
12726  , typename ST2 > // Type of the scalar value
12728  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12729  {
12730  selectDefaultSubAssignKernel( C, A, B, scalar );
12731  }
12732  //**********************************************************************************************
12733 
12734  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
12749  template< typename MT3 // Type of the left-hand side target matrix
12750  , typename MT4 // Type of the left-hand side matrix operand
12751  , typename MT5 // Type of the right-hand side matrix operand
12752  , typename ST2 > // Type of the scalar value
12754  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12755  {
12756  if( LOW )
12757  lmmm( C, A, B, -scalar, ST2(1) );
12758  else if( UPP )
12759  ummm( C, A, B, -scalar, ST2(1) );
12760  else
12761  mmm( C, A, B, -scalar, ST2(1) );
12762  }
12763  //**********************************************************************************************
12764 
12765  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
12779  template< typename MT3 // Type of the left-hand side target matrix
12780  , typename MT4 // Type of the left-hand side matrix operand
12781  , typename MT5 // Type of the right-hand side matrix operand
12782  , typename ST2 > // Type of the scalar value
12784  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12785  {
12786  selectLargeSubAssignKernel( C, A, B, scalar );
12787  }
12788  //**********************************************************************************************
12789 
12790  //**BLAS-based subraction assignment to dense matrices******************************************
12791 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
12792 
12805  template< typename MT3 // Type of the left-hand side target matrix
12806  , typename MT4 // Type of the left-hand side matrix operand
12807  , typename MT5 // Type of the right-hand side matrix operand
12808  , typename ST2 > // Type of the scalar value
12810  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12811  {
12812  using ET = ElementType_<MT3>;
12813 
12814  if( IsTriangular<MT4>::value ) {
12815  ResultType_<MT3> tmp( serial( B ) );
12816  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
12817  subAssign( C, tmp );
12818  }
12819  else if( IsTriangular<MT5>::value ) {
12820  ResultType_<MT3> tmp( serial( A ) );
12821  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
12822  subAssign( C, tmp );
12823  }
12824  else {
12825  gemm( C, A, B, ET(-scalar), ET(1) );
12826  }
12827  }
12828 #endif
12829  //**********************************************************************************************
12830 
12831  //**Subtraction assignment to sparse matrices***************************************************
12832  // No special implementation for the subtraction assignment to sparse matrices.
12833  //**********************************************************************************************
12834 
12835  //**Schur product assignment to dense matrices**************************************************
12847  template< typename MT // Type of the target dense matrix
12848  , bool SO > // Storage order of the target dense matrix
12849  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12850  {
12852 
12856 
12857  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12858  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12859 
12860  const ResultType tmp( serial( rhs ) );
12861  schurAssign( ~lhs, tmp );
12862  }
12863  //**********************************************************************************************
12864 
12865  //**Schur product assignment to sparse matrices*************************************************
12866  // No special implementation for the Schur product assignment to sparse matrices.
12867  //**********************************************************************************************
12868 
12869  //**Multiplication assignment to dense matrices*************************************************
12870  // No special implementation for the multiplication assignment to dense matrices.
12871  //**********************************************************************************************
12872 
12873  //**Multiplication assignment to sparse matrices************************************************
12874  // No special implementation for the multiplication assignment to sparse matrices.
12875  //**********************************************************************************************
12876 
12877  //**SMP assignment to dense matrices************************************************************
12892  template< typename MT // Type of the target dense matrix
12893  , bool SO > // Storage order of the target dense matrix
12895  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12896  {
12898 
12899  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12900  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12901 
12902  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
12903  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
12904 
12905  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
12906  return;
12907  }
12908  else if( left.columns() == 0UL ) {
12909  reset( ~lhs );
12910  return;
12911  }
12912 
12913  LT A( left ); // Evaluation of the left-hand side dense matrix operand
12914  RT B( right ); // Evaluation of the right-hand side dense matrix operand
12915 
12916  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
12917  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
12918  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
12919  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
12920  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
12921  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
12922 
12923  smpAssign( ~lhs, A * B * rhs.scalar_ );
12924  }
12925  //**********************************************************************************************
12926 
12927  //**SMP assignment to sparse matrices***********************************************************
12942  template< typename MT // Type of the target sparse matrix
12943  , bool SO > // Storage order of the target sparse matrix
12946  {
12948 
12950 
12957 
12958  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12959  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12960 
12961  const ForwardFunctor fwd;
12962 
12963  const TmpType tmp( rhs );
12964  smpAssign( ~lhs, fwd( tmp ) );
12965  }
12966  //**********************************************************************************************
12967 
12968  //**SMP addition assignment to dense matrices***************************************************
12983  template< typename MT // Type of the target dense matrix
12984  , bool SO > // Storage order of the target dense matrix
12987  {
12989 
12990  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12991  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12992 
12993  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
12994  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
12995 
12996  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
12997  return;
12998  }
12999 
13000  LT A( left ); // Evaluation of the left-hand side dense matrix operand
13001  RT B( right ); // Evaluation of the right-hand side dense matrix operand
13002 
13003  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13004  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13005  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13006  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13007  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
13008  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
13009 
13010  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
13011  }
13012  //**********************************************************************************************
13013 
13014  //**SMP addition assignment to sparse matrices**************************************************
13015  // No special implementation for the SMP addition assignment to sparse matrices.
13016  //**********************************************************************************************
13017 
13018  //**SMP subtraction assignment to dense matrices************************************************
13033  template< typename MT // Type of the target dense matrix
13034  , bool SO > // Storage order of the target dense matrix
13037  {
13039 
13040  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13041  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13042 
13043  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
13044  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
13045 
13046  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
13047  return;
13048  }
13049 
13050  LT A( left ); // Evaluation of the left-hand side dense matrix operand
13051  RT B( right ); // Evaluation of the right-hand side dense matrix operand
13052 
13053  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13054  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13055  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13056  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13057  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
13058  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
13059 
13060  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
13061  }
13062  //**********************************************************************************************
13063 
13064  //**SMP subtraction assignment to sparse matrices***********************************************
13065  // No special implementation for the SMP subtraction assignment to sparse matrices.
13066  //**********************************************************************************************
13067 
13068  //**SMP Schur product assignment to dense matrices**********************************************
13080  template< typename MT // Type of the target dense matrix
13081  , bool SO > // Storage order of the target dense matrix
13082  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13083  {
13085 
13089 
13090  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13091  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13092 
13093  const ResultType tmp( rhs );
13094  smpSchurAssign( ~lhs, tmp );
13095  }
13096  //**********************************************************************************************
13097 
13098  //**SMP Schur product assignment to sparse matrices*********************************************
13099  // No special implementation for the SMP Schur product assignment to sparse matrices.
13100  //**********************************************************************************************
13101 
13102  //**SMP multiplication assignment to dense matrices*********************************************
13103  // No special implementation for the SMP multiplication assignment to dense matrices.
13104  //**********************************************************************************************
13105 
13106  //**SMP multiplication assignment to sparse matrices********************************************
13107  // No special implementation for the SMP multiplication assignment to sparse matrices.
13108  //**********************************************************************************************
13109 
13110  //**Compile time checks*************************************************************************
13119  //**********************************************************************************************
13120 };
13122 //*************************************************************************************************
13123 
13124 
13125 
13126 
13127 //=================================================================================================
13128 //
13129 // GLOBAL BINARY ARITHMETIC OPERATORS
13130 //
13131 //=================================================================================================
13132 
13133 //*************************************************************************************************
13163 template< typename MT1 // Type of the left-hand side dense matrix
13164  , typename MT2 > // Type of the right-hand side dense matrix
13165 inline decltype(auto)
13166  operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,false>& rhs )
13167 {
13169 
13170  if( (~lhs).columns() != (~rhs).rows() ) {
13171  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
13172  }
13173 
13175  return ReturnType( ~lhs, ~rhs );
13176 }
13177 //*************************************************************************************************
13178 
13179 
13180 
13181 
13182 //=================================================================================================
13183 //
13184 // GLOBAL FUNCTIONS
13185 //
13186 //=================================================================================================
13187 
13188 //*************************************************************************************************
13213 template< typename MT1 // Type of the left-hand side dense matrix
13214  , typename MT2 // Type of the right-hand side dense matrix
13215  , bool SF // Symmetry flag
13216  , bool HF // Hermitian flag
13217  , bool LF // Lower flag
13218  , bool UF > // Upper flag
13219 inline decltype(auto) declsym( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13220 {
13222 
13223  if( !isSquare( dm ) ) {
13224  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
13225  }
13226 
13228  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13229 }
13231 //*************************************************************************************************
13232 
13233 
13234 //*************************************************************************************************
13259 template< typename MT1 // Type of the left-hand side dense matrix
13260  , typename MT2 // Type of the right-hand side dense matrix
13261  , bool SF // Symmetry flag
13262  , bool HF // Hermitian flag
13263  , bool LF // Lower flag
13264  , bool UF > // Upper flag
13265 inline decltype(auto) declherm( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13266 {
13268 
13269  if( !isSquare( dm ) ) {
13270  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
13271  }
13272 
13274  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13275 }
13277 //*************************************************************************************************
13278 
13279 
13280 //*************************************************************************************************
13305 template< typename MT1 // Type of the left-hand side dense matrix
13306  , typename MT2 // Type of the right-hand side dense matrix
13307  , bool SF // Symmetry flag
13308  , bool HF // Hermitian flag
13309  , bool LF // Lower flag
13310  , bool UF > // Upper flag
13311 inline decltype(auto) decllow( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13312 {
13314 
13315  if( !isSquare( dm ) ) {
13316  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
13317  }
13318 
13320  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13321 }
13323 //*************************************************************************************************
13324 
13325 
13326 //*************************************************************************************************
13351 template< typename MT1 // Type of the left-hand side dense matrix
13352  , typename MT2 // Type of the right-hand side dense matrix
13353  , bool SF // Symmetry flag
13354  , bool HF // Hermitian flag
13355  , bool LF // Lower flag
13356  , bool UF > // Upper flag
13357 inline decltype(auto) declupp( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13358 {
13360 
13361  if( !isSquare( dm ) ) {
13362  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
13363  }
13364 
13366  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13367 }
13369 //*************************************************************************************************
13370 
13371 
13372 //*************************************************************************************************
13397 template< typename MT1 // Type of the left-hand side dense matrix
13398  , typename MT2 // Type of the right-hand side dense matrix
13399  , bool SF // Symmetry flag
13400  , bool HF // Hermitian flag
13401  , bool LF // Lower flag
13402  , bool UF > // Upper flag
13403 inline decltype(auto) decldiag( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13404 {
13406 
13407  if( !isSquare( dm ) ) {
13408  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
13409  }
13410 
13412  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13413 }
13415 //*************************************************************************************************
13416 
13417 
13418 
13419 
13420 //=================================================================================================
13421 //
13422 // ROWS SPECIALIZATIONS
13423 //
13424 //=================================================================================================
13425 
13426 //*************************************************************************************************
13428 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13429 struct Rows< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13430  : public Rows<MT1>
13431 {};
13433 //*************************************************************************************************
13434 
13435 
13436 
13437 
13438 //=================================================================================================
13439 //
13440 // COLUMNS SPECIALIZATIONS
13441 //
13442 //=================================================================================================
13443 
13444 //*************************************************************************************************
13446 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13447 struct Columns< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13448  : public Columns<MT2>
13449 {};
13451 //*************************************************************************************************
13452 
13453 
13454 
13455 
13456 //=================================================================================================
13457 //
13458 // ISALIGNED SPECIALIZATIONS
13459 //
13460 //=================================================================================================
13461 
13462 //*************************************************************************************************
13464 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13465 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13466  : public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
13467 {};
13469 //*************************************************************************************************
13470 
13471 
13472 
13473 
13474 //=================================================================================================
13475 //
13476 // ISSYMMETRIC SPECIALIZATIONS
13477 //
13478 //=================================================================================================
13479 
13480 //*************************************************************************************************
13482 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13483 struct IsSymmetric< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13484  : public BoolConstant< Or< Bool<SF>
13485  , And< Bool<HF>
13486  , IsBuiltin< ElementType_< TDMatDMatMultExpr<MT1,MT2,false,true,false,false> > > >
13487  , And< Bool<LF>, Bool<UF> > >::value >
13488 {};
13490 //*************************************************************************************************
13491 
13492 
13493 
13494 
13495 //=================================================================================================
13496 //
13497 // ISHERMITIAN SPECIALIZATIONS
13498 //
13499 //=================================================================================================
13500 
13501 //*************************************************************************************************
13503 template< typename MT1, typename MT2, bool SF, bool LF, bool UF >
13504 struct IsHermitian< TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
13505  : public TrueType
13506 {};
13508 //*************************************************************************************************
13509 
13510 
13511 
13512 
13513 //=================================================================================================
13514 //
13515 // ISLOWER SPECIALIZATIONS
13516 //
13517 //=================================================================================================
13518 
13519 //*************************************************************************************************
13521 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13522 struct IsLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13523  : public BoolConstant< Or< Bool<LF>
13524  , And< IsLower<MT1>, IsLower<MT2> >
13525  , And< Or< Bool<SF>, Bool<HF> >
13526  , IsUpper<MT1>, IsUpper<MT2> > >::value >
13527 {};
13529 //*************************************************************************************************
13530 
13531 
13532 
13533 
13534 //=================================================================================================
13535 //
13536 // ISUNILOWER SPECIALIZATIONS
13537 //
13538 //=================================================================================================
13539 
13540 //*************************************************************************************************
13542 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13543 struct IsUniLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13544  : public BoolConstant< Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
13545  , And< Or< Bool<SF>, Bool<HF> >
13546  , IsUniUpper<MT1>, IsUniUpper<MT2> > >::value >
13547 {};
13549 //*************************************************************************************************
13550 
13551 
13552 
13553 
13554 //=================================================================================================
13555 //
13556 // ISSTRICTLYLOWER SPECIALIZATIONS
13557 //
13558 //=================================================================================================
13559 
13560 //*************************************************************************************************
13562 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13563 struct IsStrictlyLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13564  : public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
13565  , And< IsStrictlyLower<MT2>, IsLower<MT1> >
13566  , And< Or< Bool<SF>, Bool<HF> >
13567  , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
13568  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >::value >
13569 {};
13571 //*************************************************************************************************
13572 
13573 
13574 
13575 
13576 //=================================================================================================
13577 //
13578 // ISUPPER SPECIALIZATIONS
13579 //
13580 //=================================================================================================
13581 
13582 //*************************************************************************************************
13584 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13585 struct IsUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13586  : public BoolConstant< Or< Bool<UF>
13587  , And< IsUpper<MT1>, IsUpper<MT2> >
13588  , And< Or< Bool<SF>, Bool<HF> >
13589  , IsLower<MT1>, IsLower<MT2> > >::value >
13590 {};
13592 //*************************************************************************************************
13593 
13594 
13595 
13596 
13597 //=================================================================================================
13598 //
13599 // ISUNIUPPER SPECIALIZATIONS
13600 //
13601 //=================================================================================================
13602 
13603 //*************************************************************************************************
13605 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13606 struct IsUniUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13607  : public BoolConstant< Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
13608  , And< Or< Bool<SF>, Bool<HF> >
13609  , IsUniLower<MT1>, IsUniLower<MT2> > >::value >
13610 {};
13612 //*************************************************************************************************
13613 
13614 
13615 
13616 
13617 //=================================================================================================
13618 //
13619 // ISSTRICTLYUPPER SPECIALIZATIONS
13620 //
13621 //=================================================================================================
13622 
13623 //*************************************************************************************************
13625 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13626 struct IsStrictlyUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13627  : public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
13628  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
13629  , And< Or< Bool<SF>, Bool<HF> >
13630  , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
13631  , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >::value >
13632 {};
13634 //*************************************************************************************************
13635 
13636 } // namespace blaze
13637 
13638 #endif
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Data type constraint.
Headerfile for the generic min algorithm.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:272
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:266
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:469
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:547
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:470
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:620
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:537
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1762
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:278
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1027
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:157
Constraints on the storage order of matrix types.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:403
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:413
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1809
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:262
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:393
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Row< MT > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:124
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:148
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:263
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:425
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:110
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:110
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:265
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:260
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:383
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:275
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:158
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1027
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:58
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:175
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:619
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:264
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:457
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:108
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:437
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:367
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:174
Header file for the MatScalarMultExpr base class.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:264
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:110
Utility type for generic codes.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:261
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:304
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1029
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:269
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1029
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:447
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:319
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Compile time evaluation of the number of rows of a matrix.The Rows type trait evaluates the number of...
Definition: Rows.h:75
Header file for the IsComplex type trait.
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:176
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:742
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:177
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.