TDMatDMatMultExpr.h
Go to the documentation of this file.
1 //=================================================================================================
33 //=================================================================================================
34 
35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
37 
38 
39 //*************************************************************************************************
40 // Includes
41 //*************************************************************************************************
42 
43 #include <blaze/math/blas/gemm.h>
44 #include <blaze/math/blas/trmm.h>
45 #include <blaze/math/Aliases.h>
52 #include <blaze/math/dense/MMM.h>
53 #include <blaze/math/Exception.h>
66 #include <blaze/math/shims/Reset.h>
68 #include <blaze/math/SIMD.h>
93 #include <blaze/math/views/Check.h>
94 #include <blaze/system/BLAS.h>
95 #include <blaze/system/Blocking.h>
96 #include <blaze/system/Debugging.h>
101 #include <blaze/util/Assert.h>
102 #include <blaze/util/Complex.h>
105 #include <blaze/util/DisableIf.h>
106 #include <blaze/util/EnableIf.h>
108 #include <blaze/util/mpl/And.h>
109 #include <blaze/util/mpl/Bool.h>
110 #include <blaze/util/mpl/If.h>
111 #include <blaze/util/mpl/Not.h>
112 #include <blaze/util/mpl/Or.h>
113 #include <blaze/util/TrueType.h>
114 #include <blaze/util/Types.h>
122 
123 
124 namespace blaze {
125 
126 //=================================================================================================
127 //
128 // CLASS TDMATDMATMULTEXPR
129 //
130 //=================================================================================================
131 
132 //*************************************************************************************************
139 template< typename MT1 // Type of the left-hand side dense matrix
140  , typename MT2 // Type of the right-hand side dense matrix
141  , bool SF // Symmetry flag
142  , bool HF // Hermitian flag
143  , bool LF // Lower flag
144  , bool UF > // Upper flag
145 class TDMatDMatMultExpr
146  : public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
147  , private Computation
148 {
149  private:
150  //**Type definitions****************************************************************************
157  //**********************************************************************************************
158 
159  //**********************************************************************************************
161  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
162  //**********************************************************************************************
163 
164  //**********************************************************************************************
166  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
167  //**********************************************************************************************
168 
169  //**********************************************************************************************
171  enum : bool {
172  SYM = ( SF && !( HF || LF || UF ) ),
173  HERM = ( HF && !( LF || UF ) ),
174  LOW = ( LF || ( ( SF || HF ) && UF ) ),
175  UPP = ( UF || ( ( SF || HF ) && LF ) )
176  };
177  //**********************************************************************************************
178 
179  //**********************************************************************************************
181 
185  template< typename T1, typename T2, typename T3 >
186  struct IsEvaluationRequired {
187  enum : bool { value = ( evaluateLeft || evaluateRight ) };
188  };
190  //**********************************************************************************************
191 
192  //**********************************************************************************************
194 
197  template< typename T1, typename T2, typename T3 >
198  struct UseBlasKernel {
199  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
200  !SYM && !HERM && !LOW && !UPP &&
205  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
210  IsSame< ElementType_<T1>, ElementType_<T3> >::value };
211  };
213  //**********************************************************************************************
214 
215  //**********************************************************************************************
217 
220  template< typename T1, typename T2, typename T3 >
221  struct UseVectorizedDefaultKernel {
222  enum : bool { value = useOptimizedKernels &&
224  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
227  , ElementType_<T3> >::value &&
230  };
232  //**********************************************************************************************
233 
234  //**********************************************************************************************
236 
239  using ForwardFunctor = IfTrue_< HERM
240  , DeclHerm
241  , IfTrue_< SYM
242  , DeclSym
243  , IfTrue_< LOW
244  , IfTrue_< UPP
245  , DeclDiag
246  , DeclLow >
247  , IfTrue_< UPP
248  , DeclUpp
249  , Noop > > > >;
251  //**********************************************************************************************
252 
253  public:
254  //**Type definitions****************************************************************************
257 
263  using ReturnType = const ElementType;
264  using CompositeType = const ResultType;
265 
267  using LeftOperand = If_< IsExpression<MT1>, const MT1, const MT1& >;
268 
270  using RightOperand = If_< IsExpression<MT2>, const MT2, const MT2& >;
271 
274 
277  //**********************************************************************************************
278 
279  //**Compilation flags***************************************************************************
281  enum : bool { simdEnabled = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
282  MT1::simdEnabled && MT2::simdEnabled &&
285 
287  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
288  !evaluateRight && MT2::smpAssignable };
289  //**********************************************************************************************
290 
291  //**SIMD properties*****************************************************************************
293  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
294  //**********************************************************************************************
295 
296  //**Constructor*********************************************************************************
302  explicit inline TDMatDMatMultExpr( const MT1& lhs, const MT2& rhs ) noexcept
303  : lhs_( lhs ) // Left-hand side dense matrix of the multiplication expression
304  , rhs_( rhs ) // Right-hand side dense matrix of the multiplication expression
305  {
306  BLAZE_INTERNAL_ASSERT( lhs.columns() == rhs.rows(), "Invalid matrix sizes" );
307  }
308  //**********************************************************************************************
309 
310  //**Access operator*****************************************************************************
317  inline ReturnType operator()( size_t i, size_t j ) const {
318  BLAZE_INTERNAL_ASSERT( i < lhs_.rows() , "Invalid row access index" );
319  BLAZE_INTERNAL_ASSERT( j < rhs_.columns(), "Invalid column access index" );
320 
321  if( IsDiagonal<MT1>::value ) {
322  return lhs_(i,i) * rhs_(i,j);
323  }
324  else if( IsDiagonal<MT2>::value ) {
325  return lhs_(i,j) * rhs_(j,j);
326  }
328  const size_t begin( ( IsUpper<MT1>::value )
329  ?( ( IsLower<MT2>::value )
330  ?( max( ( IsStrictlyUpper<MT1>::value ? i+1UL : i )
331  , ( IsStrictlyLower<MT2>::value ? j+1UL : j ) ) )
332  :( IsStrictlyUpper<MT1>::value ? i+1UL : i ) )
333  :( ( IsLower<MT2>::value )
334  ?( IsStrictlyLower<MT2>::value ? j+1UL : j )
335  :( 0UL ) ) );
336  const size_t end( ( IsLower<MT1>::value )
337  ?( ( IsUpper<MT2>::value )
338  ?( min( ( IsStrictlyLower<MT1>::value ? i : i+1UL )
339  , ( IsStrictlyUpper<MT2>::value ? j : j+1UL ) ) )
340  :( IsStrictlyLower<MT1>::value ? i : i+1UL ) )
341  :( ( IsUpper<MT2>::value )
342  ?( IsStrictlyUpper<MT2>::value ? j : j+1UL )
343  :( lhs_.columns() ) ) );
344 
345  if( begin >= end ) return ElementType();
346 
347  const size_t n( end - begin );
348 
349  return subvector( row( lhs_, i, unchecked ), begin, n, unchecked ) *
350  subvector( column( rhs_, j, unchecked ), begin, n, unchecked );
351  }
352  else {
353  return row( lhs_, i, unchecked ) * column( rhs_, j, unchecked );
354  }
355  }
356  //**********************************************************************************************
357 
358  //**At function*********************************************************************************
366  inline ReturnType at( size_t i, size_t j ) const {
367  if( i >= lhs_.rows() ) {
368  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
369  }
370  if( j >= rhs_.columns() ) {
371  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
372  }
373  return (*this)(i,j);
374  }
375  //**********************************************************************************************
376 
377  //**Rows function*******************************************************************************
382  inline size_t rows() const noexcept {
383  return lhs_.rows();
384  }
385  //**********************************************************************************************
386 
387  //**Columns function****************************************************************************
392  inline size_t columns() const noexcept {
393  return rhs_.columns();
394  }
395  //**********************************************************************************************
396 
397  //**Left operand access*************************************************************************
402  inline LeftOperand leftOperand() const noexcept {
403  return lhs_;
404  }
405  //**********************************************************************************************
406 
407  //**Right operand access************************************************************************
412  inline RightOperand rightOperand() const noexcept {
413  return rhs_;
414  }
415  //**********************************************************************************************
416 
417  //**********************************************************************************************
423  template< typename T >
424  inline bool canAlias( const T* alias ) const noexcept {
425  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
426  }
427  //**********************************************************************************************
428 
429  //**********************************************************************************************
435  template< typename T >
436  inline bool isAliased( const T* alias ) const noexcept {
437  return ( lhs_.isAliased( alias ) || rhs_.isAliased( alias ) );
438  }
439  //**********************************************************************************************
440 
441  //**********************************************************************************************
446  inline bool isAligned() const noexcept {
447  return lhs_.isAligned() && rhs_.isAligned();
448  }
449  //**********************************************************************************************
450 
451  //**********************************************************************************************
456  inline bool canSMPAssign() const noexcept {
457  return ( !BLAZE_BLAS_MODE ||
458  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
460  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
461  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
463  }
464  //**********************************************************************************************
465 
466  private:
467  //**Member variables****************************************************************************
470  //**********************************************************************************************
471 
472  //**Assignment to dense matrices****************************************************************
485  template< typename MT // Type of the target dense matrix
486  , bool SO > // Storage order of the target dense matrix
487  friend inline void assign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
488  {
490 
491  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
492  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
493 
494  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
495  return;
496  }
497  else if( rhs.lhs_.columns() == 0UL ) {
498  reset( ~lhs );
499  return;
500  }
501 
502  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
503  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
504 
505  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
506  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
507  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
508  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
509  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
510  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
511 
512  TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
513  }
515  //**********************************************************************************************
516 
517  //**Assignment to dense matrices (kernel selection)*********************************************
528  template< typename MT3 // Type of the left-hand side target matrix
529  , typename MT4 // Type of the left-hand side matrix operand
530  , typename MT5 > // Type of the right-hand side matrix operand
531  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B )
532  {
534  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
535  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
536  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
537  selectSmallAssignKernel( C, A, B );
538  else
539  selectBlasAssignKernel( C, A, B );
540  }
542  //**********************************************************************************************
543 
544  //**Default assignment to row-major dense matrices (general/general)****************************
558  template< typename MT3 // Type of the left-hand side target matrix
559  , typename MT4 // Type of the left-hand side matrix operand
560  , typename MT5 > // Type of the right-hand side matrix operand
562  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
563  {
564  const size_t M( A.rows() );
565  const size_t N( B.columns() );
566  const size_t K( A.columns() );
567 
568  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
569 
570  for( size_t i=0UL; i<M; ++i )
571  {
572  const size_t kbegin( ( IsUpper<MT4>::value )
573  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
574  :( 0UL ) );
575  const size_t kend( ( IsLower<MT4>::value )
576  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
577  :( K ) );
578  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
579 
580  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
581  for( size_t j=0UL; j<N; ++j ) {
582  reset( (~C)(i,j) );
583  }
584  continue;
585  }
586 
587  {
588  const size_t jbegin( ( IsUpper<MT5>::value )
590  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
591  :( UPP ? max(i,kbegin) : kbegin ) )
592  :( UPP ? i : 0UL ) );
593  const size_t jend( ( IsLower<MT5>::value )
595  ?( LOW ? min(i+1UL,kbegin) : kbegin )
596  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
597  :( LOW ? i+1UL : N ) );
598 
599  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
600  for( size_t j=0UL; j<jbegin; ++j ) {
601  reset( (~C)(i,j) );
602  }
603  }
604  else if( IsStrictlyUpper<MT5>::value ) {
605  reset( (~C)(i,0UL) );
606  }
607  for( size_t j=jbegin; j<jend; ++j ) {
608  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
609  }
610  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
611  for( size_t j=jend; j<N; ++j ) {
612  reset( (~C)(i,j) );
613  }
614  }
615  else if( IsStrictlyLower<MT5>::value ) {
616  reset( (~C)(i,N-1UL) );
617  }
618  }
619 
620  for( size_t k=kbegin+1UL; k<kend; ++k )
621  {
622  const size_t jbegin( ( IsUpper<MT5>::value )
624  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
625  :( SYM || HERM || UPP ? max( i, k ) : k ) )
626  :( SYM || HERM || UPP ? i : 0UL ) );
627  const size_t jend( ( IsLower<MT5>::value )
629  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
630  :( LOW ? min(i+1UL,k) : k ) )
631  :( LOW ? i+1UL : N ) );
632 
633  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
634  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
635 
636  for( size_t j=jbegin; j<jend; ++j ) {
637  (~C)(i,j) += A(i,k) * B(k,j);
638  }
639  if( IsLower<MT5>::value ) {
640  (~C)(i,jend) = A(i,k) * B(k,jend);
641  }
642  }
643  }
644 
645  if( SYM || HERM ) {
646  for( size_t i=1UL; i<M; ++i ) {
647  for( size_t j=0UL; j<i; ++j ) {
648  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
649  }
650  }
651  }
652  }
654  //**********************************************************************************************
655 
656  //**Default assignment to column-major dense matrices (general/general)*************************
670  template< typename MT3 // Type of the left-hand side target matrix
671  , typename MT4 // Type of the left-hand side matrix operand
672  , typename MT5 > // Type of the right-hand side matrix operand
673  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
674  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
675  {
676  const size_t M( A.rows() );
677  const size_t N( B.columns() );
678  const size_t K( A.columns() );
679 
680  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
681 
682  for( size_t j=0UL; j<N; ++j )
683  {
684  const size_t kbegin( ( IsLower<MT5>::value )
685  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
686  :( 0UL ) );
687  const size_t kend( ( IsUpper<MT5>::value )
688  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
689  :( K ) );
690  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
691 
692  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
693  for( size_t i=0UL; i<M; ++i ) {
694  reset( (~C)(i,j) );
695  }
696  continue;
697  }
698 
699  {
700  const size_t ibegin( ( IsLower<MT4>::value )
702  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
703  :( LOW ? max(j,kbegin) : kbegin ) )
704  :( LOW ? j : 0UL ) );
705  const size_t iend( ( IsUpper<MT4>::value )
707  ?( UPP ? min(j+1UL,kbegin) : kbegin )
708  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
709  :( UPP ? j+1UL : M ) );
710 
711  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
712  for( size_t i=0UL; i<ibegin; ++i ) {
713  reset( (~C)(i,j) );
714  }
715  }
716  else if( IsStrictlyLower<MT4>::value ) {
717  reset( (~C)(0UL,j) );
718  }
719  for( size_t i=ibegin; i<iend; ++i ) {
720  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
721  }
722  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
723  for( size_t i=iend; i<M; ++i ) {
724  reset( (~C)(i,j) );
725  }
726  }
727  else if( IsStrictlyUpper<MT4>::value ) {
728  reset( (~C)(M-1UL,j) );
729  }
730  }
731 
732  for( size_t k=kbegin+1UL; k<kend; ++k )
733  {
734  const size_t ibegin( ( IsLower<MT4>::value )
736  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
737  :( SYM || HERM || LOW ? max( j, k ) : k ) )
738  :( SYM || HERM || LOW ? j : 0UL ) );
739  const size_t iend( ( IsUpper<MT4>::value )
741  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
742  :( UPP ? min(j+1UL,k) : k ) )
743  :( UPP ? j+1UL : M ) );
744 
745  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
746  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
747 
748  for( size_t i=ibegin; i<iend; ++i ) {
749  (~C)(i,j) += A(i,k) * B(k,j);
750  }
751  if( IsUpper<MT4>::value ) {
752  (~C)(iend,j) = A(iend,k) * B(k,j);
753  }
754  }
755  }
756 
757  if( SYM || HERM ) {
758  for( size_t j=1UL; j<N; ++j ) {
759  for( size_t i=0UL; i<j; ++i ) {
760  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
761  }
762  }
763  }
764  }
766  //**********************************************************************************************
767 
768  //**Default assignment to row-major dense matrices (general/diagonal)***************************
782  template< typename MT3 // Type of the left-hand side target matrix
783  , typename MT4 // Type of the left-hand side matrix operand
784  , typename MT5 > // Type of the right-hand side matrix operand
785  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
786  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
787  {
788  constexpr size_t block( BLOCK_SIZE );
789 
790  const size_t M( A.rows() );
791  const size_t N( B.columns() );
792 
793  for( size_t ii=0UL; ii<M; ii+=block ) {
794  const size_t iend( min( M, ii+block ) );
795  for( size_t jj=0UL; jj<N; jj+=block ) {
796  const size_t jend( min( N, jj+block ) );
797  for( size_t i=ii; i<iend; ++i )
798  {
799  const size_t jbegin( ( IsUpper<MT4>::value )
800  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
801  :( jj ) );
802  const size_t jpos( ( IsLower<MT4>::value )
803  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
804  :( jend ) );
805 
806  if( IsUpper<MT4>::value ) {
807  for( size_t j=jj; j<jbegin; ++j ) {
808  reset( (~C)(i,j) );
809  }
810  }
811  for( size_t j=jbegin; j<jpos; ++j ) {
812  (~C)(i,j) = A(i,j) * B(j,j);
813  }
814  if( IsLower<MT4>::value ) {
815  for( size_t j=jpos; j<jend; ++j ) {
816  reset( (~C)(i,j) );
817  }
818  }
819  }
820  }
821  }
822  }
824  //**********************************************************************************************
825 
826  //**Default assignment to column-major dense matrices (general/diagonal)************************
840  template< typename MT3 // Type of the left-hand side target matrix
841  , typename MT4 // Type of the left-hand side matrix operand
842  , typename MT5 > // Type of the right-hand side matrix operand
843  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
844  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
845  {
846  const size_t M( A.rows() );
847  const size_t N( B.columns() );
848 
849  for( size_t j=0UL; j<N; ++j )
850  {
851  const size_t ibegin( ( IsLower<MT4>::value )
852  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
853  :( 0UL ) );
854  const size_t iend( ( IsUpper<MT4>::value )
855  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
856  :( M ) );
857  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
858 
859  if( IsLower<MT4>::value ) {
860  for( size_t i=0UL; i<ibegin; ++i ) {
861  reset( (~C)(i,j) );
862  }
863  }
864  for( size_t i=ibegin; i<iend; ++i ) {
865  (~C)(i,j) = A(i,j) * B(j,j);
866  }
867  if( IsUpper<MT4>::value ) {
868  for( size_t i=iend; i<M; ++i ) {
869  reset( (~C)(i,j) );
870  }
871  }
872  }
873  }
875  //**********************************************************************************************
876 
877  //**Default assignment to row-major dense matrices (diagonal/general)***************************
891  template< typename MT3 // Type of the left-hand side target matrix
892  , typename MT4 // Type of the left-hand side matrix operand
893  , typename MT5 > // Type of the right-hand side matrix operand
895  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
896  {
897  const size_t M( A.rows() );
898  const size_t N( B.columns() );
899 
900  for( size_t i=0UL; i<M; ++i )
901  {
902  const size_t jbegin( ( IsUpper<MT5>::value )
903  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
904  :( 0UL ) );
905  const size_t jend( ( IsLower<MT5>::value )
906  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
907  :( N ) );
908  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
909 
910  if( IsUpper<MT5>::value ) {
911  for( size_t j=0UL; j<jbegin; ++j ) {
912  reset( (~C)(i,j) );
913  }
914  }
915  for( size_t j=jbegin; j<jend; ++j ) {
916  (~C)(i,j) = A(i,i) * B(i,j);
917  }
918  if( IsLower<MT5>::value ) {
919  for( size_t j=jend; j<N; ++j ) {
920  reset( (~C)(i,j) );
921  }
922  }
923  }
924  }
926  //**********************************************************************************************
927 
928  //**Default assignment to column-major dense matrices (diagonal/general)************************
942  template< typename MT3 // Type of the left-hand side target matrix
943  , typename MT4 // Type of the left-hand side matrix operand
944  , typename MT5 > // Type of the right-hand side matrix operand
945  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
946  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
947  {
948  constexpr size_t block( BLOCK_SIZE );
949 
950  const size_t M( A.rows() );
951  const size_t N( B.columns() );
952 
953  for( size_t jj=0UL; jj<N; jj+=block ) {
954  const size_t jend( min( N, jj+block ) );
955  for( size_t ii=0UL; ii<M; ii+=block ) {
956  const size_t iend( min( M, ii+block ) );
957  for( size_t j=jj; j<jend; ++j )
958  {
959  const size_t ibegin( ( IsLower<MT5>::value )
960  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
961  :( ii ) );
962  const size_t ipos( ( IsUpper<MT5>::value )
963  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
964  :( iend ) );
965 
966  if( IsLower<MT5>::value ) {
967  for( size_t i=ii; i<ibegin; ++i ) {
968  reset( (~C)(i,j) );
969  }
970  }
971  for( size_t i=ibegin; i<ipos; ++i ) {
972  (~C)(i,j) = A(i,i) * B(i,j);
973  }
974  if( IsUpper<MT5>::value ) {
975  for( size_t i=ipos; i<iend; ++i ) {
976  reset( (~C)(i,j) );
977  }
978  }
979  }
980  }
981  }
982  }
984  //**********************************************************************************************
985 
986  //**Default assignment to dense matrices (diagonal/diagonal)************************************
1000  template< typename MT3 // Type of the left-hand side target matrix
1001  , typename MT4 // Type of the left-hand side matrix operand
1002  , typename MT5 > // Type of the right-hand side matrix operand
1003  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
1004  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B )
1005  {
1006  reset( C );
1007 
1008  for( size_t i=0UL; i<A.rows(); ++i ) {
1009  C(i,i) = A(i,i) * B(i,i);
1010  }
1011  }
1013  //**********************************************************************************************
1014 
1015  //**Default assignment to dense matrices (small matrices)***************************************
1029  template< typename MT3 // Type of the left-hand side target matrix
1030  , typename MT4 // Type of the left-hand side matrix operand
1031  , typename MT5 > // Type of the right-hand side matrix operand
1033  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B )
1034  {
1035  selectDefaultAssignKernel( ~C, A, B );
1036  }
1038  //**********************************************************************************************
1039 
1040  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
1055  template< typename MT3 // Type of the left-hand side target matrix
1056  , typename MT4 // Type of the left-hand side matrix operand
1057  , typename MT5 > // Type of the right-hand side matrix operand
1059  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
1060  {
1061  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1062 
1063  const size_t M( A.rows() );
1064  const size_t N( B.columns() );
1065  const size_t K( A.columns() );
1066 
1067  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1068 
1069  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
1070  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
1071 
1072  if( LOW && UPP && N > SIMDSIZE*3UL ) {
1073  reset( ~C );
1074  }
1075 
1076  {
1077  size_t j( 0UL );
1078 
1080  {
1081  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1082  for( size_t i=0UL; i<M; ++i )
1083  {
1084  const size_t kbegin( ( IsUpper<MT4>::value )
1085  ?( ( IsLower<MT5>::value )
1086  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1087  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1088  :( IsLower<MT5>::value ? j : 0UL ) );
1089  const size_t kend( ( IsLower<MT4>::value )
1090  ?( ( IsUpper<MT5>::value )
1091  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1092  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1093  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
1094 
1095  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1096 
1097  for( size_t k=kbegin; k<kend; ++k ) {
1098  const SIMDType a1( set( A(i,k) ) );
1099  xmm1 += a1 * B.load(k,j );
1100  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1101  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1102  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1103  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1104  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
1105  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
1106  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
1107  }
1108 
1109  (~C).store( i, j , xmm1 );
1110  (~C).store( i, j+SIMDSIZE , xmm2 );
1111  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1112  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1113  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1114  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1115  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1116  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1117  }
1118  }
1119  }
1120 
1121  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
1122  {
1123  size_t i( 0UL );
1124 
1125  for( ; (i+2UL) <= M; i+=2UL )
1126  {
1127  const size_t kbegin( ( IsUpper<MT4>::value )
1128  ?( ( IsLower<MT5>::value )
1129  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1130  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1131  :( IsLower<MT5>::value ? j : 0UL ) );
1132  const size_t kend( ( IsLower<MT4>::value )
1133  ?( ( IsUpper<MT5>::value )
1134  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
1135  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1136  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
1137 
1138  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1139 
1140  for( size_t k=kbegin; k<kend; ++k ) {
1141  const SIMDType a1( set( A(i ,k) ) );
1142  const SIMDType a2( set( A(i+1UL,k) ) );
1143  const SIMDType b1( B.load(k,j ) );
1144  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1145  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1146  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1147  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
1148  xmm1 += a1 * b1;
1149  xmm2 += a1 * b2;
1150  xmm3 += a1 * b3;
1151  xmm4 += a1 * b4;
1152  xmm5 += a1 * b5;
1153  xmm6 += a2 * b1;
1154  xmm7 += a2 * b2;
1155  xmm8 += a2 * b3;
1156  xmm9 += a2 * b4;
1157  xmm10 += a2 * b5;
1158  }
1159 
1160  (~C).store( i , j , xmm1 );
1161  (~C).store( i , j+SIMDSIZE , xmm2 );
1162  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1163  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1164  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
1165  (~C).store( i+1UL, j , xmm6 );
1166  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
1167  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
1168  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
1169  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
1170  }
1171 
1172  if( i < M )
1173  {
1174  const size_t kbegin( ( IsUpper<MT4>::value )
1175  ?( ( IsLower<MT5>::value )
1176  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1177  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1178  :( IsLower<MT5>::value ? j : 0UL ) );
1179  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
1180 
1181  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1182 
1183  for( size_t k=kbegin; k<kend; ++k ) {
1184  const SIMDType a1( set( A(i,k) ) );
1185  xmm1 += a1 * B.load(k,j );
1186  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1187  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1188  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1189  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
1190  }
1191 
1192  (~C).store( i, j , xmm1 );
1193  (~C).store( i, j+SIMDSIZE , xmm2 );
1194  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1195  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1196  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1197  }
1198  }
1199 
1200  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1201  {
1202  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
1203  size_t i( LOW ? j : 0UL );
1204 
1205  for( ; (i+2UL) <= iend; i+=2UL )
1206  {
1207  const size_t kbegin( ( IsUpper<MT4>::value )
1208  ?( ( IsLower<MT5>::value )
1209  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1210  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1211  :( IsLower<MT5>::value ? j : 0UL ) );
1212  const size_t kend( ( IsLower<MT4>::value )
1213  ?( ( IsUpper<MT5>::value )
1214  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1215  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1216  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
1217 
1218  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1219 
1220  for( size_t k=kbegin; k<kend; ++k ) {
1221  const SIMDType a1( set( A(i ,k) ) );
1222  const SIMDType a2( set( A(i+1UL,k) ) );
1223  const SIMDType b1( B.load(k,j ) );
1224  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1225  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1226  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1227  xmm1 += a1 * b1;
1228  xmm2 += a1 * b2;
1229  xmm3 += a1 * b3;
1230  xmm4 += a1 * b4;
1231  xmm5 += a2 * b1;
1232  xmm6 += a2 * b2;
1233  xmm7 += a2 * b3;
1234  xmm8 += a2 * b4;
1235  }
1236 
1237  (~C).store( i , j , xmm1 );
1238  (~C).store( i , j+SIMDSIZE , xmm2 );
1239  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1240  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1241  (~C).store( i+1UL, j , xmm5 );
1242  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1243  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1244  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1245  }
1246 
1247  if( i < iend )
1248  {
1249  const size_t kbegin( ( IsUpper<MT4>::value )
1250  ?( ( IsLower<MT5>::value )
1251  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1252  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1253  :( IsLower<MT5>::value ? j : 0UL ) );
1254  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
1255 
1256  SIMDType xmm1, xmm2, xmm3, xmm4;
1257 
1258  for( size_t k=kbegin; k<kend; ++k ) {
1259  const SIMDType a1( set( A(i,k) ) );
1260  xmm1 += a1 * B.load(k,j );
1261  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1262  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1263  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
1264  }
1265 
1266  (~C).store( i, j , xmm1 );
1267  (~C).store( i, j+SIMDSIZE , xmm2 );
1268  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1269  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1270  }
1271  }
1272 
1273  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1274  {
1275  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
1276  size_t i( LOW ? j : 0UL );
1277 
1278  for( ; (i+2UL) <= iend; i+=2UL )
1279  {
1280  const size_t kbegin( ( IsUpper<MT4>::value )
1281  ?( ( IsLower<MT5>::value )
1282  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1283  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1284  :( IsLower<MT5>::value ? j : 0UL ) );
1285  const size_t kend( ( IsLower<MT4>::value )
1286  ?( ( IsUpper<MT5>::value )
1287  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
1288  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1289  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
1290 
1291  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1292 
1293  for( size_t k=kbegin; k<kend; ++k ) {
1294  const SIMDType a1( set( A(i ,k) ) );
1295  const SIMDType a2( set( A(i+1UL,k) ) );
1296  const SIMDType b1( B.load(k,j ) );
1297  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1298  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1299  xmm1 += a1 * b1;
1300  xmm2 += a1 * b2;
1301  xmm3 += a1 * b3;
1302  xmm4 += a2 * b1;
1303  xmm5 += a2 * b2;
1304  xmm6 += a2 * b3;
1305  }
1306 
1307  (~C).store( i , j , xmm1 );
1308  (~C).store( i , j+SIMDSIZE , xmm2 );
1309  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1310  (~C).store( i+1UL, j , xmm4 );
1311  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
1312  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
1313  }
1314 
1315  if( i < iend )
1316  {
1317  const size_t kbegin( ( IsUpper<MT4>::value )
1318  ?( ( IsLower<MT5>::value )
1319  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1320  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1321  :( IsLower<MT5>::value ? j : 0UL ) );
1322  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
1323 
1324  SIMDType xmm1, xmm2, xmm3;
1325 
1326  for( size_t k=kbegin; k<kend; ++k ) {
1327  const SIMDType a1( set( A(i,k) ) );
1328  xmm1 += a1 * B.load(k,j );
1329  xmm2 += a1 * B.load(k,j+SIMDSIZE );
1330  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
1331  }
1332 
1333  (~C).store( i, j , xmm1 );
1334  (~C).store( i, j+SIMDSIZE , xmm2 );
1335  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1336  }
1337  }
1338 
1339  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1340  {
1341  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
1342  size_t i( LOW ? j : 0UL );
1343 
1344  for( ; (i+4UL) <= iend; i+=4UL )
1345  {
1346  const size_t kbegin( ( IsUpper<MT4>::value )
1347  ?( ( IsLower<MT5>::value )
1348  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1349  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1350  :( IsLower<MT5>::value ? j : 0UL ) );
1351  const size_t kend( ( IsLower<MT4>::value )
1352  ?( ( IsUpper<MT5>::value )
1353  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
1354  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
1355  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1356 
1357  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1358 
1359  for( size_t k=kbegin; k<kend; ++k ) {
1360  const SIMDType a1( set( A(i ,k) ) );
1361  const SIMDType a2( set( A(i+1UL,k) ) );
1362  const SIMDType a3( set( A(i+2UL,k) ) );
1363  const SIMDType a4( set( A(i+3UL,k) ) );
1364  const SIMDType b1( B.load(k,j ) );
1365  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1366  xmm1 += a1 * b1;
1367  xmm2 += a1 * b2;
1368  xmm3 += a2 * b1;
1369  xmm4 += a2 * b2;
1370  xmm5 += a3 * b1;
1371  xmm6 += a3 * b2;
1372  xmm7 += a4 * b1;
1373  xmm8 += a4 * b2;
1374  }
1375 
1376  (~C).store( i , j , xmm1 );
1377  (~C).store( i , j+SIMDSIZE, xmm2 );
1378  (~C).store( i+1UL, j , xmm3 );
1379  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1380  (~C).store( i+2UL, j , xmm5 );
1381  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1382  (~C).store( i+3UL, j , xmm7 );
1383  (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
1384  }
1385 
1386  for( ; (i+3UL) <= iend; i+=3UL )
1387  {
1388  const size_t kbegin( ( IsUpper<MT4>::value )
1389  ?( ( IsLower<MT5>::value )
1390  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1391  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1392  :( IsLower<MT5>::value ? j : 0UL ) );
1393  const size_t kend( ( IsLower<MT4>::value )
1394  ?( ( IsUpper<MT5>::value )
1395  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
1396  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
1397  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1398 
1399  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1400 
1401  for( size_t k=kbegin; k<kend; ++k ) {
1402  const SIMDType a1( set( A(i ,k) ) );
1403  const SIMDType a2( set( A(i+1UL,k) ) );
1404  const SIMDType a3( set( A(i+2UL,k) ) );
1405  const SIMDType b1( B.load(k,j ) );
1406  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1407  xmm1 += a1 * b1;
1408  xmm2 += a1 * b2;
1409  xmm3 += a2 * b1;
1410  xmm4 += a2 * b2;
1411  xmm5 += a3 * b1;
1412  xmm6 += a3 * b2;
1413  }
1414 
1415  (~C).store( i , j , xmm1 );
1416  (~C).store( i , j+SIMDSIZE, xmm2 );
1417  (~C).store( i+1UL, j , xmm3 );
1418  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1419  (~C).store( i+2UL, j , xmm5 );
1420  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
1421  }
1422 
1423  for( ; (i+2UL) <= iend; i+=2UL )
1424  {
1425  const size_t kbegin( ( IsUpper<MT4>::value )
1426  ?( ( IsLower<MT5>::value )
1427  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1428  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1429  :( IsLower<MT5>::value ? j : 0UL ) );
1430  const size_t kend( ( IsLower<MT4>::value )
1431  ?( ( IsUpper<MT5>::value )
1432  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1433  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1434  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
1435 
1436  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1437  size_t k( kbegin );
1438 
1439  for( ; (k+2UL) <= kend; k+=2UL ) {
1440  const SIMDType a1( set( A(i ,k ) ) );
1441  const SIMDType a2( set( A(i+1UL,k ) ) );
1442  const SIMDType a3( set( A(i ,k+1UL) ) );
1443  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
1444  const SIMDType b1( B.load(k ,j ) );
1445  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
1446  const SIMDType b3( B.load(k+1UL,j ) );
1447  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
1448  xmm1 += a1 * b1;
1449  xmm2 += a1 * b2;
1450  xmm3 += a2 * b1;
1451  xmm4 += a2 * b2;
1452  xmm5 += a3 * b3;
1453  xmm6 += a3 * b4;
1454  xmm7 += a4 * b3;
1455  xmm8 += a4 * b4;
1456  }
1457 
1458  for( ; k<kend; ++k ) {
1459  const SIMDType a1( set( A(i ,k) ) );
1460  const SIMDType a2( set( A(i+1UL,k) ) );
1461  const SIMDType b1( B.load(k,j ) );
1462  const SIMDType b2( B.load(k,j+SIMDSIZE) );
1463  xmm1 += a1 * b1;
1464  xmm2 += a1 * b2;
1465  xmm3 += a2 * b1;
1466  xmm4 += a2 * b2;
1467  }
1468 
1469  (~C).store( i , j , xmm1+xmm5 );
1470  (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
1471  (~C).store( i+1UL, j , xmm3+xmm7 );
1472  (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
1473  }
1474 
1475  if( i < iend )
1476  {
1477  const size_t kbegin( ( IsUpper<MT4>::value )
1478  ?( ( IsLower<MT5>::value )
1479  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1480  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1481  :( IsLower<MT5>::value ? j : 0UL ) );
1482  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
1483 
1484  SIMDType xmm1, xmm2, xmm3, xmm4;
1485  size_t k( kbegin );
1486 
1487  for( ; (k+2UL) <= kend; k+=2UL ) {
1488  const SIMDType a1( set( A(i,k ) ) );
1489  const SIMDType a2( set( A(i,k+1UL) ) );
1490  xmm1 += a1 * B.load(k ,j );
1491  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
1492  xmm3 += a2 * B.load(k+1UL,j );
1493  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
1494  }
1495 
1496  for( ; k<kend; ++k ) {
1497  const SIMDType a1( set( A(i,k) ) );
1498  xmm1 += a1 * B.load(k,j );
1499  xmm2 += a1 * B.load(k,j+SIMDSIZE);
1500  }
1501 
1502  (~C).store( i, j , xmm1+xmm3 );
1503  (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
1504  }
1505  }
1506 
1507  for( ; j<jpos; j+=SIMDSIZE )
1508  {
1509  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
1510  size_t i( LOW ? j : 0UL );
1511 
1512  for( ; (i+4UL) <= iend; i+=4UL )
1513  {
1514  const size_t kbegin( ( IsUpper<MT4>::value )
1515  ?( ( IsLower<MT5>::value )
1516  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1517  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1518  :( IsLower<MT5>::value ? j : 0UL ) );
1519  const size_t kend( ( IsLower<MT4>::value )
1520  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
1521  :( K ) );
1522 
1523  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1524  size_t k( kbegin );
1525 
1526  for( ; (k+2UL) <= kend; k+=2UL ) {
1527  const SIMDType b1( B.load(k ,j) );
1528  const SIMDType b2( B.load(k+1UL,j) );
1529  xmm1 += set( A(i ,k ) ) * b1;
1530  xmm2 += set( A(i+1UL,k ) ) * b1;
1531  xmm3 += set( A(i+2UL,k ) ) * b1;
1532  xmm4 += set( A(i+3UL,k ) ) * b1;
1533  xmm5 += set( A(i ,k+1UL) ) * b2;
1534  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
1535  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
1536  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
1537  }
1538 
1539  for( ; k<kend; ++k ) {
1540  const SIMDType b1( B.load(k,j) );
1541  xmm1 += set( A(i ,k) ) * b1;
1542  xmm2 += set( A(i+1UL,k) ) * b1;
1543  xmm3 += set( A(i+2UL,k) ) * b1;
1544  xmm4 += set( A(i+3UL,k) ) * b1;
1545  }
1546 
1547  (~C).store( i , j, xmm1+xmm5 );
1548  (~C).store( i+1UL, j, xmm2+xmm6 );
1549  (~C).store( i+2UL, j, xmm3+xmm7 );
1550  (~C).store( i+3UL, j, xmm4+xmm8 );
1551  }
1552 
1553  for( ; (i+3UL) <= iend; i+=3UL )
1554  {
1555  const size_t kbegin( ( IsUpper<MT4>::value )
1556  ?( ( IsLower<MT5>::value )
1557  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1558  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1559  :( IsLower<MT5>::value ? j : 0UL ) );
1560  const size_t kend( ( IsLower<MT4>::value )
1561  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
1562  :( K ) );
1563 
1564  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1565  size_t k( kbegin );
1566 
1567  for( ; (k+2UL) <= kend; k+=2UL ) {
1568  const SIMDType b1( B.load(k ,j) );
1569  const SIMDType b2( B.load(k+1UL,j) );
1570  xmm1 += set( A(i ,k ) ) * b1;
1571  xmm2 += set( A(i+1UL,k ) ) * b1;
1572  xmm3 += set( A(i+2UL,k ) ) * b1;
1573  xmm4 += set( A(i ,k+1UL) ) * b2;
1574  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
1575  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
1576  }
1577 
1578  for( ; k<kend; ++k ) {
1579  const SIMDType b1( B.load(k,j) );
1580  xmm1 += set( A(i ,k) ) * b1;
1581  xmm2 += set( A(i+1UL,k) ) * b1;
1582  xmm3 += set( A(i+2UL,k) ) * b1;
1583  }
1584 
1585  (~C).store( i , j, xmm1+xmm4 );
1586  (~C).store( i+1UL, j, xmm2+xmm5 );
1587  (~C).store( i+2UL, j, xmm3+xmm6 );
1588  }
1589 
1590  for( ; (i+2UL) <= iend; i+=2UL )
1591  {
1592  const size_t kbegin( ( IsUpper<MT4>::value )
1593  ?( ( IsLower<MT5>::value )
1594  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1595  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1596  :( IsLower<MT5>::value ? j : 0UL ) );
1597  const size_t kend( ( IsLower<MT4>::value )
1598  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1599  :( K ) );
1600 
1601  SIMDType xmm1, xmm2, xmm3, xmm4;
1602  size_t k( kbegin );
1603 
1604  for( ; (k+2UL) <= kend; k+=2UL ) {
1605  const SIMDType b1( B.load(k ,j) );
1606  const SIMDType b2( B.load(k+1UL,j) );
1607  xmm1 += set( A(i ,k ) ) * b1;
1608  xmm2 += set( A(i+1UL,k ) ) * b1;
1609  xmm3 += set( A(i ,k+1UL) ) * b2;
1610  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
1611  }
1612 
1613  for( ; k<kend; ++k ) {
1614  const SIMDType b1( B.load(k,j) );
1615  xmm1 += set( A(i ,k) ) * b1;
1616  xmm2 += set( A(i+1UL,k) ) * b1;
1617  }
1618 
1619  (~C).store( i , j, xmm1+xmm3 );
1620  (~C).store( i+1UL, j, xmm2+xmm4 );
1621  }
1622 
1623  if( i < iend )
1624  {
1625  const size_t kbegin( ( IsUpper<MT4>::value )
1626  ?( ( IsLower<MT5>::value )
1627  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1628  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1629  :( IsLower<MT5>::value ? j : 0UL ) );
1630 
1631  SIMDType xmm1, xmm2;
1632  size_t k( kbegin );
1633 
1634  for( ; (k+2UL) <= K; k+=2UL ) {
1635  xmm1 += set( A(i,k ) ) * B.load(k ,j);
1636  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
1637  }
1638 
1639  for( ; k<K; ++k ) {
1640  xmm1 += set( A(i,k) ) * B.load(k,j);
1641  }
1642 
1643  (~C).store( i, j, xmm1+xmm2 );
1644  }
1645  }
1646 
1647  for( ; remainder && j<N; ++j )
1648  {
1649  size_t i( LOW && UPP ? j : 0UL );
1650 
1651  for( ; (i+2UL) <= M; i+=2UL )
1652  {
1653  const size_t kbegin( ( IsUpper<MT4>::value )
1654  ?( ( IsLower<MT5>::value )
1655  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1656  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1657  :( IsLower<MT5>::value ? j : 0UL ) );
1658  const size_t kend( ( IsLower<MT4>::value )
1659  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1660  :( K ) );
1661 
1662  ElementType value1{};
1663  ElementType value2{};
1664 
1665  for( size_t k=kbegin; k<kend; ++k ) {
1666  value1 += A(i ,k) * B(k,j);
1667  value2 += A(i+1UL,k) * B(k,j);
1668  }
1669 
1670  (~C)(i ,j) = value1;
1671  (~C)(i+1UL,j) = value2;
1672  }
1673 
1674  if( i < M )
1675  {
1676  const size_t kbegin( ( IsUpper<MT4>::value )
1677  ?( ( IsLower<MT5>::value )
1678  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1679  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1680  :( IsLower<MT5>::value ? j : 0UL ) );
1681 
1682  ElementType value{};
1683 
1684  for( size_t k=kbegin; k<K; ++k ) {
1685  value += A(i,k) * B(k,j);
1686  }
1687 
1688  (~C)(i,j) = value;
1689  }
1690  }
1691  }
1692 
1693  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
1694  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1695  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1696  for( size_t j=0UL; j<jend; ++j ) {
1697  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
1698  }
1699  }
1700  }
1701  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
1702  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
1703  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
1704  for( size_t i=0UL; i<iend; ++i ) {
1705  reset( (~C)(i,j) );
1706  }
1707  }
1708  }
1709  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
1710  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
1711  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
1712  for( size_t j=0UL; j<jend; ++j ) {
1713  reset( (~C)(i,j) );
1714  }
1715  }
1716  }
1717  }
1719  //**********************************************************************************************
1720 
1721  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
1736  template< typename MT3 // Type of the left-hand side target matrix
1737  , typename MT4 // Type of the left-hand side matrix operand
1738  , typename MT5 > // Type of the right-hand side matrix operand
1740  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
1741  {
1742  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1743 
1744  const size_t M( A.rows() );
1745  const size_t N( B.columns() );
1746  const size_t K( A.columns() );
1747 
1748  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
1749 
1750  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
1751  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
1752 
1753  if( LOW && UPP && M > SIMDSIZE*3UL ) {
1754  reset( ~C );
1755  }
1756 
1757  {
1758  size_t i( 0UL );
1759 
1761  {
1762  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1763  for( size_t j=0UL; j<N; ++j )
1764  {
1765  const size_t kbegin( ( IsLower<MT5>::value )
1766  ?( ( IsUpper<MT4>::value )
1767  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1768  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1769  :( IsUpper<MT4>::value ? i : 0UL ) );
1770  const size_t kend( ( IsUpper<MT5>::value )
1771  ?( ( IsLower<MT4>::value )
1772  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1773  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1774  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
1775 
1776  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1777 
1778  for( size_t k=kbegin; k<kend; ++k ) {
1779  const SIMDType b1( set( B(k,j) ) );
1780  xmm1 += A.load(i ,k) * b1;
1781  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1782  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1783  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1784  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1785  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
1786  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
1787  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
1788  }
1789 
1790  (~C).store( i , j, xmm1 );
1791  (~C).store( i+SIMDSIZE , j, xmm2 );
1792  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1793  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1794  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1795  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1796  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1797  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1798  }
1799  }
1800  }
1801 
1802  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
1803  {
1804  size_t j( 0UL );
1805 
1806  for( ; (j+2UL) <= N; j+=2UL )
1807  {
1808  const size_t kbegin( ( IsLower<MT5>::value )
1809  ?( ( IsUpper<MT4>::value )
1810  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1811  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1812  :( IsUpper<MT4>::value ? i : 0UL ) );
1813  const size_t kend( ( IsUpper<MT5>::value )
1814  ?( ( IsLower<MT4>::value )
1815  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1816  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1817  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
1818 
1819  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1820 
1821  for( size_t k=kbegin; k<kend; ++k ) {
1822  const SIMDType a1( A.load(i ,k) );
1823  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1824  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1825  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1826  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
1827  const SIMDType b1( set( B(k,j ) ) );
1828  const SIMDType b2( set( B(k,j+1UL) ) );
1829  xmm1 += a1 * b1;
1830  xmm2 += a2 * b1;
1831  xmm3 += a3 * b1;
1832  xmm4 += a4 * b1;
1833  xmm5 += a5 * b1;
1834  xmm6 += a1 * b2;
1835  xmm7 += a2 * b2;
1836  xmm8 += a3 * b2;
1837  xmm9 += a4 * b2;
1838  xmm10 += a5 * b2;
1839  }
1840 
1841  (~C).store( i , j , xmm1 );
1842  (~C).store( i+SIMDSIZE , j , xmm2 );
1843  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1844  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1845  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
1846  (~C).store( i , j+1UL, xmm6 );
1847  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
1848  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
1849  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
1850  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
1851  }
1852 
1853  if( j < N )
1854  {
1855  const size_t kbegin( ( IsLower<MT5>::value )
1856  ?( ( IsUpper<MT4>::value )
1857  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1858  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1859  :( IsUpper<MT4>::value ? i : 0UL ) );
1860  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
1861 
1862  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1863 
1864  for( size_t k=kbegin; k<kend; ++k ) {
1865  const SIMDType b1( set( B(k,j) ) );
1866  xmm1 += A.load(i ,k) * b1;
1867  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1868  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1869  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1870  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
1871  }
1872 
1873  (~C).store( i , j, xmm1 );
1874  (~C).store( i+SIMDSIZE , j, xmm2 );
1875  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1876  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1877  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1878  }
1879  }
1880 
1881  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1882  {
1883  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
1884  size_t j( UPP ? i : 0UL );
1885 
1886  for( ; (j+2UL) <= jend; j+=2UL )
1887  {
1888  const size_t kbegin( ( IsLower<MT5>::value )
1889  ?( ( IsUpper<MT4>::value )
1890  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1891  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1892  :( IsUpper<MT4>::value ? i : 0UL ) );
1893  const size_t kend( ( IsUpper<MT5>::value )
1894  ?( ( IsLower<MT4>::value )
1895  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1896  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1897  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
1898 
1899  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1900 
1901  for( size_t k=kbegin; k<kend; ++k ) {
1902  const SIMDType a1( A.load(i ,k) );
1903  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1904  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1905  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1906  const SIMDType b1( set( B(k,j ) ) );
1907  const SIMDType b2( set( B(k,j+1UL) ) );
1908  xmm1 += a1 * b1;
1909  xmm2 += a2 * b1;
1910  xmm3 += a3 * b1;
1911  xmm4 += a4 * b1;
1912  xmm5 += a1 * b2;
1913  xmm6 += a2 * b2;
1914  xmm7 += a3 * b2;
1915  xmm8 += a4 * b2;
1916  }
1917 
1918  (~C).store( i , j , xmm1 );
1919  (~C).store( i+SIMDSIZE , j , xmm2 );
1920  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1921  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1922  (~C).store( i , j+1UL, xmm5 );
1923  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1924  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1925  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1926  }
1927 
1928  if( j < jend )
1929  {
1930  const size_t kbegin( ( IsLower<MT5>::value )
1931  ?( ( IsUpper<MT4>::value )
1932  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1933  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1934  :( IsUpper<MT4>::value ? i : 0UL ) );
1935  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
1936 
1937  SIMDType xmm1, xmm2, xmm3, xmm4;
1938 
1939  for( size_t k=kbegin; k<kend; ++k ) {
1940  const SIMDType b1( set( B(k,j) ) );
1941  xmm1 += A.load(i ,k) * b1;
1942  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
1943  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
1944  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
1945  }
1946 
1947  (~C).store( i , j, xmm1 );
1948  (~C).store( i+SIMDSIZE , j, xmm2 );
1949  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1950  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1951  }
1952  }
1953 
1954  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1955  {
1956  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
1957  size_t j( UPP ? i : 0UL );
1958 
1959  for( ; (j+2UL) <= jend; j+=2UL )
1960  {
1961  const size_t kbegin( ( IsLower<MT5>::value )
1962  ?( ( IsUpper<MT4>::value )
1963  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1964  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1965  :( IsUpper<MT4>::value ? i : 0UL ) );
1966  const size_t kend( ( IsUpper<MT5>::value )
1967  ?( ( IsLower<MT4>::value )
1968  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1969  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1970  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
1971 
1972  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1973 
1974  for( size_t k=kbegin; k<kend; ++k ) {
1975  const SIMDType a1( A.load(i ,k) );
1976  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1977  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1978  const SIMDType b1( set( B(k,j ) ) );
1979  const SIMDType b2( set( B(k,j+1UL) ) );
1980  xmm1 += a1 * b1;
1981  xmm2 += a2 * b1;
1982  xmm3 += a3 * b1;
1983  xmm4 += a1 * b2;
1984  xmm5 += a2 * b2;
1985  xmm6 += a3 * b2;
1986  }
1987 
1988  (~C).store( i , j , xmm1 );
1989  (~C).store( i+SIMDSIZE , j , xmm2 );
1990  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1991  (~C).store( i , j+1UL, xmm4 );
1992  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
1993  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
1994  }
1995 
1996  if( j < jend )
1997  {
1998  const size_t kbegin( ( IsLower<MT5>::value )
1999  ?( ( IsUpper<MT4>::value )
2000  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2001  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2002  :( IsUpper<MT4>::value ? i : 0UL ) );
2003  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
2004 
2005  SIMDType xmm1, xmm2, xmm3;
2006 
2007  for( size_t k=kbegin; k<kend; ++k ) {
2008  const SIMDType b1( set( B(k,j) ) );
2009  xmm1 += A.load(i ,k) * b1;
2010  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
2011  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
2012  }
2013 
2014  (~C).store( i , j, xmm1 );
2015  (~C).store( i+SIMDSIZE , j, xmm2 );
2016  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2017  }
2018  }
2019 
2020  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2021  {
2022  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
2023  size_t j( UPP ? i : 0UL );
2024 
2025  for( ; (j+4UL) <= jend; j+=4UL )
2026  {
2027  const size_t kbegin( ( IsLower<MT5>::value )
2028  ?( ( IsUpper<MT4>::value )
2029  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2030  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2031  :( IsUpper<MT4>::value ? i : 0UL ) );
2032  const size_t kend( ( IsUpper<MT5>::value )
2033  ?( ( IsLower<MT4>::value )
2034  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
2035  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
2036  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2037 
2038  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2039 
2040  for( size_t k=kbegin; k<kend; ++k ) {
2041  const SIMDType a1( A.load(i ,k) );
2042  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2043  const SIMDType b1( set( B(k,j ) ) );
2044  const SIMDType b2( set( B(k,j+1UL) ) );
2045  const SIMDType b3( set( B(k,j+2UL) ) );
2046  const SIMDType b4( set( B(k,j+3UL) ) );
2047  xmm1 += a1 * b1;
2048  xmm2 += a2 * b1;
2049  xmm3 += a1 * b2;
2050  xmm4 += a2 * b2;
2051  xmm5 += a1 * b3;
2052  xmm6 += a2 * b3;
2053  xmm7 += a1 * b4;
2054  xmm8 += a2 * b4;
2055  }
2056 
2057  (~C).store( i , j , xmm1 );
2058  (~C).store( i+SIMDSIZE, j , xmm2 );
2059  (~C).store( i , j+1UL, xmm3 );
2060  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2061  (~C).store( i , j+2UL, xmm5 );
2062  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2063  (~C).store( i , j+3UL, xmm7 );
2064  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
2065  }
2066 
2067  for( ; (j+3UL) <= jend; j+=3UL )
2068  {
2069  const size_t kbegin( ( IsLower<MT5>::value )
2070  ?( ( IsUpper<MT4>::value )
2071  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2072  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2073  :( IsUpper<MT4>::value ? i : 0UL ) );
2074  const size_t kend( ( IsUpper<MT5>::value )
2075  ?( ( IsLower<MT4>::value )
2076  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
2077  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
2078  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2079 
2080  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2081 
2082  for( size_t k=kbegin; k<kend; ++k ) {
2083  const SIMDType a1( A.load(i ,k) );
2084  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2085  const SIMDType b1( set( B(k,j ) ) );
2086  const SIMDType b2( set( B(k,j+1UL) ) );
2087  const SIMDType b3( set( B(k,j+2UL) ) );
2088  xmm1 += a1 * b1;
2089  xmm2 += a2 * b1;
2090  xmm3 += a1 * b2;
2091  xmm4 += a2 * b2;
2092  xmm5 += a1 * b3;
2093  xmm6 += a2 * b3;
2094  }
2095 
2096  (~C).store( i , j , xmm1 );
2097  (~C).store( i+SIMDSIZE, j , xmm2 );
2098  (~C).store( i , j+1UL, xmm3 );
2099  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2100  (~C).store( i , j+2UL, xmm5 );
2101  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
2102  }
2103 
2104  for( ; (j+2UL) <= jend; j+=2UL )
2105  {
2106  const size_t kbegin( ( IsLower<MT5>::value )
2107  ?( ( IsUpper<MT4>::value )
2108  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2109  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2110  :( IsUpper<MT4>::value ? i : 0UL ) );
2111  const size_t kend( ( IsUpper<MT5>::value )
2112  ?( ( IsLower<MT4>::value )
2113  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2114  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2115  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
2116 
2117  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2118  size_t k( kbegin );
2119 
2120  for( ; (k+2UL) <= kend; k+=2UL ) {
2121  const SIMDType a1( A.load(i ,k ) );
2122  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
2123  const SIMDType a3( A.load(i ,k+1UL) );
2124  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
2125  const SIMDType b1( set( B(k ,j ) ) );
2126  const SIMDType b2( set( B(k ,j+1UL) ) );
2127  const SIMDType b3( set( B(k+1UL,j ) ) );
2128  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
2129  xmm1 += a1 * b1;
2130  xmm2 += a2 * b1;
2131  xmm3 += a1 * b2;
2132  xmm4 += a2 * b2;
2133  xmm5 += a3 * b3;
2134  xmm6 += a4 * b3;
2135  xmm7 += a3 * b4;
2136  xmm8 += a4 * b4;
2137  }
2138 
2139  for( ; k<kend; ++k ) {
2140  const SIMDType a1( A.load(i ,k) );
2141  const SIMDType a2( A.load(i+SIMDSIZE,k) );
2142  const SIMDType b1( set( B(k,j ) ) );
2143  const SIMDType b2( set( B(k,j+1UL) ) );
2144  xmm1 += a1 * b1;
2145  xmm2 += a2 * b1;
2146  xmm3 += a1 * b2;
2147  xmm4 += a2 * b2;
2148  }
2149 
2150  (~C).store( i , j , xmm1+xmm5 );
2151  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
2152  (~C).store( i , j+1UL, xmm3+xmm7 );
2153  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
2154  }
2155 
2156  if( j < jend )
2157  {
2158  const size_t kbegin( ( IsLower<MT5>::value )
2159  ?( ( IsUpper<MT4>::value )
2160  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2161  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2162  :( IsUpper<MT4>::value ? i : 0UL ) );
2163  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
2164 
2165  SIMDType xmm1, xmm2, xmm3, xmm4;
2166  size_t k( kbegin );
2167 
2168  for( ; (k+2UL) <= kend; k+=2UL ) {
2169  const SIMDType b1( set( B(k ,j) ) );
2170  const SIMDType b2( set( B(k+1UL,j) ) );
2171  xmm1 += A.load(i ,k ) * b1;
2172  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
2173  xmm3 += A.load(i ,k+1UL) * b2;
2174  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
2175  }
2176 
2177  for( ; k<kend; ++k ) {
2178  const SIMDType b1( set( B(k,j) ) );
2179  xmm1 += A.load(i ,k) * b1;
2180  xmm2 += A.load(i+SIMDSIZE,k) * b1;
2181  }
2182 
2183  (~C).store( i , j, xmm1+xmm3 );
2184  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
2185  }
2186  }
2187 
2188  for( ; i<ipos; i+=SIMDSIZE )
2189  {
2190  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
2191  size_t j( UPP ? i : 0UL );
2192 
2193  for( ; (j+4UL) <= jend; j+=4UL )
2194  {
2195  const size_t kbegin( ( IsLower<MT5>::value )
2196  ?( ( IsUpper<MT4>::value )
2197  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2198  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2199  :( IsUpper<MT4>::value ? i : 0UL ) );
2200  const size_t kend( ( IsUpper<MT5>::value )
2201  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
2202  :( K ) );
2203 
2204  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2205  size_t k( kbegin );
2206 
2207  for( ; (k+2UL) <= kend; k+=2UL ) {
2208  const SIMDType a1( A.load(i,k ) );
2209  const SIMDType a2( A.load(i,k+1UL) );
2210  xmm1 += a1 * set( B(k ,j ) );
2211  xmm2 += a1 * set( B(k ,j+1UL) );
2212  xmm3 += a1 * set( B(k ,j+2UL) );
2213  xmm4 += a1 * set( B(k ,j+3UL) );
2214  xmm5 += a2 * set( B(k+1UL,j ) );
2215  xmm6 += a2 * set( B(k+1UL,j+1UL) );
2216  xmm7 += a2 * set( B(k+1UL,j+2UL) );
2217  xmm8 += a2 * set( B(k+1UL,j+3UL) );
2218  }
2219 
2220  for( ; k<kend; ++k ) {
2221  const SIMDType a1( A.load(i,k) );
2222  xmm1 += a1 * set( B(k,j ) );
2223  xmm2 += a1 * set( B(k,j+1UL) );
2224  xmm3 += a1 * set( B(k,j+2UL) );
2225  xmm4 += a1 * set( B(k,j+3UL) );
2226  }
2227 
2228  (~C).store( i, j , xmm1+xmm5 );
2229  (~C).store( i, j+1UL, xmm2+xmm6 );
2230  (~C).store( i, j+2UL, xmm3+xmm7 );
2231  (~C).store( i, j+3UL, xmm4+xmm8 );
2232  }
2233 
2234  for( ; (j+3UL) <= jend; j+=3UL )
2235  {
2236  const size_t kbegin( ( IsLower<MT5>::value )
2237  ?( ( IsUpper<MT4>::value )
2238  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2239  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2240  :( IsUpper<MT4>::value ? i : 0UL ) );
2241  const size_t kend( ( IsUpper<MT5>::value )
2242  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
2243  :( K ) );
2244 
2245  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2246  size_t k( kbegin );
2247 
2248  for( ; (k+2UL) <= kend; k+=2UL ) {
2249  const SIMDType a1( A.load(i,k ) );
2250  const SIMDType a2( A.load(i,k+1UL) );
2251  xmm1 += a1 * set( B(k ,j ) );
2252  xmm2 += a1 * set( B(k ,j+1UL) );
2253  xmm3 += a1 * set( B(k ,j+2UL) );
2254  xmm4 += a2 * set( B(k+1UL,j ) );
2255  xmm5 += a2 * set( B(k+1UL,j+1UL) );
2256  xmm6 += a2 * set( B(k+1UL,j+2UL) );
2257  }
2258 
2259  for( ; k<kend; ++k ) {
2260  const SIMDType a1( A.load(i,k) );
2261  xmm1 += a1 * set( B(k,j ) );
2262  xmm2 += a1 * set( B(k,j+1UL) );
2263  xmm3 += a1 * set( B(k,j+2UL) );
2264  }
2265 
2266  (~C).store( i, j , xmm1+xmm4 );
2267  (~C).store( i, j+1UL, xmm2+xmm5 );
2268  (~C).store( i, j+2UL, xmm3+xmm6 );
2269  }
2270 
2271  for( ; (j+2UL) <= jend; j+=2UL )
2272  {
2273  const size_t kbegin( ( IsLower<MT5>::value )
2274  ?( ( IsUpper<MT4>::value )
2275  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2276  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2277  :( IsUpper<MT4>::value ? i : 0UL ) );
2278  const size_t kend( ( IsUpper<MT5>::value )
2279  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2280  :( K ) );
2281 
2282  SIMDType xmm1, xmm2, xmm3, xmm4;
2283  size_t k( kbegin );
2284 
2285  for( ; (k+2UL) <= kend; k+=2UL ) {
2286  const SIMDType a1( A.load(i,k ) );
2287  const SIMDType a2( A.load(i,k+1UL) );
2288  xmm1 += a1 * set( B(k ,j ) );
2289  xmm2 += a1 * set( B(k ,j+1UL) );
2290  xmm3 += a2 * set( B(k+1UL,j ) );
2291  xmm4 += a2 * set( B(k+1UL,j+1UL) );
2292  }
2293 
2294  for( ; k<kend; ++k ) {
2295  const SIMDType a1( A.load(i,k) );
2296  xmm1 += a1 * set( B(k,j ) );
2297  xmm2 += a1 * set( B(k,j+1UL) );
2298  }
2299 
2300  (~C).store( i, j , xmm1+xmm3 );
2301  (~C).store( i, j+1UL, xmm2+xmm4 );
2302  }
2303 
2304  if( j < jend )
2305  {
2306  const size_t kbegin( ( IsLower<MT5>::value )
2307  ?( ( IsUpper<MT4>::value )
2308  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2309  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2310  :( IsUpper<MT4>::value ? i : 0UL ) );
2311 
2312  SIMDType xmm1, xmm2;
2313  size_t k( kbegin );
2314 
2315  for( ; (k+2UL) <= K; k+=2UL ) {
2316  xmm1 += A.load(i,k ) * set( B(k ,j) );
2317  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
2318  }
2319 
2320  for( ; k<K; ++k ) {
2321  xmm1 += A.load(i,k) * set( B(k,j) );
2322  }
2323 
2324  (~C).store( i, j, xmm1+xmm2 );
2325  }
2326  }
2327 
2328  for( ; remainder && i<M; ++i )
2329  {
2330  size_t j( LOW && UPP ? i : 0UL );
2331 
2332  for( ; (j+2UL) <= N; j+=2UL )
2333  {
2334  const size_t kbegin( ( IsLower<MT5>::value )
2335  ?( ( IsUpper<MT4>::value )
2336  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2337  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2338  :( IsUpper<MT4>::value ? i : 0UL ) );
2339  const size_t kend( ( IsUpper<MT5>::value )
2340  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2341  :( K ) );
2342 
2343  ElementType value1{};
2344  ElementType value2{};
2345 
2346  for( size_t k=kbegin; k<kend; ++k ) {
2347  value1 += A(i,k) * B(k,j );
2348  value2 += A(i,k) * B(k,j+1UL);
2349  }
2350 
2351  (~C)(i,j ) = value1;
2352  (~C)(i,j+1UL) = value2;
2353  }
2354 
2355  if( j < N )
2356  {
2357  const size_t kbegin( ( IsLower<MT5>::value )
2358  ?( ( IsUpper<MT4>::value )
2359  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2360  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2361  :( IsUpper<MT4>::value ? i : 0UL ) );
2362 
2363  ElementType value{};
2364 
2365  for( size_t k=kbegin; k<K; ++k ) {
2366  value += A(i,k) * B(k,j);
2367  }
2368 
2369  (~C)(i,j) = value;
2370  }
2371  }
2372  }
2373 
2374  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
2375  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2376  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2377  for( size_t i=0UL; i<iend; ++i ) {
2378  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
2379  }
2380  }
2381  }
2382  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
2383  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
2384  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
2385  for( size_t i=0UL; i<iend; ++i ) {
2386  reset( (~C)(i,j) );
2387  }
2388  }
2389  }
2390  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
2391  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
2392  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
2393  for( size_t j=0UL; j<jend; ++j ) {
2394  reset( (~C)(i,j) );
2395  }
2396  }
2397  }
2398  }
2400  //**********************************************************************************************
2401 
2402  //**Default assignment to dense matrices (large matrices)***************************************
2416  template< typename MT3 // Type of the left-hand side target matrix
2417  , typename MT4 // Type of the left-hand side matrix operand
2418  , typename MT5 > // Type of the right-hand side matrix operand
2420  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2421  {
2422  selectDefaultAssignKernel( C, A, B );
2423  }
2425  //**********************************************************************************************
2426 
2427  //**Vectorized default assignment to dense matrices (large matrices)****************************
2442  template< typename MT3 // Type of the left-hand side target matrix
2443  , typename MT4 // Type of the left-hand side matrix operand
2444  , typename MT5 > // Type of the right-hand side matrix operand
2446  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B )
2447  {
2448  if( SYM )
2449  smmm( C, A, B, ElementType(1) );
2450  else if( HERM )
2451  hmmm( C, A, B, ElementType(1) );
2452  else if( LOW )
2453  lmmm( C, A, B, ElementType(1), ElementType(0) );
2454  else if( UPP )
2455  ummm( C, A, B, ElementType(1), ElementType(0) );
2456  else
2457  mmm( C, A, B, ElementType(1), ElementType(0) );
2458  }
2460  //**********************************************************************************************
2461 
2462  //**BLAS-based assignment to dense matrices (default)*******************************************
2476  template< typename MT3 // Type of the left-hand side target matrix
2477  , typename MT4 // Type of the left-hand side matrix operand
2478  , typename MT5 > // Type of the right-hand side matrix operand
2480  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2481  {
2482  selectLargeAssignKernel( C, A, B );
2483  }
2485  //**********************************************************************************************
2486 
2487  //**BLAS-based assignment to dense matrices*****************************************************
2488 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2489 
2502  template< typename MT3 // Type of the left-hand side target matrix
2503  , typename MT4 // Type of the left-hand side matrix operand
2504  , typename MT5 > // Type of the right-hand side matrix operand
2506  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B )
2507  {
2508  using ET = ElementType_<MT3>;
2509 
2510  if( IsTriangular<MT4>::value ) {
2511  assign( C, B );
2512  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2513  }
2514  else if( IsTriangular<MT5>::value ) {
2515  assign( C, A );
2516  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2517  }
2518  else {
2519  gemm( C, A, B, ET(1), ET(0) );
2520  }
2521  }
2523 #endif
2524  //**********************************************************************************************
2525 
2526  //**Assignment to sparse matrices***************************************************************
2539  template< typename MT // Type of the target sparse matrix
2540  , bool SO > // Storage order of the target sparse matrix
2541  friend inline void assign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2542  {
2544 
2546 
2553 
2554  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2555  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2556 
2557  const ForwardFunctor fwd;
2558 
2559  const TmpType tmp( serial( rhs ) );
2560  assign( ~lhs, fwd( tmp ) );
2561  }
2563  //**********************************************************************************************
2564 
2565  //**Addition assignment to dense matrices*******************************************************
2578  template< typename MT // Type of the target dense matrix
2579  , bool SO > // Storage order of the target dense matrix
2580  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
2581  {
2583 
2584  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
2585  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
2586 
2587  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2588  return;
2589  }
2590 
2591  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
2592  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
2593 
2594  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
2595  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
2596  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
2597  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
2598  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
2599  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
2600 
2601  TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2602  }
2604  //**********************************************************************************************
2605 
2606  //**Addition assignment to dense matrices (kernel selection)************************************
2617  template< typename MT3 // Type of the left-hand side target matrix
2618  , typename MT4 // Type of the left-hand side matrix operand
2619  , typename MT5 > // Type of the right-hand side matrix operand
2620  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2621  {
2623  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
2624  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
2625  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2626  selectSmallAddAssignKernel( C, A, B );
2627  else
2628  selectBlasAddAssignKernel( C, A, B );
2629  }
2631  //**********************************************************************************************
2632 
2633  //**Default addition assignment to row-major dense matrices (general/general)*******************
2647  template< typename MT3 // Type of the left-hand side target matrix
2648  , typename MT4 // Type of the left-hand side matrix operand
2649  , typename MT5 > // Type of the right-hand side matrix operand
2650  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2651  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2652  {
2653  const size_t M( A.rows() );
2654  const size_t N( B.columns() );
2655  const size_t K( A.columns() );
2656 
2657  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2658 
2659  for( size_t i=0UL; i<M; ++i )
2660  {
2661  const size_t kbegin( ( IsUpper<MT4>::value )
2662  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2663  :( 0UL ) );
2664  const size_t kend( ( IsLower<MT4>::value )
2665  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2666  :( K ) );
2667  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2668 
2669  for( size_t k=kbegin; k<kend; ++k )
2670  {
2671  const size_t jbegin( ( IsUpper<MT5>::value )
2673  ?( UPP ? max(i,k+1UL) : k+1UL )
2674  :( UPP ? max(i,k) : k ) )
2675  :( UPP ? i : 0UL ) );
2676  const size_t jend( ( IsLower<MT5>::value )
2678  ?( LOW ? min(i+1UL,k) : k )
2679  :( LOW ? min(i,k)+1UL : k+1UL ) )
2680  :( LOW ? i+1UL : N ) );
2681 
2682  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
2683  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2684 
2685  const size_t jnum( jend - jbegin );
2686  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2687 
2688  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2689  (~C)(i,j ) += A(i,k) * B(k,j );
2690  (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2691  }
2692  if( jpos < jend ) {
2693  (~C)(i,jpos) += A(i,k) * B(k,jpos);
2694  }
2695  }
2696  }
2697  }
2699  //**********************************************************************************************
2700 
2701  //**Default addition assignment to column-major dense matrices (general/general)****************
2715  template< typename MT3 // Type of the left-hand side target matrix
2716  , typename MT4 // Type of the left-hand side matrix operand
2717  , typename MT5 > // Type of the right-hand side matrix operand
2718  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2719  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2720  {
2721  const size_t M( A.rows() );
2722  const size_t N( B.columns() );
2723  const size_t K( A.columns() );
2724 
2725  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
2726 
2727  for( size_t j=0UL; j<N; ++j )
2728  {
2729  const size_t kbegin( ( IsLower<MT5>::value )
2730  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2731  :( 0UL ) );
2732  const size_t kend( ( IsUpper<MT5>::value )
2733  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2734  :( K ) );
2735  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
2736 
2737  for( size_t k=kbegin; k<kend; ++k )
2738  {
2739  const size_t ibegin( ( IsLower<MT4>::value )
2741  ?( LOW ? max(j,k+1UL) : k+1UL )
2742  :( LOW ? max(j,k) : k ) )
2743  :( LOW ? j : 0UL ) );
2744  const size_t iend( ( IsUpper<MT4>::value )
2746  ?( UPP ? min(j+1UL,k) : k )
2747  :( UPP ? min(j,k)+1UL : k+1UL ) )
2748  :( UPP ? j+1UL : M ) );
2749 
2750  if( ( LOW || UPP ) && ibegin >= iend ) continue;
2751  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2752 
2753  const size_t inum( iend - ibegin );
2754  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2755 
2756  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2757  (~C)(i ,j) += A(i ,k) * B(k,j);
2758  (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2759  }
2760  if( ipos < iend ) {
2761  (~C)(ipos,j) += A(ipos,k) * B(k,j);
2762  }
2763  }
2764  }
2765  }
2767  //**********************************************************************************************
2768 
2769  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
2783  template< typename MT3 // Type of the left-hand side target matrix
2784  , typename MT4 // Type of the left-hand side matrix operand
2785  , typename MT5 > // Type of the right-hand side matrix operand
2786  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2787  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2788  {
2789  constexpr size_t block( BLOCK_SIZE );
2790 
2791  const size_t M( A.rows() );
2792  const size_t N( B.columns() );
2793 
2794  for( size_t ii=0UL; ii<M; ii+=block ) {
2795  const size_t iend( min( M, ii+block ) );
2796  for( size_t jj=0UL; jj<N; jj+=block ) {
2797  const size_t jend( min( N, jj+block ) );
2798  for( size_t i=ii; i<iend; ++i )
2799  {
2800  const size_t jbegin( ( IsUpper<MT4>::value )
2801  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
2802  :( jj ) );
2803  const size_t jpos( ( IsLower<MT4>::value )
2804  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
2805  :( jend ) );
2806 
2807  for( size_t j=jbegin; j<jpos; ++j ) {
2808  (~C)(i,j) += A(i,j) * B(j,j);
2809  }
2810  }
2811  }
2812  }
2813  }
2815  //**********************************************************************************************
2816 
2817  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
2831  template< typename MT3 // Type of the left-hand side target matrix
2832  , typename MT4 // Type of the left-hand side matrix operand
2833  , typename MT5 > // Type of the right-hand side matrix operand
2834  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2835  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2836  {
2837  const size_t M( A.rows() );
2838  const size_t N( B.columns() );
2839 
2840  for( size_t j=0UL; j<N; ++j )
2841  {
2842  const size_t ibegin( ( IsLower<MT4>::value )
2843  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2844  :( 0UL ) );
2845  const size_t iend( ( IsUpper<MT4>::value )
2846  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2847  :( M ) );
2848  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
2849 
2850  const size_t inum( iend - ibegin );
2851  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
2852 
2853  for( size_t i=ibegin; i<ipos; i+=2UL ) {
2854  (~C)(i ,j) += A(i ,j) * B(j,j);
2855  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2856  }
2857  if( ipos < iend ) {
2858  (~C)(ipos,j) += A(ipos,j) * B(j,j);
2859  }
2860  }
2861  }
2863  //**********************************************************************************************
2864 
2865  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
2879  template< typename MT3 // Type of the left-hand side target matrix
2880  , typename MT4 // Type of the left-hand side matrix operand
2881  , typename MT5 > // Type of the right-hand side matrix operand
2882  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2883  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
2884  {
2885  const size_t M( A.rows() );
2886  const size_t N( B.columns() );
2887 
2888  for( size_t i=0UL; i<M; ++i )
2889  {
2890  const size_t jbegin( ( IsUpper<MT5>::value )
2891  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2892  :( 0UL ) );
2893  const size_t jend( ( IsLower<MT5>::value )
2894  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2895  :( N ) );
2896  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
2897 
2898  const size_t jnum( jend - jbegin );
2899  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
2900 
2901  for( size_t j=jbegin; j<jpos; j+=2UL ) {
2902  (~C)(i,j ) += A(i,i) * B(i,j );
2903  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2904  }
2905  if( jpos < jend ) {
2906  (~C)(i,jpos) += A(i,i) * B(i,jpos);
2907  }
2908  }
2909  }
2911  //**********************************************************************************************
2912 
2913  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
2927  template< typename MT3 // Type of the left-hand side target matrix
2928  , typename MT4 // Type of the left-hand side matrix operand
2929  , typename MT5 > // Type of the right-hand side matrix operand
2930  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2931  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
2932  {
2933  constexpr size_t block( BLOCK_SIZE );
2934 
2935  const size_t M( A.rows() );
2936  const size_t N( B.columns() );
2937 
2938  for( size_t jj=0UL; jj<N; jj+=block ) {
2939  const size_t jend( min( N, jj+block ) );
2940  for( size_t ii=0UL; ii<M; ii+=block ) {
2941  const size_t iend( min( M, ii+block ) );
2942  for( size_t j=jj; j<jend; ++j )
2943  {
2944  const size_t ibegin( ( IsLower<MT5>::value )
2945  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
2946  :( ii ) );
2947  const size_t ipos( ( IsUpper<MT5>::value )
2948  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
2949  :( iend ) );
2950 
2951  for( size_t i=ibegin; i<ipos; ++i ) {
2952  (~C)(i,j) += A(i,i) * B(i,j);
2953  }
2954  }
2955  }
2956  }
2957  }
2959  //**********************************************************************************************
2960 
2961  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
2975  template< typename MT3 // Type of the left-hand side target matrix
2976  , typename MT4 // Type of the left-hand side matrix operand
2977  , typename MT5 > // Type of the right-hand side matrix operand
2978  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2979  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
2980  {
2981  for( size_t i=0UL; i<A.rows(); ++i ) {
2982  C(i,i) += A(i,i) * B(i,i);
2983  }
2984  }
2986  //**********************************************************************************************
2987 
2988  //**Default addition assignment to dense matrices (small matrices)******************************
3002  template< typename MT3 // Type of the left-hand side target matrix
3003  , typename MT4 // Type of the left-hand side matrix operand
3004  , typename MT5 > // Type of the right-hand side matrix operand
3006  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
3007  {
3008  selectDefaultAddAssignKernel( C, A, B );
3009  }
3011  //**********************************************************************************************
3012 
3013  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
3028  template< typename MT3 // Type of the left-hand side target matrix
3029  , typename MT4 // Type of the left-hand side matrix operand
3030  , typename MT5 > // Type of the right-hand side matrix operand
3032  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
3033  {
3034  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3035 
3036  const size_t M( A.rows() );
3037  const size_t N( B.columns() );
3038  const size_t K( A.columns() );
3039 
3040  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3041 
3042  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
3043  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
3044 
3045  size_t j( 0UL );
3046 
3048  {
3049  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
3050  for( size_t i=0UL; i<M; ++i )
3051  {
3052  const size_t kbegin( ( IsUpper<MT4>::value )
3053  ?( ( IsLower<MT5>::value )
3054  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3055  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3056  :( IsLower<MT5>::value ? j : 0UL ) );
3057  const size_t kend( ( IsLower<MT4>::value )
3058  ?( ( IsUpper<MT5>::value )
3059  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
3060  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
3061  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
3062 
3063  SIMDType xmm1( (~C).load(i,j ) );
3064  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3065  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3066  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3067  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3068  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
3069  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
3070  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
3071 
3072  for( size_t k=kbegin; k<kend; ++k ) {
3073  const SIMDType a1( set( A(i,k) ) );
3074  xmm1 += a1 * B.load(k,j );
3075  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3076  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3077  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3078  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3079  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
3080  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
3081  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
3082  }
3083 
3084  (~C).store( i, j , xmm1 );
3085  (~C).store( i, j+SIMDSIZE , xmm2 );
3086  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3087  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3088  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3089  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
3090  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
3091  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
3092  }
3093  }
3094  }
3095 
3096  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
3097  {
3098  size_t i( 0UL );
3099 
3100  for( ; (i+2UL) <= M; i+=2UL )
3101  {
3102  const size_t kbegin( ( IsUpper<MT4>::value )
3103  ?( ( IsLower<MT5>::value )
3104  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3105  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3106  :( IsLower<MT5>::value ? j : 0UL ) );
3107  const size_t kend( ( IsLower<MT4>::value )
3108  ?( ( IsUpper<MT5>::value )
3109  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
3110  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3111  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
3112 
3113  SIMDType xmm1 ( (~C).load(i ,j ) );
3114  SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
3115  SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
3116  SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
3117  SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
3118  SIMDType xmm6 ( (~C).load(i+1UL,j ) );
3119  SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
3120  SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3121  SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3122  SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
3123 
3124  for( size_t k=kbegin; k<kend; ++k ) {
3125  const SIMDType a1( set( A(i ,k) ) );
3126  const SIMDType a2( set( A(i+1UL,k) ) );
3127  const SIMDType b1( B.load(k,j ) );
3128  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3129  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3130  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3131  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
3132  xmm1 += a1 * b1;
3133  xmm2 += a1 * b2;
3134  xmm3 += a1 * b3;
3135  xmm4 += a1 * b4;
3136  xmm5 += a1 * b5;
3137  xmm6 += a2 * b1;
3138  xmm7 += a2 * b2;
3139  xmm8 += a2 * b3;
3140  xmm9 += a2 * b4;
3141  xmm10 += a2 * b5;
3142  }
3143 
3144  (~C).store( i , j , xmm1 );
3145  (~C).store( i , j+SIMDSIZE , xmm2 );
3146  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3147  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3148  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
3149  (~C).store( i+1UL, j , xmm6 );
3150  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
3151  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
3152  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
3153  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
3154  }
3155 
3156  if( i < M )
3157  {
3158  const size_t kbegin( ( IsUpper<MT4>::value )
3159  ?( ( IsLower<MT5>::value )
3160  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3161  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3162  :( IsLower<MT5>::value ? j : 0UL ) );
3163  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
3164 
3165  SIMDType xmm1( (~C).load(i,j ) );
3166  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3167  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3168  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3169  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
3170 
3171  for( size_t k=kbegin; k<kend; ++k ) {
3172  const SIMDType a1( set( A(i,k) ) );
3173  xmm1 += a1 * B.load(k,j );
3174  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3175  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3176  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3177  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
3178  }
3179 
3180  (~C).store( i, j , xmm1 );
3181  (~C).store( i, j+SIMDSIZE , xmm2 );
3182  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3183  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3184  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
3185  }
3186  }
3187 
3188  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3189  {
3190  size_t i( 0UL );
3191 
3192  for( ; (i+2UL) <= M; i+=2UL )
3193  {
3194  const size_t kbegin( ( IsUpper<MT4>::value )
3195  ?( ( IsLower<MT5>::value )
3196  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3197  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3198  :( IsLower<MT5>::value ? j : 0UL ) );
3199  const size_t kend( ( IsLower<MT4>::value )
3200  ?( ( IsUpper<MT5>::value )
3201  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
3202  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3203  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
3204 
3205  SIMDType xmm1( (~C).load(i ,j ) );
3206  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3207  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3208  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
3209  SIMDType xmm5( (~C).load(i+1UL,j ) );
3210  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
3211  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3212  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
3213 
3214  for( size_t k=kbegin; k<kend; ++k ) {
3215  const SIMDType a1( set( A(i ,k) ) );
3216  const SIMDType a2( set( A(i+1UL,k) ) );
3217  const SIMDType b1( B.load(k,j ) );
3218  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3219  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3220  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
3221  xmm1 += a1 * b1;
3222  xmm2 += a1 * b2;
3223  xmm3 += a1 * b3;
3224  xmm4 += a1 * b4;
3225  xmm5 += a2 * b1;
3226  xmm6 += a2 * b2;
3227  xmm7 += a2 * b3;
3228  xmm8 += a2 * b4;
3229  }
3230 
3231  (~C).store( i , j , xmm1 );
3232  (~C).store( i , j+SIMDSIZE , xmm2 );
3233  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3234  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
3235  (~C).store( i+1UL, j , xmm5 );
3236  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
3237  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
3238  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
3239  }
3240 
3241  if( i < M )
3242  {
3243  const size_t kbegin( ( IsUpper<MT4>::value )
3244  ?( ( IsLower<MT5>::value )
3245  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3246  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3247  :( IsLower<MT5>::value ? j : 0UL ) );
3248  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
3249 
3250  SIMDType xmm1( (~C).load(i,j ) );
3251  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3252  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3253  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
3254 
3255  for( size_t k=kbegin; k<kend; ++k ) {
3256  const SIMDType a1( set( A(i,k) ) );
3257  xmm1 += a1 * B.load(k,j );
3258  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3259  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3260  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
3261  }
3262 
3263  (~C).store( i, j , xmm1 );
3264  (~C).store( i, j+SIMDSIZE , xmm2 );
3265  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3266  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
3267  }
3268  }
3269 
3270  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3271  {
3272  size_t i( 0UL );
3273 
3274  for( ; (i+2UL) <= M; i+=2UL )
3275  {
3276  const size_t kbegin( ( IsUpper<MT4>::value )
3277  ?( ( IsLower<MT5>::value )
3278  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3279  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3280  :( IsLower<MT5>::value ? j : 0UL ) );
3281  const size_t kend( ( IsLower<MT4>::value )
3282  ?( ( IsUpper<MT5>::value )
3283  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
3284  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3285  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
3286 
3287  SIMDType xmm1( (~C).load(i ,j ) );
3288  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
3289  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
3290  SIMDType xmm4( (~C).load(i+1UL,j ) );
3291  SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
3292  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
3293 
3294  for( size_t k=kbegin; k<kend; ++k ) {
3295  const SIMDType a1( set( A(i ,k) ) );
3296  const SIMDType a2( set( A(i+1UL,k) ) );
3297  const SIMDType b1( B.load(k,j ) );
3298  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
3299  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
3300  xmm1 += a1 * b1;
3301  xmm2 += a1 * b2;
3302  xmm3 += a1 * b3;
3303  xmm4 += a2 * b1;
3304  xmm5 += a2 * b2;
3305  xmm6 += a2 * b3;
3306  }
3307 
3308  (~C).store( i , j , xmm1 );
3309  (~C).store( i , j+SIMDSIZE , xmm2 );
3310  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
3311  (~C).store( i+1UL, j , xmm4 );
3312  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
3313  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
3314  }
3315 
3316  if( i < M )
3317  {
3318  const size_t kbegin( ( IsUpper<MT4>::value )
3319  ?( ( IsLower<MT5>::value )
3320  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3321  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3322  :( IsLower<MT5>::value ? j : 0UL ) );
3323  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
3324 
3325  SIMDType xmm1( (~C).load(i,j ) );
3326  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
3327  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
3328 
3329  for( size_t k=kbegin; k<kend; ++k ) {
3330  const SIMDType a1( set( A(i,k) ) );
3331  xmm1 += a1 * B.load(k,j );
3332  xmm2 += a1 * B.load(k,j+SIMDSIZE );
3333  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
3334  }
3335 
3336  (~C).store( i, j , xmm1 );
3337  (~C).store( i, j+SIMDSIZE , xmm2 );
3338  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
3339  }
3340  }
3341 
3342  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3343  {
3344  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
3345  size_t i( LOW ? j : 0UL );
3346 
3347  for( ; (i+4UL) <= iend; i+=4UL )
3348  {
3349  const size_t kbegin( ( IsUpper<MT4>::value )
3350  ?( ( IsLower<MT5>::value )
3351  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3352  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3353  :( IsLower<MT5>::value ? j : 0UL ) );
3354  const size_t kend( ( IsLower<MT4>::value )
3355  ?( ( IsUpper<MT5>::value )
3356  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
3357  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
3358  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3359 
3360  SIMDType xmm1( (~C).load(i ,j ) );
3361  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3362  SIMDType xmm3( (~C).load(i+1UL,j ) );
3363  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3364  SIMDType xmm5( (~C).load(i+2UL,j ) );
3365  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3366  SIMDType xmm7( (~C).load(i+3UL,j ) );
3367  SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
3368 
3369  for( size_t k=kbegin; k<kend; ++k ) {
3370  const SIMDType a1( set( A(i ,k) ) );
3371  const SIMDType a2( set( A(i+1UL,k) ) );
3372  const SIMDType a3( set( A(i+2UL,k) ) );
3373  const SIMDType a4( set( A(i+3UL,k) ) );
3374  const SIMDType b1( B.load(k,j ) );
3375  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3376  xmm1 += a1 * b1;
3377  xmm2 += a1 * b2;
3378  xmm3 += a2 * b1;
3379  xmm4 += a2 * b2;
3380  xmm5 += a3 * b1;
3381  xmm6 += a3 * b2;
3382  xmm7 += a4 * b1;
3383  xmm8 += a4 * b2;
3384  }
3385 
3386  (~C).store( i , j , xmm1 );
3387  (~C).store( i , j+SIMDSIZE, xmm2 );
3388  (~C).store( i+1UL, j , xmm3 );
3389  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3390  (~C).store( i+2UL, j , xmm5 );
3391  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3392  (~C).store( i+3UL, j , xmm7 );
3393  (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
3394  }
3395 
3396  for( ; (i+3UL) <= iend; i+=3UL )
3397  {
3398  const size_t kbegin( ( IsUpper<MT4>::value )
3399  ?( ( IsLower<MT5>::value )
3400  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3401  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3402  :( IsLower<MT5>::value ? j : 0UL ) );
3403  const size_t kend( ( IsLower<MT4>::value )
3404  ?( ( IsUpper<MT5>::value )
3405  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
3406  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
3407  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3408 
3409  SIMDType xmm1( (~C).load(i ,j ) );
3410  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3411  SIMDType xmm3( (~C).load(i+1UL,j ) );
3412  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3413  SIMDType xmm5( (~C).load(i+2UL,j ) );
3414  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
3415 
3416  for( size_t k=kbegin; k<kend; ++k ) {
3417  const SIMDType a1( set( A(i ,k) ) );
3418  const SIMDType a2( set( A(i+1UL,k) ) );
3419  const SIMDType a3( set( A(i+2UL,k) ) );
3420  const SIMDType b1( B.load(k,j ) );
3421  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3422  xmm1 += a1 * b1;
3423  xmm2 += a1 * b2;
3424  xmm3 += a2 * b1;
3425  xmm4 += a2 * b2;
3426  xmm5 += a3 * b1;
3427  xmm6 += a3 * b2;
3428  }
3429 
3430  (~C).store( i , j , xmm1 );
3431  (~C).store( i , j+SIMDSIZE, xmm2 );
3432  (~C).store( i+1UL, j , xmm3 );
3433  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
3434  (~C).store( i+2UL, j , xmm5 );
3435  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
3436  }
3437 
3438  for( ; (i+2UL) <= iend; i+=2UL )
3439  {
3440  const size_t kbegin( ( IsUpper<MT4>::value )
3441  ?( ( IsLower<MT5>::value )
3442  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3443  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3444  :( IsLower<MT5>::value ? j : 0UL ) );
3445  const size_t kend( ( IsLower<MT4>::value )
3446  ?( ( IsUpper<MT5>::value )
3447  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
3448  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3449  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
3450 
3451  SIMDType xmm1( (~C).load(i ,j ) );
3452  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
3453  SIMDType xmm3( (~C).load(i+1UL,j ) );
3454  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
3455  SIMDType xmm5, xmm6, xmm7, xmm8;
3456  size_t k( kbegin );
3457 
3458  for( ; (k+2UL) <= kend; k+=2UL ) {
3459  const SIMDType a1( set( A(i ,k ) ) );
3460  const SIMDType a2( set( A(i+1UL,k ) ) );
3461  const SIMDType a3( set( A(i ,k+1UL) ) );
3462  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
3463  const SIMDType b1( B.load(k ,j ) );
3464  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
3465  const SIMDType b3( B.load(k+1UL,j ) );
3466  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
3467  xmm1 += a1 * b1;
3468  xmm2 += a1 * b2;
3469  xmm3 += a2 * b1;
3470  xmm4 += a2 * b2;
3471  xmm5 += a3 * b3;
3472  xmm6 += a3 * b4;
3473  xmm7 += a4 * b3;
3474  xmm8 += a4 * b4;
3475  }
3476 
3477  for( ; k<kend; ++k ) {
3478  const SIMDType a1( set( A(i ,k) ) );
3479  const SIMDType a2( set( A(i+1UL,k) ) );
3480  const SIMDType b1( B.load(k,j ) );
3481  const SIMDType b2( B.load(k,j+SIMDSIZE) );
3482  xmm1 += a1 * b1;
3483  xmm2 += a1 * b2;
3484  xmm3 += a2 * b1;
3485  xmm4 += a2 * b2;
3486  }
3487 
3488  (~C).store( i , j , xmm1+xmm5 );
3489  (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
3490  (~C).store( i+1UL, j , xmm3+xmm7 );
3491  (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
3492  }
3493 
3494  if( i < iend )
3495  {
3496  const size_t kbegin( ( IsUpper<MT4>::value )
3497  ?( ( IsLower<MT5>::value )
3498  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3499  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3500  :( IsLower<MT5>::value ? j : 0UL ) );
3501  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
3502 
3503  SIMDType xmm1( (~C).load(i,j ) );
3504  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
3505  SIMDType xmm3, xmm4;
3506  size_t k( kbegin );
3507 
3508  for( ; (k+2UL) <= kend; k+=2UL ) {
3509  const SIMDType a1( set( A(i,k ) ) );
3510  const SIMDType a2( set( A(i,k+1UL) ) );
3511  xmm1 += a1 * B.load(k ,j );
3512  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
3513  xmm3 += a2 * B.load(k+1UL,j );
3514  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
3515  }
3516 
3517  for( ; k<kend; ++k ) {
3518  const SIMDType a1( set( A(i,k) ) );
3519  xmm1 += a1 * B.load(k,j );
3520  xmm2 += a1 * B.load(k,j+SIMDSIZE);
3521  }
3522 
3523  (~C).store( i, j , xmm1+xmm3 );
3524  (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
3525  }
3526  }
3527 
3528  for( ; j<jpos; j+=SIMDSIZE )
3529  {
3530  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
3531  size_t i( LOW ? j : 0UL );
3532 
3533  for( ; (i+4UL) <= iend; i+=4UL )
3534  {
3535  const size_t kbegin( ( IsUpper<MT4>::value )
3536  ?( ( IsLower<MT5>::value )
3537  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3538  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3539  :( IsLower<MT5>::value ? j : 0UL ) );
3540  const size_t kend( ( IsLower<MT4>::value )
3541  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
3542  :( K ) );
3543 
3544  SIMDType xmm1( (~C).load(i ,j) );
3545  SIMDType xmm2( (~C).load(i+1UL,j) );
3546  SIMDType xmm3( (~C).load(i+2UL,j) );
3547  SIMDType xmm4( (~C).load(i+3UL,j) );
3548  SIMDType xmm5, xmm6, xmm7, xmm8;
3549  size_t k( kbegin );
3550 
3551  for( ; (k+2UL) <= kend; k+=2UL ) {
3552  const SIMDType b1( B.load(k ,j) );
3553  const SIMDType b2( B.load(k+1UL,j) );
3554  xmm1 += set( A(i ,k ) ) * b1;
3555  xmm2 += set( A(i+1UL,k ) ) * b1;
3556  xmm3 += set( A(i+2UL,k ) ) * b1;
3557  xmm4 += set( A(i+3UL,k ) ) * b1;
3558  xmm5 += set( A(i ,k+1UL) ) * b2;
3559  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
3560  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
3561  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
3562  }
3563 
3564  for( ; k<kend; ++k ) {
3565  const SIMDType b1( B.load(k,j) );
3566  xmm1 += set( A(i ,k) ) * b1;
3567  xmm2 += set( A(i+1UL,k) ) * b1;
3568  xmm3 += set( A(i+2UL,k) ) * b1;
3569  xmm4 += set( A(i+3UL,k) ) * b1;
3570  }
3571 
3572  (~C).store( i , j, xmm1+xmm5 );
3573  (~C).store( i+1UL, j, xmm2+xmm6 );
3574  (~C).store( i+2UL, j, xmm3+xmm7 );
3575  (~C).store( i+3UL, j, xmm4+xmm8 );
3576  }
3577 
3578  for( ; (i+3UL) <= iend; i+=3UL )
3579  {
3580  const size_t kbegin( ( IsUpper<MT4>::value )
3581  ?( ( IsLower<MT5>::value )
3582  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3583  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3584  :( IsLower<MT5>::value ? j : 0UL ) );
3585  const size_t kend( ( IsLower<MT4>::value )
3586  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
3587  :( K ) );
3588 
3589  SIMDType xmm1( (~C).load(i ,j) );
3590  SIMDType xmm2( (~C).load(i+1UL,j) );
3591  SIMDType xmm3( (~C).load(i+2UL,j) );
3592  SIMDType xmm4, xmm5, xmm6;
3593  size_t k( kbegin );
3594 
3595  for( ; (k+2UL) <= kend; k+=2UL ) {
3596  const SIMDType b1( B.load(k ,j) );
3597  const SIMDType b2( B.load(k+1UL,j) );
3598  xmm1 += set( A(i ,k ) ) * b1;
3599  xmm2 += set( A(i+1UL,k ) ) * b1;
3600  xmm3 += set( A(i+2UL,k ) ) * b1;
3601  xmm4 += set( A(i ,k+1UL) ) * b2;
3602  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
3603  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
3604  }
3605 
3606  for( ; k<kend; ++k ) {
3607  const SIMDType b1( B.load(k,j) );
3608  xmm1 += set( A(i ,k) ) * b1;
3609  xmm2 += set( A(i+1UL,k) ) * b1;
3610  xmm3 += set( A(i+2UL,k) ) * b1;
3611  }
3612 
3613  (~C).store( i , j, xmm1+xmm4 );
3614  (~C).store( i+1UL, j, xmm2+xmm5 );
3615  (~C).store( i+2UL, j, xmm3+xmm6 );
3616  }
3617 
3618  for( ; (i+2UL) <= iend; i+=2UL )
3619  {
3620  const size_t kbegin( ( IsUpper<MT4>::value )
3621  ?( ( IsLower<MT5>::value )
3622  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3623  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3624  :( IsLower<MT5>::value ? j : 0UL ) );
3625  const size_t kend( ( IsLower<MT4>::value )
3626  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3627  :( K ) );
3628 
3629  SIMDType xmm1( (~C).load(i ,j) );
3630  SIMDType xmm2( (~C).load(i+1UL,j) );
3631  SIMDType xmm3, xmm4;
3632  size_t k( kbegin );
3633 
3634  for( ; (k+2UL) <= kend; k+=2UL ) {
3635  const SIMDType b1( B.load(k ,j) );
3636  const SIMDType b2( B.load(k+1UL,j) );
3637  xmm1 += set( A(i ,k ) ) * b1;
3638  xmm2 += set( A(i+1UL,k ) ) * b1;
3639  xmm3 += set( A(i ,k+1UL) ) * b2;
3640  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
3641  }
3642 
3643  for( ; k<kend; ++k ) {
3644  const SIMDType b1( B.load(k,j) );
3645  xmm1 += set( A(i ,k) ) * b1;
3646  xmm2 += set( A(i+1UL,k) ) * b1;
3647  }
3648 
3649  (~C).store( i , j, xmm1+xmm3 );
3650  (~C).store( i+1UL, j, xmm2+xmm4 );
3651  }
3652 
3653  if( i < iend )
3654  {
3655  const size_t kbegin( ( IsUpper<MT4>::value )
3656  ?( ( IsLower<MT5>::value )
3657  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3658  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3659  :( IsLower<MT5>::value ? j : 0UL ) );
3660 
3661  SIMDType xmm1( (~C).load(i,j) );
3662  SIMDType xmm2;
3663  size_t k( kbegin );
3664 
3665  for( ; (k+2UL) <= K; k+=2UL ) {
3666  xmm1 += set( A(i,k ) ) * B.load(k ,j);
3667  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
3668  }
3669 
3670  for( ; k<K; ++k ) {
3671  xmm1 += set( A(i,k) ) * B.load(k,j);
3672  }
3673 
3674  (~C).store( i, j, xmm1+xmm2 );
3675  }
3676  }
3677 
3678  for( ; remainder && j<N; ++j )
3679  {
3680  const size_t iend( UPP ? j+1UL : M );
3681  size_t i( LOW ? j : 0UL );
3682 
3683  for( ; (i+2UL) <= iend; i+=2UL )
3684  {
3685  const size_t kbegin( ( IsUpper<MT4>::value )
3686  ?( ( IsLower<MT5>::value )
3687  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3688  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3689  :( IsLower<MT5>::value ? j : 0UL ) );
3690  const size_t kend( ( IsLower<MT4>::value )
3691  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3692  :( K ) );
3693 
3694  ElementType value1( (~C)(i ,j) );
3695  ElementType value2( (~C)(i+1UL,j) );;
3696 
3697  for( size_t k=kbegin; k<kend; ++k ) {
3698  value1 += A(i ,k) * B(k,j);
3699  value2 += A(i+1UL,k) * B(k,j);
3700  }
3701 
3702  (~C)(i ,j) = value1;
3703  (~C)(i+1UL,j) = value2;
3704  }
3705 
3706  if( i < iend )
3707  {
3708  const size_t kbegin( ( IsUpper<MT4>::value )
3709  ?( ( IsLower<MT5>::value )
3710  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3711  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3712  :( IsLower<MT5>::value ? j : 0UL ) );
3713 
3714  ElementType value( (~C)(i,j) );
3715 
3716  for( size_t k=kbegin; k<K; ++k ) {
3717  value += A(i,k) * B(k,j);
3718  }
3719 
3720  (~C)(i,j) = value;
3721  }
3722  }
3723  }
3725  //**********************************************************************************************
3726 
3727  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
3742  template< typename MT3 // Type of the left-hand side target matrix
3743  , typename MT4 // Type of the left-hand side matrix operand
3744  , typename MT5 > // Type of the right-hand side matrix operand
3746  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
3747  {
3748  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3749 
3750  const size_t M( A.rows() );
3751  const size_t N( B.columns() );
3752  const size_t K( A.columns() );
3753 
3754  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
3755 
3756  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
3757  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
3758 
3759  size_t i( 0UL );
3760 
3762  {
3763  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3764  for( size_t j=0UL; j<N; ++j )
3765  {
3766  const size_t kbegin( ( IsLower<MT5>::value )
3767  ?( ( IsUpper<MT4>::value )
3768  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3769  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3770  :( IsUpper<MT4>::value ? i : 0UL ) );
3771  const size_t kend( ( IsUpper<MT5>::value )
3772  ?( ( IsLower<MT4>::value )
3773  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3774  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3775  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
3776 
3777  SIMDType xmm1( (~C).load(i ,j) );
3778  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3779  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3780  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3781  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3782  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3783  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3784  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3785 
3786  for( size_t k=kbegin; k<kend; ++k ) {
3787  const SIMDType b1( set( B(k,j) ) );
3788  xmm1 += A.load(i ,k) * b1;
3789  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3790  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3791  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3792  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3793  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
3794  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
3795  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
3796  }
3797 
3798  (~C).store( i , j, xmm1 );
3799  (~C).store( i+SIMDSIZE , j, xmm2 );
3800  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3801  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3802  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3803  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3804  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3805  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3806  }
3807  }
3808  }
3809 
3810  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
3811  {
3812  size_t j( 0UL );
3813 
3814  for( ; (j+2UL) <= N; j+=2UL )
3815  {
3816  const size_t kbegin( ( IsLower<MT5>::value )
3817  ?( ( IsUpper<MT4>::value )
3818  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3819  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3820  :( IsUpper<MT4>::value ? i : 0UL ) );
3821  const size_t kend( ( IsUpper<MT5>::value )
3822  ?( ( IsLower<MT4>::value )
3823  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3824  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3825  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
3826 
3827  SIMDType xmm1 ( (~C).load(i ,j ) );
3828  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
3829  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
3830  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
3831  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
3832  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
3833  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
3834  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3835  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3836  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
3837 
3838  for( size_t k=kbegin; k<kend; ++k ) {
3839  const SIMDType a1( A.load(i ,k) );
3840  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3841  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3842  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3843  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
3844  const SIMDType b1( set( B(k,j ) ) );
3845  const SIMDType b2( set( B(k,j+1UL) ) );
3846  xmm1 += a1 * b1;
3847  xmm2 += a2 * b1;
3848  xmm3 += a3 * b1;
3849  xmm4 += a4 * b1;
3850  xmm5 += a5 * b1;
3851  xmm6 += a1 * b2;
3852  xmm7 += a2 * b2;
3853  xmm8 += a3 * b2;
3854  xmm9 += a4 * b2;
3855  xmm10 += a5 * b2;
3856  }
3857 
3858  (~C).store( i , j , xmm1 );
3859  (~C).store( i+SIMDSIZE , j , xmm2 );
3860  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3861  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3862  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
3863  (~C).store( i , j+1UL, xmm6 );
3864  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
3865  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
3866  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
3867  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
3868  }
3869 
3870  if( j < N )
3871  {
3872  const size_t kbegin( ( IsLower<MT5>::value )
3873  ?( ( IsUpper<MT4>::value )
3874  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3875  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3876  :( IsUpper<MT4>::value ? i : 0UL ) );
3877  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
3878 
3879  SIMDType xmm1( (~C).load(i ,j) );
3880  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3881  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3882  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3883  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3884 
3885  for( size_t k=kbegin; k<kend; ++k ) {
3886  const SIMDType b1( set( B(k,j) ) );
3887  xmm1 += A.load(i ,k) * b1;
3888  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3889  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3890  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3891  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
3892  }
3893 
3894  (~C).store( i , j, xmm1 );
3895  (~C).store( i+SIMDSIZE , j, xmm2 );
3896  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3897  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3898  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3899  }
3900  }
3901 
3902  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3903  {
3904  size_t j( 0UL );
3905 
3906  for( ; (j+2UL) <= N; j+=2UL )
3907  {
3908  const size_t kbegin( ( IsLower<MT5>::value )
3909  ?( ( IsUpper<MT4>::value )
3910  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3911  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3912  :( IsUpper<MT4>::value ? i : 0UL ) );
3913  const size_t kend( ( IsUpper<MT5>::value )
3914  ?( ( IsLower<MT4>::value )
3915  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3916  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3917  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
3918 
3919  SIMDType xmm1( (~C).load(i ,j ) );
3920  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3921  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3922  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3923  SIMDType xmm5( (~C).load(i ,j+1UL) );
3924  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3925  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3926  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3927 
3928  for( size_t k=kbegin; k<kend; ++k ) {
3929  const SIMDType a1( A.load(i ,k) );
3930  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3931  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3932  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3933  const SIMDType b1( set( B(k,j ) ) );
3934  const SIMDType b2( set( B(k,j+1UL) ) );
3935  xmm1 += a1 * b1;
3936  xmm2 += a2 * b1;
3937  xmm3 += a3 * b1;
3938  xmm4 += a4 * b1;
3939  xmm5 += a1 * b2;
3940  xmm6 += a2 * b2;
3941  xmm7 += a3 * b2;
3942  xmm8 += a4 * b2;
3943  }
3944 
3945  (~C).store( i , j , xmm1 );
3946  (~C).store( i+SIMDSIZE , j , xmm2 );
3947  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3948  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3949  (~C).store( i , j+1UL, xmm5 );
3950  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3951  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3952  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3953  }
3954 
3955  if( j < N )
3956  {
3957  const size_t kbegin( ( IsLower<MT5>::value )
3958  ?( ( IsUpper<MT4>::value )
3959  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3960  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3961  :( IsUpper<MT4>::value ? i : 0UL ) );
3962  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
3963 
3964  SIMDType xmm1( (~C).load(i ,j) );
3965  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3966  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3967  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3968 
3969  for( size_t k=kbegin; k<kend; ++k ) {
3970  const SIMDType b1( set( B(k,j) ) );
3971  xmm1 += A.load(i ,k) * b1;
3972  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
3973  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
3974  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
3975  }
3976 
3977  (~C).store( i , j, xmm1 );
3978  (~C).store( i+SIMDSIZE , j, xmm2 );
3979  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3980  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3981  }
3982  }
3983 
3984  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3985  {
3986  size_t j( 0UL );
3987 
3988  for( ; (j+2UL) <= N; j+=2UL )
3989  {
3990  const size_t kbegin( ( IsLower<MT5>::value )
3991  ?( ( IsUpper<MT4>::value )
3992  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3993  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3994  :( IsUpper<MT4>::value ? i : 0UL ) );
3995  const size_t kend( ( IsUpper<MT5>::value )
3996  ?( ( IsLower<MT4>::value )
3997  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3998  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3999  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
4000 
4001  SIMDType xmm1( (~C).load(i ,j ) );
4002  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4003  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4004  SIMDType xmm4( (~C).load(i ,j+1UL) );
4005  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
4006  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4007 
4008  for( size_t k=kbegin; k<kend; ++k ) {
4009  const SIMDType a1( A.load(i ,k) );
4010  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4011  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4012  const SIMDType b1( set( B(k,j ) ) );
4013  const SIMDType b2( set( B(k,j+1UL) ) );
4014  xmm1 += a1 * b1;
4015  xmm2 += a2 * b1;
4016  xmm3 += a3 * b1;
4017  xmm4 += a1 * b2;
4018  xmm5 += a2 * b2;
4019  xmm6 += a3 * b2;
4020  }
4021 
4022  (~C).store( i , j , xmm1 );
4023  (~C).store( i+SIMDSIZE , j , xmm2 );
4024  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4025  (~C).store( i , j+1UL, xmm4 );
4026  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
4027  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
4028  }
4029 
4030  if( j < N )
4031  {
4032  const size_t kbegin( ( IsLower<MT5>::value )
4033  ?( ( IsUpper<MT4>::value )
4034  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4035  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4036  :( IsUpper<MT4>::value ? i : 0UL ) );
4037  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
4038 
4039  SIMDType xmm1( (~C).load(i ,j) );
4040  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4041  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4042 
4043  for( size_t k=kbegin; k<kend; ++k ) {
4044  const SIMDType b1( set( B(k,j) ) );
4045  xmm1 += A.load(i ,k) * b1;
4046  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
4047  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
4048  }
4049 
4050  (~C).store( i , j, xmm1 );
4051  (~C).store( i+SIMDSIZE , j, xmm2 );
4052  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4053  }
4054  }
4055 
4056  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4057  {
4058  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
4059  size_t j( UPP ? i : 0UL );
4060 
4061  for( ; (j+4UL) <= jend; j+=4UL )
4062  {
4063  const size_t kbegin( ( IsLower<MT5>::value )
4064  ?( ( IsUpper<MT4>::value )
4065  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4066  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4067  :( IsUpper<MT4>::value ? i : 0UL ) );
4068  const size_t kend( ( IsUpper<MT5>::value )
4069  ?( ( IsLower<MT4>::value )
4070  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
4071  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
4072  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
4073 
4074  SIMDType xmm1( (~C).load(i ,j ) );
4075  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4076  SIMDType xmm3( (~C).load(i ,j+1UL) );
4077  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4078  SIMDType xmm5( (~C).load(i ,j+2UL) );
4079  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
4080  SIMDType xmm7( (~C).load(i ,j+3UL) );
4081  SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
4082 
4083  for( size_t k=kbegin; k<kend; ++k ) {
4084  const SIMDType a1( A.load(i ,k) );
4085  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4086  const SIMDType b1( set( B(k,j ) ) );
4087  const SIMDType b2( set( B(k,j+1UL) ) );
4088  const SIMDType b3( set( B(k,j+2UL) ) );
4089  const SIMDType b4( set( B(k,j+3UL) ) );
4090  xmm1 += a1 * b1;
4091  xmm2 += a2 * b1;
4092  xmm3 += a1 * b2;
4093  xmm4 += a2 * b2;
4094  xmm5 += a1 * b3;
4095  xmm6 += a2 * b3;
4096  xmm7 += a1 * b4;
4097  xmm8 += a2 * b4;
4098  }
4099 
4100  (~C).store( i , j , xmm1 );
4101  (~C).store( i+SIMDSIZE, j , xmm2 );
4102  (~C).store( i , j+1UL, xmm3 );
4103  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4104  (~C).store( i , j+2UL, xmm5 );
4105  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
4106  (~C).store( i , j+3UL, xmm7 );
4107  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
4108  }
4109 
4110  for( ; (j+3UL) <= jend; j+=3UL )
4111  {
4112  const size_t kbegin( ( IsLower<MT5>::value )
4113  ?( ( IsUpper<MT4>::value )
4114  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4115  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4116  :( IsUpper<MT4>::value ? i : 0UL ) );
4117  const size_t kend( ( IsUpper<MT5>::value )
4118  ?( ( IsLower<MT4>::value )
4119  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
4120  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
4121  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
4122 
4123  SIMDType xmm1( (~C).load(i ,j ) );
4124  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4125  SIMDType xmm3( (~C).load(i ,j+1UL) );
4126  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4127  SIMDType xmm5( (~C).load(i ,j+2UL) );
4128  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
4129 
4130  for( size_t k=kbegin; k<kend; ++k ) {
4131  const SIMDType a1( A.load(i ,k) );
4132  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4133  const SIMDType b1( set( B(k,j ) ) );
4134  const SIMDType b2( set( B(k,j+1UL) ) );
4135  const SIMDType b3( set( B(k,j+2UL) ) );
4136  xmm1 += a1 * b1;
4137  xmm2 += a2 * b1;
4138  xmm3 += a1 * b2;
4139  xmm4 += a2 * b2;
4140  xmm5 += a1 * b3;
4141  xmm6 += a2 * b3;
4142  }
4143 
4144  (~C).store( i , j , xmm1 );
4145  (~C).store( i+SIMDSIZE, j , xmm2 );
4146  (~C).store( i , j+1UL, xmm3 );
4147  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4148  (~C).store( i , j+2UL, xmm5 );
4149  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
4150  }
4151 
4152  for( ; (j+2UL) <= jend; j+=2UL )
4153  {
4154  const size_t kbegin( ( IsLower<MT5>::value )
4155  ?( ( IsUpper<MT4>::value )
4156  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4157  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4158  :( IsUpper<MT4>::value ? i : 0UL ) );
4159  const size_t kend( ( IsUpper<MT5>::value )
4160  ?( ( IsLower<MT4>::value )
4161  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4162  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4163  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
4164 
4165  SIMDType xmm1( (~C).load(i ,j ) );
4166  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4167  SIMDType xmm3( (~C).load(i ,j+1UL) );
4168  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4169  SIMDType xmm5, xmm6, xmm7, xmm8;
4170  size_t k( kbegin );
4171 
4172  for( ; (k+2UL) < kend; k+=2UL ) {
4173  const SIMDType a1( A.load(i ,k ) );
4174  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
4175  const SIMDType a3( A.load(i ,k+1UL) );
4176  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
4177  const SIMDType b1( set( B(k ,j ) ) );
4178  const SIMDType b2( set( B(k ,j+1UL) ) );
4179  const SIMDType b3( set( B(k+1UL,j ) ) );
4180  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
4181  xmm1 += a1 * b1;
4182  xmm2 += a2 * b1;
4183  xmm3 += a1 * b2;
4184  xmm4 += a2 * b2;
4185  xmm5 += a3 * b3;
4186  xmm6 += a4 * b3;
4187  xmm7 += a3 * b4;
4188  xmm8 += a4 * b4;
4189  }
4190 
4191  for( ; k<kend; ++k ) {
4192  const SIMDType a1( A.load(i ,k) );
4193  const SIMDType a2( A.load(i+SIMDSIZE,k) );
4194  const SIMDType b1( set( B(k,j ) ) );
4195  const SIMDType b2( set( B(k,j+1UL) ) );
4196  xmm1 += a1 * b1;
4197  xmm2 += a2 * b1;
4198  xmm3 += a1 * b2;
4199  xmm4 += a2 * b2;
4200  }
4201 
4202  (~C).store( i , j , xmm1+xmm5 );
4203  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
4204  (~C).store( i , j+1UL, xmm3+xmm7 );
4205  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
4206  }
4207 
4208  if( j < jend )
4209  {
4210  const size_t kbegin( ( IsLower<MT5>::value )
4211  ?( ( IsUpper<MT4>::value )
4212  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4213  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4214  :( IsUpper<MT4>::value ? i : 0UL ) );
4215  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
4216 
4217  SIMDType xmm1( (~C).load(i ,j) );
4218  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
4219  SIMDType xmm3, xmm4;
4220  size_t k( kbegin );
4221 
4222  for( ; (k+2UL) <= kend; k+=2UL ) {
4223  const SIMDType b1( set( B(k ,j) ) );
4224  const SIMDType b2( set( B(k+1UL,j) ) );
4225  xmm1 += A.load(i ,k ) * b1;
4226  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
4227  xmm3 += A.load(i ,k+1UL) * b2;
4228  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
4229  }
4230 
4231  for( ; k<kend; ++k ) {
4232  const SIMDType b1( set( B(k,j) ) );
4233  xmm1 += A.load(i ,k) * b1;
4234  xmm2 += A.load(i+SIMDSIZE,k) * b1;
4235  }
4236 
4237  (~C).store( i , j, xmm1+xmm3 );
4238  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
4239  }
4240  }
4241 
4242  for( ; i<ipos; i+=SIMDSIZE )
4243  {
4244  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
4245  size_t j( UPP ? i : 0UL );
4246 
4247  for( ; (j+4UL) <= jend; j+=4UL )
4248  {
4249  const size_t kbegin( ( IsLower<MT5>::value )
4250  ?( ( IsUpper<MT4>::value )
4251  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4252  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4253  :( IsUpper<MT4>::value ? i : 0UL ) );
4254  const size_t kend( ( IsUpper<MT5>::value )
4255  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
4256  :( K ) );
4257 
4258  SIMDType xmm1( (~C).load(i,j ) );
4259  SIMDType xmm2( (~C).load(i,j+1UL) );
4260  SIMDType xmm3( (~C).load(i,j+2UL) );
4261  SIMDType xmm4( (~C).load(i,j+3UL) );
4262  SIMDType xmm5, xmm6, xmm7, xmm8;
4263  size_t k( kbegin );
4264 
4265  for( ; (k+2UL) <= kend; k+=2UL ) {
4266  const SIMDType a1( A.load(i,k ) );
4267  const SIMDType a2( A.load(i,k+1UL) );
4268  xmm1 += a1 * set( B(k ,j ) );
4269  xmm2 += a1 * set( B(k ,j+1UL) );
4270  xmm3 += a1 * set( B(k ,j+2UL) );
4271  xmm4 += a1 * set( B(k ,j+3UL) );
4272  xmm5 += a2 * set( B(k+1UL,j ) );
4273  xmm6 += a2 * set( B(k+1UL,j+1UL) );
4274  xmm7 += a2 * set( B(k+1UL,j+2UL) );
4275  xmm8 += a2 * set( B(k+1UL,j+3UL) );
4276  }
4277 
4278  for( ; k<kend; ++k ) {
4279  const SIMDType a1( A.load(i,k) );
4280  xmm1 += a1 * set( B(k,j ) );
4281  xmm2 += a1 * set( B(k,j+1UL) );
4282  xmm3 += a1 * set( B(k,j+2UL) );
4283  xmm4 += a1 * set( B(k,j+3UL) );
4284  }
4285 
4286  (~C).store( i, j , xmm1+xmm5 );
4287  (~C).store( i, j+1UL, xmm2+xmm6 );
4288  (~C).store( i, j+2UL, xmm3+xmm7 );
4289  (~C).store( i, j+3UL, xmm4+xmm8 );
4290  }
4291 
4292  for( ; (j+3UL) <= jend; j+=3UL )
4293  {
4294  const size_t kbegin( ( IsLower<MT5>::value )
4295  ?( ( IsUpper<MT4>::value )
4296  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4297  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4298  :( IsUpper<MT4>::value ? i : 0UL ) );
4299  const size_t kend( ( IsUpper<MT5>::value )
4300  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
4301  :( K ) );
4302 
4303  SIMDType xmm1( (~C).load(i,j ) );
4304  SIMDType xmm2( (~C).load(i,j+1UL) );
4305  SIMDType xmm3( (~C).load(i,j+2UL) );
4306  SIMDType xmm4, xmm5, xmm6;
4307  size_t k( kbegin );
4308 
4309  for( ; (k+2UL) <= kend; k+=2UL ) {
4310  const SIMDType a1( A.load(i,k ) );
4311  const SIMDType a2( A.load(i,k+1UL) );
4312  xmm1 += a1 * set( B(k ,j ) );
4313  xmm2 += a1 * set( B(k ,j+1UL) );
4314  xmm3 += a1 * set( B(k ,j+2UL) );
4315  xmm4 += a2 * set( B(k+1UL,j ) );
4316  xmm5 += a2 * set( B(k+1UL,j+1UL) );
4317  xmm6 += a2 * set( B(k+1UL,j+2UL) );
4318  }
4319 
4320  for( ; k<kend; ++k ) {
4321  const SIMDType a1( A.load(i,k) );
4322  xmm1 += a1 * set( B(k,j ) );
4323  xmm2 += a1 * set( B(k,j+1UL) );
4324  xmm3 += a1 * set( B(k,j+2UL) );
4325  }
4326 
4327  (~C).store( i, j , xmm1+xmm4 );
4328  (~C).store( i, j+1UL, xmm2+xmm5 );
4329  (~C).store( i, j+2UL, xmm3+xmm6 );
4330  }
4331 
4332  for( ; (j+2UL) <= jend; j+=2UL )
4333  {
4334  const size_t kbegin( ( IsLower<MT5>::value )
4335  ?( ( IsUpper<MT4>::value )
4336  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4337  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4338  :( IsUpper<MT4>::value ? i : 0UL ) );
4339  const size_t kend( ( IsUpper<MT5>::value )
4340  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4341  :( K ) );
4342 
4343  SIMDType xmm1( (~C).load(i,j ) );
4344  SIMDType xmm2( (~C).load(i,j+1UL) );
4345  SIMDType xmm3, xmm4;
4346  size_t k( kbegin );
4347 
4348  for( ; (k+2UL) <= kend; k+=2UL ) {
4349  const SIMDType a1( A.load(i,k ) );
4350  const SIMDType a2( A.load(i,k+1UL) );
4351  xmm1 += a1 * set( B(k ,j ) );
4352  xmm2 += a1 * set( B(k ,j+1UL) );
4353  xmm3 += a2 * set( B(k+1UL,j ) );
4354  xmm4 += a2 * set( B(k+1UL,j+1UL) );
4355  }
4356 
4357  for( ; k<kend; ++k ) {
4358  const SIMDType a1( A.load(i,k) );
4359  xmm1 += a1 * set( B(k,j ) );
4360  xmm2 += a1 * set( B(k,j+1UL) );
4361  }
4362 
4363  (~C).store( i, j , xmm1+xmm3 );
4364  (~C).store( i, j+1UL, xmm2+xmm4 );
4365  }
4366 
4367  if( j < jend )
4368  {
4369  const size_t kbegin( ( IsLower<MT5>::value )
4370  ?( ( IsUpper<MT4>::value )
4371  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4372  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4373  :( IsUpper<MT4>::value ? i : 0UL ) );
4374 
4375  SIMDType xmm1( (~C).load(i,j) );
4376  SIMDType xmm2;
4377  size_t k( kbegin );
4378 
4379  for( ; (k+2UL) <= K; k+=2UL ) {
4380  xmm1 += A.load(i,k ) * set( B(k ,j) );
4381  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
4382  }
4383 
4384  for( ; k<K; ++k ) {
4385  xmm1 += A.load(i,k) * set( B(k,j) );
4386  }
4387 
4388  (~C).store( i, j, xmm1+xmm2 );
4389  }
4390  }
4391 
4392  for( ; remainder && i<M; ++i )
4393  {
4394  const size_t jend( LOW ? i+1UL : N );
4395  size_t j( UPP ? i : 0UL );
4396 
4397  for( ; (j+2UL) <= jend; j+=2UL )
4398  {
4399  const size_t kbegin( ( IsLower<MT5>::value )
4400  ?( ( IsUpper<MT4>::value )
4401  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4402  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4403  :( IsUpper<MT4>::value ? i : 0UL ) );
4404  const size_t kend( ( IsUpper<MT5>::value )
4405  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4406  :( K ) );
4407 
4408  ElementType value1( (~C)(i,j ) );
4409  ElementType value2( (~C)(i,j+1UL) );
4410 
4411  for( size_t k=kbegin; k<kend; ++k ) {
4412  value1 += A(i,k) * B(k,j );
4413  value2 += A(i,k) * B(k,j+1UL);
4414  }
4415 
4416  (~C)(i,j ) = value1;
4417  (~C)(i,j+1UL) = value2;
4418  }
4419 
4420  if( j < jend )
4421  {
4422  const size_t kbegin( ( IsLower<MT5>::value )
4423  ?( ( IsUpper<MT4>::value )
4424  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4425  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4426  :( IsUpper<MT4>::value ? i : 0UL ) );
4427 
4428  ElementType value( (~C)(i,j) );
4429 
4430  for( size_t k=kbegin; k<K; ++k ) {
4431  value += A(i,k) * B(k,j);
4432  }
4433 
4434  (~C)(i,j) = value;
4435  }
4436  }
4437  }
4439  //**********************************************************************************************
4440 
4441  //**Default addition assignment to dense matrices (large matrices)******************************
4455  template< typename MT3 // Type of the left-hand side target matrix
4456  , typename MT4 // Type of the left-hand side matrix operand
4457  , typename MT5 > // Type of the right-hand side matrix operand
4459  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4460  {
4461  selectDefaultAddAssignKernel( C, A, B );
4462  }
4464  //**********************************************************************************************
4465 
4466  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
4481  template< typename MT3 // Type of the left-hand side target matrix
4482  , typename MT4 // Type of the left-hand side matrix operand
4483  , typename MT5 > // Type of the right-hand side matrix operand
4485  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4486  {
4487  if( LOW )
4488  lmmm( C, A, B, ElementType(1), ElementType(1) );
4489  else if( UPP )
4490  ummm( C, A, B, ElementType(1), ElementType(1) );
4491  else
4492  mmm( C, A, B, ElementType(1), ElementType(1) );
4493  }
4495  //**********************************************************************************************
4496 
4497  //**BLAS-based addition assignment to dense matrices (default)**********************************
4511  template< typename MT3 // Type of the left-hand side target matrix
4512  , typename MT4 // Type of the left-hand side matrix operand
4513  , typename MT5 > // Type of the right-hand side matrix operand
4515  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4516  {
4517  selectLargeAddAssignKernel( C, A, B );
4518  }
4520  //**********************************************************************************************
4521 
4522  //**BLAS-based addition assignment to dense matrices********************************************
4523 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4524 
4537  template< typename MT3 // Type of the left-hand side target matrix
4538  , typename MT4 // Type of the left-hand side matrix operand
4539  , typename MT5 > // Type of the right-hand side matrix operand
4541  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B )
4542  {
4543  using ET = ElementType_<MT3>;
4544 
4545  if( IsTriangular<MT4>::value ) {
4546  ResultType_<MT3> tmp( serial( B ) );
4547  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4548  addAssign( C, tmp );
4549  }
4550  else if( IsTriangular<MT5>::value ) {
4551  ResultType_<MT3> tmp( serial( A ) );
4552  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
4553  addAssign( C, tmp );
4554  }
4555  else {
4556  gemm( C, A, B, ET(1), ET(1) );
4557  }
4558  }
4560 #endif
4561  //**********************************************************************************************
4562 
4563  //**Addition assignment to sparse matrices******************************************************
4564  // No special implementation for the addition assignment to sparse matrices.
4565  //**********************************************************************************************
4566 
4567  //**Subtraction assignment to dense matrices****************************************************
4580  template< typename MT // Type of the target dense matrix
4581  , bool SO > // Storage order of the target dense matrix
4582  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
4583  {
4585 
4586  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
4587  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
4588 
4589  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4590  return;
4591  }
4592 
4593  LT A( serial( rhs.lhs_ ) ); // Evaluation of the left-hand side dense matrix operand
4594  RT B( serial( rhs.rhs_ ) ); // Evaluation of the right-hand side dense matrix operand
4595 
4596  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
4597  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
4598  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
4599  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
4600  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
4601  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
4602 
4603  TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
4604  }
4606  //**********************************************************************************************
4607 
4608  //**Subtraction assignment to dense matrices (kernel selection)*********************************
4619  template< typename MT3 // Type of the left-hand side target matrix
4620  , typename MT4 // Type of the left-hand side matrix operand
4621  , typename MT5 > // Type of the right-hand side matrix operand
4622  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4623  {
4625  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
4626  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
4627  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
4628  selectSmallSubAssignKernel( C, A, B );
4629  else
4630  selectBlasSubAssignKernel( C, A, B );
4631  }
4633  //**********************************************************************************************
4634 
4635  //**Default subtraction assignment to row-major dense matrices (general/general)****************
4649  template< typename MT3 // Type of the left-hand side target matrix
4650  , typename MT4 // Type of the left-hand side matrix operand
4651  , typename MT5 > // Type of the right-hand side matrix operand
4652  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4653  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4654  {
4655  const size_t M( A.rows() );
4656  const size_t N( B.columns() );
4657  const size_t K( A.columns() );
4658 
4659  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4660 
4661  for( size_t i=0UL; i<M; ++i )
4662  {
4663  const size_t kbegin( ( IsUpper<MT4>::value )
4664  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4665  :( 0UL ) );
4666  const size_t kend( ( IsLower<MT4>::value )
4667  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4668  :( K ) );
4669  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4670 
4671  for( size_t k=kbegin; k<kend; ++k )
4672  {
4673  const size_t jbegin( ( IsUpper<MT5>::value )
4675  ?( UPP ? max(i,k+1UL) : k+1UL )
4676  :( UPP ? max(i,k) : k ) )
4677  :( UPP ? i : 0UL ) );
4678  const size_t jend( ( IsLower<MT5>::value )
4680  ?( LOW ? min(i+1UL,k) : k )
4681  :( LOW ? min(i,k)+1UL : k+1UL ) )
4682  :( LOW ? i+1UL : N ) );
4683 
4684  if( ( LOW || UPP ) && ( jbegin >= jend ) ) continue;
4685  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4686 
4687  const size_t jnum( jend - jbegin );
4688  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4689 
4690  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4691  (~C)(i,j ) -= A(i,k) * B(k,j );
4692  (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4693  }
4694  if( jpos < jend ) {
4695  (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4696  }
4697  }
4698  }
4699  }
4701  //**********************************************************************************************
4702 
4703  //**Default subtraction assignment to column-major dense matrices (general/general)*************
4717  template< typename MT3 // Type of the left-hand side target matrix
4718  , typename MT4 // Type of the left-hand side matrix operand
4719  , typename MT5 > // Type of the right-hand side matrix operand
4720  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4721  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4722  {
4723  const size_t M( A.rows() );
4724  const size_t N( B.columns() );
4725  const size_t K( A.columns() );
4726 
4727  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
4728 
4729  for( size_t j=0UL; j<N; ++j )
4730  {
4731  const size_t kbegin( ( IsLower<MT5>::value )
4732  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4733  :( 0UL ) );
4734  const size_t kend( ( IsUpper<MT5>::value )
4735  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4736  :( K ) );
4737  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
4738 
4739  for( size_t k=kbegin; k<kend; ++k )
4740  {
4741  const size_t ibegin( ( IsLower<MT4>::value )
4743  ?( LOW ? max(j,k+1UL) : k+1UL )
4744  :( LOW ? max(j,k) : k ) )
4745  :( LOW ? j : 0UL ) );
4746  const size_t iend( ( IsUpper<MT4>::value )
4748  ?( UPP ? min(j+1UL,k) : k )
4749  :( UPP ? min(j,k)+1UL : k+1UL ) )
4750  :( UPP ? j+1UL : M ) );
4751 
4752  if( ( LOW || UPP ) && ( ibegin >= iend ) ) continue;
4753  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4754 
4755  const size_t inum( iend - ibegin );
4756  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4757 
4758  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4759  (~C)(i ,j) -= A(i ,k) * B(k,j);
4760  (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4761  }
4762  if( ipos < iend ) {
4763  (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4764  }
4765  }
4766  }
4767  }
4769  //**********************************************************************************************
4770 
4771  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
4785  template< typename MT3 // Type of the left-hand side target matrix
4786  , typename MT4 // Type of the left-hand side matrix operand
4787  , typename MT5 > // Type of the right-hand side matrix operand
4788  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4789  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4790  {
4791  constexpr size_t block( BLOCK_SIZE );
4792 
4793  const size_t M( A.rows() );
4794  const size_t N( B.columns() );
4795 
4796  for( size_t ii=0UL; ii<M; ii+=block ) {
4797  const size_t iend( min( M, ii+block ) );
4798  for( size_t jj=0UL; jj<N; jj+=block ) {
4799  const size_t jend( min( N, jj+block ) );
4800  for( size_t i=ii; i<iend; ++i )
4801  {
4802  const size_t jbegin( ( IsUpper<MT4>::value )
4803  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
4804  :( jj ) );
4805  const size_t jpos( ( IsLower<MT4>::value )
4806  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
4807  :( jend ) );
4808 
4809  for( size_t j=jbegin; j<jpos; ++j ) {
4810  (~C)(i,j) -= A(i,j) * B(j,j);
4811  }
4812  }
4813  }
4814  }
4815  }
4817  //**********************************************************************************************
4818 
4819  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
4833  template< typename MT3 // Type of the left-hand side target matrix
4834  , typename MT4 // Type of the left-hand side matrix operand
4835  , typename MT5 > // Type of the right-hand side matrix operand
4836  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4837  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4838  {
4839  const size_t M( A.rows() );
4840  const size_t N( B.columns() );
4841 
4842  for( size_t j=0UL; j<N; ++j )
4843  {
4844  const size_t ibegin( ( IsLower<MT4>::value )
4845  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4846  :( 0UL ) );
4847  const size_t iend( ( IsUpper<MT4>::value )
4848  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4849  :( M ) );
4850  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
4851 
4852  const size_t inum( iend - ibegin );
4853  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
4854 
4855  for( size_t i=ibegin; i<ipos; i+=2UL ) {
4856  (~C)(i ,j) -= A(i ,j) * B(j,j);
4857  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4858  }
4859  if( ipos < iend ) {
4860  (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4861  }
4862  }
4863  }
4865  //**********************************************************************************************
4866 
4867  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
4881  template< typename MT3 // Type of the left-hand side target matrix
4882  , typename MT4 // Type of the left-hand side matrix operand
4883  , typename MT5 > // Type of the right-hand side matrix operand
4884  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4885  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
4886  {
4887  const size_t M( A.rows() );
4888  const size_t N( B.columns() );
4889 
4890  for( size_t i=0UL; i<M; ++i )
4891  {
4892  const size_t jbegin( ( IsUpper<MT5>::value )
4893  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4894  :( 0UL ) );
4895  const size_t jend( ( IsLower<MT5>::value )
4896  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4897  :( N ) );
4898  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
4899 
4900  const size_t jnum( jend - jbegin );
4901  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
4902 
4903  for( size_t j=jbegin; j<jpos; j+=2UL ) {
4904  (~C)(i,j ) -= A(i,i) * B(i,j );
4905  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4906  }
4907  if( jpos < jend ) {
4908  (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4909  }
4910  }
4911  }
4913  //**********************************************************************************************
4914 
4915  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
4929  template< typename MT3 // Type of the left-hand side target matrix
4930  , typename MT4 // Type of the left-hand side matrix operand
4931  , typename MT5 > // Type of the right-hand side matrix operand
4932  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4933  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
4934  {
4935  constexpr size_t block( BLOCK_SIZE );
4936 
4937  const size_t M( A.rows() );
4938  const size_t N( B.columns() );
4939 
4940  for( size_t jj=0UL; jj<N; jj+=block ) {
4941  const size_t jend( min( N, jj+block ) );
4942  for( size_t ii=0UL; ii<M; ii+=block ) {
4943  const size_t iend( min( M, ii+block ) );
4944  for( size_t j=jj; j<jend; ++j )
4945  {
4946  const size_t ibegin( ( IsLower<MT5>::value )
4947  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
4948  :( ii ) );
4949  const size_t ipos( ( IsUpper<MT5>::value )
4950  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
4951  :( iend ) );
4952 
4953  for( size_t i=ibegin; i<ipos; ++i ) {
4954  (~C)(i,j) -= A(i,i) * B(i,j);
4955  }
4956  }
4957  }
4958  }
4959  }
4961  //**********************************************************************************************
4962 
4963  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
4977  template< typename MT3 // Type of the left-hand side target matrix
4978  , typename MT4 // Type of the left-hand side matrix operand
4979  , typename MT5 > // Type of the right-hand side matrix operand
4980  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4981  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
4982  {
4983  for( size_t i=0UL; i<A.rows(); ++i ) {
4984  C(i,i) -= A(i,i) * B(i,i);
4985  }
4986  }
4988  //**********************************************************************************************
4989 
4990  //**Default subtraction assignment to dense matrices (small matrices)***************************
5004  template< typename MT3 // Type of the left-hand side target matrix
5005  , typename MT4 // Type of the left-hand side matrix operand
5006  , typename MT5 > // Type of the right-hand side matrix operand
5008  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
5009  {
5010  selectDefaultSubAssignKernel( C, A, B );
5011  }
5013  //**********************************************************************************************
5014 
5015  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
5030  template< typename MT3 // Type of the left-hand side target matrix
5031  , typename MT4 // Type of the left-hand side matrix operand
5032  , typename MT5 > // Type of the right-hand side matrix operand
5034  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B )
5035  {
5036  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5037 
5038  const size_t M( A.rows() );
5039  const size_t N( B.columns() );
5040  const size_t K( A.columns() );
5041 
5042  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5043 
5044  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
5045  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
5046 
5047  size_t j( 0UL );
5048 
5050  {
5051  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
5052  for( size_t i=0UL; i<M; ++i )
5053  {
5054  const size_t kbegin( ( IsUpper<MT4>::value )
5055  ?( ( IsLower<MT5>::value )
5056  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5057  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5058  :( IsLower<MT5>::value ? j : 0UL ) );
5059  const size_t kend( ( IsLower<MT4>::value )
5060  ?( ( IsUpper<MT5>::value )
5061  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
5062  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
5063  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
5064 
5065  SIMDType xmm1( (~C).load(i,j ) );
5066  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5067  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5068  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5069  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
5070  SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
5071  SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
5072  SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
5073 
5074  for( size_t k=kbegin; k<kend; ++k ) {
5075  const SIMDType a1( set( A(i,k) ) );
5076  xmm1 -= a1 * B.load(k,j );
5077  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5078  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5079  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5080  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5081  xmm6 -= a1 * B.load(k,j+SIMDSIZE*5UL);
5082  xmm7 -= a1 * B.load(k,j+SIMDSIZE*6UL);
5083  xmm8 -= a1 * B.load(k,j+SIMDSIZE*7UL);
5084  }
5085 
5086  (~C).store( i, j , xmm1 );
5087  (~C).store( i, j+SIMDSIZE , xmm2 );
5088  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5089  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5090  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
5091  (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
5092  (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
5093  (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
5094  }
5095  }
5096  }
5097 
5098  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
5099  {
5100  size_t i( 0UL );
5101 
5102  for( ; (i+2UL) <= M; i+=2UL )
5103  {
5104  const size_t kbegin( ( IsUpper<MT4>::value )
5105  ?( ( IsLower<MT5>::value )
5106  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5107  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5108  :( IsLower<MT5>::value ? j : 0UL ) );
5109  const size_t kend( ( IsLower<MT4>::value )
5110  ?( ( IsUpper<MT5>::value )
5111  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
5112  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5113  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
5114 
5115  SIMDType xmm1 ( (~C).load(i ,j ) );
5116  SIMDType xmm2 ( (~C).load(i ,j+SIMDSIZE ) );
5117  SIMDType xmm3 ( (~C).load(i ,j+SIMDSIZE*2UL) );
5118  SIMDType xmm4 ( (~C).load(i ,j+SIMDSIZE*3UL) );
5119  SIMDType xmm5 ( (~C).load(i ,j+SIMDSIZE*4UL) );
5120  SIMDType xmm6 ( (~C).load(i+1UL,j ) );
5121  SIMDType xmm7 ( (~C).load(i+1UL,j+SIMDSIZE ) );
5122  SIMDType xmm8 ( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5123  SIMDType xmm9 ( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
5124  SIMDType xmm10( (~C).load(i+1UL,j+SIMDSIZE*4UL) );
5125 
5126  for( size_t k=kbegin; k<kend; ++k ) {
5127  const SIMDType a1( set( A(i ,k) ) );
5128  const SIMDType a2( set( A(i+1UL,k) ) );
5129  const SIMDType b1( B.load(k,j ) );
5130  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5131  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5132  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5133  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
5134  xmm1 -= a1 * b1;
5135  xmm2 -= a1 * b2;
5136  xmm3 -= a1 * b3;
5137  xmm4 -= a1 * b4;
5138  xmm5 -= a1 * b5;
5139  xmm6 -= a2 * b1;
5140  xmm7 -= a2 * b2;
5141  xmm8 -= a2 * b3;
5142  xmm9 -= a2 * b4;
5143  xmm10 -= a2 * b5;
5144  }
5145 
5146  (~C).store( i , j , xmm1 );
5147  (~C).store( i , j+SIMDSIZE , xmm2 );
5148  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5149  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
5150  (~C).store( i , j+SIMDSIZE*4UL, xmm5 );
5151  (~C).store( i+1UL, j , xmm6 );
5152  (~C).store( i+1UL, j+SIMDSIZE , xmm7 );
5153  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 );
5154  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 );
5155  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 );
5156  }
5157 
5158  if( i < M )
5159  {
5160  const size_t kbegin( ( IsUpper<MT4>::value )
5161  ?( ( IsLower<MT5>::value )
5162  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5163  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5164  :( IsLower<MT5>::value ? j : 0UL ) );
5165  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
5166 
5167  SIMDType xmm1( (~C).load(i,j ) );
5168  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5169  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5170  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5171  SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
5172 
5173  for( size_t k=kbegin; k<kend; ++k ) {
5174  const SIMDType a1( set( A(i,k) ) );
5175  xmm1 -= a1 * B.load(k,j );
5176  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5177  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5178  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5179  xmm5 -= a1 * B.load(k,j+SIMDSIZE*4UL);
5180  }
5181 
5182  (~C).store( i, j , xmm1 );
5183  (~C).store( i, j+SIMDSIZE , xmm2 );
5184  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5185  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5186  (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
5187  }
5188  }
5189 
5190  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5191  {
5192  size_t i( 0UL );
5193 
5194  for( ; (i+2UL) <= M; i+=2UL )
5195  {
5196  const size_t kbegin( ( IsUpper<MT4>::value )
5197  ?( ( IsLower<MT5>::value )
5198  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5199  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5200  :( IsLower<MT5>::value ? j : 0UL ) );
5201  const size_t kend( ( IsLower<MT4>::value )
5202  ?( ( IsUpper<MT5>::value )
5203  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
5204  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5205  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
5206 
5207  SIMDType xmm1( (~C).load(i ,j ) );
5208  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
5209  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
5210  SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
5211  SIMDType xmm5( (~C).load(i+1UL,j ) );
5212  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
5213  SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5214  SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
5215 
5216  for( size_t k=kbegin; k<kend; ++k ) {
5217  const SIMDType a1( set( A(i ,k) ) );
5218  const SIMDType a2( set( A(i+1UL,k) ) );
5219  const SIMDType b1( B.load(k,j ) );
5220  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5221  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5222  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
5223  xmm1 -= a1 * b1;
5224  xmm2 -= a1 * b2;
5225  xmm3 -= a1 * b3;
5226  xmm4 -= a1 * b4;
5227  xmm5 -= a2 * b1;
5228  xmm6 -= a2 * b2;
5229  xmm7 -= a2 * b3;
5230  xmm8 -= a2 * b4;
5231  }
5232 
5233  (~C).store( i , j , xmm1 );
5234  (~C).store( i , j+SIMDSIZE , xmm2 );
5235  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5236  (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
5237  (~C).store( i+1UL, j , xmm5 );
5238  (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
5239  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
5240  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
5241  }
5242 
5243  if( i < M )
5244  {
5245  const size_t kbegin( ( IsUpper<MT4>::value )
5246  ?( ( IsLower<MT5>::value )
5247  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5248  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5249  :( IsLower<MT5>::value ? j : 0UL ) );
5250  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
5251 
5252  SIMDType xmm1( (~C).load(i,j ) );
5253  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5254  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5255  SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
5256 
5257  for( size_t k=kbegin; k<kend; ++k ) {
5258  const SIMDType a1( set( A(i,k) ) );
5259  xmm1 -= a1 * B.load(k,j );
5260  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5261  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5262  xmm4 -= a1 * B.load(k,j+SIMDSIZE*3UL);
5263  }
5264 
5265  (~C).store( i, j , xmm1 );
5266  (~C).store( i, j+SIMDSIZE , xmm2 );
5267  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5268  (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
5269  }
5270  }
5271 
5272  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
5273  {
5274  size_t i( 0UL );
5275 
5276  for( ; (i+2UL) <= M; i+=2UL )
5277  {
5278  const size_t kbegin( ( IsUpper<MT4>::value )
5279  ?( ( IsLower<MT5>::value )
5280  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5281  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5282  :( IsLower<MT5>::value ? j : 0UL ) );
5283  const size_t kend( ( IsLower<MT4>::value )
5284  ?( ( IsUpper<MT5>::value )
5285  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
5286  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5287  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
5288 
5289  SIMDType xmm1( (~C).load(i ,j ) );
5290  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
5291  SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
5292  SIMDType xmm4( (~C).load(i+1UL,j ) );
5293  SIMDType xmm5( (~C).load(i+1UL,j+SIMDSIZE ) );
5294  SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
5295 
5296  for( size_t k=kbegin; k<kend; ++k ) {
5297  const SIMDType a1( set( A(i ,k) ) );
5298  const SIMDType a2( set( A(i+1UL,k) ) );
5299  const SIMDType b1( B.load(k,j ) );
5300  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
5301  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
5302  xmm1 -= a1 * b1;
5303  xmm2 -= a1 * b2;
5304  xmm3 -= a1 * b3;
5305  xmm4 -= a2 * b1;
5306  xmm5 -= a2 * b2;
5307  xmm6 -= a2 * b3;
5308  }
5309 
5310  (~C).store( i , j , xmm1 );
5311  (~C).store( i , j+SIMDSIZE , xmm2 );
5312  (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
5313  (~C).store( i+1UL, j , xmm4 );
5314  (~C).store( i+1UL, j+SIMDSIZE , xmm5 );
5315  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 );
5316  }
5317 
5318  if( i < M )
5319  {
5320  const size_t kbegin( ( IsUpper<MT4>::value )
5321  ?( ( IsLower<MT5>::value )
5322  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5323  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5324  :( IsLower<MT5>::value ? j : 0UL ) );
5325  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
5326 
5327  SIMDType xmm1( (~C).load(i,j ) );
5328  SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
5329  SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
5330 
5331  for( size_t k=kbegin; k<kend; ++k ) {
5332  const SIMDType a1( set( A(i,k) ) );
5333  xmm1 -= a1 * B.load(k,j );
5334  xmm2 -= a1 * B.load(k,j+SIMDSIZE );
5335  xmm3 -= a1 * B.load(k,j+SIMDSIZE*2UL);
5336  }
5337 
5338  (~C).store( i, j , xmm1 );
5339  (~C).store( i, j+SIMDSIZE , xmm2 );
5340  (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
5341  }
5342  }
5343 
5344  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5345  {
5346  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
5347  size_t i( LOW ? j : 0UL );
5348 
5349  for( ; (i+4UL) <= iend; i+=4UL )
5350  {
5351  const size_t kbegin( ( IsUpper<MT4>::value )
5352  ?( ( IsLower<MT5>::value )
5353  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5354  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5355  :( IsLower<MT5>::value ? j : 0UL ) );
5356  const size_t kend( ( IsLower<MT4>::value )
5357  ?( ( IsUpper<MT5>::value )
5358  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
5359  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
5360  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5361 
5362  SIMDType xmm1( (~C).load(i ,j ) );
5363  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5364  SIMDType xmm3( (~C).load(i+1UL,j ) );
5365  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5366  SIMDType xmm5( (~C).load(i+2UL,j ) );
5367  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
5368  SIMDType xmm7( (~C).load(i+3UL,j ) );
5369  SIMDType xmm8( (~C).load(i+3UL,j+SIMDSIZE) );
5370 
5371  for( size_t k=kbegin; k<kend; ++k ) {
5372  const SIMDType a1( set( A(i ,k) ) );
5373  const SIMDType a2( set( A(i+1UL,k) ) );
5374  const SIMDType a3( set( A(i+2UL,k) ) );
5375  const SIMDType a4( set( A(i+3UL,k) ) );
5376  const SIMDType b1( B.load(k,j ) );
5377  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5378  xmm1 -= a1 * b1;
5379  xmm2 -= a1 * b2;
5380  xmm3 -= a2 * b1;
5381  xmm4 -= a2 * b2;
5382  xmm5 -= a3 * b1;
5383  xmm6 -= a3 * b2;
5384  xmm7 -= a4 * b1;
5385  xmm8 -= a4 * b2;
5386  }
5387 
5388  (~C).store( i , j , xmm1 );
5389  (~C).store( i , j+SIMDSIZE, xmm2 );
5390  (~C).store( i+1UL, j , xmm3 );
5391  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
5392  (~C).store( i+2UL, j , xmm5 );
5393  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
5394  (~C).store( i+3UL, j , xmm7 );
5395  (~C).store( i+3UL, j+SIMDSIZE, xmm8 );
5396  }
5397 
5398  for( ; (i+3UL) <= iend; i+=3UL )
5399  {
5400  const size_t kbegin( ( IsUpper<MT4>::value )
5401  ?( ( IsLower<MT5>::value )
5402  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5403  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5404  :( IsLower<MT5>::value ? j : 0UL ) );
5405  const size_t kend( ( IsLower<MT4>::value )
5406  ?( ( IsUpper<MT5>::value )
5407  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
5408  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
5409  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5410 
5411  SIMDType xmm1( (~C).load(i ,j ) );
5412  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5413  SIMDType xmm3( (~C).load(i+1UL,j ) );
5414  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5415  SIMDType xmm5( (~C).load(i+2UL,j ) );
5416  SIMDType xmm6( (~C).load(i+2UL,j+SIMDSIZE) );
5417 
5418  for( size_t k=kbegin; k<kend; ++k ) {
5419  const SIMDType a1( set( A(i ,k) ) );
5420  const SIMDType a2( set( A(i+1UL,k) ) );
5421  const SIMDType a3( set( A(i+2UL,k) ) );
5422  const SIMDType b1( B.load(k,j ) );
5423  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5424  xmm1 -= a1 * b1;
5425  xmm2 -= a1 * b2;
5426  xmm3 -= a2 * b1;
5427  xmm4 -= a2 * b2;
5428  xmm5 -= a3 * b1;
5429  xmm6 -= a3 * b2;
5430  }
5431 
5432  (~C).store( i , j , xmm1 );
5433  (~C).store( i , j+SIMDSIZE, xmm2 );
5434  (~C).store( i+1UL, j , xmm3 );
5435  (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
5436  (~C).store( i+2UL, j , xmm5 );
5437  (~C).store( i+2UL, j+SIMDSIZE, xmm6 );
5438  }
5439 
5440  for( ; (i+2UL) <= iend; i+=2UL )
5441  {
5442  const size_t kbegin( ( IsUpper<MT4>::value )
5443  ?( ( IsLower<MT5>::value )
5444  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5445  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5446  :( IsLower<MT5>::value ? j : 0UL ) );
5447  const size_t kend( ( IsLower<MT4>::value )
5448  ?( ( IsUpper<MT5>::value )
5449  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
5450  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5451  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
5452 
5453  SIMDType xmm1( (~C).load(i ,j ) );
5454  SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
5455  SIMDType xmm3( (~C).load(i+1UL,j ) );
5456  SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
5457  SIMDType xmm5, xmm6, xmm7, xmm8;
5458  size_t k( kbegin );
5459 
5460  for( ; (k+2UL) <= kend; k+=2UL ) {
5461  const SIMDType a1( set( A(i ,k ) ) );
5462  const SIMDType a2( set( A(i+1UL,k ) ) );
5463  const SIMDType a3( set( A(i ,k+1UL) ) );
5464  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
5465  const SIMDType b1( B.load(k ,j ) );
5466  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
5467  const SIMDType b3( B.load(k+1UL,j ) );
5468  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
5469  xmm1 -= a1 * b1;
5470  xmm2 -= a1 * b2;
5471  xmm3 -= a2 * b1;
5472  xmm4 -= a2 * b2;
5473  xmm5 -= a3 * b3;
5474  xmm6 -= a3 * b4;
5475  xmm7 -= a4 * b3;
5476  xmm8 -= a4 * b4;
5477  }
5478 
5479  for( ; k<kend; ++k ) {
5480  const SIMDType a1( set( A(i ,k) ) );
5481  const SIMDType a2( set( A(i+1UL,k) ) );
5482  const SIMDType b1( B.load(k,j ) );
5483  const SIMDType b2( B.load(k,j+SIMDSIZE) );
5484  xmm1 -= a1 * b1;
5485  xmm2 -= a1 * b2;
5486  xmm3 -= a2 * b1;
5487  xmm4 -= a2 * b2;
5488  }
5489 
5490  (~C).store( i , j , xmm1+xmm5 );
5491  (~C).store( i , j+SIMDSIZE, xmm2+xmm6 );
5492  (~C).store( i+1UL, j , xmm3+xmm7 );
5493  (~C).store( i+1UL, j+SIMDSIZE, xmm4+xmm8 );
5494  }
5495 
5496  if( i < iend )
5497  {
5498  const size_t kbegin( ( IsUpper<MT4>::value )
5499  ?( ( IsLower<MT5>::value )
5500  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5501  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5502  :( IsLower<MT5>::value ? j : 0UL ) );
5503  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
5504 
5505  SIMDType xmm1( (~C).load(i,j ) );
5506  SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
5507  SIMDType xmm3, xmm4;
5508  size_t k( kbegin );
5509 
5510  for( ; (k+2UL) <= kend; k+=2UL ) {
5511  const SIMDType a1( set( A(i,k ) ) );
5512  const SIMDType a2( set( A(i,k+1UL) ) );
5513  xmm1 -= a1 * B.load(k ,j );
5514  xmm2 -= a1 * B.load(k ,j+SIMDSIZE);
5515  xmm3 -= a2 * B.load(k+1UL,j );
5516  xmm4 -= a2 * B.load(k+1UL,j+SIMDSIZE);
5517  }
5518 
5519  for( ; k<kend; ++k ) {
5520  const SIMDType a1( set( A(i,k) ) );
5521  xmm1 -= a1 * B.load(k,j );
5522  xmm2 -= a1 * B.load(k,j+SIMDSIZE);
5523  }
5524 
5525  (~C).store( i, j , xmm1+xmm3 );
5526  (~C).store( i, j+SIMDSIZE, xmm2+xmm4 );
5527  }
5528  }
5529 
5530  for( ; j<jpos; j+=SIMDSIZE )
5531  {
5532  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
5533  size_t i( LOW ? j : 0UL );
5534 
5535  for( ; (i+4UL) <= iend; i+=4UL )
5536  {
5537  const size_t kbegin( ( IsUpper<MT4>::value )
5538  ?( ( IsLower<MT5>::value )
5539  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5540  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5541  :( IsLower<MT5>::value ? j : 0UL ) );
5542  const size_t kend( ( IsLower<MT4>::value )
5543  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
5544  :( K ) );
5545 
5546  SIMDType xmm1( (~C).load(i ,j) );
5547  SIMDType xmm2( (~C).load(i+1UL,j) );
5548  SIMDType xmm3( (~C).load(i+2UL,j) );
5549  SIMDType xmm4( (~C).load(i+3UL,j) );
5550  SIMDType xmm5, xmm6, xmm7, xmm8;
5551  size_t k( kbegin );
5552 
5553  for( ; (k+2UL) <= kend; k+=2UL ) {
5554  const SIMDType b1( B.load(k ,j) );
5555  const SIMDType b2( B.load(k+1UL,j) );
5556  xmm1 -= set( A(i ,k ) ) * b1;
5557  xmm2 -= set( A(i+1UL,k ) ) * b1;
5558  xmm3 -= set( A(i+2UL,k ) ) * b1;
5559  xmm4 -= set( A(i+3UL,k ) ) * b1;
5560  xmm5 -= set( A(i ,k+1UL) ) * b2;
5561  xmm6 -= set( A(i+1UL,k+1UL) ) * b2;
5562  xmm7 -= set( A(i+2UL,k+1UL) ) * b2;
5563  xmm8 -= set( A(i+3UL,k+1UL) ) * b2;
5564  }
5565 
5566  for( ; k<kend; ++k ) {
5567  const SIMDType b1( B.load(k,j) );
5568  xmm1 -= set( A(i ,k) ) * b1;
5569  xmm2 -= set( A(i+1UL,k) ) * b1;
5570  xmm3 -= set( A(i+2UL,k) ) * b1;
5571  xmm4 -= set( A(i+3UL,k) ) * b1;
5572  }
5573 
5574  (~C).store( i , j, xmm1+xmm5 );
5575  (~C).store( i+1UL, j, xmm2+xmm6 );
5576  (~C).store( i+2UL, j, xmm3+xmm7 );
5577  (~C).store( i+3UL, j, xmm4+xmm8 );
5578  }
5579 
5580  for( ; (i+3UL) <= iend; i+=3UL )
5581  {
5582  const size_t kbegin( ( IsUpper<MT4>::value )
5583  ?( ( IsLower<MT5>::value )
5584  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5585  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5586  :( IsLower<MT5>::value ? j : 0UL ) );
5587  const size_t kend( ( IsLower<MT4>::value )
5588  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
5589  :( K ) );
5590 
5591  SIMDType xmm1( (~C).load(i ,j) );
5592  SIMDType xmm2( (~C).load(i+1UL,j) );
5593  SIMDType xmm3( (~C).load(i+2UL,j) );
5594  SIMDType xmm4, xmm5, xmm6;
5595  size_t k( kbegin );
5596 
5597  for( ; (k+2UL) <= kend; k+=2UL ) {
5598  const SIMDType b1( B.load(k ,j) );
5599  const SIMDType b2( B.load(k+1UL,j) );
5600  xmm1 -= set( A(i ,k ) ) * b1;
5601  xmm2 -= set( A(i+1UL,k ) ) * b1;
5602  xmm3 -= set( A(i+2UL,k ) ) * b1;
5603  xmm4 -= set( A(i ,k+1UL) ) * b2;
5604  xmm5 -= set( A(i+1UL,k+1UL) ) * b2;
5605  xmm6 -= set( A(i+2UL,k+1UL) ) * b2;
5606  }
5607 
5608  for( ; k<kend; ++k ) {
5609  const SIMDType b1( B.load(k,j) );
5610  xmm1 -= set( A(i ,k) ) * b1;
5611  xmm2 -= set( A(i+1UL,k) ) * b1;
5612  xmm3 -= set( A(i+2UL,k) ) * b1;
5613  }
5614 
5615  (~C).store( i , j, xmm1+xmm4 );
5616  (~C).store( i+1UL, j, xmm2+xmm5 );
5617  (~C).store( i+2UL, j, xmm3+xmm6 );
5618  }
5619 
5620  for( ; (i+2UL) <= iend; i+=2UL )
5621  {
5622  const size_t kbegin( ( IsUpper<MT4>::value )
5623  ?( ( IsLower<MT5>::value )
5624  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5625  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5626  :( IsLower<MT5>::value ? j : 0UL ) );
5627  const size_t kend( ( IsLower<MT4>::value )
5628  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5629  :( K ) );
5630 
5631  SIMDType xmm1( (~C).load(i ,j) );
5632  SIMDType xmm2( (~C).load(i+1UL,j) );
5633  SIMDType xmm3, xmm4;
5634  size_t k( kbegin );
5635 
5636  for( ; (k+2UL) <= kend; k+=2UL ) {
5637  const SIMDType b1( B.load(k ,j) );
5638  const SIMDType b2( B.load(k+1UL,j) );
5639  xmm1 -= set( A(i ,k ) ) * b1;
5640  xmm2 -= set( A(i+1UL,k ) ) * b1;
5641  xmm3 -= set( A(i ,k+1UL) ) * b2;
5642  xmm4 -= set( A(i+1UL,k+1UL) ) * b2;
5643  }
5644 
5645  for( ; k<kend; ++k ) {
5646  const SIMDType b1( B.load(k,j) );
5647  xmm1 -= set( A(i ,k) ) * b1;
5648  xmm2 -= set( A(i+1UL,k) ) * b1;
5649  }
5650 
5651  (~C).store( i , j, xmm1+xmm3 );
5652  (~C).store( i+1UL, j, xmm2+xmm4 );
5653  }
5654 
5655  if( i < iend )
5656  {
5657  const size_t kbegin( ( IsUpper<MT4>::value )
5658  ?( ( IsLower<MT5>::value )
5659  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5660  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5661  :( IsLower<MT5>::value ? j : 0UL ) );
5662 
5663  SIMDType xmm1( (~C).load(i,j) );
5664  SIMDType xmm2;
5665  size_t k( kbegin );
5666 
5667  for( ; (k+2UL) <= K; k+=2UL ) {
5668  xmm1 -= set( A(i,k ) ) * B.load(k ,j);
5669  xmm2 -= set( A(i,k+1UL) ) * B.load(k+1UL,j);
5670  }
5671 
5672  for( ; k<K; ++k ) {
5673  xmm1 -= set( A(i,k) ) * B.load(k,j);
5674  }
5675 
5676  (~C).store( i, j, xmm1+xmm2 );
5677  }
5678  }
5679 
5680  for( ; remainder && j<N; ++j )
5681  {
5682  const size_t iend( UPP ? j+1UL : M );
5683  size_t i( LOW ? j : 0UL );
5684 
5685  for( ; (i+2UL) <= iend; i+=2UL )
5686  {
5687  const size_t kbegin( ( IsUpper<MT4>::value )
5688  ?( ( IsLower<MT5>::value )
5689  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5690  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5691  :( IsLower<MT5>::value ? j : 0UL ) );
5692  const size_t kend( ( IsLower<MT4>::value )
5693  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5694  :( K ) );
5695 
5696  ElementType value1( (~C)(i ,j) );
5697  ElementType value2( (~C)(i+1UL,j) );
5698 
5699  for( size_t k=kbegin; k<kend; ++k ) {
5700  value1 -= A(i ,k) * B(k,j);
5701  value2 -= A(i+1UL,k) * B(k,j);
5702  }
5703 
5704  (~C)(i ,j) = value1;
5705  (~C)(i+1UL,j) = value2;
5706  }
5707 
5708  if( i < iend )
5709  {
5710  const size_t kbegin( ( IsUpper<MT4>::value )
5711  ?( ( IsLower<MT5>::value )
5712  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5713  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5714  :( IsLower<MT5>::value ? j : 0UL ) );
5715 
5716  ElementType value( (~C)(i,j) );
5717 
5718  for( size_t k=kbegin; k<K; ++k ) {
5719  value -= A(i,k) * B(k,j);
5720  }
5721 
5722  (~C)(i,j) = value;
5723  }
5724  }
5725  }
5727  //**********************************************************************************************
5728 
5729  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
5744  template< typename MT3 // Type of the left-hand side target matrix
5745  , typename MT4 // Type of the left-hand side matrix operand
5746  , typename MT5 > // Type of the right-hand side matrix operand
5748  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B )
5749  {
5750  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5751 
5752  const size_t M( A.rows() );
5753  const size_t N( B.columns() );
5754  const size_t K( A.columns() );
5755 
5756  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
5757 
5758  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
5759  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
5760 
5761  size_t i( 0UL );
5762 
5764  {
5765  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5766  for( size_t j=0UL; j<N; ++j )
5767  {
5768  const size_t kbegin( ( IsLower<MT5>::value )
5769  ?( ( IsUpper<MT4>::value )
5770  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5771  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5772  :( IsUpper<MT4>::value ? i : 0UL ) );
5773  const size_t kend( ( IsUpper<MT5>::value )
5774  ?( ( IsLower<MT4>::value )
5775  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
5776  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
5777  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
5778 
5779  SIMDType xmm1( (~C).load(i ,j) );
5780  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5781  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5782  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5783  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
5784  SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
5785  SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
5786  SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
5787 
5788  for( size_t k=kbegin; k<kend; ++k ) {
5789  const SIMDType b1( set( B(k,j) ) );
5790  xmm1 -= A.load(i ,k) * b1;
5791  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5792  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5793  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5794  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5795  xmm6 -= A.load(i+SIMDSIZE*5UL,k) * b1;
5796  xmm7 -= A.load(i+SIMDSIZE*6UL,k) * b1;
5797  xmm8 -= A.load(i+SIMDSIZE*7UL,k) * b1;
5798  }
5799 
5800  (~C).store( i , j, xmm1 );
5801  (~C).store( i+SIMDSIZE , j, xmm2 );
5802  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5803  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5804  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
5805  (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
5806  (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
5807  (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
5808  }
5809  }
5810  }
5811 
5812  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
5813  {
5814  size_t j( 0UL );
5815 
5816  for( ; (j+2UL) <= N; j+=2UL )
5817  {
5818  const size_t kbegin( ( IsLower<MT5>::value )
5819  ?( ( IsUpper<MT4>::value )
5820  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5821  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5822  :( IsUpper<MT4>::value ? i : 0UL ) );
5823  const size_t kend( ( IsUpper<MT5>::value )
5824  ?( ( IsLower<MT4>::value )
5825  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5826  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5827  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
5828 
5829  SIMDType xmm1 ( (~C).load(i ,j ) );
5830  SIMDType xmm2 ( (~C).load(i+SIMDSIZE ,j ) );
5831  SIMDType xmm3 ( (~C).load(i+SIMDSIZE*2UL,j ) );
5832  SIMDType xmm4 ( (~C).load(i+SIMDSIZE*3UL,j ) );
5833  SIMDType xmm5 ( (~C).load(i+SIMDSIZE*4UL,j ) );
5834  SIMDType xmm6 ( (~C).load(i ,j+1UL) );
5835  SIMDType xmm7 ( (~C).load(i+SIMDSIZE ,j+1UL) );
5836  SIMDType xmm8 ( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
5837  SIMDType xmm9 ( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
5838  SIMDType xmm10( (~C).load(i+SIMDSIZE*4UL,j+1UL) );
5839 
5840  for( size_t k=kbegin; k<kend; ++k ) {
5841  const SIMDType a1( A.load(i ,k) );
5842  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5843  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5844  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5845  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
5846  const SIMDType b1( set( B(k,j ) ) );
5847  const SIMDType b2( set( B(k,j+1UL) ) );
5848  xmm1 -= a1 * b1;
5849  xmm2 -= a2 * b1;
5850  xmm3 -= a3 * b1;
5851  xmm4 -= a4 * b1;
5852  xmm5 -= a5 * b1;
5853  xmm6 -= a1 * b2;
5854  xmm7 -= a2 * b2;
5855  xmm8 -= a3 * b2;
5856  xmm9 -= a4 * b2;
5857  xmm10 -= a5 * b2;
5858  }
5859 
5860  (~C).store( i , j , xmm1 );
5861  (~C).store( i+SIMDSIZE , j , xmm2 );
5862  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
5863  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
5864  (~C).store( i+SIMDSIZE*4UL, j , xmm5 );
5865  (~C).store( i , j+1UL, xmm6 );
5866  (~C).store( i+SIMDSIZE , j+1UL, xmm7 );
5867  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 );
5868  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 );
5869  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 );
5870  }
5871 
5872  if( j < N )
5873  {
5874  const size_t kbegin( ( IsLower<MT5>::value )
5875  ?( ( IsUpper<MT4>::value )
5876  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5877  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5878  :( IsUpper<MT4>::value ? i : 0UL ) );
5879  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
5880 
5881  SIMDType xmm1( (~C).load(i ,j) );
5882  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5883  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5884  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5885  SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
5886 
5887  for( size_t k=kbegin; k<kend; ++k ) {
5888  const SIMDType b1( set( B(k,j) ) );
5889  xmm1 -= A.load(i ,k) * b1;
5890  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5891  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5892  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5893  xmm5 -= A.load(i+SIMDSIZE*4UL,k) * b1;
5894  }
5895 
5896  (~C).store( i , j, xmm1 );
5897  (~C).store( i+SIMDSIZE , j, xmm2 );
5898  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5899  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5900  (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
5901  }
5902  }
5903 
5904  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5905  {
5906  size_t j( 0UL );
5907 
5908  for( ; (j+2UL) <= N; j+=2UL )
5909  {
5910  const size_t kbegin( ( IsLower<MT5>::value )
5911  ?( ( IsUpper<MT4>::value )
5912  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5913  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5914  :( IsUpper<MT4>::value ? i : 0UL ) );
5915  const size_t kend( ( IsUpper<MT5>::value )
5916  ?( ( IsLower<MT4>::value )
5917  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5918  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5919  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
5920 
5921  SIMDType xmm1( (~C).load(i ,j ) );
5922  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
5923  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
5924  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
5925  SIMDType xmm5( (~C).load(i ,j+1UL) );
5926  SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
5927  SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
5928  SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
5929 
5930  for( size_t k=kbegin; k<kend; ++k ) {
5931  const SIMDType a1( A.load(i ,k) );
5932  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5933  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5934  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5935  const SIMDType b1( set( B(k,j ) ) );
5936  const SIMDType b2( set( B(k,j+1UL) ) );
5937  xmm1 -= a1 * b1;
5938  xmm2 -= a2 * b1;
5939  xmm3 -= a3 * b1;
5940  xmm4 -= a4 * b1;
5941  xmm5 -= a1 * b2;
5942  xmm6 -= a2 * b2;
5943  xmm7 -= a3 * b2;
5944  xmm8 -= a4 * b2;
5945  }
5946 
5947  (~C).store( i , j , xmm1 );
5948  (~C).store( i+SIMDSIZE , j , xmm2 );
5949  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
5950  (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
5951  (~C).store( i , j+1UL, xmm5 );
5952  (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
5953  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
5954  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
5955  }
5956 
5957  if( j < N )
5958  {
5959  const size_t kbegin( ( IsLower<MT5>::value )
5960  ?( ( IsUpper<MT4>::value )
5961  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5962  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5963  :( IsUpper<MT4>::value ? i : 0UL ) );
5964  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
5965 
5966  SIMDType xmm1( (~C).load(i ,j) );
5967  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
5968  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
5969  SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
5970 
5971  for( size_t k=kbegin; k<kend; ++k ) {
5972  const SIMDType b1( set( B(k,j) ) );
5973  xmm1 -= A.load(i ,k) * b1;
5974  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
5975  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
5976  xmm4 -= A.load(i+SIMDSIZE*3UL,k) * b1;
5977  }
5978 
5979  (~C).store( i , j, xmm1 );
5980  (~C).store( i+SIMDSIZE , j, xmm2 );
5981  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
5982  (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
5983  }
5984  }
5985 
5986  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
5987  {
5988  size_t j( 0UL );
5989 
5990  for( ; (j+2UL) <= N; j+=2UL )
5991  {
5992  const size_t kbegin( ( IsLower<MT5>::value )
5993  ?( ( IsUpper<MT4>::value )
5994  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5995  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5996  :( IsUpper<MT4>::value ? i : 0UL ) );
5997  const size_t kend( ( IsUpper<MT5>::value )
5998  ?( ( IsLower<MT4>::value )
5999  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6000  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6001  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
6002 
6003  SIMDType xmm1( (~C).load(i ,j ) );
6004  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
6005  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
6006  SIMDType xmm4( (~C).load(i ,j+1UL) );
6007  SIMDType xmm5( (~C).load(i+SIMDSIZE ,j+1UL) );
6008  SIMDType xmm6( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
6009 
6010  for( size_t k=kbegin; k<kend; ++k ) {
6011  const SIMDType a1( A.load(i ,k) );
6012  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6013  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6014  const SIMDType b1( set( B(k,j ) ) );
6015  const SIMDType b2( set( B(k,j+1UL) ) );
6016  xmm1 -= a1 * b1;
6017  xmm2 -= a2 * b1;
6018  xmm3 -= a3 * b1;
6019  xmm4 -= a1 * b2;
6020  xmm5 -= a2 * b2;
6021  xmm6 -= a3 * b2;
6022  }
6023 
6024  (~C).store( i , j , xmm1 );
6025  (~C).store( i+SIMDSIZE , j , xmm2 );
6026  (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
6027  (~C).store( i , j+1UL, xmm4 );
6028  (~C).store( i+SIMDSIZE , j+1UL, xmm5 );
6029  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 );
6030  }
6031 
6032  if( j < N )
6033  {
6034  const size_t kbegin( ( IsLower<MT5>::value )
6035  ?( ( IsUpper<MT4>::value )
6036  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6037  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6038  :( IsUpper<MT4>::value ? i : 0UL ) );
6039  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
6040 
6041  SIMDType xmm1( (~C).load(i ,j) );
6042  SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
6043  SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
6044 
6045  for( size_t k=kbegin; k<kend; ++k ) {
6046  const SIMDType b1( set( B(k,j) ) );
6047  xmm1 -= A.load(i ,k) * b1;
6048  xmm2 -= A.load(i+SIMDSIZE ,k) * b1;
6049  xmm3 -= A.load(i+SIMDSIZE*2UL,k) * b1;
6050  }
6051 
6052  (~C).store( i , j, xmm1 );
6053  (~C).store( i+SIMDSIZE , j, xmm2 );
6054  (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
6055  }
6056  }
6057 
6058  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6059  {
6060  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
6061  size_t j( UPP ? i : 0UL );
6062 
6063  for( ; (j+4UL) <= jend; j+=4UL )
6064  {
6065  const size_t kbegin( ( IsLower<MT5>::value )
6066  ?( ( IsUpper<MT4>::value )
6067  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6068  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6069  :( IsUpper<MT4>::value ? i : 0UL ) );
6070  const size_t kend( ( IsUpper<MT5>::value )
6071  ?( ( IsLower<MT4>::value )
6072  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
6073  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
6074  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6075 
6076  SIMDType xmm1( (~C).load(i ,j ) );
6077  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6078  SIMDType xmm3( (~C).load(i ,j+1UL) );
6079  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6080  SIMDType xmm5( (~C).load(i ,j+2UL) );
6081  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
6082  SIMDType xmm7( (~C).load(i ,j+3UL) );
6083  SIMDType xmm8( (~C).load(i+SIMDSIZE,j+3UL) );
6084 
6085  for( size_t k=kbegin; k<kend; ++k ) {
6086  const SIMDType a1( A.load(i ,k) );
6087  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6088  const SIMDType b1( set( B(k,j ) ) );
6089  const SIMDType b2( set( B(k,j+1UL) ) );
6090  const SIMDType b3( set( B(k,j+2UL) ) );
6091  const SIMDType b4( set( B(k,j+3UL) ) );
6092  xmm1 -= a1 * b1;
6093  xmm2 -= a2 * b1;
6094  xmm3 -= a1 * b2;
6095  xmm4 -= a2 * b2;
6096  xmm5 -= a1 * b3;
6097  xmm6 -= a2 * b3;
6098  xmm7 -= a1 * b4;
6099  xmm8 -= a2 * b4;
6100  }
6101 
6102  (~C).store( i , j , xmm1 );
6103  (~C).store( i+SIMDSIZE, j , xmm2 );
6104  (~C).store( i , j+1UL, xmm3 );
6105  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
6106  (~C).store( i , j+2UL, xmm5 );
6107  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
6108  (~C).store( i , j+3UL, xmm7 );
6109  (~C).store( i+SIMDSIZE, j+3UL, xmm8 );
6110  }
6111 
6112  for( ; (j+3UL) <= jend; j+=3UL )
6113  {
6114  const size_t kbegin( ( IsLower<MT5>::value )
6115  ?( ( IsUpper<MT4>::value )
6116  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6117  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6118  :( IsUpper<MT4>::value ? i : 0UL ) );
6119  const size_t kend( ( IsUpper<MT5>::value )
6120  ?( ( IsLower<MT4>::value )
6121  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
6122  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
6123  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6124 
6125  SIMDType xmm1( (~C).load(i ,j ) );
6126  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6127  SIMDType xmm3( (~C).load(i ,j+1UL) );
6128  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6129  SIMDType xmm5( (~C).load(i ,j+2UL) );
6130  SIMDType xmm6( (~C).load(i+SIMDSIZE,j+2UL) );
6131 
6132  for( size_t k=kbegin; k<kend; ++k ) {
6133  const SIMDType a1( A.load(i ,k) );
6134  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6135  const SIMDType b1( set( B(k,j ) ) );
6136  const SIMDType b2( set( B(k,j+1UL) ) );
6137  const SIMDType b3( set( B(k,j+2UL) ) );
6138  xmm1 -= a1 * b1;
6139  xmm2 -= a2 * b1;
6140  xmm3 -= a1 * b2;
6141  xmm4 -= a2 * b2;
6142  xmm5 -= a1 * b3;
6143  xmm6 -= a2 * b3;
6144  }
6145 
6146  (~C).store( i , j , xmm1 );
6147  (~C).store( i+SIMDSIZE, j , xmm2 );
6148  (~C).store( i , j+1UL, xmm3 );
6149  (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
6150  (~C).store( i , j+2UL, xmm5 );
6151  (~C).store( i+SIMDSIZE, j+2UL, xmm6 );
6152  }
6153 
6154  for( ; (j+2UL) <= jend; j+=2UL )
6155  {
6156  const size_t kbegin( ( IsLower<MT5>::value )
6157  ?( ( IsUpper<MT4>::value )
6158  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6159  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6160  :( IsUpper<MT4>::value ? i : 0UL ) );
6161  const size_t kend( ( IsUpper<MT5>::value )
6162  ?( ( IsLower<MT4>::value )
6163  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6164  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6165  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
6166 
6167  SIMDType xmm1( (~C).load(i ,j ) );
6168  SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
6169  SIMDType xmm3( (~C).load(i ,j+1UL) );
6170  SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
6171  SIMDType xmm5, xmm6, xmm7, xmm8;
6172  size_t k( kbegin );
6173 
6174  for( ; (k+2UL) <= kend; k+=2UL ) {
6175  const SIMDType a1( A.load(i ,k ) );
6176  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
6177  const SIMDType a3( A.load(i ,k+1UL) );
6178  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
6179  const SIMDType b1( set( B(k ,j ) ) );
6180  const SIMDType b2( set( B(k ,j+1UL) ) );
6181  const SIMDType b3( set( B(k+1UL,j ) ) );
6182  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
6183  xmm1 -= a1 * b1;
6184  xmm2 -= a2 * b1;
6185  xmm3 -= a1 * b2;
6186  xmm4 -= a2 * b2;
6187  xmm5 -= a3 * b3;
6188  xmm6 -= a4 * b3;
6189  xmm7 -= a3 * b4;
6190  xmm8 -= a4 * b4;
6191  }
6192 
6193  for( ; k<kend; ++k ) {
6194  const SIMDType a1( A.load(i ,k) );
6195  const SIMDType a2( A.load(i+SIMDSIZE,k) );
6196  const SIMDType b1( set( B(k,j ) ) );
6197  const SIMDType b2( set( B(k,j+1UL) ) );
6198  xmm1 -= a1 * b1;
6199  xmm2 -= a2 * b1;
6200  xmm3 -= a1 * b2;
6201  xmm4 -= a2 * b2;
6202  }
6203 
6204  (~C).store( i , j , xmm1+xmm5 );
6205  (~C).store( i+SIMDSIZE, j , xmm2+xmm6 );
6206  (~C).store( i , j+1UL, xmm3+xmm7 );
6207  (~C).store( i+SIMDSIZE, j+1UL, xmm4+xmm8 );
6208  }
6209 
6210  if( j < jend )
6211  {
6212  const size_t kbegin( ( IsLower<MT5>::value )
6213  ?( ( IsUpper<MT4>::value )
6214  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6215  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6216  :( IsUpper<MT4>::value ? i : 0UL ) );
6217  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
6218 
6219  SIMDType xmm1( (~C).load(i ,j) );
6220  SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
6221  SIMDType xmm3, xmm4;
6222  size_t k( kbegin );
6223 
6224  for( ; (k+2UL) <= kend; k+=2UL ) {
6225  const SIMDType b1( set( B(k ,j) ) );
6226  const SIMDType b2( set( B(k+1UL,j) ) );
6227  xmm1 -= A.load(i ,k ) * b1;
6228  xmm2 -= A.load(i+SIMDSIZE,k ) * b1;
6229  xmm3 -= A.load(i ,k+1UL) * b2;
6230  xmm4 -= A.load(i+SIMDSIZE,k+1UL) * b2;
6231  }
6232 
6233  for( ; k<kend; ++k ) {
6234  const SIMDType b1( set( B(k,j) ) );
6235  xmm1 -= A.load(i ,k) * b1;
6236  xmm2 -= A.load(i+SIMDSIZE,k) * b1;
6237  }
6238 
6239  (~C).store( i , j, xmm1+xmm3 );
6240  (~C).store( i+SIMDSIZE, j, xmm2+xmm4 );
6241  }
6242  }
6243 
6244  for( ; i<ipos; i+=SIMDSIZE )
6245  {
6246  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
6247  size_t j( UPP ? i : 0UL );
6248 
6249  for( ; (j+4UL) <= jend; j+=4UL )
6250  {
6251  const size_t kbegin( ( IsLower<MT5>::value )
6252  ?( ( IsUpper<MT4>::value )
6253  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6254  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6255  :( IsUpper<MT4>::value ? i : 0UL ) );
6256  const size_t kend( ( IsUpper<MT5>::value )
6257  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
6258  :( K ) );
6259 
6260  SIMDType xmm1( (~C).load(i,j ) );
6261  SIMDType xmm2( (~C).load(i,j+1UL) );
6262  SIMDType xmm3( (~C).load(i,j+2UL) );
6263  SIMDType xmm4( (~C).load(i,j+3UL) );
6264  SIMDType xmm5, xmm6, xmm7, xmm8;
6265  size_t k( kbegin );
6266 
6267  for( ; (k+2UL) <= kend; k+=2UL ) {
6268  const SIMDType a1( A.load(i,k ) );
6269  const SIMDType a2( A.load(i,k+1UL) );
6270  xmm1 -= a1 * set( B(k ,j ) );
6271  xmm2 -= a1 * set( B(k ,j+1UL) );
6272  xmm3 -= a1 * set( B(k ,j+2UL) );
6273  xmm4 -= a1 * set( B(k ,j+3UL) );
6274  xmm5 -= a2 * set( B(k+1UL,j ) );
6275  xmm6 -= a2 * set( B(k+1UL,j+1UL) );
6276  xmm7 -= a2 * set( B(k+1UL,j+2UL) );
6277  xmm8 -= a2 * set( B(k+1UL,j+3UL) );
6278  }
6279 
6280  for( ; k<kend; ++k ) {
6281  const SIMDType a1( A.load(i,k) );
6282  xmm1 -= a1 * set( B(k,j ) );
6283  xmm2 -= a1 * set( B(k,j+1UL) );
6284  xmm3 -= a1 * set( B(k,j+2UL) );
6285  xmm4 -= a1 * set( B(k,j+3UL) );
6286  }
6287 
6288  (~C).store( i, j , xmm1+xmm5 );
6289  (~C).store( i, j+1UL, xmm2+xmm6 );
6290  (~C).store( i, j+2UL, xmm3+xmm7 );
6291  (~C).store( i, j+3UL, xmm4+xmm8 );
6292  }
6293 
6294  for( ; (j+3UL) <= jend; j+=3UL )
6295  {
6296  const size_t kbegin( ( IsLower<MT5>::value )
6297  ?( ( IsUpper<MT4>::value )
6298  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6299  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6300  :( IsUpper<MT4>::value ? i : 0UL ) );
6301  const size_t kend( ( IsUpper<MT5>::value )
6302  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
6303  :( K ) );
6304 
6305  SIMDType xmm1( (~C).load(i,j ) );
6306  SIMDType xmm2( (~C).load(i,j+1UL) );
6307  SIMDType xmm3( (~C).load(i,j+2UL) );
6308  SIMDType xmm4, xmm5, xmm6;
6309  size_t k( kbegin );
6310 
6311  for( ; (k+2UL) <= kend; k+=2UL ) {
6312  const SIMDType a1( A.load(i,k ) );
6313  const SIMDType a2( A.load(i,k+1UL) );
6314  xmm1 -= a1 * set( B(k ,j ) );
6315  xmm2 -= a1 * set( B(k ,j+1UL) );
6316  xmm3 -= a1 * set( B(k ,j+2UL) );
6317  xmm4 -= a2 * set( B(k+1UL,j ) );
6318  xmm5 -= a2 * set( B(k+1UL,j+1UL) );
6319  xmm6 -= a2 * set( B(k+1UL,j+2UL) );
6320  }
6321 
6322  for( ; k<kend; ++k ) {
6323  const SIMDType a1( A.load(i,k) );
6324  xmm1 -= a1 * set( B(k,j ) );
6325  xmm2 -= a1 * set( B(k,j+1UL) );
6326  xmm3 -= a1 * set( B(k,j+2UL) );
6327  }
6328 
6329  (~C).store( i, j , xmm1+xmm4 );
6330  (~C).store( i, j+1UL, xmm2+xmm5 );
6331  (~C).store( i, j+2UL, xmm3+xmm6 );
6332  }
6333 
6334  for( ; (j+2UL) <= jend; j+=2UL )
6335  {
6336  const size_t kbegin( ( IsLower<MT5>::value )
6337  ?( ( IsUpper<MT4>::value )
6338  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6339  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6340  :( IsUpper<MT4>::value ? i : 0UL ) );
6341  const size_t kend( ( IsUpper<MT5>::value )
6342  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6343  :( K ) );
6344 
6345  SIMDType xmm1( (~C).load(i,j ) );
6346  SIMDType xmm2( (~C).load(i,j+1UL) );
6347  SIMDType xmm3, xmm4;
6348  size_t k( kbegin );
6349 
6350  for( ; (k+2UL) <= kend; k+=2UL ) {
6351  const SIMDType a1( A.load(i,k ) );
6352  const SIMDType a2( A.load(i,k+1UL) );
6353  xmm1 -= a1 * set( B(k ,j ) );
6354  xmm2 -= a1 * set( B(k ,j+1UL) );
6355  xmm3 -= a2 * set( B(k+1UL,j ) );
6356  xmm4 -= a2 * set( B(k+1UL,j+1UL) );
6357  }
6358 
6359  for( ; k<kend; ++k ) {
6360  const SIMDType a1( A.load(i,k) );
6361  xmm1 -= a1 * set( B(k,j ) );
6362  xmm2 -= a1 * set( B(k,j+1UL) );
6363  }
6364 
6365  (~C).store( i, j , xmm1+xmm3 );
6366  (~C).store( i, j+1UL, xmm2+xmm4 );
6367  }
6368 
6369  if( j < jend )
6370  {
6371  const size_t kbegin( ( IsLower<MT5>::value )
6372  ?( ( IsUpper<MT4>::value )
6373  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6374  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6375  :( IsUpper<MT4>::value ? i : 0UL ) );
6376 
6377  SIMDType xmm1( (~C).load(i,j) );
6378  SIMDType xmm2;
6379  size_t k( kbegin );
6380 
6381  for( ; (k+2UL) <= K; k+=2UL ) {
6382  xmm1 -= A.load(i,k ) * set( B(k ,j) );
6383  xmm2 -= A.load(i,k+1UL) * set( B(k+1UL,j) );
6384  }
6385 
6386  for( ; k<K; ++k ) {
6387  xmm1 -= A.load(i,k) * set( B(k,j) );
6388  }
6389 
6390  (~C).store( i, j, xmm1+xmm2 );
6391  }
6392  }
6393 
6394  for( ; remainder && i<M; ++i )
6395  {
6396  const size_t jend( LOW ? i+1UL : N );
6397  size_t j( UPP ? i : 0UL );
6398 
6399  for( ; (j+2UL) <= jend; j+=2UL )
6400  {
6401  const size_t kbegin( ( IsLower<MT5>::value )
6402  ?( ( IsUpper<MT4>::value )
6403  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6404  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6405  :( IsUpper<MT4>::value ? i : 0UL ) );
6406  const size_t kend( ( IsUpper<MT5>::value )
6407  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6408  :( K ) );
6409 
6410  ElementType value1( (~C)(i,j ) );
6411  ElementType value2( (~C)(i,j+1UL) );
6412 
6413  for( size_t k=kbegin; k<kend; ++k ) {
6414  value1 -= A(i,k) * B(k,j );
6415  value2 -= A(i,k) * B(k,j+1UL);
6416  }
6417 
6418  (~C)(i,j ) = value1;
6419  (~C)(i,j+1UL) = value2;
6420  }
6421 
6422  if( j < jend )
6423  {
6424  const size_t kbegin( ( IsLower<MT5>::value )
6425  ?( ( IsUpper<MT4>::value )
6426  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6427  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6428  :( IsUpper<MT4>::value ? i : 0UL ) );
6429 
6430  ElementType value( (~C)(i,j) );
6431 
6432  for( size_t k=kbegin; k<K; ++k ) {
6433  value -= A(i,k) * B(k,j);
6434  }
6435 
6436  (~C)(i,j) = value;
6437  }
6438  }
6439  }
6441  //**********************************************************************************************
6442 
6443  //**Default subtraction assignment to dense matrices (large matrices)***************************
6457  template< typename MT3 // Type of the left-hand side target matrix
6458  , typename MT4 // Type of the left-hand side matrix operand
6459  , typename MT5 > // Type of the right-hand side matrix operand
6461  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6462  {
6463  selectDefaultSubAssignKernel( C, A, B );
6464  }
6466  //**********************************************************************************************
6467 
6468  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
6483  template< typename MT3 // Type of the left-hand side target matrix
6484  , typename MT4 // Type of the left-hand side matrix operand
6485  , typename MT5 > // Type of the right-hand side matrix operand
6487  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6488  {
6489  if( LOW )
6490  lmmm( C, A, B, ElementType(-1), ElementType(1) );
6491  else if( UPP )
6492  ummm( C, A, B, ElementType(-1), ElementType(1) );
6493  else
6494  mmm( C, A, B, ElementType(-1), ElementType(1) );
6495  }
6497  //**********************************************************************************************
6498 
6499  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
6513  template< typename MT3 // Type of the left-hand side target matrix
6514  , typename MT4 // Type of the left-hand side matrix operand
6515  , typename MT5 > // Type of the right-hand side matrix operand
6517  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6518  {
6519  selectLargeSubAssignKernel( C, A, B );
6520  }
6522  //**********************************************************************************************
6523 
6524  //**BLAS-based subraction assignment to dense matrices******************************************
6525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6526 
6539  template< typename MT3 // Type of the left-hand side target matrix
6540  , typename MT4 // Type of the left-hand side matrix operand
6541  , typename MT5 > // Type of the right-hand side matrix operand
6543  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B )
6544  {
6545  using ET = ElementType_<MT3>;
6546 
6547  if( IsTriangular<MT4>::value ) {
6548  ResultType_<MT3> tmp( serial( B ) );
6549  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
6550  subAssign( C, tmp );
6551  }
6552  else if( IsTriangular<MT5>::value ) {
6553  ResultType_<MT3> tmp( serial( A ) );
6554  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
6555  subAssign( C, tmp );
6556  }
6557  else {
6558  gemm( C, A, B, ET(-1), ET(1) );
6559  }
6560  }
6562 #endif
6563  //**********************************************************************************************
6564 
6565  //**Subtraction assignment to sparse matrices***************************************************
6566  // No special implementation for the subtraction assignment to sparse matrices.
6567  //**********************************************************************************************
6568 
6569  //**Schur product assignment to dense matrices**************************************************
6582  template< typename MT // Type of the target dense matrix
6583  , bool SO > // Storage order of the target dense matrix
6584  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6585  {
6587 
6591 
6592  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6593  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6594 
6595  const ResultType tmp( serial( rhs ) );
6596  schurAssign( ~lhs, tmp );
6597  }
6599  //**********************************************************************************************
6600 
6601  //**Schur product assignment to sparse matrices*************************************************
6602  // No special implementation for the Schur product assignment to sparse matrices.
6603  //**********************************************************************************************
6604 
6605  //**Multiplication assignment to dense matrices*************************************************
6606  // No special implementation for the multiplication assignment to dense matrices.
6607  //**********************************************************************************************
6608 
6609  //**Multiplication assignment to sparse matrices************************************************
6610  // No special implementation for the multiplication assignment to sparse matrices.
6611  //**********************************************************************************************
6612 
6613  //**SMP assignment to dense matrices************************************************************
6629  template< typename MT // Type of the target dense matrix
6630  , bool SO > // Storage order of the target dense matrix
6632  smpAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6633  {
6635 
6636  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6637  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6638 
6639  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
6640  return;
6641  }
6642  else if( rhs.lhs_.columns() == 0UL ) {
6643  reset( ~lhs );
6644  return;
6645  }
6646 
6647  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6648  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6649 
6650  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6651  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6652  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6653  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6654  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6655  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6656 
6657  smpAssign( ~lhs, A * B );
6658  }
6660  //**********************************************************************************************
6661 
6662  //**SMP assignment to sparse matrices***********************************************************
6678  template< typename MT // Type of the target sparse matrix
6679  , bool SO > // Storage order of the target sparse matrix
6681  smpAssign( SparseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6682  {
6684 
6686 
6693 
6694  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6695  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6696 
6697  const ForwardFunctor fwd;
6698 
6699  const TmpType tmp( rhs );
6700  smpAssign( ~lhs, fwd( tmp ) );
6701  }
6703  //**********************************************************************************************
6704 
6705  //**SMP addition assignment to dense matrices***************************************************
6721  template< typename MT // Type of the target dense matrix
6722  , bool SO > // Storage order of the target dense matrix
6725  {
6727 
6728  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6729  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6730 
6731  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6732  return;
6733  }
6734 
6735  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6736  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6737 
6738  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6739  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6740  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6741  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6742  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6743  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6744 
6745  smpAddAssign( ~lhs, A * B );
6746  }
6748  //**********************************************************************************************
6749 
6750  //**SMP addition assignment to sparse matrices**************************************************
6751  // No special implementation for the SMP addition assignment to sparse matrices.
6752  //**********************************************************************************************
6753 
6754  //**SMP subtraction assignment to dense matrices************************************************
6770  template< typename MT // Type of the target dense matrix
6771  , bool SO > // Storage order of the target dense matrix
6774  {
6776 
6777  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6778  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6779 
6780  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6781  return;
6782  }
6783 
6784  LT A( rhs.lhs_ ); // Evaluation of the left-hand side dense matrix operand
6785  RT B( rhs.rhs_ ); // Evaluation of the right-hand side dense matrix operand
6786 
6787  BLAZE_INTERNAL_ASSERT( A.rows() == rhs.lhs_.rows() , "Invalid number of rows" );
6788  BLAZE_INTERNAL_ASSERT( A.columns() == rhs.lhs_.columns(), "Invalid number of columns" );
6789  BLAZE_INTERNAL_ASSERT( B.rows() == rhs.rhs_.rows() , "Invalid number of rows" );
6790  BLAZE_INTERNAL_ASSERT( B.columns() == rhs.rhs_.columns(), "Invalid number of columns" );
6791  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
6792  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns() , "Invalid number of columns" );
6793 
6794  smpSubAssign( ~lhs, A * B );
6795  }
6797  //**********************************************************************************************
6798 
6799  //**SMP subtraction assignment to sparse matrices***********************************************
6800  // No special implementation for the SMP subtraction assignment to sparse matrices.
6801  //**********************************************************************************************
6802 
6803  //**SMP Schur product assignment to dense matrices**********************************************
6816  template< typename MT // Type of the target dense matrix
6817  , bool SO > // Storage order of the target dense matrix
6818  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const TDMatDMatMultExpr& rhs )
6819  {
6821 
6825 
6826  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
6827  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
6828 
6829  const ResultType tmp( rhs );
6830  smpSchurAssign( ~lhs, tmp );
6831  }
6833  //**********************************************************************************************
6834 
6835  //**SMP Schur product assignment to sparse matrices*********************************************
6836  // No special implementation for the SMP Schur product assignment to sparse matrices.
6837  //**********************************************************************************************
6838 
6839  //**SMP multiplication assignment to dense matrices*********************************************
6840  // No special implementation for the SMP multiplication assignment to dense matrices.
6841  //**********************************************************************************************
6842 
6843  //**SMP multiplication assignment to sparse matrices********************************************
6844  // No special implementation for the SMP multiplication assignment to sparse matrices.
6845  //**********************************************************************************************
6846 
6847  //**Compile time checks*************************************************************************
6855  //**********************************************************************************************
6856 };
6857 //*************************************************************************************************
6858 
6859 
6860 
6861 
6862 //=================================================================================================
6863 //
6864 // DMATSCALARMULTEXPR SPECIALIZATION
6865 //
6866 //=================================================================================================
6867 
6868 //*************************************************************************************************
6876 template< typename MT1 // Type of the left-hand side dense matrix
6877  , typename MT2 // Type of the right-hand side dense matrix
6878  , bool SF // Symmetry flag
6879  , bool HF // Hermitian flag
6880  , bool LF // Lower flag
6881  , bool UF // Upper flag
6882  , typename ST > // Type of the right-hand side scalar value
6883 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
6884  : public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
6885  , private Computation
6886 {
6887  private:
6888  //**Type definitions****************************************************************************
6891 
6892  using RES = ResultType_<MMM>;
6893  using RT1 = ResultType_<MT1>;
6894  using RT2 = ResultType_<MT2>;
6895  using ET1 = ElementType_<RT1>;
6896  using ET2 = ElementType_<RT2>;
6897  using CT1 = CompositeType_<MT1>;
6898  using CT2 = CompositeType_<MT2>;
6899  //**********************************************************************************************
6900 
6901  //**********************************************************************************************
6903  enum : bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
6904  //**********************************************************************************************
6905 
6906  //**********************************************************************************************
6908  enum : bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
6909  //**********************************************************************************************
6910 
6911  //**********************************************************************************************
6913  enum : bool {
6914  SYM = ( SF && !( HF || LF || UF ) ),
6915  HERM = ( HF && !( LF || UF ) ),
6916  LOW = ( LF || ( ( SF || HF ) && UF ) ),
6917  UPP = ( UF || ( ( SF || HF ) && LF ) )
6918  };
6919  //**********************************************************************************************
6920 
6921  //**********************************************************************************************
6923 
6926  template< typename T1, typename T2, typename T3 >
6927  struct IsEvaluationRequired {
6928  enum : bool { value = ( evaluateLeft || evaluateRight ) };
6929  };
6930  //**********************************************************************************************
6931 
6932  //**********************************************************************************************
6934 
6936  template< typename T1, typename T2, typename T3, typename T4 >
6937  struct UseBlasKernel {
6938  enum : bool { value = BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION &&
6939  !SYM && !HERM && !LOW && !UPP &&
6944  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6949  IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
6951  };
6952  //**********************************************************************************************
6953 
6954  //**********************************************************************************************
6956 
6958  template< typename T1, typename T2, typename T3, typename T4 >
6959  struct UseVectorizedDefaultKernel {
6960  enum : bool { value = useOptimizedKernels &&
6964  T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6968  , T4 >::value &&
6971  };
6972  //**********************************************************************************************
6973 
6974  //**********************************************************************************************
6976 
6978  using ForwardFunctor = IfTrue_< HERM
6979  , DeclHerm
6980  , IfTrue_< SYM
6981  , DeclSym
6982  , IfTrue_< LOW
6983  , IfTrue_< UPP
6984  , DeclDiag
6985  , DeclLow >
6986  , IfTrue_< UPP
6987  , DeclUpp
6988  , Noop > > > >;
6989  //**********************************************************************************************
6990 
6991  public:
6992  //**Type definitions****************************************************************************
6994  using ResultType = MultTrait_<RES,ST>;
6999  using ReturnType = const ElementType;
7000  using CompositeType = const ResultType;
7001 
7004 
7006  using RightOperand = ST;
7007 
7010 
7013  //**********************************************************************************************
7014 
7015  //**Compilation flags***************************************************************************
7017  enum : bool { simdEnabled = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
7018  MT1::simdEnabled && MT2::simdEnabled &&
7022 
7024  enum : bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
7025  !evaluateRight && MT2::smpAssignable };
7026  //**********************************************************************************************
7027 
7028  //**SIMD properties*****************************************************************************
7030  enum : size_t { SIMDSIZE = SIMDTrait<ElementType>::size };
7031  //**********************************************************************************************
7032 
7033  //**Constructor*********************************************************************************
7039  explicit inline DMatScalarMultExpr( const MMM& matrix, ST scalar )
7040  : matrix_( matrix ) // Left-hand side dense matrix of the multiplication expression
7041  , scalar_( scalar ) // Right-hand side scalar of the multiplication expression
7042  {}
7043  //**********************************************************************************************
7044 
7045  //**Access operator*****************************************************************************
7052  inline ReturnType operator()( size_t i, size_t j ) const {
7053  BLAZE_INTERNAL_ASSERT( i < matrix_.rows() , "Invalid row access index" );
7054  BLAZE_INTERNAL_ASSERT( j < matrix_.columns(), "Invalid column access index" );
7055  return matrix_(i,j) * scalar_;
7056  }
7057  //**********************************************************************************************
7058 
7059  //**At function*********************************************************************************
7067  inline ReturnType at( size_t i, size_t j ) const {
7068  if( i >= matrix_.rows() ) {
7069  BLAZE_THROW_OUT_OF_RANGE( "Invalid row access index" );
7070  }
7071  if( j >= matrix_.columns() ) {
7072  BLAZE_THROW_OUT_OF_RANGE( "Invalid column access index" );
7073  }
7074  return (*this)(i,j);
7075  }
7076  //**********************************************************************************************
7077 
7078  //**Rows function*******************************************************************************
7083  inline size_t rows() const {
7084  return matrix_.rows();
7085  }
7086  //**********************************************************************************************
7087 
7088  //**Columns function****************************************************************************
7093  inline size_t columns() const {
7094  return matrix_.columns();
7095  }
7096  //**********************************************************************************************
7097 
7098  //**Left operand access*************************************************************************
7103  inline LeftOperand leftOperand() const {
7104  return matrix_;
7105  }
7106  //**********************************************************************************************
7107 
7108  //**Right operand access************************************************************************
7113  inline RightOperand rightOperand() const {
7114  return scalar_;
7115  }
7116  //**********************************************************************************************
7117 
7118  //**********************************************************************************************
7124  template< typename T >
7125  inline bool canAlias( const T* alias ) const {
7126  return matrix_.canAlias( alias );
7127  }
7128  //**********************************************************************************************
7129 
7130  //**********************************************************************************************
7136  template< typename T >
7137  inline bool isAliased( const T* alias ) const {
7138  return matrix_.isAliased( alias );
7139  }
7140  //**********************************************************************************************
7141 
7142  //**********************************************************************************************
7147  inline bool isAligned() const {
7148  return matrix_.isAligned();
7149  }
7150  //**********************************************************************************************
7151 
7152  //**********************************************************************************************
7157  inline bool canSMPAssign() const noexcept {
7158  return ( !BLAZE_BLAS_MODE ||
7159  !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7161  ( rows() * columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7162  ( rows() * columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7163  }
7164  //**********************************************************************************************
7165 
7166  private:
7167  //**Member variables****************************************************************************
7168  LeftOperand matrix_;
7169  RightOperand scalar_;
7170  //**********************************************************************************************
7171 
7172  //**Assignment to dense matrices****************************************************************
7184  template< typename MT // Type of the target dense matrix
7185  , bool SO > // Storage order of the target dense matrix
7186  friend inline void assign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
7187  {
7189 
7190  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
7191  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
7192 
7193  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7194  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7195 
7196  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
7197  return;
7198  }
7199  else if( left.columns() == 0UL ) {
7200  reset( ~lhs );
7201  return;
7202  }
7203 
7204  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
7205  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
7206 
7207  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
7208  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
7209  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
7210  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
7211  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
7212  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
7213 
7214  DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
7215  }
7216  //**********************************************************************************************
7217 
7218  //**Assignment to dense matrices (kernel selection)*********************************************
7229  template< typename MT3 // Type of the left-hand side target matrix
7230  , typename MT4 // Type of the left-hand side matrix operand
7231  , typename MT5 // Type of the right-hand side matrix operand
7232  , typename ST2 > // Type of the scalar value
7233  static inline void selectAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7234  {
7236  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
7237  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
7238  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7239  selectSmallAssignKernel( C, A, B, scalar );
7240  else
7241  selectBlasAssignKernel( C, A, B, scalar );
7242  }
7243  //**********************************************************************************************
7244 
7245  //**Default assignment to row-major dense matrices (general/general)****************************
7259  template< typename MT3 // Type of the left-hand side target matrix
7260  , typename MT4 // Type of the left-hand side matrix operand
7261  , typename MT5 // Type of the right-hand side matrix operand
7262  , typename ST2 > // Type of the scalar value
7264  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7265  {
7266  const size_t M( A.rows() );
7267  const size_t N( B.columns() );
7268  const size_t K( A.columns() );
7269 
7270  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7271 
7272  for( size_t i=0UL; i<M; ++i )
7273  {
7274  const size_t kbegin( ( IsUpper<MT4>::value )
7275  ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
7276  :( 0UL ) );
7277  const size_t kend( ( IsLower<MT4>::value )
7278  ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
7279  :( K ) );
7280  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7281 
7282  if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
7283  for( size_t j=0UL; j<N; ++j ) {
7284  reset( (~C)(i,j) );
7285  }
7286  continue;
7287  }
7288 
7289  {
7290  const size_t jbegin( ( IsUpper<MT5>::value )
7292  ?( UPP ? max(i,kbegin+1UL) : kbegin+1UL )
7293  :( UPP ? max(i,kbegin) : kbegin ) )
7294  :( UPP ? i : 0UL ) );
7295  const size_t jend( ( IsLower<MT5>::value )
7297  ?( LOW ? min(i+1UL,kbegin) : kbegin )
7298  :( LOW ? min(i,kbegin)+1UL : kbegin+1UL ) )
7299  :( LOW ? i+1UL : N ) );
7300 
7301  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
7302  for( size_t j=0UL; j<jbegin; ++j ) {
7303  reset( (~C)(i,j) );
7304  }
7305  }
7306  else if( IsStrictlyUpper<MT5>::value ) {
7307  reset( (~C)(i,0UL) );
7308  }
7309  for( size_t j=jbegin; j<jend; ++j ) {
7310  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
7311  }
7312  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
7313  for( size_t j=jend; j<N; ++j ) {
7314  reset( (~C)(i,j) );
7315  }
7316  }
7317  else if( IsStrictlyLower<MT5>::value ) {
7318  reset( (~C)(i,N-1UL) );
7319  }
7320  }
7321 
7322  for( size_t k=kbegin+1UL; k<kend; ++k )
7323  {
7324  const size_t jbegin( ( IsUpper<MT5>::value )
7326  ?( SYM || HERM || UPP ? max( i, k+1UL ) : k+1UL )
7327  :( SYM || HERM || UPP ? max( i, k ) : k ) )
7328  :( SYM || HERM || UPP ? i : 0UL ) );
7329  const size_t jend( ( IsLower<MT5>::value )
7331  ?( LOW ? min(i+1UL,k-1UL) : k-1UL )
7332  :( LOW ? min(i+1UL,k) : k ) )
7333  :( LOW ? i+1UL : N ) );
7334 
7335  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7336  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7337 
7338  for( size_t j=jbegin; j<jend; ++j ) {
7339  (~C)(i,j) += A(i,k) * B(k,j);
7340  }
7341  if( IsLower<MT5>::value ) {
7342  (~C)(i,jend) = A(i,k) * B(k,jend);
7343  }
7344  }
7345 
7346  {
7347  const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
7349  :( SYM || HERM || UPP ? i : 0UL ) );
7350  const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
7352  :( LOW ? i+1UL : N ) );
7353 
7354  if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) continue;
7355  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7356 
7357  for( size_t j=jbegin; j<jend; ++j ) {
7358  (~C)(i,j) *= scalar;
7359  }
7360  }
7361  }
7362 
7363  if( SYM || HERM ) {
7364  for( size_t i=1UL; i<M; ++i ) {
7365  for( size_t j=0UL; j<i; ++j ) {
7366  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
7367  }
7368  }
7369  }
7370  }
7371  //**********************************************************************************************
7372 
7373  //**Default assignment to column-major dense matrices (general/general)*************************
7387  template< typename MT3 // Type of the left-hand side target matrix
7388  , typename MT4 // Type of the left-hand side matrix operand
7389  , typename MT5 // Type of the right-hand side matrix operand
7390  , typename ST2 > // Type of the scalar value
7391  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
7392  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7393  {
7394  const size_t M( A.rows() );
7395  const size_t N( B.columns() );
7396  const size_t K( A.columns() );
7397 
7398  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7399 
7400  for( size_t j=0UL; j<N; ++j )
7401  {
7402  const size_t kbegin( ( IsLower<MT5>::value )
7403  ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
7404  :( 0UL ) );
7405  const size_t kend( ( IsUpper<MT5>::value )
7406  ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
7407  :( K ) );
7408  BLAZE_INTERNAL_ASSERT( kbegin <= kend, "Invalid loop indices detected" );
7409 
7410  if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
7411  for( size_t i=0UL; i<M; ++i ) {
7412  reset( (~C)(i,j) );
7413  }
7414  continue;
7415  }
7416 
7417  {
7418  const size_t ibegin( ( IsLower<MT4>::value )
7420  ?( LOW ? max(j,kbegin+1UL) : kbegin+1UL )
7421  :( LOW ? max(j,kbegin) : kbegin ) )
7422  :( LOW ? j : 0UL ) );
7423  const size_t iend( ( IsUpper<MT4>::value )
7425  ?( UPP ? min(j+1UL,kbegin) : kbegin )
7426  :( UPP ? min(j,kbegin)+1UL : kbegin+1UL ) )
7427  :( UPP ? j+1UL : M ) );
7428 
7429  if( ( IsLower<MT4>::value && IsLower<MT5>::value ) || LOW ) {
7430  for( size_t i=0UL; i<ibegin; ++i ) {
7431  reset( (~C)(i,j) );
7432  }
7433  }
7434  else if( IsStrictlyLower<MT4>::value ) {
7435  reset( (~C)(0UL,j) );
7436  }
7437  for( size_t i=ibegin; i<iend; ++i ) {
7438  (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
7439  }
7440  if( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) || UPP ) {
7441  for( size_t i=iend; i<M; ++i ) {
7442  reset( (~C)(i,j) );
7443  }
7444  }
7445  else if( IsStrictlyUpper<MT4>::value ) {
7446  reset( (~C)(M-1UL,j) );
7447  }
7448  }
7449 
7450  for( size_t k=kbegin+1UL; k<kend; ++k )
7451  {
7452  const size_t ibegin( ( IsLower<MT4>::value )
7454  ?( SYM || HERM || LOW ? max( j, k+1UL ) : k+1UL )
7455  :( SYM || HERM || LOW ? max( j, k ) : k ) )
7456  :( SYM || HERM || LOW ? j : 0UL ) );
7457  const size_t iend( ( IsUpper<MT4>::value )
7459  ?( UPP ? min(j+1UL,k-1UL) : k-1UL )
7460  :( UPP ? min(j+1UL,k) : k ) )
7461  :( UPP ? j+1UL : M ) );
7462 
7463  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7464  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7465 
7466  for( size_t i=ibegin; i<iend; ++i ) {
7467  (~C)(i,j) += A(i,k) * B(k,j);
7468  }
7469  if( IsUpper<MT4>::value ) {
7470  (~C)(iend,j) = A(iend,k) * B(k,j);
7471  }
7472  }
7473 
7474  {
7475  const size_t ibegin( ( ( IsLower<MT4>::value && IsLower<MT5>::value ) )
7477  :( SYM || HERM || LOW ? j : 0UL ) );
7478  const size_t iend( ( ( IsUpper<MT4>::value && IsUpper<MT5>::value ) )
7480  :( UPP ? j+1UL : M ) );
7481 
7482  if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) continue;
7483  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7484 
7485  for( size_t i=ibegin; i<iend; ++i ) {
7486  (~C)(i,j) *= scalar;
7487  }
7488  }
7489  }
7490 
7491  if( SYM || HERM ) {
7492  for( size_t j=1UL; j<N; ++j ) {
7493  for( size_t i=0UL; i<j; ++i ) {
7494  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
7495  }
7496  }
7497  }
7498  }
7499  //**********************************************************************************************
7500 
7501  //**Default assignment to row-major dense matrices (general/diagonal)***************************
7515  template< typename MT3 // Type of the left-hand side target matrix
7516  , typename MT4 // Type of the left-hand side matrix operand
7517  , typename MT5 // Type of the right-hand side matrix operand
7518  , typename ST2 > // Type of the scalar value
7519  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7520  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7521  {
7522  constexpr size_t block( BLOCK_SIZE );
7523 
7524  const size_t M( A.rows() );
7525  const size_t N( B.columns() );
7526 
7527  for( size_t ii=0UL; ii<M; ii+=block ) {
7528  const size_t iend( min( M, ii+block ) );
7529  for( size_t jj=0UL; jj<N; jj+=block ) {
7530  const size_t jend( min( N, jj+block ) );
7531  for( size_t i=ii; i<iend; ++i )
7532  {
7533  const size_t jbegin( ( IsUpper<MT4>::value )
7534  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
7535  :( jj ) );
7536  const size_t jpos( ( IsLower<MT4>::value )
7537  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
7538  :( jend ) );
7539 
7540  if( IsUpper<MT4>::value ) {
7541  for( size_t j=jj; j<jbegin; ++j ) {
7542  reset( (~C)(i,j) );
7543  }
7544  }
7545  for( size_t j=jbegin; j<jpos; ++j ) {
7546  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
7547  }
7548  if( IsLower<MT4>::value ) {
7549  for( size_t j=jpos; j<jend; ++j ) {
7550  reset( (~C)(i,j) );
7551  }
7552  }
7553  }
7554  }
7555  }
7556  }
7557  //**********************************************************************************************
7558 
7559  //**Default assignment to column-major dense matrices (general/diagonal)************************
7573  template< typename MT3 // Type of the left-hand side target matrix
7574  , typename MT4 // Type of the left-hand side matrix operand
7575  , typename MT5 // Type of the right-hand side matrix operand
7576  , typename ST2 > // Type of the scalar value
7577  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7578  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7579  {
7580  const size_t M( A.rows() );
7581  const size_t N( B.columns() );
7582 
7583  for( size_t j=0UL; j<N; ++j )
7584  {
7585  const size_t ibegin( ( IsLower<MT4>::value )
7586  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
7587  :( 0UL ) );
7588  const size_t iend( ( IsUpper<MT4>::value )
7589  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
7590  :( M ) );
7591  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
7592 
7593  if( IsLower<MT4>::value ) {
7594  for( size_t i=0UL; i<ibegin; ++i ) {
7595  reset( (~C)(i,j) );
7596  }
7597  }
7598  for( size_t i=ibegin; i<iend; ++i ) {
7599  (~C)(i,j) = A(i,j) * B(j,j) * scalar;
7600  }
7601  if( IsUpper<MT4>::value ) {
7602  for( size_t i=iend; i<M; ++i ) {
7603  reset( (~C)(i,j) );
7604  }
7605  }
7606  }
7607  }
7608  //**********************************************************************************************
7609 
7610  //**Default assignment to row-major dense matrices (diagonal/general)***************************
7624  template< typename MT3 // Type of the left-hand side target matrix
7625  , typename MT4 // Type of the left-hand side matrix operand
7626  , typename MT5 // Type of the right-hand side matrix operand
7627  , typename ST2 > // Type of the scalar value
7629  selectDefaultAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7630  {
7631  const size_t M( A.rows() );
7632  const size_t N( B.columns() );
7633 
7634  for( size_t i=0UL; i<M; ++i )
7635  {
7636  const size_t jbegin( ( IsUpper<MT5>::value )
7637  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
7638  :( 0UL ) );
7639  const size_t jend( ( IsLower<MT5>::value )
7640  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
7641  :( N ) );
7642  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
7643 
7644  if( IsUpper<MT5>::value ) {
7645  for( size_t j=0UL; j<jbegin; ++j ) {
7646  reset( (~C)(i,j) );
7647  }
7648  }
7649  for( size_t j=jbegin; j<jend; ++j ) {
7650  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
7651  }
7652  if( IsLower<MT5>::value ) {
7653  for( size_t j=jend; j<N; ++j ) {
7654  reset( (~C)(i,j) );
7655  }
7656  }
7657  }
7658  }
7659  //**********************************************************************************************
7660 
7661  //**Default assignment to column-major dense matrices (diagonal/general)************************
7675  template< typename MT3 // Type of the left-hand side target matrix
7676  , typename MT4 // Type of the left-hand side matrix operand
7677  , typename MT5 // Type of the right-hand side matrix operand
7678  , typename ST2 > // Type of the scalar value
7679  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
7680  selectDefaultAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
7681  {
7682  constexpr size_t block( BLOCK_SIZE );
7683 
7684  const size_t M( A.rows() );
7685  const size_t N( B.columns() );
7686 
7687  for( size_t jj=0UL; jj<N; jj+=block ) {
7688  const size_t jend( min( N, jj+block ) );
7689  for( size_t ii=0UL; ii<M; ii+=block ) {
7690  const size_t iend( min( M, ii+block ) );
7691  for( size_t j=jj; j<jend; ++j )
7692  {
7693  const size_t ibegin( ( IsLower<MT5>::value )
7694  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
7695  :( ii ) );
7696  const size_t ipos( ( IsUpper<MT5>::value )
7697  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
7698  :( iend ) );
7699 
7700  if( IsLower<MT5>::value ) {
7701  for( size_t i=ii; i<ibegin; ++i ) {
7702  reset( (~C)(i,j) );
7703  }
7704  }
7705  for( size_t i=ibegin; i<ipos; ++i ) {
7706  (~C)(i,j) = A(i,i) * B(i,j) * scalar;
7707  }
7708  if( IsUpper<MT5>::value ) {
7709  for( size_t i=ipos; i<iend; ++i ) {
7710  reset( (~C)(i,j) );
7711  }
7712  }
7713  }
7714  }
7715  }
7716  }
7717  //**********************************************************************************************
7718 
7719  //**Default assignment to dense matrices (diagonal/diagonal)************************************
7733  template< typename MT3 // Type of the left-hand side target matrix
7734  , typename MT4 // Type of the left-hand side matrix operand
7735  , typename MT5 // Type of the right-hand side matrix operand
7736  , typename ST2 > // Type of the scalar value
7737  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
7738  selectDefaultAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7739  {
7740  reset( C );
7741 
7742  for( size_t i=0UL; i<A.rows(); ++i ) {
7743  C(i,i) = A(i,i) * B(i,i) * scalar;
7744  }
7745  }
7746  //**********************************************************************************************
7747 
7748  //**Default assignment to dense matrices (small matrices)***************************************
7762  template< typename MT3 // Type of the left-hand side target matrix
7763  , typename MT4 // Type of the left-hand side matrix operand
7764  , typename MT5 // Type of the right-hand side matrix operand
7765  , typename ST2 > // Type of the scalar value
7767  selectSmallAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
7768  {
7769  selectDefaultAssignKernel( C, A, B, scalar );
7770  }
7771  //**********************************************************************************************
7772 
7773  //**Vectorized default assignment to row-major dense matrices (small matrices)******************
7788  template< typename MT3 // Type of the left-hand side target matrix
7789  , typename MT4 // Type of the left-hand side matrix operand
7790  , typename MT5 // Type of the right-hand side matrix operand
7791  , typename ST2 > // Type of the scalar value
7793  selectSmallAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
7794  {
7795  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7796 
7797  const size_t M( A.rows() );
7798  const size_t N( B.columns() );
7799  const size_t K( A.columns() );
7800 
7801  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
7802 
7803  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
7804  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
7805 
7806  const SIMDType factor( set( scalar ) );
7807 
7808  if( LOW && UPP && N > SIMDSIZE*3UL ) {
7809  reset( ~C );
7810  }
7811 
7812  {
7813  size_t j( 0UL );
7814 
7816  {
7817  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
7818  for( size_t i=0UL; i<M; ++i )
7819  {
7820  const size_t kbegin( ( IsUpper<MT4>::value )
7821  ?( ( IsLower<MT5>::value )
7822  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7823  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7824  :( IsLower<MT5>::value ? j : 0UL ) );
7825  const size_t kend( ( IsLower<MT4>::value )
7826  ?( ( IsUpper<MT5>::value )
7827  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
7828  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
7829  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
7830 
7831  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7832 
7833  for( size_t k=kbegin; k<kend; ++k ) {
7834  const SIMDType a1( set( A(i,k) ) );
7835  xmm1 += a1 * B.load(k,j );
7836  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7837  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7838  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7839  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7840  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
7841  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
7842  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
7843  }
7844 
7845  (~C).store( i, j , xmm1 * factor );
7846  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
7847  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7848  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7849  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7850  (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
7851  (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
7852  (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
7853  }
7854  }
7855  }
7856 
7857  for( ; !SYM && !HERM && !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
7858  {
7859  size_t i( 0UL );
7860 
7861  for( ; (i+2UL) <= M; i+=2UL )
7862  {
7863  const size_t kbegin( ( IsUpper<MT4>::value )
7864  ?( ( IsLower<MT5>::value )
7865  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7866  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7867  :( IsLower<MT5>::value ? j : 0UL ) );
7868  const size_t kend( ( IsLower<MT4>::value )
7869  ?( ( IsUpper<MT5>::value )
7870  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
7871  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
7872  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
7873 
7874  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7875 
7876  for( size_t k=kbegin; k<kend; ++k ) {
7877  const SIMDType a1( set( A(i ,k) ) );
7878  const SIMDType a2( set( A(i+1UL,k) ) );
7879  const SIMDType b1( B.load(k,j ) );
7880  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7881  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7882  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7883  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
7884  xmm1 += a1 * b1;
7885  xmm2 += a1 * b2;
7886  xmm3 += a1 * b3;
7887  xmm4 += a1 * b4;
7888  xmm5 += a1 * b5;
7889  xmm6 += a2 * b1;
7890  xmm7 += a2 * b2;
7891  xmm8 += a2 * b3;
7892  xmm9 += a2 * b4;
7893  xmm10 += a2 * b5;
7894  }
7895 
7896  (~C).store( i , j , xmm1 * factor );
7897  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
7898  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7899  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7900  (~C).store( i , j+SIMDSIZE*4UL, xmm5 * factor );
7901  (~C).store( i+1UL, j , xmm6 * factor );
7902  (~C).store( i+1UL, j+SIMDSIZE , xmm7 * factor );
7903  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm8 * factor );
7904  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm9 * factor );
7905  (~C).store( i+1UL, j+SIMDSIZE*4UL, xmm10 * factor );
7906  }
7907 
7908  if( i < M )
7909  {
7910  const size_t kbegin( ( IsUpper<MT4>::value )
7911  ?( ( IsLower<MT5>::value )
7912  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7913  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7914  :( IsLower<MT5>::value ? j : 0UL ) );
7915  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
7916 
7917  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7918 
7919  for( size_t k=kbegin; k<kend; ++k ) {
7920  const SIMDType a1( set( A(i,k) ) );
7921  xmm1 += a1 * B.load(k,j );
7922  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7923  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7924  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
7925  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
7926  }
7927 
7928  (~C).store( i, j , xmm1 * factor );
7929  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
7930  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
7931  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
7932  (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
7933  }
7934  }
7935 
7936  for( ; !( LOW && UPP ) && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7937  {
7938  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*4UL,M) : M );
7939  size_t i( LOW ? j : 0UL );
7940 
7941  for( ; (i+2UL) <= iend; i+=2UL )
7942  {
7943  const size_t kbegin( ( IsUpper<MT4>::value )
7944  ?( ( IsLower<MT5>::value )
7945  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7946  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7947  :( IsLower<MT5>::value ? j : 0UL ) );
7948  const size_t kend( ( IsLower<MT4>::value )
7949  ?( ( IsUpper<MT5>::value )
7950  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
7951  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
7952  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
7953 
7954  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7955 
7956  for( size_t k=kbegin; k<kend; ++k ) {
7957  const SIMDType a1( set( A(i ,k) ) );
7958  const SIMDType a2( set( A(i+1UL,k) ) );
7959  const SIMDType b1( B.load(k,j ) );
7960  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
7961  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
7962  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
7963  xmm1 += a1 * b1;
7964  xmm2 += a1 * b2;
7965  xmm3 += a1 * b3;
7966  xmm4 += a1 * b4;
7967  xmm5 += a2 * b1;
7968  xmm6 += a2 * b2;
7969  xmm7 += a2 * b3;
7970  xmm8 += a2 * b4;
7971  }
7972 
7973  (~C).store( i , j , xmm1 * factor );
7974  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
7975  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
7976  (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
7977  (~C).store( i+1UL, j , xmm5 * factor );
7978  (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
7979  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
7980  (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
7981  }
7982 
7983  if( i < iend )
7984  {
7985  const size_t kbegin( ( IsUpper<MT4>::value )
7986  ?( ( IsLower<MT5>::value )
7987  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
7988  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
7989  :( IsLower<MT5>::value ? j : 0UL ) );
7990  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
7991 
7992  SIMDType xmm1, xmm2, xmm3, xmm4;
7993 
7994  for( size_t k=kbegin; k<kend; ++k ) {
7995  const SIMDType a1( set( A(i,k) ) );
7996  xmm1 += a1 * B.load(k,j );
7997  xmm2 += a1 * B.load(k,j+SIMDSIZE );
7998  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
7999  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
8000  }
8001 
8002  (~C).store( i, j , xmm1 * factor );
8003  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
8004  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8005  (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
8006  }
8007  }
8008 
8009  for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
8010  {
8011  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*3UL,M) : M );
8012  size_t i( LOW ? j : 0UL );
8013 
8014  for( ; (i+2UL) <= iend; i+=2UL )
8015  {
8016  const size_t kbegin( ( IsUpper<MT4>::value )
8017  ?( ( IsLower<MT5>::value )
8018  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8019  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8020  :( IsLower<MT5>::value ? j : 0UL ) );
8021  const size_t kend( ( IsLower<MT4>::value )
8022  ?( ( IsUpper<MT5>::value )
8023  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
8024  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8025  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
8026 
8027  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8028 
8029  for( size_t k=kbegin; k<kend; ++k ) {
8030  const SIMDType a1( set( A(i ,k) ) );
8031  const SIMDType a2( set( A(i+1UL,k) ) );
8032  const SIMDType b1( B.load(k,j ) );
8033  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8034  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8035  xmm1 += a1 * b1;
8036  xmm2 += a1 * b2;
8037  xmm3 += a1 * b3;
8038  xmm4 += a2 * b1;
8039  xmm5 += a2 * b2;
8040  xmm6 += a2 * b3;
8041  }
8042 
8043  (~C).store( i , j , xmm1 * factor );
8044  (~C).store( i , j+SIMDSIZE , xmm2 * factor );
8045  (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
8046  (~C).store( i+1UL, j , xmm4 * factor );
8047  (~C).store( i+1UL, j+SIMDSIZE , xmm5 * factor );
8048  (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm6 * factor );
8049  }
8050 
8051  if( i < iend )
8052  {
8053  const size_t kbegin( ( IsUpper<MT4>::value )
8054  ?( ( IsLower<MT5>::value )
8055  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8056  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8057  :( IsLower<MT5>::value ? j : 0UL ) );
8058  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
8059 
8060  SIMDType xmm1, xmm2, xmm3;
8061 
8062  for( size_t k=kbegin; k<kend; ++k ) {
8063  const SIMDType a1( set( A(i,k) ) );
8064  xmm1 += a1 * B.load(k,j );
8065  xmm2 += a1 * B.load(k,j+SIMDSIZE );
8066  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
8067  }
8068 
8069  (~C).store( i, j , xmm1 * factor );
8070  (~C).store( i, j+SIMDSIZE , xmm2 * factor );
8071  (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
8072  }
8073  }
8074 
8075  for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8076  {
8077  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE*2UL,M) : M );
8078  size_t i( LOW ? j : 0UL );
8079 
8080  for( ; (i+4UL) <= iend; i+=4UL )
8081  {
8082  const size_t kbegin( ( IsUpper<MT4>::value )
8083  ?( ( IsLower<MT5>::value )
8084  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8085  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8086  :( IsLower<MT5>::value ? j : 0UL ) );
8087  const size_t kend( ( IsLower<MT4>::value )
8088  ?( ( IsUpper<MT5>::value )
8089  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
8090  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
8091  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
8092 
8093  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8094 
8095  for( size_t k=kbegin; k<kend; ++k ) {
8096  const SIMDType a1( set( A(i ,k) ) );
8097  const SIMDType a2( set( A(i+1UL,k) ) );
8098  const SIMDType a3( set( A(i+2UL,k) ) );
8099  const SIMDType a4( set( A(i+3UL,k) ) );
8100  const SIMDType b1( B.load(k,j ) );
8101  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8102  xmm1 += a1 * b1;
8103  xmm2 += a1 * b2;
8104  xmm3 += a2 * b1;
8105  xmm4 += a2 * b2;
8106  xmm5 += a3 * b1;
8107  xmm6 += a3 * b2;
8108  xmm7 += a4 * b1;
8109  xmm8 += a4 * b2;
8110  }
8111 
8112  (~C).store( i , j , xmm1 * factor );
8113  (~C).store( i , j+SIMDSIZE, xmm2 * factor );
8114  (~C).store( i+1UL, j , xmm3 * factor );
8115  (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8116  (~C).store( i+2UL, j , xmm5 * factor );
8117  (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8118  (~C).store( i+3UL, j , xmm7 * factor );
8119  (~C).store( i+3UL, j+SIMDSIZE, xmm8 * factor );
8120  }
8121 
8122  for( ; (i+3UL) <= iend; i+=3UL )
8123  {
8124  const size_t kbegin( ( IsUpper<MT4>::value )
8125  ?( ( IsLower<MT5>::value )
8126  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8127  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8128  :( IsLower<MT5>::value ? j : 0UL ) );
8129  const size_t kend( ( IsLower<MT4>::value )
8130  ?( ( IsUpper<MT5>::value )
8131  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
8132  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
8133  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
8134 
8135  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8136 
8137  for( size_t k=kbegin; k<kend; ++k ) {
8138  const SIMDType a1( set( A(i ,k) ) );
8139  const SIMDType a2( set( A(i+1UL,k) ) );
8140  const SIMDType a3( set( A(i+2UL,k) ) );
8141  const SIMDType b1( B.load(k,j ) );
8142  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8143  xmm1 += a1 * b1;
8144  xmm2 += a1 * b2;
8145  xmm3 += a2 * b1;
8146  xmm4 += a2 * b2;
8147  xmm5 += a3 * b1;
8148  xmm6 += a3 * b2;
8149  }
8150 
8151  (~C).store( i , j , xmm1 * factor );
8152  (~C).store( i , j+SIMDSIZE, xmm2 * factor );
8153  (~C).store( i+1UL, j , xmm3 * factor );
8154  (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
8155  (~C).store( i+2UL, j , xmm5 * factor );
8156  (~C).store( i+2UL, j+SIMDSIZE, xmm6 * factor );
8157  }
8158 
8159  for( ; (i+2UL) <= iend; i+=2UL )
8160  {
8161  const size_t kbegin( ( IsUpper<MT4>::value )
8162  ?( ( IsLower<MT5>::value )
8163  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8164  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8165  :( IsLower<MT5>::value ? j : 0UL ) );
8166  const size_t kend( ( IsLower<MT4>::value )
8167  ?( ( IsUpper<MT5>::value )
8168  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8169  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8170  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
8171 
8172  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8173  size_t k( kbegin );
8174 
8175  for( ; (k+2UL) <= kend; k+=2UL ) {
8176  const SIMDType a1( set( A(i ,k ) ) );
8177  const SIMDType a2( set( A(i+1UL,k ) ) );
8178  const SIMDType a3( set( A(i ,k+1UL) ) );
8179  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
8180  const SIMDType b1( B.load(k ,j ) );
8181  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
8182  const SIMDType b3( B.load(k+1UL,j ) );
8183  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
8184  xmm1 += a1 * b1;
8185  xmm2 += a1 * b2;
8186  xmm3 += a2 * b1;
8187  xmm4 += a2 * b2;
8188  xmm5 += a3 * b3;
8189  xmm6 += a3 * b4;
8190  xmm7 += a4 * b3;
8191  xmm8 += a4 * b4;
8192  }
8193 
8194  for( ; k<kend; ++k ) {
8195  const SIMDType a1( set( A(i ,k) ) );
8196  const SIMDType a2( set( A(i+1UL,k) ) );
8197  const SIMDType b1( B.load(k,j ) );
8198  const SIMDType b2( B.load(k,j+SIMDSIZE) );
8199  xmm1 += a1 * b1;
8200  xmm2 += a1 * b2;
8201  xmm3 += a2 * b1;
8202  xmm4 += a2 * b2;
8203  }
8204 
8205  (~C).store( i , j , (xmm1+xmm5) * factor );
8206  (~C).store( i , j+SIMDSIZE, (xmm2+xmm6) * factor );
8207  (~C).store( i+1UL, j , (xmm3+xmm7) * factor );
8208  (~C).store( i+1UL, j+SIMDSIZE, (xmm4+xmm8) * factor );
8209  }
8210 
8211  if( i < iend )
8212  {
8213  const size_t kbegin( ( IsUpper<MT4>::value )
8214  ?( ( IsLower<MT5>::value )
8215  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8216  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8217  :( IsLower<MT5>::value ? j : 0UL ) );
8218  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
8219 
8220  SIMDType xmm1, xmm2, xmm3, xmm4;
8221  size_t k( kbegin );
8222 
8223  for( ; (k+2UL) <= kend; k+=2UL ) {
8224  const SIMDType a1( set( A(i,k ) ) );
8225  const SIMDType a2( set( A(i,k+1UL) ) );
8226  xmm1 += a1 * B.load(k ,j );
8227  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
8228  xmm3 += a2 * B.load(k+1UL,j );
8229  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
8230  }
8231 
8232  for( ; k<kend; ++k ) {
8233  const SIMDType a1( set( A(i,k) ) );
8234  xmm1 += a1 * B.load(k,j );
8235  xmm2 += a1 * B.load(k,j+SIMDSIZE);
8236  }
8237 
8238  (~C).store( i, j , (xmm1+xmm3) * factor );
8239  (~C).store( i, j+SIMDSIZE, (xmm2+xmm4) * factor );
8240  }
8241  }
8242 
8243  for( ; j<jpos; j+=SIMDSIZE )
8244  {
8245  const size_t iend( SYM || HERM || UPP ? min(j+SIMDSIZE,M) : M );
8246  size_t i( LOW ? j : 0UL );
8247 
8248  for( ; (i+4UL) <= iend; i+=4UL )
8249  {
8250  const size_t kbegin( ( IsUpper<MT4>::value )
8251  ?( ( IsLower<MT5>::value )
8252  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8253  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8254  :( IsLower<MT5>::value ? j : 0UL ) );
8255  const size_t kend( ( IsLower<MT4>::value )
8256  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
8257  :( K ) );
8258 
8259  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8260  size_t k( kbegin );
8261 
8262  for( ; (k+2UL) <= kend; k+=2UL ) {
8263  const SIMDType b1( B.load(k ,j) );
8264  const SIMDType b2( B.load(k+1UL,j) );
8265  xmm1 += set( A(i ,k ) ) * b1;
8266  xmm2 += set( A(i+1UL,k ) ) * b1;
8267  xmm3 += set( A(i+2UL,k ) ) * b1;
8268  xmm4 += set( A(i+3UL,k ) ) * b1;
8269  xmm5 += set( A(i ,k+1UL) ) * b2;
8270  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
8271  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
8272  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
8273  }
8274 
8275  for( ; k<kend; ++k ) {
8276  const SIMDType b1( B.load(k,j) );
8277  xmm1 += set( A(i ,k) ) * b1;
8278  xmm2 += set( A(i+1UL,k) ) * b1;
8279  xmm3 += set( A(i+2UL,k) ) * b1;
8280  xmm4 += set( A(i+3UL,k) ) * b1;
8281  }
8282 
8283  (~C).store( i , j, (xmm1+xmm5) * factor );
8284  (~C).store( i+1UL, j, (xmm2+xmm6) * factor );
8285  (~C).store( i+2UL, j, (xmm3+xmm7) * factor );
8286  (~C).store( i+3UL, j, (xmm4+xmm8) * factor );
8287  }
8288 
8289  for( ; (i+3UL) <= iend; i+=3UL )
8290  {
8291  const size_t kbegin( ( IsUpper<MT4>::value )
8292  ?( ( IsLower<MT5>::value )
8293  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8294  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8295  :( IsLower<MT5>::value ? j : 0UL ) );
8296  const size_t kend( ( IsLower<MT4>::value )
8297  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
8298  :( K ) );
8299 
8300  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8301  size_t k( kbegin );
8302 
8303  for( ; (k+2UL) <= kend; k+=2UL ) {
8304  const SIMDType b1( B.load(k ,j) );
8305  const SIMDType b2( B.load(k+1UL,j) );
8306  xmm1 += set( A(i ,k ) ) * b1;
8307  xmm2 += set( A(i+1UL,k ) ) * b1;
8308  xmm3 += set( A(i+2UL,k ) ) * b1;
8309  xmm4 += set( A(i ,k+1UL) ) * b2;
8310  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
8311  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
8312  }
8313 
8314  for( ; k<kend; ++k ) {
8315  const SIMDType b1( B.load(k,j) );
8316  xmm1 += set( A(i ,k) ) * b1;
8317  xmm2 += set( A(i+1UL,k) ) * b1;
8318  xmm3 += set( A(i+2UL,k) ) * b1;
8319  }
8320 
8321  (~C).store( i , j, (xmm1+xmm4) * factor );
8322  (~C).store( i+1UL, j, (xmm2+xmm5) * factor );
8323  (~C).store( i+2UL, j, (xmm3+xmm6) * factor );
8324  }
8325 
8326  for( ; (i+2UL) <= iend; i+=2UL )
8327  {
8328  const size_t kbegin( ( IsUpper<MT4>::value )
8329  ?( ( IsLower<MT5>::value )
8330  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8331  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8332  :( IsLower<MT5>::value ? j : 0UL ) );
8333  const size_t kend( ( IsLower<MT4>::value )
8334  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8335  :( K ) );
8336 
8337  SIMDType xmm1, xmm2, xmm3, xmm4;
8338  size_t k( kbegin );
8339 
8340  for( ; (k+2UL) <= kend; k+=2UL ) {
8341  const SIMDType b1( B.load(k ,j) );
8342  const SIMDType b2( B.load(k+1UL,j) );
8343  xmm1 += set( A(i ,k ) ) * b1;
8344  xmm2 += set( A(i+1UL,k ) ) * b1;
8345  xmm3 += set( A(i ,k+1UL) ) * b2;
8346  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
8347  }
8348 
8349  for( ; k<kend; ++k ) {
8350  const SIMDType b1( B.load(k,j) );
8351  xmm1 += set( A(i ,k) ) * b1;
8352  xmm2 += set( A(i+1UL,k) ) * b1;
8353  }
8354 
8355  (~C).store( i , j, (xmm1+xmm3) * factor );
8356  (~C).store( i+1UL, j, (xmm2+xmm4) * factor );
8357  }
8358 
8359  if( i < iend )
8360  {
8361  const size_t kbegin( ( IsUpper<MT4>::value )
8362  ?( ( IsLower<MT5>::value )
8363  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8364  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8365  :( IsLower<MT5>::value ? j : 0UL ) );
8366 
8367  SIMDType xmm1, xmm2;
8368  size_t k( kbegin );
8369 
8370  for( ; (k+2UL) <= K; k+=2UL ) {
8371  xmm1 += set( A(i,k ) ) * B.load(k ,j);
8372  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
8373  }
8374 
8375  for( ; k<K; ++k ) {
8376  xmm1 += set( A(i,k) ) * B.load(k,j);
8377  }
8378 
8379  (~C).store( i, j, (xmm1+xmm2) * factor );
8380  }
8381  }
8382 
8383  for( ; remainder && j<N; ++j )
8384  {
8385  size_t i( LOW && UPP ? j : 0UL );
8386 
8387  for( ; (i+2UL) <= M; i+=2UL )
8388  {
8389  const size_t kbegin( ( IsUpper<MT4>::value )
8390  ?( ( IsLower<MT5>::value )
8391  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8392  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8393  :( IsLower<MT5>::value ? j : 0UL ) );
8394  const size_t kend( ( IsLower<MT4>::value )
8395  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8396  :( K ) );
8397 
8398  ElementType value1{};
8399  ElementType value2{};
8400 
8401  for( size_t k=kbegin; k<kend; ++k ) {
8402  value1 += A(i ,k) * B(k,j);
8403  value2 += A(i+1UL,k) * B(k,j);
8404  }
8405 
8406  (~C)(i ,j) = value1 * scalar;
8407  (~C)(i+1UL,j) = value2 * scalar;
8408  }
8409 
8410  if( i < M )
8411  {
8412  const size_t kbegin( ( IsUpper<MT4>::value )
8413  ?( ( IsLower<MT5>::value )
8414  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8415  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8416  :( IsLower<MT5>::value ? j : 0UL ) );
8417 
8418  ElementType value{};
8419 
8420  for( size_t k=kbegin; k<K; ++k ) {
8421  value += A(i,k) * B(k,j);
8422  }
8423 
8424  (~C)(i,j) = value * scalar;
8425  }
8426  }
8427  }
8428 
8429  if( ( SYM || HERM ) && ( N > SIMDSIZE*4UL ) ) {
8430  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8431  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8432  for( size_t j=0UL; j<jend; ++j ) {
8433  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
8434  }
8435  }
8436  }
8437  else if( LOW && !UPP && N > SIMDSIZE*4UL ) {
8438  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
8439  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
8440  for( size_t i=0UL; i<iend; ++i ) {
8441  reset( (~C)(i,j) );
8442  }
8443  }
8444  }
8445  else if( !LOW && UPP && N > SIMDSIZE*4UL ) {
8446  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
8447  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
8448  for( size_t j=0UL; j<jend; ++j ) {
8449  reset( (~C)(i,j) );
8450  }
8451  }
8452  }
8453  }
8454  //**********************************************************************************************
8455 
8456  //**Vectorized default assignment to column-major dense matrices (small matrices)***************
8471  template< typename MT3 // Type of the left-hand side target matrix
8472  , typename MT4 // Type of the left-hand side matrix operand
8473  , typename MT5 // Type of the right-hand side matrix operand
8474  , typename ST2 > // Type of the scalar value
8476  selectSmallAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
8477  {
8478  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
8479 
8480  const size_t M( A.rows() );
8481  const size_t N( B.columns() );
8482  const size_t K( A.columns() );
8483 
8484  BLAZE_INTERNAL_ASSERT( !( SYM || HERM || LOW || UPP ) || ( M == N ), "Broken invariant detected" );
8485 
8486  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
8487  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
8488 
8489  const SIMDType factor( set( scalar ) );
8490 
8491  if( LOW && UPP && M > SIMDSIZE*3UL ) {
8492  reset( ~C );
8493  }
8494 
8495  {
8496  size_t i( 0UL );
8497 
8499  {
8500  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8501  for( size_t j=0UL; j<N; ++j )
8502  {
8503  const size_t kbegin( ( IsLower<MT5>::value )
8504  ?( ( IsUpper<MT4>::value )
8505  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8506  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8507  :( IsUpper<MT4>::value ? i : 0UL ) );
8508  const size_t kend( ( IsUpper<MT5>::value )
8509  ?( ( IsLower<MT4>::value )
8510  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
8511  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
8512  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
8513 
8514  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8515 
8516  for( size_t k=kbegin; k<kend; ++k ) {
8517  const SIMDType b1( set( B(k,j) ) );
8518  xmm1 += A.load(i ,k) * b1;
8519  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8520  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8521  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8522  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8523  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
8524  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
8525  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
8526  }
8527 
8528  (~C).store( i , j, xmm1 * factor );
8529  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8530  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8531  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8532  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8533  (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
8534  (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
8535  (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
8536  }
8537  }
8538  }
8539 
8540  for( ; !SYM && !HERM && !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
8541  {
8542  size_t j( 0UL );
8543 
8544  for( ; (j+2UL) <= N; j+=2UL )
8545  {
8546  const size_t kbegin( ( IsLower<MT5>::value )
8547  ?( ( IsUpper<MT4>::value )
8548  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8549  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8550  :( IsUpper<MT4>::value ? i : 0UL ) );
8551  const size_t kend( ( IsUpper<MT5>::value )
8552  ?( ( IsLower<MT4>::value )
8553  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8554  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8555  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
8556 
8557  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8558 
8559  for( size_t k=kbegin; k<kend; ++k ) {
8560  const SIMDType a1( A.load(i ,k) );
8561  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8562  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8563  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8564  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
8565  const SIMDType b1( set( B(k,j ) ) );
8566  const SIMDType b2( set( B(k,j+1UL) ) );
8567  xmm1 += a1 * b1;
8568  xmm2 += a2 * b1;
8569  xmm3 += a3 * b1;
8570  xmm4 += a4 * b1;
8571  xmm5 += a5 * b1;
8572  xmm6 += a1 * b2;
8573  xmm7 += a2 * b2;
8574  xmm8 += a3 * b2;
8575  xmm9 += a4 * b2;
8576  xmm10 += a5 * b2;
8577  }
8578 
8579  (~C).store( i , j , xmm1 * factor );
8580  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8581  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8582  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8583  (~C).store( i+SIMDSIZE*4UL, j , xmm5 * factor );
8584  (~C).store( i , j+1UL, xmm6 * factor );
8585  (~C).store( i+SIMDSIZE , j+1UL, xmm7 * factor );
8586  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm8 * factor );
8587  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm9 * factor );
8588  (~C).store( i+SIMDSIZE*4UL, j+1UL, xmm10 * factor );
8589  }
8590 
8591  if( j < N )
8592  {
8593  const size_t kbegin( ( IsLower<MT5>::value )
8594  ?( ( IsUpper<MT4>::value )
8595  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8596  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8597  :( IsUpper<MT4>::value ? i : 0UL ) );
8598  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
8599 
8600  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8601 
8602  for( size_t k=kbegin; k<kend; ++k ) {
8603  const SIMDType b1( set( B(k,j) ) );
8604  xmm1 += A.load(i ,k) * b1;
8605  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8606  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8607  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8608  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
8609  }
8610 
8611  (~C).store( i , j, xmm1 * factor );
8612  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8613  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8614  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8615  (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
8616  }
8617  }
8618 
8619  for( ; !( LOW && UPP ) && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8620  {
8621  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*4UL,N) : N );
8622  size_t j( UPP ? i : 0UL );
8623 
8624  for( ; (j+2UL) <= jend; j+=2UL )
8625  {
8626  const size_t kbegin( ( IsLower<MT5>::value )
8627  ?( ( IsUpper<MT4>::value )
8628  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8629  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8630  :( IsUpper<MT4>::value ? i : 0UL ) );
8631  const size_t kend( ( IsUpper<MT5>::value )
8632  ?( ( IsLower<MT4>::value )
8633  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8634  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8635  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
8636 
8637  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8638 
8639  for( size_t k=kbegin; k<kend; ++k ) {
8640  const SIMDType a1( A.load(i ,k) );
8641  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8642  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8643  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8644  const SIMDType b1( set( B(k,j ) ) );
8645  const SIMDType b2( set( B(k,j+1UL) ) );
8646  xmm1 += a1 * b1;
8647  xmm2 += a2 * b1;
8648  xmm3 += a3 * b1;
8649  xmm4 += a4 * b1;
8650  xmm5 += a1 * b2;
8651  xmm6 += a2 * b2;
8652  xmm7 += a3 * b2;
8653  xmm8 += a4 * b2;
8654  }
8655 
8656  (~C).store( i , j , xmm1 * factor );
8657  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8658  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8659  (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
8660  (~C).store( i , j+1UL, xmm5 * factor );
8661  (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
8662  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
8663  (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
8664  }
8665 
8666  if( j < jend )
8667  {
8668  const size_t kbegin( ( IsLower<MT5>::value )
8669  ?( ( IsUpper<MT4>::value )
8670  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8671  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8672  :( IsUpper<MT4>::value ? i : 0UL ) );
8673  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
8674 
8675  SIMDType xmm1, xmm2, xmm3, xmm4;
8676 
8677  for( size_t k=kbegin; k<kend; ++k ) {
8678  const SIMDType b1( set( B(k,j) ) );
8679  xmm1 += A.load(i ,k) * b1;
8680  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8681  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8682  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
8683  }
8684 
8685  (~C).store( i , j, xmm1 * factor );
8686  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8687  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8688  (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
8689  }
8690  }
8691 
8692  for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
8693  {
8694  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*3UL,N) : N );
8695  size_t j( UPP ? i : 0UL );
8696 
8697  for( ; (j+2UL) <= jend; j+=2UL )
8698  {
8699  const size_t kbegin( ( IsLower<MT5>::value )
8700  ?( ( IsUpper<MT4>::value )
8701  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8702  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8703  :( IsUpper<MT4>::value ? i : 0UL ) );
8704  const size_t kend( ( IsUpper<MT5>::value )
8705  ?( ( IsLower<MT4>::value )
8706  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8707  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8708  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
8709 
8710  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8711 
8712  for( size_t k=kbegin; k<kend; ++k ) {
8713  const SIMDType a1( A.load(i ,k) );
8714  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8715  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8716  const SIMDType b1( set( B(k,j ) ) );
8717  const SIMDType b2( set( B(k,j+1UL) ) );
8718  xmm1 += a1 * b1;
8719  xmm2 += a2 * b1;
8720  xmm3 += a3 * b1;
8721  xmm4 += a1 * b2;
8722  xmm5 += a2 * b2;
8723  xmm6 += a3 * b2;
8724  }
8725 
8726  (~C).store( i , j , xmm1 * factor );
8727  (~C).store( i+SIMDSIZE , j , xmm2 * factor );
8728  (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
8729  (~C).store( i , j+1UL, xmm4 * factor );
8730  (~C).store( i+SIMDSIZE , j+1UL, xmm5 * factor );
8731  (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm6 * factor );
8732  }
8733 
8734  if( j < jend )
8735  {
8736  const size_t kbegin( ( IsLower<MT5>::value )
8737  ?( ( IsUpper<MT4>::value )
8738  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8739  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8740  :( IsUpper<MT4>::value ? i : 0UL ) );
8741  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
8742 
8743  SIMDType xmm1, xmm2, xmm3;
8744 
8745  for( size_t k=kbegin; k<kend; ++k ) {
8746  const SIMDType b1( set( B(k,j) ) );
8747  xmm1 += A.load(i ,k) * b1;
8748  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
8749  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
8750  }
8751 
8752  (~C).store( i , j, xmm1 * factor );
8753  (~C).store( i+SIMDSIZE , j, xmm2 * factor );
8754  (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
8755  }
8756  }
8757 
8758  for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8759  {
8760  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE*2UL,N) : N );
8761  size_t j( UPP ? i : 0UL );
8762 
8763  for( ; (j+4UL) <= jend; j+=4UL )
8764  {
8765  const size_t kbegin( ( IsLower<MT5>::value )
8766  ?( ( IsUpper<MT4>::value )
8767  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8768  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8769  :( IsUpper<MT4>::value ? i : 0UL ) );
8770  const size_t kend( ( IsUpper<MT5>::value )
8771  ?( ( IsLower<MT4>::value )
8772  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
8773  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
8774  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8775 
8776  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8777 
8778  for( size_t k=kbegin; k<kend; ++k ) {
8779  const SIMDType a1( A.load(i ,k) );
8780  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8781  const SIMDType b1( set( B(k,j ) ) );
8782  const SIMDType b2( set( B(k,j+1UL) ) );
8783  const SIMDType b3( set( B(k,j+2UL) ) );
8784  const SIMDType b4( set( B(k,j+3UL) ) );
8785  xmm1 += a1 * b1;
8786  xmm2 += a2 * b1;
8787  xmm3 += a1 * b2;
8788  xmm4 += a2 * b2;
8789  xmm5 += a1 * b3;
8790  xmm6 += a2 * b3;
8791  xmm7 += a1 * b4;
8792  xmm8 += a2 * b4;
8793  }
8794 
8795  (~C).store( i , j , xmm1 * factor );
8796  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
8797  (~C).store( i , j+1UL, xmm3 * factor );
8798  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8799  (~C).store( i , j+2UL, xmm5 * factor );
8800  (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8801  (~C).store( i , j+3UL, xmm7 * factor );
8802  (~C).store( i+SIMDSIZE, j+3UL, xmm8 * factor );
8803  }
8804 
8805  for( ; (j+3UL) <= jend; j+=3UL )
8806  {
8807  const size_t kbegin( ( IsLower<MT5>::value )
8808  ?( ( IsUpper<MT4>::value )
8809  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8810  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8811  :( IsUpper<MT4>::value ? i : 0UL ) );
8812  const size_t kend( ( IsUpper<MT5>::value )
8813  ?( ( IsLower<MT4>::value )
8814  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
8815  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
8816  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8817 
8818  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8819 
8820  for( size_t k=kbegin; k<kend; ++k ) {
8821  const SIMDType a1( A.load(i ,k) );
8822  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8823  const SIMDType b1( set( B(k,j ) ) );
8824  const SIMDType b2( set( B(k,j+1UL) ) );
8825  const SIMDType b3( set( B(k,j+2UL) ) );
8826  xmm1 += a1 * b1;
8827  xmm2 += a2 * b1;
8828  xmm3 += a1 * b2;
8829  xmm4 += a2 * b2;
8830  xmm5 += a1 * b3;
8831  xmm6 += a2 * b3;
8832  }
8833 
8834  (~C).store( i , j , xmm1 * factor );
8835  (~C).store( i+SIMDSIZE, j , xmm2 * factor );
8836  (~C).store( i , j+1UL, xmm3 * factor );
8837  (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
8838  (~C).store( i , j+2UL, xmm5 * factor );
8839  (~C).store( i+SIMDSIZE, j+2UL, xmm6 * factor );
8840  }
8841 
8842  for( ; (j+2UL) <= jend; j+=2UL )
8843  {
8844  const size_t kbegin( ( IsLower<MT5>::value )
8845  ?( ( IsUpper<MT4>::value )
8846  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8847  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8848  :( IsUpper<MT4>::value ? i : 0UL ) );
8849  const size_t kend( ( IsUpper<MT5>::value )
8850  ?( ( IsLower<MT4>::value )
8851  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8852  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8853  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
8854 
8855  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8856  size_t k( kbegin );
8857 
8858  for( ; (k+2UL) <= kend; k+=2UL ) {
8859  const SIMDType a1( A.load(i ,k ) );
8860  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
8861  const SIMDType a3( A.load(i ,k+1UL) );
8862  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
8863  const SIMDType b1( set( B(k ,j ) ) );
8864  const SIMDType b2( set( B(k ,j+1UL) ) );
8865  const SIMDType b3( set( B(k+1UL,j ) ) );
8866  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
8867  xmm1 += a1 * b1;
8868  xmm2 += a2 * b1;
8869  xmm3 += a1 * b2;
8870  xmm4 += a2 * b2;
8871  xmm5 += a3 * b3;
8872  xmm6 += a4 * b3;
8873  xmm7 += a3 * b4;
8874  xmm8 += a4 * b4;
8875  }
8876 
8877  for( ; k<kend; ++k ) {
8878  const SIMDType a1( A.load(i ,k) );
8879  const SIMDType a2( A.load(i+SIMDSIZE,k) );
8880  const SIMDType b1( set( B(k,j ) ) );
8881  const SIMDType b2( set( B(k,j+1UL) ) );
8882  xmm1 += a1 * b1;
8883  xmm2 += a2 * b1;
8884  xmm3 += a1 * b2;
8885  xmm4 += a2 * b2;
8886  }
8887 
8888  (~C).store( i , j , (xmm1+xmm5) * factor );
8889  (~C).store( i+SIMDSIZE, j , (xmm2+xmm6) * factor );
8890  (~C).store( i , j+1UL, (xmm3+xmm7) * factor );
8891  (~C).store( i+SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
8892  }
8893 
8894  if( j < jend )
8895  {
8896  const size_t kbegin( ( IsLower<MT5>::value )
8897  ?( ( IsUpper<MT4>::value )
8898  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8899  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8900  :( IsUpper<MT4>::value ? i : 0UL ) );
8901  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
8902 
8903  SIMDType xmm1, xmm2, xmm3, xmm4;
8904  size_t k( kbegin );
8905 
8906  for( ; (k+2UL) <= kend; k+=2UL ) {
8907  const SIMDType b1( set( B(k ,j) ) );
8908  const SIMDType b2( set( B(k+1UL,j) ) );
8909  xmm1 += A.load(i ,k ) * b1;
8910  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
8911  xmm3 += A.load(i ,k+1UL) * b2;
8912  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
8913  }
8914 
8915  for( ; k<kend; ++k ) {
8916  const SIMDType b1( set( B(k,j) ) );
8917  xmm1 += A.load(i ,k) * b1;
8918  xmm2 += A.load(i+SIMDSIZE,k) * b1;
8919  }
8920 
8921  (~C).store( i , j, (xmm1+xmm3) * factor );
8922  (~C).store( i+SIMDSIZE, j, (xmm2+xmm4) * factor );
8923  }
8924  }
8925 
8926  for( ; i<ipos; i+=SIMDSIZE )
8927  {
8928  const size_t jend( SYM || HERM || LOW ? min(i+SIMDSIZE,N) : N );
8929  size_t j( UPP ? i : 0UL );
8930 
8931  for( ; (j+4UL) <= jend; j+=4UL )
8932  {
8933  const size_t kbegin( ( IsLower<MT5>::value )
8934  ?( ( IsUpper<MT4>::value )
8935  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8936  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8937  :( IsUpper<MT4>::value ? i : 0UL ) );
8938  const size_t kend( ( IsUpper<MT5>::value )
8939  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
8940  :( K ) );
8941 
8942  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8943  size_t k( kbegin );
8944 
8945  for( ; (k+2UL) <= kend; k+=2UL ) {
8946  const SIMDType a1( A.load(i,k ) );
8947  const SIMDType a2( A.load(i,k+1UL) );
8948  xmm1 += a1 * set( B(k ,j ) );
8949  xmm2 += a1 * set( B(k ,j+1UL) );
8950  xmm3 += a1 * set( B(k ,j+2UL) );
8951  xmm4 += a1 * set( B(k ,j+3UL) );
8952  xmm5 += a2 * set( B(k+1UL,j ) );
8953  xmm6 += a2 * set( B(k+1UL,j+1UL) );
8954  xmm7 += a2 * set( B(k+1UL,j+2UL) );
8955  xmm8 += a2 * set( B(k+1UL,j+3UL) );
8956  }
8957 
8958  for( ; k<kend; ++k ) {
8959  const SIMDType a1( A.load(i,k) );
8960  xmm1 += a1 * set( B(k,j ) );
8961  xmm2 += a1 * set( B(k,j+1UL) );
8962  xmm3 += a1 * set( B(k,j+2UL) );
8963  xmm4 += a1 * set( B(k,j+3UL) );
8964  }
8965 
8966  (~C).store( i, j , (xmm1+xmm5) * factor );
8967  (~C).store( i, j+1UL, (xmm2+xmm6) * factor );
8968  (~C).store( i, j+2UL, (xmm3+xmm7) * factor );
8969  (~C).store( i, j+3UL, (xmm4+xmm8) * factor );
8970  }
8971 
8972  for( ; (j+3UL) <= jend; j+=3UL )
8973  {
8974  const size_t kbegin( ( IsLower<MT5>::value )
8975  ?( ( IsUpper<MT4>::value )
8976  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8977  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8978  :( IsUpper<MT4>::value ? i : 0UL ) );
8979  const size_t kend( ( IsUpper<MT5>::value )
8980  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
8981  :( K ) );
8982 
8983  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8984  size_t k( kbegin );
8985 
8986  for( ; (k+2UL) <= kend; k+=2UL ) {
8987  const SIMDType a1( A.load(i,k ) );
8988  const SIMDType a2( A.load(i,k+1UL) );
8989  xmm1 += a1 * set( B(k ,j ) );
8990  xmm2 += a1 * set( B(k ,j+1UL) );
8991  xmm3 += a1 * set( B(k ,j+2UL) );
8992  xmm4 += a2 * set( B(k+1UL,j ) );
8993  xmm5 += a2 * set( B(k+1UL,j+1UL) );
8994  xmm6 += a2 * set( B(k+1UL,j+2UL) );
8995  }
8996 
8997  for( ; k<kend; ++k ) {
8998  const SIMDType a1( A.load(i,k) );
8999  xmm1 += a1 * set( B(k,j ) );
9000  xmm2 += a1 * set( B(k,j+1UL) );
9001  xmm3 += a1 * set( B(k,j+2UL) );
9002  }
9003 
9004  (~C).store( i, j , (xmm1+xmm4) * factor );
9005  (~C).store( i, j+1UL, (xmm2+xmm5) * factor );
9006  (~C).store( i, j+2UL, (xmm3+xmm6) * factor );
9007  }
9008 
9009  for( ; (j+2UL) <= jend; j+=2UL )
9010  {
9011  const size_t kbegin( ( IsLower<MT5>::value )
9012  ?( ( IsUpper<MT4>::value )
9013  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9014  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9015  :( IsUpper<MT4>::value ? i : 0UL ) );
9016  const size_t kend( ( IsUpper<MT5>::value )
9017  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
9018  :( K ) );
9019 
9020  SIMDType xmm1, xmm2, xmm3, xmm4;
9021  size_t k( kbegin );
9022 
9023  for( ; k<kend; ++k ) {
9024  const SIMDType a1( A.load(i,k) );
9025  xmm1 += a1 * set( B(k,j ) );
9026  xmm2 += a1 * set( B(k,j+1UL) );
9027  }
9028 
9029  for( ; (k+2UL) <= kend; k+=2UL ) {
9030  const SIMDType a1( A.load(i,k ) );
9031  const SIMDType a2( A.load(i,k+1UL) );
9032  xmm1 += a1 * set( B(k ,j ) );
9033  xmm2 += a1 * set( B(k ,j+1UL) );
9034  xmm3 += a2 * set( B(k+1UL,j ) );
9035  xmm4 += a2 * set( B(k+1UL,j+1UL) );
9036  }
9037 
9038  (~C).store( i, j , (xmm1+xmm3) * factor );
9039  (~C).store( i, j+1UL, (xmm2+xmm4) * factor );
9040  }
9041 
9042  if( j < jend )
9043  {
9044  const size_t kbegin( ( IsLower<MT5>::value )
9045  ?( ( IsUpper<MT4>::value )
9046  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9047  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9048  :( IsUpper<MT4>::value ? i : 0UL ) );
9049 
9050  SIMDType xmm1, xmm2;
9051  size_t k( kbegin );
9052 
9053  for( ; (k+2UL) <= K; k+=2UL ) {
9054  xmm1 += A.load(i,k ) * set( B(k ,j) );
9055  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
9056  }
9057 
9058  for( ; k<K; ++k ) {
9059  xmm1 += A.load(i,k) * set( B(k,j) );
9060  }
9061 
9062  (~C).store( i, j, (xmm1+xmm2) * factor );
9063  }
9064  }
9065 
9066  for( ; remainder && i<M; ++i )
9067  {
9068  size_t j( LOW && UPP ? i : 0UL );
9069 
9070  for( ; (j+2UL) <= N; j+=2UL )
9071  {
9072  const size_t kbegin( ( IsLower<MT5>::value )
9073  ?( ( IsUpper<MT4>::value )
9074  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9075  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9076  :( IsUpper<MT4>::value ? i : 0UL ) );
9077  const size_t kend( ( IsUpper<MT5>::value )
9078  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
9079  :( K ) );
9080 
9081  ElementType value1{};
9082  ElementType value2{};
9083 
9084  for( size_t k=kbegin; k<kend; ++k ) {
9085  value1 += A(i,k) * B(k,j );
9086  value2 += A(i,k) * B(k,j+1UL);
9087  }
9088 
9089  (~C)(i,j ) = value1 * scalar;
9090  (~C)(i,j+1UL) = value2 * scalar;
9091  }
9092 
9093  if( j < N )
9094  {
9095  const size_t kbegin( ( IsLower<MT5>::value )
9096  ?( ( IsUpper<MT4>::value )
9097  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
9098  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
9099  :( IsUpper<MT4>::value ? i : 0UL ) );
9100 
9101  ElementType value{};
9102 
9103  for( size_t k=kbegin; k<K; ++k ) {
9104  value += A(i,k) * B(k,j);
9105  }
9106 
9107  (~C)(i,j) = value * scalar;
9108  }
9109  }
9110  }
9111 
9112  if( ( SYM || HERM ) && ( M > SIMDSIZE*4UL ) ) {
9113  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9114  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9115  for( size_t i=0UL; i<iend; ++i ) {
9116  (~C)(i,j) = HERM ? conj( (~C)(j,i) ) : (~C)(j,i);
9117  }
9118  }
9119  }
9120  else if( LOW && !UPP && M > SIMDSIZE*4UL ) {
9121  for( size_t j=SIMDSIZE*4UL; j<N; ++j ) {
9122  const size_t iend( ( SIMDSIZE*4UL ) * ( j / (SIMDSIZE*4UL) ) );
9123  for( size_t i=0UL; i<iend; ++i ) {
9124  reset( (~C)(i,j) );
9125  }
9126  }
9127  }
9128  else if( !LOW && UPP && M > SIMDSIZE*4UL ) {
9129  for( size_t i=SIMDSIZE*4UL; i<M; ++i ) {
9130  const size_t jend( ( SIMDSIZE*4UL ) * ( i / (SIMDSIZE*4UL) ) );
9131  for( size_t j=0UL; j<jend; ++j ) {
9132  reset( (~C)(i,j) );
9133  }
9134  }
9135  }
9136  }
9137  //**********************************************************************************************
9138 
9139  //**Default assignment to dense matrices (large matrices)***************************************
9153  template< typename MT3 // Type of the left-hand side target matrix
9154  , typename MT4 // Type of the left-hand side matrix operand
9155  , typename MT5 // Type of the right-hand side matrix operand
9156  , typename ST2 > // Type of the scalar value
9158  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9159  {
9160  selectDefaultAssignKernel( C, A, B, scalar );
9161  }
9162  //**********************************************************************************************
9163 
9164  //**Vectorized default assignment to dense matrices (large matrices)****************************
9179  template< typename MT3 // Type of the left-hand side target matrix
9180  , typename MT4 // Type of the left-hand side matrix operand
9181  , typename MT5 // Type of the right-hand side matrix operand
9182  , typename ST2 > // Type of the scalar value
9184  selectLargeAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9185  {
9186  if( SYM )
9187  smmm( C, A, B, scalar );
9188  else if( HERM )
9189  hmmm( C, A, B, scalar );
9190  else if( LOW )
9191  lmmm( C, A, B, scalar, ST2(0) );
9192  else if( UPP )
9193  ummm( C, A, B, scalar, ST2(0) );
9194  else
9195  mmm( C, A, B, scalar, ST2(0) );
9196  }
9197  //**********************************************************************************************
9198 
9199  //**BLAS-based assignment to dense matrices (default)*******************************************
9213  template< typename MT3 // Type of the left-hand side target matrix
9214  , typename MT4 // Type of the left-hand side matrix operand
9215  , typename MT5 // Type of the right-hand side matrix operand
9216  , typename ST2 > // Type of the scalar value
9218  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9219  {
9220  selectLargeAssignKernel( C, A, B, scalar );
9221  }
9222  //**********************************************************************************************
9223 
9224  //**BLAS-based assignment to dense matrices*****************************************************
9225 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9226 
9239  template< typename MT3 // Type of the left-hand side target matrix
9240  , typename MT4 // Type of the left-hand side matrix operand
9241  , typename MT5 // Type of the right-hand side matrix operand
9242  , typename ST2 > // Type of the scalar value
9244  selectBlasAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9245  {
9246  using ET = ElementType_<MT3>;
9247 
9248  if( IsTriangular<MT4>::value ) {
9249  assign( C, B );
9250  trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9251  }
9252  else if( IsTriangular<MT5>::value ) {
9253  assign( C, A );
9254  trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9255  }
9256  else {
9257  gemm( C, A, B, ET(scalar), ET(0) );
9258  }
9259  }
9260 #endif
9261  //**********************************************************************************************
9262 
9263  //**Assignment to sparse matrices***************************************************************
9275  template< typename MT // Type of the target sparse matrix
9276  , bool SO > // Storage order of the target sparse matrix
9277  friend inline void assign( SparseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9278  {
9280 
9282 
9289 
9290  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9291  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9292 
9293  const ForwardFunctor fwd;
9294 
9295  const TmpType tmp( serial( rhs ) );
9296  assign( ~lhs, fwd( tmp ) );
9297  }
9298  //**********************************************************************************************
9299 
9300  //**Addition assignment to dense matrices*******************************************************
9312  template< typename MT // Type of the target dense matrix
9313  , bool SO > // Storage order of the target dense matrix
9314  friend inline void addAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
9315  {
9317 
9318  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
9319  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
9320 
9321  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
9322  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
9323 
9324  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
9325  return;
9326  }
9327 
9328  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
9329  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
9330 
9331  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
9332  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
9333  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
9334  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
9335  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
9336  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
9337 
9338  DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
9339  }
9340  //**********************************************************************************************
9341 
9342  //**Addition assignment to dense matrices (kernel selection)************************************
9353  template< typename MT3 // Type of the left-hand side target matrix
9354  , typename MT4 // Type of the left-hand side matrix operand
9355  , typename MT5 // Type of the right-hand side matrix operand
9356  , typename ST2 > // Type of the scalar value
9357  static inline void selectAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9358  {
9360  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
9361  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
9362  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9363  selectSmallAddAssignKernel( C, A, B, scalar );
9364  else
9365  selectBlasAddAssignKernel( C, A, B, scalar );
9366  }
9367  //**********************************************************************************************
9368 
9369  //**Default addition assignment to dense matrices (general/general)*****************************
9383  template< typename MT3 // Type of the left-hand side target matrix
9384  , typename MT4 // Type of the left-hand side matrix operand
9385  , typename MT5 // Type of the right-hand side matrix operand
9386  , typename ST2 > // Type of the scalar value
9387  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
9388  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9389  {
9390  const ResultType tmp( serial( A * B * scalar ) );
9391  addAssign( C, tmp );
9392  }
9393  //**********************************************************************************************
9394 
9395  //**Default addition assignment to row-major dense matrices (general/diagonal)******************
9409  template< typename MT3 // Type of the left-hand side target matrix
9410  , typename MT4 // Type of the left-hand side matrix operand
9411  , typename MT5 // Type of the right-hand side matrix operand
9412  , typename ST2 > // Type of the scalar value
9413  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9414  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9415  {
9416  constexpr size_t block( BLOCK_SIZE );
9417 
9418  const size_t M( A.rows() );
9419  const size_t N( B.columns() );
9420 
9421  for( size_t ii=0UL; ii<M; ii+=block ) {
9422  const size_t iend( min( M, ii+block ) );
9423  for( size_t jj=0UL; jj<N; jj+=block ) {
9424  const size_t jend( min( N, jj+block ) );
9425  for( size_t i=ii; i<iend; ++i )
9426  {
9427  const size_t jbegin( ( IsUpper<MT4>::value )
9428  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
9429  :( jj ) );
9430  const size_t jpos( ( IsLower<MT4>::value )
9431  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
9432  :( jend ) );
9433 
9434  for( size_t j=jbegin; j<jpos; ++j ) {
9435  (~C)(i,j) += A(i,j) * B(j,j) * scalar;
9436  }
9437  }
9438  }
9439  }
9440  }
9441  //**********************************************************************************************
9442 
9443  //**Default addition assignment to column-major dense matrices (general/diagonal)***************
9457  template< typename MT3 // Type of the left-hand side target matrix
9458  , typename MT4 // Type of the left-hand side matrix operand
9459  , typename MT5 // Type of the right-hand side matrix operand
9460  , typename ST2 > // Type of the scalar value
9461  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9462  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9463  {
9464  const size_t M( A.rows() );
9465  const size_t N( B.columns() );
9466 
9467  for( size_t j=0UL; j<N; ++j )
9468  {
9469  const size_t ibegin( ( IsLower<MT4>::value )
9470  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
9471  :( 0UL ) );
9472  const size_t iend( ( IsUpper<MT4>::value )
9473  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
9474  :( M ) );
9475  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
9476 
9477  const size_t inum( iend - ibegin );
9478  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
9479 
9480  for( size_t i=ibegin; i<ipos; i+=2UL ) {
9481  (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
9482  (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
9483  }
9484  if( ipos < iend ) {
9485  (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
9486  }
9487  }
9488  }
9489  //**********************************************************************************************
9490 
9491  //**Default addition assignment to row-major dense matrices (diagonal/general)******************
9505  template< typename MT3 // Type of the left-hand side target matrix
9506  , typename MT4 // Type of the left-hand side matrix operand
9507  , typename MT5 // Type of the right-hand side matrix operand
9508  , typename ST2 > // Type of the scalar value
9509  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9510  selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9511  {
9512  const size_t M( A.rows() );
9513  const size_t N( B.columns() );
9514 
9515  for( size_t i=0UL; i<M; ++i )
9516  {
9517  const size_t jbegin( ( IsUpper<MT5>::value )
9518  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
9519  :( 0UL ) );
9520  const size_t jend( ( IsLower<MT5>::value )
9521  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
9522  :( N ) );
9523  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
9524 
9525  const size_t jnum( jend - jbegin );
9526  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
9527 
9528  for( size_t j=jbegin; j<jpos; j+=2UL ) {
9529  (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
9530  (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
9531  }
9532  if( jpos < jend ) {
9533  (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
9534  }
9535  }
9536  }
9537  //**********************************************************************************************
9538 
9539  //**Default addition assignment to column-major dense matrices (diagonal/general)***************
9553  template< typename MT3 // Type of the left-hand side target matrix
9554  , typename MT4 // Type of the left-hand side matrix operand
9555  , typename MT5 // Type of the right-hand side matrix operand
9556  , typename ST2 > // Type of the scalar value
9557  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9558  selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
9559  {
9560  constexpr size_t block( BLOCK_SIZE );
9561 
9562  const size_t M( A.rows() );
9563  const size_t N( B.columns() );
9564 
9565  for( size_t jj=0UL; jj<N; jj+=block ) {
9566  const size_t jend( min( N, jj+block ) );
9567  for( size_t ii=0UL; ii<M; ii+=block ) {
9568  const size_t iend( min( M, ii+block ) );
9569  for( size_t j=jj; j<jend; ++j )
9570  {
9571  const size_t ibegin( ( IsLower<MT5>::value )
9572  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
9573  :( ii ) );
9574  const size_t ipos( ( IsUpper<MT5>::value )
9575  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
9576  :( iend ) );
9577 
9578  for( size_t i=ibegin; i<ipos; ++i ) {
9579  (~C)(i,j) += A(i,i) * B(i,j) * scalar;
9580  }
9581  }
9582  }
9583  }
9584  }
9585  //**********************************************************************************************
9586 
9587  //**Default addition assignment to dense matrices (diagonal/diagonal)***************************
9601  template< typename MT3 // Type of the left-hand side target matrix
9602  , typename MT4 // Type of the left-hand side matrix operand
9603  , typename MT5 // Type of the right-hand side matrix operand
9604  , typename ST2 > // Type of the scalar value
9605  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
9606  selectDefaultAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9607  {
9608  for( size_t i=0UL; i<A.rows(); ++i ) {
9609  C(i,i) += A(i,i) * B(i,i) * scalar;
9610  }
9611  }
9612  //**********************************************************************************************
9613 
9614  //**Default addition assignment to dense matrices (small matrices)******************************
9628  template< typename MT3 // Type of the left-hand side target matrix
9629  , typename MT4 // Type of the left-hand side matrix operand
9630  , typename MT5 // Type of the right-hand side matrix operand
9631  , typename ST2 > // Type of the scalar value
9633  selectSmallAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
9634  {
9635  selectDefaultAddAssignKernel( C, A, B, scalar );
9636  }
9637  //**********************************************************************************************
9638 
9639  //**Vectorized default addition assignment to row-major dense matrices (small matrices)*********
9654  template< typename MT3 // Type of the left-hand side target matrix
9655  , typename MT4 // Type of the left-hand side matrix operand
9656  , typename MT5 // Type of the right-hand side matrix operand
9657  , typename ST2 > // Type of the scalar value
9659  selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
9660  {
9661  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
9662 
9663  const size_t M( A.rows() );
9664  const size_t N( B.columns() );
9665  const size_t K( A.columns() );
9666 
9667  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
9668 
9669  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
9670  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
9671 
9672  const SIMDType factor( set( scalar ) );
9673 
9674  size_t j( 0UL );
9675 
9677  {
9678  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
9679  for( size_t i=0UL; i<M; ++i )
9680  {
9681  const size_t kbegin( ( IsUpper<MT4>::value )
9682  ?( ( IsLower<MT5>::value )
9683  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9684  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9685  :( IsLower<MT5>::value ? j : 0UL ) );
9686  const size_t kend( ( IsLower<MT4>::value )
9687  ?( ( IsUpper<MT5>::value )
9688  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
9689  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
9690  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
9691 
9692  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9693 
9694  for( size_t k=kbegin; k<kend; ++k ) {
9695  const SIMDType a1( set( A(i,k) ) );
9696  xmm1 += a1 * B.load(k,j );
9697  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9698  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9699  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9700  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9701  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
9702  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
9703  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
9704  }
9705 
9706  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9707  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9708  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9709  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9710  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9711  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
9712  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
9713  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
9714  }
9715  }
9716  }
9717 
9718  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
9719  {
9720  size_t i( 0UL );
9721 
9722  for( ; (i+2UL) <= M; i+=2UL )
9723  {
9724  const size_t kbegin( ( IsUpper<MT4>::value )
9725  ?( ( IsLower<MT5>::value )
9726  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9727  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9728  :( IsLower<MT5>::value ? j : 0UL ) );
9729  const size_t kend( ( IsLower<MT4>::value )
9730  ?( ( IsUpper<MT5>::value )
9731  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
9732  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9733  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
9734 
9735  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9736 
9737  for( size_t k=kbegin; k<kend; ++k ) {
9738  const SIMDType a1( set( A(i ,k) ) );
9739  const SIMDType a2( set( A(i+1UL,k) ) );
9740  const SIMDType b1( B.load(k,j ) );
9741  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9742  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9743  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9744  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
9745  xmm1 += a1 * b1;
9746  xmm2 += a1 * b2;
9747  xmm3 += a1 * b3;
9748  xmm4 += a1 * b4;
9749  xmm5 += a1 * b5;
9750  xmm6 += a2 * b1;
9751  xmm7 += a2 * b2;
9752  xmm8 += a2 * b3;
9753  xmm9 += a2 * b4;
9754  xmm10 += a2 * b5;
9755  }
9756 
9757  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9758  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9759  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9760  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9761  (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) + xmm5 * factor );
9762  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm6 * factor );
9763  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm7 * factor );
9764  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm8 * factor );
9765  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm9 * factor );
9766  (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) + xmm10 * factor );
9767  }
9768 
9769  if( i < M )
9770  {
9771  const size_t kbegin( ( IsUpper<MT4>::value )
9772  ?( ( IsLower<MT5>::value )
9773  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9774  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9775  :( IsLower<MT5>::value ? j : 0UL ) );
9776  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
9777 
9778  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9779 
9780  for( size_t k=kbegin; k<kend; ++k ) {
9781  const SIMDType a1( set( A(i,k) ) );
9782  xmm1 += a1 * B.load(k,j );
9783  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9784  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9785  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9786  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
9787  }
9788 
9789  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9790  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9791  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9792  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9793  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
9794  }
9795  }
9796 
9797  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
9798  {
9799  size_t i( 0UL );
9800 
9801  for( ; (i+2UL) <= M; i+=2UL )
9802  {
9803  const size_t kbegin( ( IsUpper<MT4>::value )
9804  ?( ( IsLower<MT5>::value )
9805  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9806  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9807  :( IsLower<MT5>::value ? j : 0UL ) );
9808  const size_t kend( ( IsLower<MT4>::value )
9809  ?( ( IsUpper<MT5>::value )
9810  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
9811  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9812  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
9813 
9814  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9815 
9816  for( size_t k=kbegin; k<kend; ++k ) {
9817  const SIMDType a1( set( A(i ,k) ) );
9818  const SIMDType a2( set( A(i+1UL,k) ) );
9819  const SIMDType b1( B.load(k,j ) );
9820  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9821  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9822  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9823  xmm1 += a1 * b1;
9824  xmm2 += a1 * b2;
9825  xmm3 += a1 * b3;
9826  xmm4 += a1 * b4;
9827  xmm5 += a2 * b1;
9828  xmm6 += a2 * b2;
9829  xmm7 += a2 * b3;
9830  xmm8 += a2 * b4;
9831  }
9832 
9833  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9834  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9835  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9836  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
9837  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
9838  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
9839  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
9840  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
9841  }
9842 
9843  if( i < M )
9844  {
9845  const size_t kbegin( ( IsUpper<MT4>::value )
9846  ?( ( IsLower<MT5>::value )
9847  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9848  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9849  :( IsLower<MT5>::value ? j : 0UL ) );
9850  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
9851 
9852  SIMDType xmm1, xmm2, xmm3, xmm4;
9853 
9854  for( size_t k=kbegin; k<kend; ++k ) {
9855  const SIMDType a1( set( A(i,k) ) );
9856  xmm1 += a1 * B.load(k,j );
9857  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9858  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9859  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
9860  }
9861 
9862  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9863  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9864  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9865  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
9866  }
9867  }
9868 
9869  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
9870  {
9871  size_t i( 0UL );
9872 
9873  for( ; (i+2UL) <= M; i+=2UL )
9874  {
9875  const size_t kbegin( ( IsUpper<MT4>::value )
9876  ?( ( IsLower<MT5>::value )
9877  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9878  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9879  :( IsLower<MT5>::value ? j : 0UL ) );
9880  const size_t kend( ( IsLower<MT4>::value )
9881  ?( ( IsUpper<MT5>::value )
9882  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
9883  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9884  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
9885 
9886  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9887 
9888  for( size_t k=kbegin; k<kend; ++k ) {
9889  const SIMDType a1( set( A(i ,k) ) );
9890  const SIMDType a2( set( A(i+1UL,k) ) );
9891  const SIMDType b1( B.load(k,j ) );
9892  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9893  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9894  xmm1 += a1 * b1;
9895  xmm2 += a1 * b2;
9896  xmm3 += a1 * b3;
9897  xmm4 += a2 * b1;
9898  xmm5 += a2 * b2;
9899  xmm6 += a2 * b3;
9900  }
9901 
9902  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9903  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
9904  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
9905  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm4 * factor );
9906  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm5 * factor );
9907  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm6 * factor );
9908  }
9909 
9910  if( i < M )
9911  {
9912  const size_t kbegin( ( IsUpper<MT4>::value )
9913  ?( ( IsLower<MT5>::value )
9914  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9915  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9916  :( IsLower<MT5>::value ? j : 0UL ) );
9917  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
9918 
9919  SIMDType xmm1, xmm2, xmm3;
9920 
9921  for( size_t k=kbegin; k<kend; ++k ) {
9922  const SIMDType a1( set( A(i,k) ) );
9923  xmm1 += a1 * B.load(k,j );
9924  xmm2 += a1 * B.load(k,j+SIMDSIZE );
9925  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
9926  }
9927 
9928  (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9929  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
9930  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
9931  }
9932  }
9933 
9934  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
9935  {
9936  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
9937  size_t i( LOW ? j : 0UL );
9938 
9939  for( ; (i+4UL) <= iend; i+=4UL )
9940  {
9941  const size_t kbegin( ( IsUpper<MT4>::value )
9942  ?( ( IsLower<MT5>::value )
9943  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9944  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9945  :( IsLower<MT5>::value ? j : 0UL ) );
9946  const size_t kend( ( IsLower<MT4>::value )
9947  ?( ( IsUpper<MT5>::value )
9948  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
9949  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
9950  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
9951 
9952  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9953 
9954  for( size_t k=kbegin; k<kend; ++k ) {
9955  const SIMDType a1( set( A(i ,k) ) );
9956  const SIMDType a2( set( A(i+1UL,k) ) );
9957  const SIMDType a3( set( A(i+2UL,k) ) );
9958  const SIMDType a4( set( A(i+3UL,k) ) );
9959  const SIMDType b1( B.load(k,j ) );
9960  const SIMDType b2( B.load(k,j+SIMDSIZE) );
9961  xmm1 += a1 * b1;
9962  xmm2 += a1 * b2;
9963  xmm3 += a2 * b1;
9964  xmm4 += a2 * b2;
9965  xmm5 += a3 * b1;
9966  xmm6 += a3 * b2;
9967  xmm7 += a4 * b1;
9968  xmm8 += a4 * b2;
9969  }
9970 
9971  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9972  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
9973  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9974  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
9975  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
9976  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
9977  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
9978  (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) + xmm8 * factor );
9979  }
9980 
9981  for( ; (i+3UL) <= iend; i+=3UL )
9982  {
9983  const size_t kbegin( ( IsUpper<MT4>::value )
9984  ?( ( IsLower<MT5>::value )
9985  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9986  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9987  :( IsLower<MT5>::value ? j : 0UL ) );
9988  const size_t kend( ( IsLower<MT4>::value )
9989  ?( ( IsUpper<MT5>::value )
9990  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
9991  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
9992  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
9993 
9994  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9995 
9996  for( size_t k=kbegin; k<kend; ++k ) {
9997  const SIMDType a1( set( A(i ,k) ) );
9998  const SIMDType a2( set( A(i+1UL,k) ) );
9999  const SIMDType a3( set( A(i+2UL,k) ) );
10000  const SIMDType b1( B.load(k,j ) );
10001  const SIMDType b2( B.load(k,j+SIMDSIZE) );
10002  xmm1 += a1 * b1;
10003  xmm2 += a1 * b2;
10004  xmm3 += a2 * b1;
10005  xmm4 += a2 * b2;
10006  xmm5 += a3 * b1;
10007  xmm6 += a3 * b2;
10008  }
10009 
10010  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10011  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
10012  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
10013  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
10014  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
10015  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) + xmm6 * factor );
10016  }
10017 
10018  for( ; (i+2UL) <= iend; i+=2UL )
10019  {
10020  const size_t kbegin( ( IsUpper<MT4>::value )
10021  ?( ( IsLower<MT5>::value )
10022  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10023  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10024  :( IsLower<MT5>::value ? j : 0UL ) );
10025  const size_t kend( ( IsLower<MT4>::value )
10026  ?( ( IsUpper<MT5>::value )
10027  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
10028  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
10029  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
10030 
10031  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10032  size_t k( kbegin );
10033 
10034  for( ; (k+2UL) <= kend; k+=2UL ) {
10035  const SIMDType a1( set( A(i ,k ) ) );
10036  const SIMDType a2( set( A(i+1UL,k ) ) );
10037  const SIMDType a3( set( A(i ,k+1UL) ) );
10038  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
10039  const SIMDType b1( B.load(k ,j ) );
10040  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
10041  const SIMDType b3( B.load(k+1UL,j ) );
10042  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
10043  xmm1 += a1 * b1;
10044  xmm2 += a1 * b2;
10045  xmm3 += a2 * b1;
10046  xmm4 += a2 * b2;
10047  xmm5 += a3 * b3;
10048  xmm6 += a3 * b4;
10049  xmm7 += a4 * b3;
10050  xmm8 += a4 * b4;
10051  }
10052 
10053  for( ; k<kend; ++k ) {
10054  const SIMDType a1( set( A(i ,k) ) );
10055  const SIMDType a2( set( A(i+1UL,k) ) );
10056  const SIMDType b1( B.load(k,j ) );
10057  const SIMDType b2( B.load(k,j+SIMDSIZE) );
10058  xmm1 += a1 * b1;
10059  xmm2 += a1 * b2;
10060  xmm3 += a2 * b1;
10061  xmm4 += a2 * b2;
10062  }
10063 
10064  (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
10065  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + (xmm2+xmm6) * factor );
10066  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + (xmm3+xmm7) * factor );
10067  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + (xmm4+xmm8) * factor );
10068  }
10069 
10070  if( i < iend )
10071  {
10072  const size_t kbegin( ( IsUpper<MT4>::value )
10073  ?( ( IsLower<MT5>::value )
10074  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10075  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10076  :( IsLower<MT5>::value ? j : 0UL ) );
10077  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
10078 
10079  SIMDType xmm1, xmm2, xmm3, xmm4;
10080  size_t k( kbegin );
10081 
10082  for( ; (k+2UL) <= kend; k+=2UL ) {
10083  const SIMDType a1( set( A(i,k ) ) );
10084  const SIMDType a2( set( A(i,k+1UL) ) );
10085  xmm1 += a1 * B.load(k ,j );
10086  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
10087  xmm3 += a2 * B.load(k+1UL,j );
10088  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
10089  }
10090 
10091  for( ; k<kend; ++k ) {
10092  const SIMDType a1( set( A(i,k) ) );
10093  xmm1 += a1 * B.load(k,j );
10094  xmm2 += a1 * B.load(k,j+SIMDSIZE);
10095  }
10096 
10097  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
10098  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + (xmm2+xmm4) * factor );
10099  }
10100  }
10101 
10102  for( ; j<jpos; j+=SIMDSIZE )
10103  {
10104  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
10105  size_t i( LOW ? j : 0UL );
10106 
10107  for( ; (i+4UL) <= iend; i+=4UL )
10108  {
10109  const size_t kbegin( ( IsUpper<MT4>::value )
10110  ?( ( IsLower<MT5>::value )
10111  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10112  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10113  :( IsLower<MT5>::value ? j : 0UL ) );
10114  const size_t kend( ( IsLower<MT4>::value )
10115  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
10116  :( K ) );
10117 
10118  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10119  size_t k( kbegin );
10120 
10121  for( ; (k+2UL) <= kend; k+=2UL ) {
10122  const SIMDType b1( B.load(k ,j) );
10123  const SIMDType b2( B.load(k+1UL,j) );
10124  xmm1 += set( A(i ,k ) ) * b1;
10125  xmm2 += set( A(i+1UL,k ) ) * b1;
10126  xmm3 += set( A(i+2UL,k ) ) * b1;
10127  xmm4 += set( A(i+3UL,k ) ) * b1;
10128  xmm5 += set( A(i ,k+1UL) ) * b2;
10129  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
10130  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
10131  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
10132  }
10133 
10134  for( ; k<kend; ++k ) {
10135  const SIMDType b1( B.load(k,j) );
10136  xmm1 += set( A(i ,k) ) * b1;
10137  xmm2 += set( A(i+1UL,k) ) * b1;
10138  xmm3 += set( A(i+2UL,k) ) * b1;
10139  xmm4 += set( A(i+3UL,k) ) * b1;
10140  }
10141 
10142  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm5) * factor );
10143  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm6) * factor );
10144  (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm7) * factor );
10145  (~C).store( i+3UL, j, (~C).load(i+3UL,j) + (xmm4+xmm8) * factor );
10146  }
10147 
10148  for( ; (i+3UL) <= iend; i+=3UL )
10149  {
10150  const size_t kbegin( ( IsUpper<MT4>::value )
10151  ?( ( IsLower<MT5>::value )
10152  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10153  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10154  :( IsLower<MT5>::value ? j : 0UL ) );
10155  const size_t kend( ( IsLower<MT4>::value )
10156  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
10157  :( K ) );
10158 
10159  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10160  size_t k( kbegin );
10161 
10162  for( ; (k+2UL) <= kend; k+=2UL ) {
10163  const SIMDType b1( B.load(k ,j) );
10164  const SIMDType b2( B.load(k+1UL,j) );
10165  xmm1 += set( A(i ,k ) ) * b1;
10166  xmm2 += set( A(i+1UL,k ) ) * b1;
10167  xmm3 += set( A(i+2UL,k ) ) * b1;
10168  xmm4 += set( A(i ,k+1UL) ) * b2;
10169  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
10170  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
10171  }
10172 
10173  for( ; k<kend; ++k ) {
10174  const SIMDType b1( B.load(k,j) );
10175  xmm1 += set( A(i ,k) ) * b1;
10176  xmm2 += set( A(i+1UL,k) ) * b1;
10177  xmm3 += set( A(i+2UL,k) ) * b1;
10178  }
10179 
10180  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm4) * factor );
10181  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm5) * factor );
10182  (~C).store( i+2UL, j, (~C).load(i+2UL,j) + (xmm3+xmm6) * factor );
10183  }
10184 
10185  for( ; (i+2UL) <= iend; i+=2UL )
10186  {
10187  const size_t kbegin( ( IsUpper<MT4>::value )
10188  ?( ( IsLower<MT5>::value )
10189  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10190  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10191  :( IsLower<MT5>::value ? j : 0UL ) );
10192  const size_t kend( ( IsLower<MT4>::value )
10193  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
10194  :( K ) );
10195 
10196  SIMDType xmm1, xmm2, xmm3, xmm4;
10197  size_t k( kbegin );
10198 
10199  for( ; (k+2UL) <= kend; k+=2UL ) {
10200  const SIMDType b1( B.load(k ,j) );
10201  const SIMDType b2( B.load(k+1UL,j) );
10202  xmm1 += set( A(i ,k ) ) * b1;
10203  xmm2 += set( A(i+1UL,k ) ) * b1;
10204  xmm3 += set( A(i ,k+1UL) ) * b2;
10205  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
10206  }
10207 
10208  for( ; k<kend; ++k ) {
10209  const SIMDType b1( B.load(k,j) );
10210  xmm1 += set( A(i ,k) ) * b1;
10211  xmm2 += set( A(i+1UL,k) ) * b1;
10212  }
10213 
10214  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
10215  (~C).store( i+1UL, j, (~C).load(i+1UL,j) + (xmm2+xmm4) * factor );
10216  }
10217 
10218  if( i < iend )
10219  {
10220  const size_t kbegin( ( IsUpper<MT4>::value )
10221  ?( ( IsLower<MT5>::value )
10222  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10223  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10224  :( IsLower<MT5>::value ? j : 0UL ) );
10225 
10226  SIMDType xmm1, xmm2;
10227  size_t k( kbegin );
10228 
10229  for( ; (k+2UL) <= K; k+=2UL ) {
10230  xmm1 += set( A(i,k ) ) * B.load(k ,j);
10231  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
10232  }
10233 
10234  for( ; k<K; ++k ) {
10235  xmm1 += set( A(i,k) ) * B.load(k,j);
10236  }
10237 
10238  (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
10239  }
10240  }
10241 
10242  for( ; remainder && j<N; ++j )
10243  {
10244  const size_t iend( UPP ? j+1UL : M );
10245  size_t i( LOW ? j : 0UL );
10246 
10247  for( ; (i+2UL) <= iend; i+=2UL )
10248  {
10249  const size_t kbegin( ( IsUpper<MT4>::value )
10250  ?( ( IsLower<MT5>::value )
10251  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10252  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10253  :( IsLower<MT5>::value ? j : 0UL ) );
10254  const size_t kend( ( IsLower<MT4>::value )
10255  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
10256  :( K ) );
10257 
10258  ElementType value1{};
10259  ElementType value2{};
10260 
10261  for( size_t k=kbegin; k<kend; ++k ) {
10262  value1 += A(i ,k) * B(k,j);
10263  value2 += A(i+1UL,k) * B(k,j);
10264  }
10265 
10266  (~C)(i ,j) += value1 * scalar;
10267  (~C)(i+1UL,j) += value2 * scalar;
10268  }
10269 
10270  if( i < iend )
10271  {
10272  const size_t kbegin( ( IsUpper<MT4>::value )
10273  ?( ( IsLower<MT5>::value )
10274  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10275  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10276  :( IsLower<MT5>::value ? j : 0UL ) );
10277 
10278  ElementType value{};
10279 
10280  for( size_t k=kbegin; k<K; ++k ) {
10281  value += A(i,k) * B(k,j);
10282  }
10283 
10284  (~C)(i,j) += value * scalar;
10285  }
10286  }
10287  }
10288  //**********************************************************************************************
10289 
10290  //**Vectorized default addition assignment to column-major dense matrices (small matrices)******
10305  template< typename MT3 // Type of the left-hand side target matrix
10306  , typename MT4 // Type of the left-hand side matrix operand
10307  , typename MT5 // Type of the right-hand side matrix operand
10308  , typename ST2 > // Type of the scalar value
10310  selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
10311  {
10312  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10313 
10314  const size_t M( A.rows() );
10315  const size_t N( B.columns() );
10316  const size_t K( A.columns() );
10317 
10318  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
10319 
10320  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
10321  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
10322 
10323  const SIMDType factor( set( scalar ) );
10324 
10325  size_t i( 0UL );
10326 
10328  {
10329  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
10330  for( size_t j=0UL; j<N; ++j )
10331  {
10332  const size_t kbegin( ( IsLower<MT5>::value )
10333  ?( ( IsUpper<MT4>::value )
10334  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10335  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10336  :( IsUpper<MT4>::value ? i : 0UL ) );
10337  const size_t kend( ( IsUpper<MT5>::value )
10338  ?( ( IsLower<MT4>::value )
10339  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
10340  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
10341  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
10342 
10343  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10344 
10345  for( size_t k=kbegin; k<kend; ++k ) {
10346  const SIMDType b1( set( B(k,j) ) );
10347  xmm1 += A.load(i ,k) * b1;
10348  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10349  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10350  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10351  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10352  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
10353  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
10354  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
10355  }
10356 
10357  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10358  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10359  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10360  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10361  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10362  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
10363  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
10364  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
10365  }
10366  }
10367  }
10368 
10369  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
10370  {
10371  size_t j( 0UL );
10372 
10373  for( ; (j+2UL) <= N; j+=2UL )
10374  {
10375  const size_t kbegin( ( IsLower<MT5>::value )
10376  ?( ( IsUpper<MT4>::value )
10377  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10378  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10379  :( IsUpper<MT4>::value ? i : 0UL ) );
10380  const size_t kend( ( IsUpper<MT5>::value )
10381  ?( ( IsLower<MT4>::value )
10382  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10383  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10384  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
10385 
10386  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10387 
10388  for( size_t k=kbegin; k<kend; ++k ) {
10389  const SIMDType a1( A.load(i ,k) );
10390  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10391  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10392  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10393  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
10394  const SIMDType b1( set( B(k,j ) ) );
10395  const SIMDType b2( set( B(k,j+1UL) ) );
10396  xmm1 += a1 * b1;
10397  xmm2 += a2 * b1;
10398  xmm3 += a3 * b1;
10399  xmm4 += a4 * b1;
10400  xmm5 += a5 * b1;
10401  xmm6 += a1 * b2;
10402  xmm7 += a2 * b2;
10403  xmm8 += a3 * b2;
10404  xmm9 += a4 * b2;
10405  xmm10 += a5 * b2;
10406  }
10407 
10408  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10409  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10410  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10411  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10412  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) + xmm5 * factor );
10413  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm6 * factor );
10414  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm7 * factor );
10415  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
10416  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
10417  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
10418  }
10419 
10420  if( j < N )
10421  {
10422  const size_t kbegin( ( IsLower<MT5>::value )
10423  ?( ( IsUpper<MT4>::value )
10424  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10425  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10426  :( IsUpper<MT4>::value ? i : 0UL ) );
10427  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
10428 
10429  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10430 
10431  for( size_t k=kbegin; k<kend; ++k ) {
10432  const SIMDType b1( set( B(k,j) ) );
10433  xmm1 += A.load(i ,k) * b1;
10434  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10435  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10436  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10437  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
10438  }
10439 
10440  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10441  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10442  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10443  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10444  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
10445  }
10446  }
10447 
10448  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10449  {
10450  size_t j( 0UL );
10451 
10452  for( ; (j+2UL) <= N; j+=2UL )
10453  {
10454  const size_t kbegin( ( IsLower<MT5>::value )
10455  ?( ( IsUpper<MT4>::value )
10456  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10457  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10458  :( IsUpper<MT4>::value ? i : 0UL ) );
10459  const size_t kend( ( IsUpper<MT5>::value )
10460  ?( ( IsLower<MT4>::value )
10461  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10462  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10463  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
10464 
10465  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10466 
10467  for( size_t k=kbegin; k<kend; ++k ) {
10468  const SIMDType a1( A.load(i ,k) );
10469  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10470  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10471  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10472  const SIMDType b1( set( B(k,j ) ) );
10473  const SIMDType b2( set( B(k,j+1UL) ) );
10474  xmm1 += a1 * b1;
10475  xmm2 += a2 * b1;
10476  xmm3 += a3 * b1;
10477  xmm4 += a4 * b1;
10478  xmm5 += a1 * b2;
10479  xmm6 += a2 * b2;
10480  xmm7 += a3 * b2;
10481  xmm8 += a4 * b2;
10482  }
10483 
10484  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10485  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10486  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10487  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
10488  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
10489  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
10490  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
10491  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
10492  }
10493 
10494  if( j < N )
10495  {
10496  const size_t kbegin( ( IsLower<MT5>::value )
10497  ?( ( IsUpper<MT4>::value )
10498  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10499  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10500  :( IsUpper<MT4>::value ? i : 0UL ) );
10501  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
10502 
10503  SIMDType xmm1, xmm2, xmm3, xmm4;
10504 
10505  for( size_t k=kbegin; k<kend; ++k ) {
10506  const SIMDType b1( set( B(k,j) ) );
10507  xmm1 += A.load(i ,k) * b1;
10508  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10509  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10510  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
10511  }
10512 
10513  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10514  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10515  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10516  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
10517  }
10518  }
10519 
10520  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
10521  {
10522  size_t j( 0UL );
10523 
10524  for( ; (j+2UL) <= N; j+=2UL )
10525  {
10526  const size_t kbegin( ( IsLower<MT5>::value )
10527  ?( ( IsUpper<MT4>::value )
10528  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10529  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10530  :( IsUpper<MT4>::value ? i : 0UL ) );
10531  const size_t kend( ( IsUpper<MT5>::value )
10532  ?( ( IsLower<MT4>::value )
10533  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10534  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10535  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
10536 
10537  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10538 
10539  for( size_t k=kbegin; k<kend; ++k ) {
10540  const SIMDType a1( A.load(i ,k) );
10541  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10542  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10543  const SIMDType b1( set( B(k,j ) ) );
10544  const SIMDType b2( set( B(k,j+1UL) ) );
10545  xmm1 += a1 * b1;
10546  xmm2 += a2 * b1;
10547  xmm3 += a3 * b1;
10548  xmm4 += a1 * b2;
10549  xmm5 += a2 * b2;
10550  xmm6 += a3 * b2;
10551  }
10552 
10553  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10554  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
10555  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
10556  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm4 * factor );
10557  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm5 * factor );
10558  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
10559  }
10560 
10561  if( j < N )
10562  {
10563  const size_t kbegin( ( IsLower<MT5>::value )
10564  ?( ( IsUpper<MT4>::value )
10565  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10566  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10567  :( IsUpper<MT4>::value ? i : 0UL ) );
10568  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
10569 
10570  SIMDType xmm1, xmm2, xmm3;
10571 
10572  for( size_t k=kbegin; k<kend; ++k ) {
10573  const SIMDType b1( set( B(k,j) ) );
10574  xmm1 += A.load(i ,k) * b1;
10575  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
10576  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
10577  }
10578 
10579  (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
10580  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
10581  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
10582  }
10583  }
10584 
10585  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10586  {
10587  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
10588  size_t j( UPP ? i : 0UL );
10589 
10590  for( ; (j+4UL) <= jend; j+=4UL )
10591  {
10592  const size_t kbegin( ( IsLower<MT5>::value )
10593  ?( ( IsUpper<MT4>::value )
10594  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10595  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10596  :( IsUpper<MT4>::value ? i : 0UL ) );
10597  const size_t kend( ( IsUpper<MT5>::value )
10598  ?( ( IsLower<MT4>::value )
10599  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
10600  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
10601  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
10602 
10603  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10604 
10605  for( size_t k=kbegin; k<kend; ++k ) {
10606  const SIMDType a1( A.load(i ,k) );
10607  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10608  const SIMDType b1( set( B(k,j ) ) );
10609  const SIMDType b2( set( B(k,j+1UL) ) );
10610  const SIMDType b3( set( B(k,j+2UL) ) );
10611  const SIMDType b4( set( B(k,j+3UL) ) );
10612  xmm1 += a1 * b1;
10613  xmm2 += a2 * b1;
10614  xmm3 += a1 * b2;
10615  xmm4 += a2 * b2;
10616  xmm5 += a1 * b3;
10617  xmm6 += a2 * b3;
10618  xmm7 += a1 * b4;
10619  xmm8 += a2 * b4;
10620  }
10621 
10622  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10623  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
10624  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
10625  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10626  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
10627  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10628  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
10629  (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) + xmm8 * factor );
10630  }
10631 
10632  for( ; (j+3UL) <= jend; j+=3UL )
10633  {
10634  const size_t kbegin( ( IsLower<MT5>::value )
10635  ?( ( IsUpper<MT4>::value )
10636  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10637  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10638  :( IsUpper<MT4>::value ? i : 0UL ) );
10639  const size_t kend( ( IsUpper<MT5>::value )
10640  ?( ( IsLower<MT4>::value )
10641  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
10642  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
10643  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
10644 
10645  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10646 
10647  for( size_t k=kbegin; k<kend; ++k ) {
10648  const SIMDType a1( A.load(i ,k) );
10649  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10650  const SIMDType b1( set( B(k,j ) ) );
10651  const SIMDType b2( set( B(k,j+1UL) ) );
10652  const SIMDType b3( set( B(k,j+2UL) ) );
10653  xmm1 += a1 * b1;
10654  xmm2 += a2 * b1;
10655  xmm3 += a1 * b2;
10656  xmm4 += a2 * b2;
10657  xmm5 += a1 * b3;
10658  xmm6 += a2 * b3;
10659  }
10660 
10661  (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
10662  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
10663  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
10664  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
10665  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
10666  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) + xmm6 * factor );
10667  }
10668 
10669  for( ; (j+2UL) <= jend; j+=2UL )
10670  {
10671  const size_t kbegin( ( IsLower<MT5>::value )
10672  ?( ( IsUpper<MT4>::value )
10673  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10674  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10675  :( IsUpper<MT4>::value ? i : 0UL ) );
10676  const size_t kend( ( IsUpper<MT5>::value )
10677  ?( ( IsLower<MT4>::value )
10678  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10679  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10680  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
10681 
10682  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10683  size_t k( kbegin );
10684 
10685  for( ; (k+2UL) <= kend; k+=2UL ) {
10686  const SIMDType a1( A.load(i ,k ) );
10687  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
10688  const SIMDType a3( A.load(i ,k+1UL) );
10689  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
10690  const SIMDType b1( set( B(k ,j ) ) );
10691  const SIMDType b2( set( B(k ,j+1UL) ) );
10692  const SIMDType b3( set( B(k+1UL,j ) ) );
10693  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
10694  xmm1 += a1 * b1;
10695  xmm2 += a2 * b1;
10696  xmm3 += a1 * b2;
10697  xmm4 += a2 * b2;
10698  xmm5 += a3 * b3;
10699  xmm6 += a4 * b3;
10700  xmm7 += a3 * b4;
10701  xmm8 += a4 * b4;
10702  }
10703 
10704  for( ; k<kend; ++k ) {
10705  const SIMDType a1( A.load(i ,k) );
10706  const SIMDType a2( A.load(i+SIMDSIZE,k) );
10707  const SIMDType b1( set( B(k,j ) ) );
10708  const SIMDType b2( set( B(k,j+1UL) ) );
10709  xmm1 += a1 * b1;
10710  xmm2 += a2 * b1;
10711  xmm3 += a1 * b2;
10712  xmm4 += a2 * b2;
10713  }
10714 
10715  (~C).store( i , j , (~C).load(i ,j ) + (xmm1+xmm5) * factor );
10716  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + (xmm2+xmm6) * factor );
10717  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + (xmm3+xmm7) * factor );
10718  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
10719  }
10720 
10721  if( j < jend )
10722  {
10723  const size_t kbegin( ( IsLower<MT5>::value )
10724  ?( ( IsUpper<MT4>::value )
10725  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10726  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10727  :( IsUpper<MT4>::value ? i : 0UL ) );
10728  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
10729 
10730  SIMDType xmm1, xmm2, xmm3, xmm4;
10731  size_t k( kbegin );
10732 
10733  for( ; (k+2UL) <= kend; k+=2UL ) {
10734  const SIMDType b1( set( B(k ,j) ) );
10735  const SIMDType b2( set( B(k+1UL,j) ) );
10736  xmm1 += A.load(i ,k ) * b1;
10737  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
10738  xmm3 += A.load(i ,k+1UL) * b2;
10739  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
10740  }
10741 
10742  for( ; k<kend; ++k ) {
10743  const SIMDType b1( set( B(k,j) ) );
10744  xmm1 += A.load(i ,k) * b1;
10745  xmm2 += A.load(i+SIMDSIZE,k) * b1;
10746  }
10747 
10748  (~C).store( i , j, (~C).load(i ,j) + (xmm1+xmm3) * factor );
10749  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + (xmm2+xmm4) * factor );
10750  }
10751  }
10752 
10753  for( ; i<ipos; i+=SIMDSIZE )
10754  {
10755  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
10756  size_t j( UPP ? i : 0UL );
10757 
10758  for( ; (j+4UL) <= jend; j+=4UL )
10759  {
10760  const size_t kbegin( ( IsLower<MT5>::value )
10761  ?( ( IsUpper<MT4>::value )
10762  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10763  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10764  :( IsUpper<MT4>::value ? i : 0UL ) );
10765  const size_t kend( ( IsUpper<MT5>::value )
10766  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
10767  :( K ) );
10768 
10769  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10770  size_t k( kbegin );
10771 
10772  for( ; (k+2UL) <= kend; k+=2UL ) {
10773  const SIMDType a1( A.load(i,k ) );
10774  const SIMDType a2( A.load(i,k+1UL) );
10775  xmm1 += a1 * set( B(k ,j ) );
10776  xmm2 += a1 * set( B(k ,j+1UL) );
10777  xmm3 += a1 * set( B(k ,j+2UL) );
10778  xmm4 += a1 * set( B(k ,j+3UL) );
10779  xmm5 += a2 * set( B(k+1UL,j ) );
10780  xmm6 += a2 * set( B(k+1UL,j+1UL) );
10781  xmm7 += a2 * set( B(k+1UL,j+2UL) );
10782  xmm8 += a2 * set( B(k+1UL,j+3UL) );
10783  }
10784 
10785  for( ; k<kend; ++k ) {
10786  const SIMDType a1( A.load(i,k) );
10787  xmm1 += a1 * set( B(k,j ) );
10788  xmm2 += a1 * set( B(k,j+1UL) );
10789  xmm3 += a1 * set( B(k,j+2UL) );
10790  xmm4 += a1 * set( B(k,j+3UL) );
10791  }
10792 
10793  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm5) * factor );
10794  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm6) * factor );
10795  (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm7) * factor );
10796  (~C).store( i, j+3UL, (~C).load(i,j+3UL) + (xmm4+xmm8) * factor );
10797  }
10798 
10799  for( ; (j+3UL) <= jend; j+=3UL )
10800  {
10801  const size_t kbegin( ( IsLower<MT5>::value )
10802  ?( ( IsUpper<MT4>::value )
10803  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10804  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10805  :( IsUpper<MT4>::value ? i : 0UL ) );
10806  const size_t kend( ( IsUpper<MT5>::value )
10807  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
10808  :( K ) );
10809 
10810  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10811  size_t k( kbegin );
10812 
10813  for( ; (k+2UL) <= kend; k+=2UL ) {
10814  const SIMDType a1( A.load(i,k ) );
10815  const SIMDType a2( A.load(i,k+1UL) );
10816  xmm1 += a1 * set( B(k ,j ) );
10817  xmm2 += a1 * set( B(k ,j+1UL) );
10818  xmm3 += a1 * set( B(k ,j+2UL) );
10819  xmm4 += a2 * set( B(k+1UL,j ) );
10820  xmm5 += a2 * set( B(k+1UL,j+1UL) );
10821  xmm6 += a2 * set( B(k+1UL,j+2UL) );
10822  }
10823 
10824  for( ; k<kend; ++k ) {
10825  const SIMDType a1( A.load(i,k) );
10826  xmm1 += a1 * set( B(k,j ) );
10827  xmm2 += a1 * set( B(k,j+1UL) );
10828  xmm3 += a1 * set( B(k,j+2UL) );
10829  }
10830 
10831  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm4) * factor );
10832  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm5) * factor );
10833  (~C).store( i, j+2UL, (~C).load(i,j+2UL) + (xmm3+xmm6) * factor );
10834  }
10835 
10836  for( ; (j+2UL) <= jend; j+=2UL )
10837  {
10838  const size_t kbegin( ( IsLower<MT5>::value )
10839  ?( ( IsUpper<MT4>::value )
10840  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10841  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10842  :( IsUpper<MT4>::value ? i : 0UL ) );
10843  const size_t kend( ( IsUpper<MT5>::value )
10844  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10845  :( K ) );
10846 
10847  SIMDType xmm1, xmm2, xmm3, xmm4;
10848  size_t k( kbegin );
10849 
10850  for( ; (k+2UL) <= kend; k+=2UL ) {
10851  const SIMDType a1( A.load(i,k ) );
10852  const SIMDType a2( A.load(i,k+1UL) );
10853  xmm1 += a1 * set( B(k ,j ) );
10854  xmm2 += a1 * set( B(k ,j+1UL) );
10855  xmm3 += a2 * set( B(k+1UL,j ) );
10856  xmm4 += a2 * set( B(k+1UL,j+1UL) );
10857  }
10858 
10859  for( ; k<kend; ++k ) {
10860  const SIMDType a1( A.load(i,k) );
10861  xmm1 += a1 * set( B(k,j ) );
10862  xmm2 += a1 * set( B(k,j+1UL) );
10863  }
10864 
10865  (~C).store( i, j , (~C).load(i,j ) + (xmm1+xmm3) * factor );
10866  (~C).store( i, j+1UL, (~C).load(i,j+1UL) + (xmm2+xmm4) * factor );
10867  }
10868 
10869  if( j < jend )
10870  {
10871  const size_t kbegin( ( IsLower<MT5>::value )
10872  ?( ( IsUpper<MT4>::value )
10873  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10874  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10875  :( IsUpper<MT4>::value ? i : 0UL ) );
10876 
10877  SIMDType xmm1, xmm2;
10878  size_t k( kbegin );
10879 
10880  for( ; (k+2UL) <= K; k+=2UL ) {
10881  xmm1 += A.load(i,k ) * set( B(k ,j) );
10882  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
10883  }
10884 
10885  for( ; k<K; ++k ) {
10886  xmm1 += A.load(i,k) * set( B(k,j) );
10887  }
10888 
10889  (~C).store( i, j, (~C).load(i,j) + (xmm1+xmm2) * factor );
10890  }
10891  }
10892 
10893  for( ; remainder && i<M; ++i )
10894  {
10895  const size_t jend( LOW ? i+1UL : N );
10896  size_t j( UPP ? i : 0UL );
10897 
10898  for( ; (j+2UL) <= jend; j+=2UL )
10899  {
10900  const size_t kbegin( ( IsLower<MT5>::value )
10901  ?( ( IsUpper<MT4>::value )
10902  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10903  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10904  :( IsUpper<MT4>::value ? i : 0UL ) );
10905  const size_t kend( ( IsUpper<MT5>::value )
10906  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10907  :( K ) );
10908 
10909  ElementType value1{};
10910  ElementType value2{};
10911 
10912  for( size_t k=kbegin; k<kend; ++k ) {
10913  value1 += A(i,k) * B(k,j );
10914  value2 += A(i,k) * B(k,j+1UL);
10915  }
10916 
10917  (~C)(i,j ) += value1 * scalar;
10918  (~C)(i,j+1UL) += value2 * scalar;
10919  }
10920 
10921  if( j < jend )
10922  {
10923  const size_t kbegin( ( IsLower<MT5>::value )
10924  ?( ( IsUpper<MT4>::value )
10925  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10926  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10927  :( IsUpper<MT4>::value ? i : 0UL ) );
10928 
10929  ElementType value{};
10930 
10931  for( size_t k=kbegin; k<K; ++k ) {
10932  value += A(i,k) * B(k,j);
10933  }
10934 
10935  (~C)(i,j) += value * scalar;
10936  }
10937  }
10938  }
10939  //**********************************************************************************************
10940 
10941  //**Default addition assignment to dense matrices (large matrices)******************************
10955  template< typename MT3 // Type of the left-hand side target matrix
10956  , typename MT4 // Type of the left-hand side matrix operand
10957  , typename MT5 // Type of the right-hand side matrix operand
10958  , typename ST2 > // Type of the scalar value
10960  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10961  {
10962  selectDefaultAddAssignKernel( C, A, B, scalar );
10963  }
10964  //**********************************************************************************************
10965 
10966  //**Vectorized default addition assignment to dense matrices (large matrices)*******************
10981  template< typename MT3 // Type of the left-hand side target matrix
10982  , typename MT4 // Type of the left-hand side matrix operand
10983  , typename MT5 // Type of the right-hand side matrix operand
10984  , typename ST2 > // Type of the scalar value
10986  selectLargeAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
10987  {
10988  if( LOW )
10989  lmmm( C, A, B, scalar, ST2(1) );
10990  else if( UPP )
10991  ummm( C, A, B, scalar, ST2(1) );
10992  else
10993  mmm( C, A, B, scalar, ST2(1) );
10994  }
10995  //**********************************************************************************************
10996 
10997  //**BLAS-based addition assignment to dense matrices (default)**********************************
11011  template< typename MT3 // Type of the left-hand side target matrix
11012  , typename MT4 // Type of the left-hand side matrix operand
11013  , typename MT5 // Type of the right-hand side matrix operand
11014  , typename ST2 > // Type of the scalar value
11016  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11017  {
11018  selectLargeAddAssignKernel( C, A, B, scalar );
11019  }
11020  //**********************************************************************************************
11021 
11022  //**BLAS-based addition assignment to dense matrices********************************************
11023 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
11024 
11037  template< typename MT3 // Type of the left-hand side target matrix
11038  , typename MT4 // Type of the left-hand side matrix operand
11039  , typename MT5 // Type of the right-hand side matrix operand
11040  , typename ST2 > // Type of the scalar value
11042  selectBlasAddAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11043  {
11044  using ET = ElementType_<MT3>;
11045 
11046  if( IsTriangular<MT4>::value ) {
11047  ResultType_<MT3> tmp( serial( B ) );
11048  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
11049  addAssign( C, tmp );
11050  }
11051  else if( IsTriangular<MT5>::value ) {
11052  ResultType_<MT3> tmp( serial( A ) );
11053  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
11054  addAssign( C, tmp );
11055  }
11056  else {
11057  gemm( C, A, B, ET(scalar), ET(1) );
11058  }
11059  }
11060 #endif
11061  //**********************************************************************************************
11062 
11063  //**Addition assignment to sparse matrices******************************************************
11064  // No special implementation for the addition assignment to sparse matrices.
11065  //**********************************************************************************************
11066 
11067  //**Subtraction assignment to dense matrices****************************************************
11079  template< typename MT // Type of the target dense matrix
11080  , bool SO > // Storage order of the target dense matrix
11081  friend inline void subAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
11082  {
11084 
11085  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
11086  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
11087 
11088  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
11089  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
11090 
11091  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
11092  return;
11093  }
11094 
11095  LT A( serial( left ) ); // Evaluation of the left-hand side dense matrix operand
11096  RT B( serial( right ) ); // Evaluation of the right-hand side dense matrix operand
11097 
11098  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
11099  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
11100  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
11101  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
11102  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
11103  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
11104 
11105  DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
11106  }
11107  //**********************************************************************************************
11108 
11109  //**Subtraction assignment to dense matrices (kernel selection)*********************************
11120  template< typename MT3 // Type of the left-hand side target matrix
11121  , typename MT4 // Type of the left-hand side matrix operand
11122  , typename MT5 // Type of the right-hand side matrix operand
11123  , typename ST2 > // Type of the scalar value
11124  static inline void selectSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11125  {
11127  ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix<MT3>::value && B.columns() <= SIMDSIZE*10UL ) ||
11128  ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix<MT3>::value && A.rows() <= SIMDSIZE*10UL ) ||
11129  ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
11130  selectSmallSubAssignKernel( C, A, B, scalar );
11131  else
11132  selectBlasSubAssignKernel( C, A, B, scalar );
11133  }
11134  //**********************************************************************************************
11135 
11136  //**Default subtraction assignment to dense matrices********************************************
11150  template< typename MT3 // Type of the left-hand side target matrix
11151  , typename MT4 // Type of the left-hand side matrix operand
11152  , typename MT5 // Type of the right-hand side matrix operand
11153  , typename ST2 > // Type of the scalar value
11154  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
11155  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11156  {
11157  const ResultType tmp( serial( A * B * scalar ) );
11158  subAssign( C, tmp );
11159  }
11160  //**********************************************************************************************
11161 
11162  //**Default subtraction assignment to row-major dense matrices (general/diagonal)***************
11176  template< typename MT3 // Type of the left-hand side target matrix
11177  , typename MT4 // Type of the left-hand side matrix operand
11178  , typename MT5 // Type of the right-hand side matrix operand
11179  , typename ST2 > // Type of the scalar value
11180  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
11181  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
11182  {
11183  constexpr size_t block( BLOCK_SIZE );
11184 
11185  const size_t M( A.rows() );
11186  const size_t N( B.columns() );
11187 
11188  for( size_t ii=0UL; ii<M; ii+=block ) {
11189  const size_t iend( min( M, ii+block ) );
11190  for( size_t jj=0UL; jj<N; jj+=block ) {
11191  const size_t jend( min( N, jj+block ) );
11192  for( size_t i=ii; i<iend; ++i )
11193  {
11194  const size_t jbegin( ( IsUpper<MT4>::value )
11195  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
11196  :( jj ) );
11197  const size_t jpos( ( IsLower<MT4>::value )
11198  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
11199  :( jend ) );
11200 
11201  for( size_t j=jbegin; j<jpos; ++j ) {
11202  (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
11203  }
11204  }
11205  }
11206  }
11207  }
11208  //**********************************************************************************************
11209 
11210  //**Default subtraction assignment to column-major dense matrices (general/diagonal)************
11224  template< typename MT3 // Type of the left-hand side target matrix
11225  , typename MT4 // Type of the left-hand side matrix operand
11226  , typename MT5 // Type of the right-hand side matrix operand
11227  , typename ST2 > // Type of the scalar value
11228  static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
11229  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
11230  {
11231  const size_t M( A.rows() );
11232  const size_t N( B.columns() );
11233 
11234  for( size_t j=0UL; j<N; ++j )
11235  {
11236  const size_t ibegin( ( IsLower<MT4>::value )
11237  ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
11238  :( 0UL ) );
11239  const size_t iend( ( IsUpper<MT4>::value )
11240  ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
11241  :( M ) );
11242  BLAZE_INTERNAL_ASSERT( ibegin <= iend, "Invalid loop indices detected" );
11243 
11244  const size_t inum( iend - ibegin );
11245  const size_t ipos( ibegin + ( inum & size_t(-2) ) );
11246 
11247  for( size_t i=ibegin; i<ipos; i+=2UL ) {
11248  (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
11249  (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
11250  }
11251  if( ipos < iend ) {
11252  (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
11253  }
11254  }
11255  }
11256  //**********************************************************************************************
11257 
11258  //**Default subtraction assignment to row-major dense matrices (diagonal/general)***************
11272  template< typename MT3 // Type of the left-hand side target matrix
11273  , typename MT4 // Type of the left-hand side matrix operand
11274  , typename MT5 // Type of the right-hand side matrix operand
11275  , typename ST2 > // Type of the scalar value
11276  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
11277  selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
11278  {
11279  const size_t M( A.rows() );
11280  const size_t N( B.columns() );
11281 
11282  for( size_t i=0UL; i<M; ++i )
11283  {
11284  const size_t jbegin( ( IsUpper<MT5>::value )
11285  ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
11286  :( 0UL ) );
11287  const size_t jend( ( IsLower<MT5>::value )
11288  ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
11289  :( N ) );
11290  BLAZE_INTERNAL_ASSERT( jbegin <= jend, "Invalid loop indices detected" );
11291 
11292  const size_t jnum( jend - jbegin );
11293  const size_t jpos( jbegin + ( jnum & size_t(-2) ) );
11294 
11295  for( size_t j=jbegin; j<jpos; j+=2UL ) {
11296  (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
11297  (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
11298  }
11299  if( jpos < jend ) {
11300  (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
11301  }
11302  }
11303  }
11304  //**********************************************************************************************
11305 
11306  //**Default subtraction assignment to column-major dense matrices (diagonal/general)************
11320  template< typename MT3 // Type of the left-hand side target matrix
11321  , typename MT4 // Type of the left-hand side matrix operand
11322  , typename MT5 // Type of the right-hand side matrix operand
11323  , typename ST2 > // Type of the scalar value
11324  static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
11325  selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
11326  {
11327  constexpr size_t block( BLOCK_SIZE );
11328 
11329  const size_t M( A.rows() );
11330  const size_t N( B.columns() );
11331 
11332  for( size_t jj=0UL; jj<N; jj+=block ) {
11333  const size_t jend( min( N, jj+block ) );
11334  for( size_t ii=0UL; ii<M; ii+=block ) {
11335  const size_t iend( min( M, ii+block ) );
11336  for( size_t j=jj; j<jend; ++j )
11337  {
11338  const size_t ibegin( ( IsLower<MT5>::value )
11339  ?( max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
11340  :( ii ) );
11341  const size_t ipos( ( IsUpper<MT5>::value )
11342  ?( min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
11343  :( iend ) );
11344 
11345  for( size_t i=ibegin; i<ipos; ++i ) {
11346  (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
11347  }
11348  }
11349  }
11350  }
11351  }
11352  //**********************************************************************************************
11353 
11354  //**Default subtraction assignment to dense matrices (diagonal/diagonal)************************
11368  template< typename MT3 // Type of the left-hand side target matrix
11369  , typename MT4 // Type of the left-hand side matrix operand
11370  , typename MT5 // Type of the right-hand side matrix operand
11371  , typename ST2 > // Type of the scalar value
11372  static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
11373  selectDefaultSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11374  {
11375  for( size_t i=0UL; i<A.rows(); ++i ) {
11376  C(i,i) -= A(i,i) * B(i,i) * scalar;
11377  }
11378  }
11379  //**********************************************************************************************
11380 
11381  //**Default subtraction assignment to dense matrices (small matrices)***************************
11395  template< typename MT3 // Type of the left-hand side target matrix
11396  , typename MT4 // Type of the left-hand side matrix operand
11397  , typename MT5 // Type of the right-hand side matrix operand
11398  , typename ST2 > // Type of the scalar value
11400  selectSmallSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
11401  {
11402  selectDefaultSubAssignKernel( C, A, B, scalar );
11403  }
11404  //**********************************************************************************************
11405 
11406  //**Vectorized default subtraction assignment to row-major dense matrices (small matrices)******
11421  template< typename MT3 // Type of the left-hand side target matrix
11422  , typename MT4 // Type of the left-hand side matrix operand
11423  , typename MT5 // Type of the right-hand side matrix operand
11424  , typename ST2 > // Type of the scalar value
11426  selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C, const MT4& A, const MT5& B, ST2 scalar )
11427  {
11428  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
11429 
11430  const size_t M( A.rows() );
11431  const size_t N( B.columns() );
11432  const size_t K( A.columns() );
11433 
11434  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
11435 
11436  const size_t jpos( remainder ? ( N & size_t(-SIMDSIZE) ) : N );
11437  BLAZE_INTERNAL_ASSERT( !remainder || ( N - ( N % SIMDSIZE ) ) == jpos, "Invalid end calculation" );
11438 
11439  const SIMDType factor( set( scalar ) );
11440 
11441  size_t j( 0UL );
11442 
11444  {
11445  for( ; !LOW && !UPP && (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
11446  for( size_t i=0UL; i<M; ++i )
11447  {
11448  const size_t kbegin( ( IsUpper<MT4>::value )
11449  ?( ( IsLower<MT5>::value )
11450  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11451  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11452  :( IsLower<MT5>::value ? j : 0UL ) );
11453  const size_t kend( ( IsLower<MT4>::value )
11454  ?( ( IsUpper<MT5>::value )
11455  ?( min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
11456  :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
11457  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*8UL, K ) : K ) );
11458 
11459  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11460 
11461  for( size_t k=kbegin; k<kend; ++k ) {
11462  const SIMDType a1( set( A(i,k) ) );
11463  xmm1 += a1 * B.load(k,j );
11464  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11465  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11466  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11467  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11468  xmm6 += a1 * B.load(k,j+SIMDSIZE*5UL);
11469  xmm7 += a1 * B.load(k,j+SIMDSIZE*6UL);
11470  xmm8 += a1 * B.load(k,j+SIMDSIZE*7UL);
11471  }
11472 
11473  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11474  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11475  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11476  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11477  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11478  (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
11479  (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
11480  (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
11481  }
11482  }
11483  }
11484 
11485  for( ; !LOW && !UPP && (j+SIMDSIZE*4UL) < jpos; j+=SIMDSIZE*5UL )
11486  {
11487  size_t i( 0UL );
11488 
11489  for( ; (i+2UL) <= M; i+=2UL )
11490  {
11491  const size_t kbegin( ( IsUpper<MT4>::value )
11492  ?( ( IsLower<MT5>::value )
11493  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11494  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11495  :( IsLower<MT5>::value ? j : 0UL ) );
11496  const size_t kend( ( IsLower<MT4>::value )
11497  ?( ( IsUpper<MT5>::value )
11498  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*5UL, K ) )
11499  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
11500  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*5UL, K ) : K ) );
11501 
11502  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
11503 
11504  for( size_t k=kbegin; k<kend; ++k ) {
11505  const SIMDType a1( set( A(i ,k) ) );
11506  const SIMDType a2( set( A(i+1UL,k) ) );
11507  const SIMDType b1( B.load(k,j ) );
11508  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11509  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11510  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11511  const SIMDType b5( B.load(k,j+SIMDSIZE*4UL) );
11512  xmm1 += a1 * b1;
11513  xmm2 += a1 * b2;
11514  xmm3 += a1 * b3;
11515  xmm4 += a1 * b4;
11516  xmm5 += a1 * b5;
11517  xmm6 += a2 * b1;
11518  xmm7 += a2 * b2;
11519  xmm8 += a2 * b3;
11520  xmm9 += a2 * b4;
11521  xmm10 += a2 * b5;
11522  }
11523 
11524  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11525  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11526  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11527  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11528  (~C).store( i , j+SIMDSIZE*4UL, (~C).load(i ,j+SIMDSIZE*4UL) - xmm5 * factor );
11529  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm6 * factor );
11530  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm7 * factor );
11531  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm8 * factor );
11532  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm9 * factor );
11533  (~C).store( i+1UL, j+SIMDSIZE*4UL, (~C).load(i+1UL,j+SIMDSIZE*4UL) - xmm10 * factor );
11534  }
11535 
11536  if( i < M )
11537  {
11538  const size_t kbegin( ( IsUpper<MT4>::value )
11539  ?( ( IsLower<MT5>::value )
11540  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11541  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11542  :( IsLower<MT5>::value ? j : 0UL ) );
11543  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*5UL, K ) ):( K ) );
11544 
11545  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
11546 
11547  for( size_t k=kbegin; k<kend; ++k ) {
11548  const SIMDType a1( set( A(i,k) ) );
11549  xmm1 += a1 * B.load(k,j );
11550  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11551  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11552  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11553  xmm5 += a1 * B.load(k,j+SIMDSIZE*4UL);
11554  }
11555 
11556  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11557  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11558  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11559  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11560  (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
11561  }
11562  }
11563 
11564  for( ; !LOW && !UPP && (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
11565  {
11566  size_t i( 0UL );
11567 
11568  for( ; (i+2UL) <= M; i+=2UL )
11569  {
11570  const size_t kbegin( ( IsUpper<MT4>::value )
11571  ?( ( IsLower<MT5>::value )
11572  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11573  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11574  :( IsLower<MT5>::value ? j : 0UL ) );
11575  const size_t kend( ( IsLower<MT4>::value )
11576  ?( ( IsUpper<MT5>::value )
11577  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
11578  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
11579  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*4UL, K ) : K ) );
11580 
11581  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11582 
11583  for( size_t k=kbegin; k<kend; ++k ) {
11584  const SIMDType a1( set( A(i ,k) ) );
11585  const SIMDType a2( set( A(i+1UL,k) ) );
11586  const SIMDType b1( B.load(k,j ) );
11587  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11588  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11589  const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
11590  xmm1 += a1 * b1;
11591  xmm2 += a1 * b2;
11592  xmm3 += a1 * b3;
11593  xmm4 += a1 * b4;
11594  xmm5 += a2 * b1;
11595  xmm6 += a2 * b2;
11596  xmm7 += a2 * b3;
11597  xmm8 += a2 * b4;
11598  }
11599 
11600  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11601  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11602  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11603  (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
11604  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
11605  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
11606  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
11607  (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
11608  }
11609 
11610  if( i < M )
11611  {
11612  const size_t kbegin( ( IsUpper<MT4>::value )
11613  ?( ( IsLower<MT5>::value )
11614  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11615  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11616  :( IsLower<MT5>::value ? j : 0UL ) );
11617  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*4UL, K ) ):( K ) );
11618 
11619  SIMDType xmm1, xmm2, xmm3, xmm4;
11620 
11621  for( size_t k=kbegin; k<kend; ++k ) {
11622  const SIMDType a1( set( A(i,k) ) );
11623  xmm1 += a1 * B.load(k,j );
11624  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11625  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11626  xmm4 += a1 * B.load(k,j+SIMDSIZE*3UL);
11627  }
11628 
11629  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11630  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11631  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11632  (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
11633  }
11634  }
11635 
11636  for( ; !LOW && !UPP && (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
11637  {
11638  size_t i( 0UL );
11639 
11640  for( ; (i+2UL) <= M; i+=2UL )
11641  {
11642  const size_t kbegin( ( IsUpper<MT4>::value )
11643  ?( ( IsLower<MT5>::value )
11644  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11645  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11646  :( IsLower<MT5>::value ? j : 0UL ) );
11647  const size_t kend( ( IsLower<MT4>::value )
11648  ?( ( IsUpper<MT5>::value )
11649  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*3UL, K ) )
11650  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
11651  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*3UL, K ) : K ) );
11652 
11653  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11654 
11655  for( size_t k=kbegin; k<kend; ++k ) {
11656  const SIMDType a1( set( A(i ,k) ) );
11657  const SIMDType a2( set( A(i+1UL,k) ) );
11658  const SIMDType b1( B.load(k,j ) );
11659  const SIMDType b2( B.load(k,j+SIMDSIZE ) );
11660  const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
11661  xmm1 += a1 * b1;
11662  xmm2 += a1 * b2;
11663  xmm3 += a1 * b3;
11664  xmm4 += a2 * b1;
11665  xmm5 += a2 * b2;
11666  xmm6 += a2 * b3;
11667  }
11668 
11669  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11670  (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
11671  (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
11672  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm4 * factor );
11673  (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm5 * factor );
11674  (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm6 * factor );
11675  }
11676 
11677  if( i < M )
11678  {
11679  const size_t kbegin( ( IsUpper<MT4>::value )
11680  ?( ( IsLower<MT5>::value )
11681  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11682  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11683  :( IsLower<MT5>::value ? j : 0UL ) );
11684  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*3UL, K ) ):( K ) );
11685 
11686  SIMDType xmm1, xmm2, xmm3;
11687 
11688  for( size_t k=kbegin; k<kend; ++k ) {
11689  const SIMDType a1( set( A(i,k) ) );
11690  xmm1 += a1 * B.load(k,j );
11691  xmm2 += a1 * B.load(k,j+SIMDSIZE );
11692  xmm3 += a1 * B.load(k,j+SIMDSIZE*2UL);
11693  }
11694 
11695  (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
11696  (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
11697  (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
11698  }
11699  }
11700 
11701  for( ; !( LOW && UPP ) && (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
11702  {
11703  const size_t iend( UPP ? min(j+SIMDSIZE*2UL,M) : M );
11704  size_t i( LOW ? j : 0UL );
11705 
11706  for( ; (i+4UL) <= iend; i+=4UL )
11707  {
11708  const size_t kbegin( ( IsUpper<MT4>::value )
11709  ?( ( IsLower<MT5>::value )
11710  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11711  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11712  :( IsLower<MT5>::value ? j : 0UL ) );
11713  const size_t kend( ( IsLower<MT4>::value )
11714  ?( ( IsUpper<MT5>::value )
11715  ?( min( ( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ), j+SIMDSIZE*2UL, K ) )
11716  :( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL ) )
11717  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
11718 
11719  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11720 
11721  for( size_t k=kbegin; k<kend; ++k ) {
11722  const SIMDType a1( set( A(i ,k) ) );
11723  const SIMDType a2( set( A(i+1UL,k) ) );
11724  const SIMDType a3( set( A(i+2UL,k) ) );
11725  const SIMDType a4( set( A(i+3UL,k) ) );
11726  const SIMDType b1( B.load(k,j ) );
11727  const SIMDType b2( B.load(k,j+SIMDSIZE) );
11728  xmm1 += a1 * b1;
11729  xmm2 += a1 * b2;
11730  xmm3 += a2 * b1;
11731  xmm4 += a2 * b2;
11732  xmm5 += a3 * b1;
11733  xmm6 += a3 * b2;
11734  xmm7 += a4 * b1;
11735  xmm8 += a4 * b2;
11736  }
11737 
11738  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11739  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
11740  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
11741  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11742  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
11743  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11744  (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
11745  (~C).store( i+3UL, j+SIMDSIZE, (~C).load(i+3UL,j+SIMDSIZE) - xmm8 * factor );
11746  }
11747 
11748  for( ; (i+3UL) <= iend; i+=3UL )
11749  {
11750  const size_t kbegin( ( IsUpper<MT4>::value )
11751  ?( ( IsLower<MT5>::value )
11752  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11753  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11754  :( IsLower<MT5>::value ? j : 0UL ) );
11755  const size_t kend( ( IsLower<MT4>::value )
11756  ?( ( IsUpper<MT5>::value )
11757  ?( min( ( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ), j+SIMDSIZE*2UL, K ) )
11758  :( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL ) )
11759  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
11760 
11761  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11762 
11763  for( size_t k=kbegin; k<kend; ++k ) {
11764  const SIMDType a1( set( A(i ,k) ) );
11765  const SIMDType a2( set( A(i+1UL,k) ) );
11766  const SIMDType a3( set( A(i+2UL,k) ) );
11767  const SIMDType b1( B.load(k,j ) );
11768  const SIMDType b2( B.load(k,j+SIMDSIZE) );
11769  xmm1 += a1 * b1;
11770  xmm2 += a1 * b2;
11771  xmm3 += a2 * b1;
11772  xmm4 += a2 * b2;
11773  xmm5 += a3 * b1;
11774  xmm6 += a3 * b2;
11775  }
11776 
11777  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
11778  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
11779  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
11780  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
11781  (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
11782  (~C).store( i+2UL, j+SIMDSIZE, (~C).load(i+2UL,j+SIMDSIZE) - xmm6 * factor );
11783  }
11784 
11785  for( ; (i+2UL) <= iend; i+=2UL )
11786  {
11787  const size_t kbegin( ( IsUpper<MT4>::value )
11788  ?( ( IsLower<MT5>::value )
11789  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11790  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11791  :( IsLower<MT5>::value ? j : 0UL ) );
11792  const size_t kend( ( IsLower<MT4>::value )
11793  ?( ( IsUpper<MT5>::value )
11794  ?( min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
11795  :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
11796  :( IsUpper<MT5>::value ? min( j+SIMDSIZE*2UL, K ) : K ) );
11797 
11798  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11799  size_t k( kbegin );
11800 
11801  for( ; (k+2UL) <= kend; k+=2UL ) {
11802  const SIMDType a1( set( A(i ,k ) ) );
11803  const SIMDType a2( set( A(i+1UL,k ) ) );
11804  const SIMDType a3( set( A(i ,k+1UL) ) );
11805  const SIMDType a4( set( A(i+1UL,k+1UL) ) );
11806  const SIMDType b1( B.load(k ,j ) );
11807  const SIMDType b2( B.load(k ,j+SIMDSIZE) );
11808  const SIMDType b3( B.load(k+1UL,j ) );
11809  const SIMDType b4( B.load(k+1UL,j+SIMDSIZE) );
11810  xmm1 += a1 * b1;
11811  xmm2 += a1 * b2;
11812  xmm3 += a2 * b1;
11813  xmm4 += a2 * b2;
11814  xmm5 += a3 * b3;
11815  xmm6 += a3 * b4;
11816  xmm7 += a4 * b3;
11817  xmm8 += a4 * b4;
11818  }
11819 
11820  for( ; k<kend; ++k ) {
11821  const SIMDType a1( set( A(i ,k) ) );
11822  const SIMDType a2( set( A(i+1UL,k) ) );
11823  const SIMDType b1( B.load(k,j ) );
11824  const SIMDType b2( B.load(k,j+SIMDSIZE) );
11825  xmm1 += a1 * b1;
11826  xmm2 += a1 * b2;
11827  xmm3 += a2 * b1;
11828  xmm4 += a2 * b2;
11829  }
11830 
11831  (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
11832  (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - (xmm2+xmm6) * factor );
11833  (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - (xmm3+xmm7) * factor );
11834  (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - (xmm4+xmm8) * factor );
11835  }
11836 
11837  if( i < iend )
11838  {
11839  const size_t kbegin( ( IsUpper<MT4>::value )
11840  ?( ( IsLower<MT5>::value )
11841  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11842  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11843  :( IsLower<MT5>::value ? j : 0UL ) );
11844  const size_t kend( ( IsUpper<MT5>::value )?( min( j+SIMDSIZE*2UL, K ) ):( K ) );
11845 
11846  SIMDType xmm1, xmm2, xmm3, xmm4;
11847  size_t k( kbegin );
11848 
11849  for( ; (k+2UL) <= kend; k+=2UL ) {
11850  const SIMDType a1( set( A(i,k ) ) );
11851  const SIMDType a2( set( A(i,k+1UL) ) );
11852  xmm1 += a1 * B.load(k ,j );
11853  xmm2 += a1 * B.load(k ,j+SIMDSIZE);
11854  xmm3 += a2 * B.load(k+1UL,j );
11855  xmm4 += a2 * B.load(k+1UL,j+SIMDSIZE);
11856  }
11857 
11858  for( ; k<kend; ++k ) {
11859  const SIMDType a1( set( A(i,k) ) );
11860  xmm1 += a1 * B.load(k,j );
11861  xmm2 += a1 * B.load(k,j+SIMDSIZE);
11862  }
11863 
11864  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
11865  (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - (xmm2+xmm4) * factor );
11866  }
11867  }
11868 
11869  for( ; j<jpos; j+=SIMDSIZE )
11870  {
11871  const size_t iend( LOW && UPP ? min(j+SIMDSIZE,M) : M );
11872  size_t i( LOW ? j : 0UL );
11873 
11874  for( ; (i+4UL) <= iend; i+=4UL )
11875  {
11876  const size_t kbegin( ( IsUpper<MT4>::value )
11877  ?( ( IsLower<MT5>::value )
11878  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11879  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11880  :( IsLower<MT5>::value ? j : 0UL ) );
11881  const size_t kend( ( IsLower<MT4>::value )
11882  ?( IsStrictlyLower<MT4>::value ? i+3UL : i+4UL )
11883  :( K ) );
11884 
11885  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11886  size_t k( kbegin );
11887 
11888  for( ; (k+2UL) <= kend; k+=2UL ) {
11889  const SIMDType b1( B.load(k ,j) );
11890  const SIMDType b2( B.load(k+1UL,j) );
11891  xmm1 += set( A(i ,k ) ) * b1;
11892  xmm2 += set( A(i+1UL,k ) ) * b1;
11893  xmm3 += set( A(i+2UL,k ) ) * b1;
11894  xmm4 += set( A(i+3UL,k ) ) * b1;
11895  xmm5 += set( A(i ,k+1UL) ) * b2;
11896  xmm6 += set( A(i+1UL,k+1UL) ) * b2;
11897  xmm7 += set( A(i+2UL,k+1UL) ) * b2;
11898  xmm8 += set( A(i+3UL,k+1UL) ) * b2;
11899  }
11900 
11901  for( ; k<kend; ++k ) {
11902  const SIMDType b1( B.load(k,j) );
11903  xmm1 += set( A(i ,k) ) * b1;
11904  xmm2 += set( A(i+1UL,k) ) * b1;
11905  xmm3 += set( A(i+2UL,k) ) * b1;
11906  xmm4 += set( A(i+3UL,k) ) * b1;
11907  }
11908 
11909  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm5) * factor );
11910  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm6) * factor );
11911  (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm7) * factor );
11912  (~C).store( i+3UL, j, (~C).load(i+3UL,j) - (xmm4+xmm8) * factor );
11913  }
11914 
11915  for( ; (i+3UL) <= iend; i+=3UL )
11916  {
11917  const size_t kbegin( ( IsUpper<MT4>::value )
11918  ?( ( IsLower<MT5>::value )
11919  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11920  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11921  :( IsLower<MT5>::value ? j : 0UL ) );
11922  const size_t kend( ( IsLower<MT4>::value )
11923  ?( IsStrictlyLower<MT4>::value ? i+2UL : i+3UL )
11924  :( K ) );
11925 
11926  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11927  size_t k( kbegin );
11928 
11929  for( ; (k+2UL) <= kend; k+=2UL ) {
11930  const SIMDType b1( B.load(k ,j) );
11931  const SIMDType b2( B.load(k+1UL,j) );
11932  xmm1 += set( A(i ,k ) ) * b1;
11933  xmm2 += set( A(i+1UL,k ) ) * b1;
11934  xmm3 += set( A(i+2UL,k ) ) * b1;
11935  xmm4 += set( A(i ,k+1UL) ) * b2;
11936  xmm5 += set( A(i+1UL,k+1UL) ) * b2;
11937  xmm6 += set( A(i+2UL,k+1UL) ) * b2;
11938  }
11939 
11940  for( ; k<kend; ++k ) {
11941  const SIMDType b1( B.load(k,j) );
11942  xmm1 += set( A(i ,k) ) * b1;
11943  xmm2 += set( A(i+1UL,k) ) * b1;
11944  xmm3 += set( A(i+2UL,k) ) * b1;
11945  }
11946 
11947  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm4) * factor );
11948  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm5) * factor );
11949  (~C).store( i+2UL, j, (~C).load(i+2UL,j) - (xmm3+xmm6) * factor );
11950  }
11951 
11952  for( ; (i+2UL) <= iend; i+=2UL )
11953  {
11954  const size_t kbegin( ( IsUpper<MT4>::value )
11955  ?( ( IsLower<MT5>::value )
11956  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11957  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11958  :( IsLower<MT5>::value ? j : 0UL ) );
11959  const size_t kend( ( IsLower<MT4>::value )
11960  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
11961  :( K ) );
11962 
11963  SIMDType xmm1, xmm2, xmm3, xmm4;
11964  size_t k( kbegin );
11965 
11966  for( ; (k+2UL) <= kend; k+=2UL ) {
11967  const SIMDType b1( B.load(k ,j) );
11968  const SIMDType b2( B.load(k+1UL,j) );
11969  xmm1 += set( A(i ,k ) ) * b1;
11970  xmm2 += set( A(i+1UL,k ) ) * b1;
11971  xmm3 += set( A(i ,k+1UL) ) * b2;
11972  xmm4 += set( A(i+1UL,k+1UL) ) * b2;
11973  }
11974 
11975  for( ; k<kend; ++k ) {
11976  const SIMDType b1( B.load(k,j) );
11977  xmm1 += set( A(i ,k) ) * b1;
11978  xmm2 += set( A(i+1UL,k) ) * b1;
11979  }
11980 
11981  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
11982  (~C).store( i+1UL, j, (~C).load(i+1UL,j) - (xmm2+xmm4) * factor );
11983  }
11984 
11985  if( i < iend )
11986  {
11987  const size_t kbegin( ( IsUpper<MT4>::value )
11988  ?( ( IsLower<MT5>::value )
11989  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
11990  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
11991  :( IsLower<MT5>::value ? j : 0UL ) );
11992 
11993  SIMDType xmm1, xmm2;
11994  size_t k( kbegin );
11995 
11996  for( ; (k+2UL) <= K; k+=2UL ) {
11997  xmm1 += set( A(i,k ) ) * B.load(k ,j);
11998  xmm2 += set( A(i,k+1UL) ) * B.load(k+1UL,j);
11999  }
12000 
12001  for( ; k<K; ++k ) {
12002  xmm1 += set( A(i,k) ) * B.load(k,j);
12003  }
12004 
12005  (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
12006  }
12007  }
12008 
12009  for( ; remainder && j<N; ++j )
12010  {
12011  const size_t iend( UPP ? j+1UL : M );
12012  size_t i( LOW ? j : 0UL );
12013 
12014  for( ; (i+2UL) <= iend; i+=2UL )
12015  {
12016  const size_t kbegin( ( IsUpper<MT4>::value )
12017  ?( ( IsLower<MT5>::value )
12018  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
12019  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
12020  :( IsLower<MT5>::value ? j : 0UL ) );
12021  const size_t kend( ( IsLower<MT4>::value )
12022  ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
12023  :( K ) );
12024 
12025  ElementType value1{};
12026  ElementType value2{};
12027 
12028  for( size_t k=kbegin; k<kend; ++k ) {
12029  value1 += A(i ,k) * B(k,j);
12030  value2 += A(i+1UL,k) * B(k,j);
12031  }
12032 
12033  (~C)(i ,j) -= value1 * scalar;
12034  (~C)(i+1UL,j) -= value2 * scalar;
12035  }
12036 
12037  if( i < iend )
12038  {
12039  const size_t kbegin( ( IsUpper<MT4>::value )
12040  ?( ( IsLower<MT5>::value )
12041  ?( max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
12042  :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
12043  :( IsLower<MT5>::value ? j : 0UL ) );
12044 
12045  ElementType value{};
12046 
12047  for( size_t k=kbegin; k<K; ++k ) {
12048  value += A(i,k) * B(k,j);
12049  }
12050 
12051  (~C)(i,j) -= value * scalar;
12052  }
12053  }
12054  }
12055  //**********************************************************************************************
12056 
12057  //**Vectorized default subtraction assignment to column-major dense matrices (small matrices)***
12072  template< typename MT3 // Type of the left-hand side target matrix
12073  , typename MT4 // Type of the left-hand side matrix operand
12074  , typename MT5 // Type of the right-hand side matrix operand
12075  , typename ST2 > // Type of the scalar value
12077  selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C, const MT4& A, const MT5& B, ST2 scalar )
12078  {
12079  constexpr bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
12080 
12081  const size_t M( A.rows() );
12082  const size_t N( B.columns() );
12083  const size_t K( A.columns() );
12084 
12085  BLAZE_INTERNAL_ASSERT( !( LOW || UPP ) || ( M == N ), "Broken invariant detected" );
12086 
12087  const size_t ipos( remainder ? ( M & size_t(-SIMDSIZE) ) : M );
12088  BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos, "Invalid end calculation" );
12089 
12090  const SIMDType factor( set( scalar ) );
12091 
12092  size_t i( 0UL );
12093 
12095  {
12096  for( ; !LOW && !UPP && (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
12097  for( size_t j=0UL; j<N; ++j )
12098  {
12099  const size_t kbegin( ( IsLower<MT5>::value )
12100  ?( ( IsUpper<MT4>::value )
12101  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12102  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12103  :( IsUpper<MT4>::value ? i : 0UL ) );
12104  const size_t kend( ( IsUpper<MT5>::value )
12105  ?( ( IsLower<MT4>::value )
12106  ?( min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
12107  :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
12108  :( IsLower<MT4>::value ? min( i+SIMDSIZE*8UL, K ) : K ) );
12109 
12110  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12111 
12112  for( size_t k=kbegin; k<kend; ++k ) {
12113  const SIMDType b1( set( B(k,j) ) );
12114  xmm1 += A.load(i ,k) * b1;
12115  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12116  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12117  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12118  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12119  xmm6 += A.load(i+SIMDSIZE*5UL,k) * b1;
12120  xmm7 += A.load(i+SIMDSIZE*6UL,k) * b1;
12121  xmm8 += A.load(i+SIMDSIZE*7UL,k) * b1;
12122  }
12123 
12124  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12125  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12126  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12127  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12128  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12129  (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
12130  (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
12131  (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
12132  }
12133  }
12134  }
12135 
12136  for( ; !LOW && !UPP && (i+SIMDSIZE*4UL) < ipos; i+=SIMDSIZE*5UL )
12137  {
12138  size_t j( 0UL );
12139 
12140  for( ; (j+2UL) <= N; j+=2UL )
12141  {
12142  const size_t kbegin( ( IsLower<MT5>::value )
12143  ?( ( IsUpper<MT4>::value )
12144  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12145  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12146  :( IsUpper<MT4>::value ? i : 0UL ) );
12147  const size_t kend( ( IsUpper<MT5>::value )
12148  ?( ( IsLower<MT4>::value )
12149  ?( min( i+SIMDSIZE*5UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
12150  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
12151  :( IsLower<MT4>::value ? min( i+SIMDSIZE*5UL, K ) : K ) );
12152 
12153  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
12154 
12155  for( size_t k=kbegin; k<kend; ++k ) {
12156  const SIMDType a1( A.load(i ,k) );
12157  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12158  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12159  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12160  const SIMDType a5( A.load(i+SIMDSIZE*4UL,k) );
12161  const SIMDType b1( set( B(k,j ) ) );
12162  const SIMDType b2( set( B(k,j+1UL) ) );
12163  xmm1 += a1 * b1;
12164  xmm2 += a2 * b1;
12165  xmm3 += a3 * b1;
12166  xmm4 += a4 * b1;
12167  xmm5 += a5 * b1;
12168  xmm6 += a1 * b2;
12169  xmm7 += a2 * b2;
12170  xmm8 += a3 * b2;
12171  xmm9 += a4 * b2;
12172  xmm10 += a5 * b2;
12173  }
12174 
12175  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12176  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12177  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12178  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12179  (~C).store( i+SIMDSIZE*4UL, j , (~C).load(i+SIMDSIZE*4UL,j ) - xmm5 * factor );
12180  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm6 * factor );
12181  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm7 * factor );
12182  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
12183  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
12184  (~C).store( i+SIMDSIZE*4UL, j+1UL, (~C).load(i+SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
12185  }
12186 
12187  if( j < N )
12188  {
12189  const size_t kbegin( ( IsLower<MT5>::value )
12190  ?( ( IsUpper<MT4>::value )
12191  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12192  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12193  :( IsUpper<MT4>::value ? i : 0UL ) );
12194  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*5UL, K ) ):( K ) );
12195 
12196  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
12197 
12198  for( size_t k=kbegin; k<kend; ++k ) {
12199  const SIMDType b1( set( B(k,j) ) );
12200  xmm1 += A.load(i ,k) * b1;
12201  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12202  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12203  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12204  xmm5 += A.load(i+SIMDSIZE*4UL,k) * b1;
12205  }
12206 
12207  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12208  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12209  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12210  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12211  (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
12212  }
12213  }
12214 
12215  for( ; !LOW && !UPP && (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
12216  {
12217  size_t j( 0UL );
12218 
12219  for( ; (j+2UL) <= N; j+=2UL )
12220  {
12221  const size_t kbegin( ( IsLower<MT5>::value )
12222  ?( ( IsUpper<MT4>::value )
12223  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12224  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12225  :( IsUpper<MT4>::value ? i : 0UL ) );
12226  const size_t kend( ( IsUpper<MT5>::value )
12227  ?( ( IsLower<MT4>::value )
12228  ?( min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
12229  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
12230  :( IsLower<MT4>::value ? min( i+SIMDSIZE*4UL, K ) : K ) );
12231 
12232  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12233 
12234  for( size_t k=kbegin; k<kend; ++k ) {
12235  const SIMDType a1( A.load(i ,k) );
12236  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12237  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12238  const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
12239  const SIMDType b1( set( B(k,j ) ) );
12240  const SIMDType b2( set( B(k,j+1UL) ) );
12241  xmm1 += a1 * b1;
12242  xmm2 += a2 * b1;
12243  xmm3 += a3 * b1;
12244  xmm4 += a4 * b1;
12245  xmm5 += a1 * b2;
12246  xmm6 += a2 * b2;
12247  xmm7 += a3 * b2;
12248  xmm8 += a4 * b2;
12249  }
12250 
12251  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12252  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12253  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12254  (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
12255  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
12256  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
12257  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
12258  (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
12259  }
12260 
12261  if( j < N )
12262  {
12263  const size_t kbegin( ( IsLower<MT5>::value )
12264  ?( ( IsUpper<MT4>::value )
12265  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12266  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12267  :( IsUpper<MT4>::value ? i : 0UL ) );
12268  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*4UL, K ) ):( K ) );
12269 
12270  SIMDType xmm1, xmm2, xmm3, xmm4;
12271 
12272  for( size_t k=kbegin; k<kend; ++k ) {
12273  const SIMDType b1( set( B(k,j) ) );
12274  xmm1 += A.load(i ,k) * b1;
12275  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12276  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12277  xmm4 += A.load(i+SIMDSIZE*3UL,k) * b1;
12278  }
12279 
12280  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12281  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12282  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12283  (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
12284  }
12285  }
12286 
12287  for( ; !LOW && !UPP && (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
12288  {
12289  size_t j( 0UL );
12290 
12291  for( ; (j+2UL) <= N; j+=2UL )
12292  {
12293  const size_t kbegin( ( IsLower<MT5>::value )
12294  ?( ( IsUpper<MT4>::value )
12295  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12296  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12297  :( IsUpper<MT4>::value ? i : 0UL ) );
12298  const size_t kend( ( IsUpper<MT5>::value )
12299  ?( ( IsLower<MT4>::value )
12300  ?( min( i+SIMDSIZE*3UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
12301  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
12302  :( IsLower<MT4>::value ? min( i+SIMDSIZE*3UL, K ) : K ) );
12303 
12304  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12305 
12306  for( size_t k=kbegin; k<kend; ++k ) {
12307  const SIMDType a1( A.load(i ,k) );
12308  const SIMDType a2( A.load(i+SIMDSIZE ,k) );
12309  const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
12310  const SIMDType b1( set( B(k,j ) ) );
12311  const SIMDType b2( set( B(k,j+1UL) ) );
12312  xmm1 += a1 * b1;
12313  xmm2 += a2 * b1;
12314  xmm3 += a3 * b1;
12315  xmm4 += a1 * b2;
12316  xmm5 += a2 * b2;
12317  xmm6 += a3 * b2;
12318  }
12319 
12320  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12321  (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
12322  (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
12323  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm4 * factor );
12324  (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm5 * factor );
12325  (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
12326  }
12327 
12328  if( j < N )
12329  {
12330  const size_t kbegin( ( IsLower<MT5>::value )
12331  ?( ( IsUpper<MT4>::value )
12332  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12333  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12334  :( IsUpper<MT4>::value ? i : 0UL ) );
12335  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*3UL, K ) ):( K ) );
12336 
12337  SIMDType xmm1, xmm2, xmm3;
12338 
12339  for( size_t k=kbegin; k<kend; ++k ) {
12340  const SIMDType b1( set( B(k,j) ) );
12341  xmm1 += A.load(i ,k) * b1;
12342  xmm2 += A.load(i+SIMDSIZE ,k) * b1;
12343  xmm3 += A.load(i+SIMDSIZE*2UL,k) * b1;
12344  }
12345 
12346  (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
12347  (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
12348  (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
12349  }
12350  }
12351 
12352  for( ; !( LOW && UPP ) && (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
12353  {
12354  const size_t jend( LOW ? min(i+SIMDSIZE*2UL,N) : N );
12355  size_t j( UPP ? i : 0UL );
12356 
12357  for( ; (j+4UL) <= jend; j+=4UL )
12358  {
12359  const size_t kbegin( ( IsLower<MT5>::value )
12360  ?( ( IsUpper<MT4>::value )
12361  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12362  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12363  :( IsUpper<MT4>::value ? i : 0UL ) );
12364  const size_t kend( ( IsUpper<MT5>::value )
12365  ?( ( IsLower<MT4>::value )
12366  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) ) )
12367  :( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL ) )
12368  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
12369 
12370  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12371 
12372  for( size_t k=kbegin; k<kend; ++k ) {
12373  const SIMDType a1( A.load(i ,k) );
12374  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12375  const SIMDType b1( set( B(k,j ) ) );
12376  const SIMDType b2( set( B(k,j+1UL) ) );
12377  const SIMDType b3( set( B(k,j+2UL) ) );
12378  const SIMDType b4( set( B(k,j+3UL) ) );
12379  xmm1 += a1 * b1;
12380  xmm2 += a2 * b1;
12381  xmm3 += a1 * b2;
12382  xmm4 += a2 * b2;
12383  xmm5 += a1 * b3;
12384  xmm6 += a2 * b3;
12385  xmm7 += a1 * b4;
12386  xmm8 += a2 * b4;
12387  }
12388 
12389  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12390  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
12391  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
12392  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12393  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
12394  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12395  (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
12396  (~C).store( i+SIMDSIZE, j+3UL, (~C).load(i+SIMDSIZE,j+3UL) - xmm8 * factor );
12397  }
12398 
12399  for( ; (j+3UL) <= jend; j+=3UL )
12400  {
12401  const size_t kbegin( ( IsLower<MT5>::value )
12402  ?( ( IsUpper<MT4>::value )
12403  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12404  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12405  :( IsUpper<MT4>::value ? i : 0UL ) );
12406  const size_t kend( ( IsUpper<MT5>::value )
12407  ?( ( IsLower<MT4>::value )
12408  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) ) )
12409  :( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL ) )
12410  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
12411 
12412  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12413 
12414  for( size_t k=kbegin; k<kend; ++k ) {
12415  const SIMDType a1( A.load(i ,k) );
12416  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12417  const SIMDType b1( set( B(k,j ) ) );
12418  const SIMDType b2( set( B(k,j+1UL) ) );
12419  const SIMDType b3( set( B(k,j+2UL) ) );
12420  xmm1 += a1 * b1;
12421  xmm2 += a2 * b1;
12422  xmm3 += a1 * b2;
12423  xmm4 += a2 * b2;
12424  xmm5 += a1 * b3;
12425  xmm6 += a2 * b3;
12426  }
12427 
12428  (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
12429  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
12430  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
12431  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
12432  (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
12433  (~C).store( i+SIMDSIZE, j+2UL, (~C).load(i+SIMDSIZE,j+2UL) - xmm6 * factor );
12434  }
12435 
12436  for( ; (j+2UL) <= jend; j+=2UL )
12437  {
12438  const size_t kbegin( ( IsLower<MT5>::value )
12439  ?( ( IsUpper<MT4>::value )
12440  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12441  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12442  :( IsUpper<MT4>::value ? i : 0UL ) );
12443  const size_t kend( ( IsUpper<MT5>::value )
12444  ?( ( IsLower<MT4>::value )
12445  ?( min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
12446  :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
12447  :( IsLower<MT4>::value ? min( i+SIMDSIZE*2UL, K ) : K ) );
12448 
12449  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12450  size_t k( kbegin );
12451 
12452  for( ; (k+2UL) <= kend; k+=2UL ) {
12453  const SIMDType a1( A.load(i ,k ) );
12454  const SIMDType a2( A.load(i+SIMDSIZE,k ) );
12455  const SIMDType a3( A.load(i ,k+1UL) );
12456  const SIMDType a4( A.load(i+SIMDSIZE,k+1UL) );
12457  const SIMDType b1( set( B(k ,j ) ) );
12458  const SIMDType b2( set( B(k ,j+1UL) ) );
12459  const SIMDType b3( set( B(k+1UL,j ) ) );
12460  const SIMDType b4( set( B(k+1UL,j+1UL) ) );
12461  xmm1 += a1 * b1;
12462  xmm2 += a2 * b1;
12463  xmm3 += a1 * b2;
12464  xmm4 += a2 * b2;
12465  xmm5 += a3 * b3;
12466  xmm6 += a4 * b3;
12467  xmm7 += a3 * b4;
12468  xmm8 += a4 * b4;
12469  }
12470 
12471  for( ; k<kend; ++k ) {
12472  const SIMDType a1( A.load(i ,k) );
12473  const SIMDType a2( A.load(i+SIMDSIZE,k) );
12474  const SIMDType b1( set( B(k,j ) ) );
12475  const SIMDType b2( set( B(k,j+1UL) ) );
12476  xmm1 += a1 * b1;
12477  xmm2 += a2 * b1;
12478  xmm3 += a1 * b2;
12479  xmm4 += a2 * b2;
12480  }
12481 
12482  (~C).store( i , j , (~C).load(i ,j ) - (xmm1+xmm5) * factor );
12483  (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - (xmm2+xmm6) * factor );
12484  (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - (xmm3+xmm7) * factor );
12485  (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
12486  }
12487 
12488  if( j < jend )
12489  {
12490  const size_t kbegin( ( IsLower<MT5>::value )
12491  ?( ( IsUpper<MT4>::value )
12492  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12493  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12494  :( IsUpper<MT4>::value ? i : 0UL ) );
12495  const size_t kend( ( IsLower<MT4>::value )?( min( i+SIMDSIZE*2UL, K ) ):( K ) );
12496 
12497  SIMDType xmm1, xmm2, xmm3, xmm4;
12498  size_t k( kbegin );
12499 
12500  for( ; (k+2UL) <= kend; k+=2UL ) {
12501  const SIMDType b1( set( B(k ,j) ) );
12502  const SIMDType b2( set( B(k+1UL,j) ) );
12503  xmm1 += A.load(i ,k ) * b1;
12504  xmm2 += A.load(i+SIMDSIZE,k ) * b1;
12505  xmm3 += A.load(i ,k+1UL) * b2;
12506  xmm4 += A.load(i+SIMDSIZE,k+1UL) * b2;
12507  }
12508 
12509  for( ; k<kend; ++k ) {
12510  const SIMDType b1( set( B(k,j) ) );
12511  xmm1 += A.load(i ,k) * b1;
12512  xmm2 += A.load(i+SIMDSIZE,k) * b1;
12513  }
12514 
12515  (~C).store( i , j, (~C).load(i ,j) - (xmm1+xmm3) * factor );
12516  (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - (xmm2+xmm4) * factor );
12517  }
12518  }
12519 
12520  for( ; i<ipos; i+=SIMDSIZE )
12521  {
12522  const size_t jend( LOW && UPP ? min(i+SIMDSIZE,N) : N );
12523  size_t j( UPP ? i : 0UL );
12524 
12525  for( ; (j+4UL) <= jend; j+=4UL )
12526  {
12527  const size_t kbegin( ( IsLower<MT5>::value )
12528  ?( ( IsUpper<MT4>::value )
12529  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12530  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12531  :( IsUpper<MT4>::value ? i : 0UL ) );
12532  const size_t kend( ( IsUpper<MT5>::value )
12533  ?( IsStrictlyUpper<MT5>::value ? j+3UL : j+4UL )
12534  :( K ) );
12535 
12536  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12537  size_t k( kbegin );
12538 
12539  for( ; (k+2UL) <= kend; k+=2UL ) {
12540  const SIMDType a1( A.load(i,k ) );
12541  const SIMDType a2( A.load(i,k+1UL) );
12542  xmm1 += a1 * set( B(k ,j ) );
12543  xmm2 += a1 * set( B(k ,j+1UL) );
12544  xmm3 += a1 * set( B(k ,j+2UL) );
12545  xmm4 += a1 * set( B(k ,j+3UL) );
12546  xmm5 += a2 * set( B(k+1UL,j ) );
12547  xmm6 += a2 * set( B(k+1UL,j+1UL) );
12548  xmm7 += a2 * set( B(k+1UL,j+2UL) );
12549  xmm8 += a2 * set( B(k+1UL,j+3UL) );
12550  }
12551 
12552  for( ; k<kend; ++k ) {
12553  const SIMDType a1( A.load(i,k) );
12554  xmm1 += a1 * set( B(k,j ) );
12555  xmm2 += a1 * set( B(k,j+1UL) );
12556  xmm3 += a1 * set( B(k,j+2UL) );
12557  xmm4 += a1 * set( B(k,j+3UL) );
12558  }
12559 
12560  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm5) * factor );
12561  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm6) * factor );
12562  (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm7) * factor );
12563  (~C).store( i, j+3UL, (~C).load(i,j+3UL) - (xmm4+xmm8) * factor );
12564  }
12565 
12566  for( ; (j+3UL) <= jend; j+=3UL )
12567  {
12568  const size_t kbegin( ( IsLower<MT5>::value )
12569  ?( ( IsUpper<MT4>::value )
12570  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12571  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12572  :( IsUpper<MT4>::value ? i : 0UL ) );
12573  const size_t kend( ( IsUpper<MT5>::value )
12574  ?( IsStrictlyUpper<MT5>::value ? j+2UL : j+3UL )
12575  :( K ) );
12576 
12577  SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12578  size_t k( kbegin );
12579 
12580  for( ; (k+2UL) <= kend; k+=2UL ) {
12581  const SIMDType a1( A.load(i,k ) );
12582  const SIMDType a2( A.load(i,k+1UL) );
12583  xmm1 += a1 * set( B(k ,j ) );
12584  xmm2 += a1 * set( B(k ,j+1UL) );
12585  xmm3 += a1 * set( B(k ,j+2UL) );
12586  xmm4 += a2 * set( B(k+1UL,j ) );
12587  xmm5 += a2 * set( B(k+1UL,j+1UL) );
12588  xmm6 += a2 * set( B(k+1UL,j+2UL) );
12589  }
12590 
12591  for( ; k<kend; ++k ) {
12592  const SIMDType a1( A.load(i,k) );
12593  xmm1 += a1 * set( B(k,j ) );
12594  xmm2 += a1 * set( B(k,j+1UL) );
12595  xmm3 += a1 * set( B(k,j+2UL) );
12596  }
12597 
12598  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm4) * factor );
12599  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm5) * factor );
12600  (~C).store( i, j+2UL, (~C).load(i,j+2UL) - (xmm3+xmm6) * factor );
12601  }
12602 
12603  for( ; (j+2UL) <= jend; j+=2UL )
12604  {
12605  const size_t kbegin( ( IsLower<MT5>::value )
12606  ?( ( IsUpper<MT4>::value )
12607  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12608  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12609  :( IsUpper<MT4>::value ? i : 0UL ) );
12610  const size_t kend( ( IsUpper<MT5>::value )
12611  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
12612  :( K ) );
12613 
12614  SIMDType xmm1, xmm2, xmm3, xmm4;
12615  size_t k( kbegin );
12616 
12617  for( ; (k+2UL) <= kend; k+=2UL ) {
12618  const SIMDType a1( A.load(i,k ) );
12619  const SIMDType a2( A.load(i,k+1UL) );
12620  xmm1 += a1 * set( B(k ,j ) );
12621  xmm2 += a1 * set( B(k ,j+1UL) );
12622  xmm3 += a2 * set( B(k+1UL,j ) );
12623  xmm4 += a2 * set( B(k+1UL,j+1UL) );
12624  }
12625 
12626  for( ; k<kend; ++k ) {
12627  const SIMDType a1( A.load(i,k) );
12628  xmm1 += a1 * set( B(k,j ) );
12629  xmm2 += a1 * set( B(k,j+1UL) );
12630  }
12631 
12632  (~C).store( i, j , (~C).load(i,j ) - (xmm1+xmm3) * factor );
12633  (~C).store( i, j+1UL, (~C).load(i,j+1UL) - (xmm2+xmm4) * factor );
12634  }
12635 
12636  if( j < jend )
12637  {
12638  const size_t kbegin( ( IsLower<MT5>::value )
12639  ?( ( IsUpper<MT4>::value )
12640  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12641  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12642  :( IsUpper<MT4>::value ? i : 0UL ) );
12643 
12644  SIMDType xmm1, xmm2;
12645  size_t k( kbegin );
12646 
12647  for( ; (k+2UL) <= K; k+=2UL ) {
12648  xmm1 += A.load(i,k ) * set( B(k ,j) );
12649  xmm2 += A.load(i,k+1UL) * set( B(k+1UL,j) );
12650  }
12651 
12652  for( ; k<K; ++k ) {
12653  xmm1 += A.load(i,k) * set( B(k,j) );
12654  }
12655 
12656  (~C).store( i, j, (~C).load(i,j) - (xmm1+xmm2) * factor );
12657  }
12658  }
12659 
12660  for( ; remainder && i<M; ++i )
12661  {
12662  const size_t jend( LOW ? i+1UL : N );
12663  size_t j( UPP ? i : 0UL );
12664 
12665  for( ; (j+2UL) <= jend; j+=2UL )
12666  {
12667  const size_t kbegin( ( IsLower<MT5>::value )
12668  ?( ( IsUpper<MT4>::value )
12669  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12670  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12671  :( IsUpper<MT4>::value ? i : 0UL ) );
12672  const size_t kend( ( IsUpper<MT5>::value )
12673  ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
12674  :( K ) );
12675 
12676  ElementType value1{};
12677  ElementType value2{};
12678 
12679  for( size_t k=kbegin; k<kend; ++k ) {
12680  value1 += A(i,k) * B(k,j );
12681  value2 += A(i,k) * B(k,j+1UL);
12682  }
12683 
12684  (~C)(i,j ) -= value1 * scalar;
12685  (~C)(i,j+1UL) -= value2 * scalar;
12686  }
12687 
12688  if( j < jend )
12689  {
12690  const size_t kbegin( ( IsLower<MT5>::value )
12691  ?( ( IsUpper<MT4>::value )
12692  ?( max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
12693  :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
12694  :( IsUpper<MT4>::value ? i : 0UL ) );
12695 
12696  ElementType value{};
12697 
12698  for( size_t k=kbegin; k<K; ++k ) {
12699  value += A(i,k) * B(k,j);
12700  }
12701 
12702  (~C)(i,j) -= value * scalar;
12703  }
12704  }
12705  }
12706  //**********************************************************************************************
12707 
12708  //**Default subtraction assignment to dense matrices (large matrices)***************************
12722  template< typename MT3 // Type of the left-hand side target matrix
12723  , typename MT4 // Type of the left-hand side matrix operand
12724  , typename MT5 // Type of the right-hand side matrix operand
12725  , typename ST2 > // Type of the scalar value
12727  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12728  {
12729  selectDefaultSubAssignKernel( C, A, B, scalar );
12730  }
12731  //**********************************************************************************************
12732 
12733  //**Vectorized default subtraction assignment to dense matrices (large matrices)****************
12748  template< typename MT3 // Type of the left-hand side target matrix
12749  , typename MT4 // Type of the left-hand side matrix operand
12750  , typename MT5 // Type of the right-hand side matrix operand
12751  , typename ST2 > // Type of the scalar value
12753  selectLargeSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12754  {
12755  if( LOW )
12756  lmmm( C, A, B, -scalar, ST2(1) );
12757  else if( UPP )
12758  ummm( C, A, B, -scalar, ST2(1) );
12759  else
12760  mmm( C, A, B, -scalar, ST2(1) );
12761  }
12762  //**********************************************************************************************
12763 
12764  //**BLAS-based subtraction assignment to dense matrices (default)*******************************
12778  template< typename MT3 // Type of the left-hand side target matrix
12779  , typename MT4 // Type of the left-hand side matrix operand
12780  , typename MT5 // Type of the right-hand side matrix operand
12781  , typename ST2 > // Type of the scalar value
12783  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12784  {
12785  selectLargeSubAssignKernel( C, A, B, scalar );
12786  }
12787  //**********************************************************************************************
12788 
12789  //**BLAS-based subraction assignment to dense matrices******************************************
12790 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
12791 
12804  template< typename MT3 // Type of the left-hand side target matrix
12805  , typename MT4 // Type of the left-hand side matrix operand
12806  , typename MT5 // Type of the right-hand side matrix operand
12807  , typename ST2 > // Type of the scalar value
12809  selectBlasSubAssignKernel( MT3& C, const MT4& A, const MT5& B, ST2 scalar )
12810  {
12811  using ET = ElementType_<MT3>;
12812 
12813  if( IsTriangular<MT4>::value ) {
12814  ResultType_<MT3> tmp( serial( B ) );
12815  trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
12816  subAssign( C, tmp );
12817  }
12818  else if( IsTriangular<MT5>::value ) {
12819  ResultType_<MT3> tmp( serial( A ) );
12820  trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
12821  subAssign( C, tmp );
12822  }
12823  else {
12824  gemm( C, A, B, ET(-scalar), ET(1) );
12825  }
12826  }
12827 #endif
12828  //**********************************************************************************************
12829 
12830  //**Subtraction assignment to sparse matrices***************************************************
12831  // No special implementation for the subtraction assignment to sparse matrices.
12832  //**********************************************************************************************
12833 
12834  //**Schur product assignment to dense matrices**************************************************
12846  template< typename MT // Type of the target dense matrix
12847  , bool SO > // Storage order of the target dense matrix
12848  friend inline void schurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12849  {
12851 
12855 
12856  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12857  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12858 
12859  const ResultType tmp( serial( rhs ) );
12860  schurAssign( ~lhs, tmp );
12861  }
12862  //**********************************************************************************************
12863 
12864  //**Schur product assignment to sparse matrices*************************************************
12865  // No special implementation for the Schur product assignment to sparse matrices.
12866  //**********************************************************************************************
12867 
12868  //**Multiplication assignment to dense matrices*************************************************
12869  // No special implementation for the multiplication assignment to dense matrices.
12870  //**********************************************************************************************
12871 
12872  //**Multiplication assignment to sparse matrices************************************************
12873  // No special implementation for the multiplication assignment to sparse matrices.
12874  //**********************************************************************************************
12875 
12876  //**SMP assignment to dense matrices************************************************************
12891  template< typename MT // Type of the target dense matrix
12892  , bool SO > // Storage order of the target dense matrix
12894  smpAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
12895  {
12897 
12898  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12899  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12900 
12901  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
12902  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
12903 
12904  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL ) {
12905  return;
12906  }
12907  else if( left.columns() == 0UL ) {
12908  reset( ~lhs );
12909  return;
12910  }
12911 
12912  LT A( left ); // Evaluation of the left-hand side dense matrix operand
12913  RT B( right ); // Evaluation of the right-hand side dense matrix operand
12914 
12915  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
12916  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
12917  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
12918  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
12919  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
12920  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
12921 
12922  smpAssign( ~lhs, A * B * rhs.scalar_ );
12923  }
12924  //**********************************************************************************************
12925 
12926  //**SMP assignment to sparse matrices***********************************************************
12941  template< typename MT // Type of the target sparse matrix
12942  , bool SO > // Storage order of the target sparse matrix
12945  {
12947 
12949 
12956 
12957  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12958  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12959 
12960  const ForwardFunctor fwd;
12961 
12962  const TmpType tmp( rhs );
12963  smpAssign( ~lhs, fwd( tmp ) );
12964  }
12965  //**********************************************************************************************
12966 
12967  //**SMP addition assignment to dense matrices***************************************************
12982  template< typename MT // Type of the target dense matrix
12983  , bool SO > // Storage order of the target dense matrix
12986  {
12988 
12989  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
12990  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
12991 
12992  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
12993  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
12994 
12995  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
12996  return;
12997  }
12998 
12999  LT A( left ); // Evaluation of the left-hand side dense matrix operand
13000  RT B( right ); // Evaluation of the right-hand side dense matrix operand
13001 
13002  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13003  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13004  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13005  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13006  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
13007  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
13008 
13009  smpAddAssign( ~lhs, A * B * rhs.scalar_ );
13010  }
13011  //**********************************************************************************************
13012 
13013  //**SMP addition assignment to sparse matrices**************************************************
13014  // No special implementation for the SMP addition assignment to sparse matrices.
13015  //**********************************************************************************************
13016 
13017  //**SMP subtraction assignment to dense matrices************************************************
13032  template< typename MT // Type of the target dense matrix
13033  , bool SO > // Storage order of the target dense matrix
13036  {
13038 
13039  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13040  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13041 
13042  LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
13043  RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
13044 
13045  if( (~lhs).rows() == 0UL || (~lhs).columns() == 0UL || left.columns() == 0UL ) {
13046  return;
13047  }
13048 
13049  LT A( left ); // Evaluation of the left-hand side dense matrix operand
13050  RT B( right ); // Evaluation of the right-hand side dense matrix operand
13051 
13052  BLAZE_INTERNAL_ASSERT( A.rows() == left.rows() , "Invalid number of rows" );
13053  BLAZE_INTERNAL_ASSERT( A.columns() == left.columns() , "Invalid number of columns" );
13054  BLAZE_INTERNAL_ASSERT( B.rows() == right.rows() , "Invalid number of rows" );
13055  BLAZE_INTERNAL_ASSERT( B.columns() == right.columns() , "Invalid number of columns" );
13056  BLAZE_INTERNAL_ASSERT( A.rows() == (~lhs).rows() , "Invalid number of rows" );
13057  BLAZE_INTERNAL_ASSERT( B.columns() == (~lhs).columns(), "Invalid number of columns" );
13058 
13059  smpSubAssign( ~lhs, A * B * rhs.scalar_ );
13060  }
13061  //**********************************************************************************************
13062 
13063  //**SMP subtraction assignment to sparse matrices***********************************************
13064  // No special implementation for the SMP subtraction assignment to sparse matrices.
13065  //**********************************************************************************************
13066 
13067  //**SMP Schur product assignment to dense matrices**********************************************
13079  template< typename MT // Type of the target dense matrix
13080  , bool SO > // Storage order of the target dense matrix
13081  friend inline void smpSchurAssign( DenseMatrix<MT,SO>& lhs, const DMatScalarMultExpr& rhs )
13082  {
13084 
13088 
13089  BLAZE_INTERNAL_ASSERT( (~lhs).rows() == rhs.rows() , "Invalid number of rows" );
13090  BLAZE_INTERNAL_ASSERT( (~lhs).columns() == rhs.columns(), "Invalid number of columns" );
13091 
13092  const ResultType tmp( rhs );
13093  smpSchurAssign( ~lhs, tmp );
13094  }
13095  //**********************************************************************************************
13096 
13097  //**SMP Schur product assignment to sparse matrices*********************************************
13098  // No special implementation for the SMP Schur product assignment to sparse matrices.
13099  //**********************************************************************************************
13100 
13101  //**SMP multiplication assignment to dense matrices*********************************************
13102  // No special implementation for the SMP multiplication assignment to dense matrices.
13103  //**********************************************************************************************
13104 
13105  //**SMP multiplication assignment to sparse matrices********************************************
13106  // No special implementation for the SMP multiplication assignment to sparse matrices.
13107  //**********************************************************************************************
13108 
13109  //**Compile time checks*************************************************************************
13118  //**********************************************************************************************
13119 };
13121 //*************************************************************************************************
13122 
13123 
13124 
13125 
13126 //=================================================================================================
13127 //
13128 // GLOBAL BINARY ARITHMETIC OPERATORS
13129 //
13130 //=================================================================================================
13131 
13132 //*************************************************************************************************
13162 template< typename MT1 // Type of the left-hand side dense matrix
13163  , typename MT2 > // Type of the right-hand side dense matrix
13164 inline decltype(auto)
13165  operator*( const DenseMatrix<MT1,true>& lhs, const DenseMatrix<MT2,false>& rhs )
13166 {
13168 
13169  if( (~lhs).columns() != (~rhs).rows() ) {
13170  BLAZE_THROW_INVALID_ARGUMENT( "Matrix sizes do not match" );
13171  }
13172 
13174  return ReturnType( ~lhs, ~rhs );
13175 }
13176 //*************************************************************************************************
13177 
13178 
13179 
13180 
13181 //=================================================================================================
13182 //
13183 // GLOBAL FUNCTIONS
13184 //
13185 //=================================================================================================
13186 
13187 //*************************************************************************************************
13212 template< typename MT1 // Type of the left-hand side dense matrix
13213  , typename MT2 // Type of the right-hand side dense matrix
13214  , bool SF // Symmetry flag
13215  , bool HF // Hermitian flag
13216  , bool LF // Lower flag
13217  , bool UF > // Upper flag
13218 inline decltype(auto) declsym( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13219 {
13221 
13222  if( !isSquare( dm ) ) {
13223  BLAZE_THROW_INVALID_ARGUMENT( "Invalid symmetric matrix specification" );
13224  }
13225 
13227  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13228 }
13230 //*************************************************************************************************
13231 
13232 
13233 //*************************************************************************************************
13258 template< typename MT1 // Type of the left-hand side dense matrix
13259  , typename MT2 // Type of the right-hand side dense matrix
13260  , bool SF // Symmetry flag
13261  , bool HF // Hermitian flag
13262  , bool LF // Lower flag
13263  , bool UF > // Upper flag
13264 inline decltype(auto) declherm( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13265 {
13267 
13268  if( !isSquare( dm ) ) {
13269  BLAZE_THROW_INVALID_ARGUMENT( "Invalid Hermitian matrix specification" );
13270  }
13271 
13273  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13274 }
13276 //*************************************************************************************************
13277 
13278 
13279 //*************************************************************************************************
13304 template< typename MT1 // Type of the left-hand side dense matrix
13305  , typename MT2 // Type of the right-hand side dense matrix
13306  , bool SF // Symmetry flag
13307  , bool HF // Hermitian flag
13308  , bool LF // Lower flag
13309  , bool UF > // Upper flag
13310 inline decltype(auto) decllow( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13311 {
13313 
13314  if( !isSquare( dm ) ) {
13315  BLAZE_THROW_INVALID_ARGUMENT( "Invalid lower matrix specification" );
13316  }
13317 
13319  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13320 }
13322 //*************************************************************************************************
13323 
13324 
13325 //*************************************************************************************************
13350 template< typename MT1 // Type of the left-hand side dense matrix
13351  , typename MT2 // Type of the right-hand side dense matrix
13352  , bool SF // Symmetry flag
13353  , bool HF // Hermitian flag
13354  , bool LF // Lower flag
13355  , bool UF > // Upper flag
13356 inline decltype(auto) declupp( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13357 {
13359 
13360  if( !isSquare( dm ) ) {
13361  BLAZE_THROW_INVALID_ARGUMENT( "Invalid upper matrix specification" );
13362  }
13363 
13365  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13366 }
13368 //*************************************************************************************************
13369 
13370 
13371 //*************************************************************************************************
13396 template< typename MT1 // Type of the left-hand side dense matrix
13397  , typename MT2 // Type of the right-hand side dense matrix
13398  , bool SF // Symmetry flag
13399  , bool HF // Hermitian flag
13400  , bool LF // Lower flag
13401  , bool UF > // Upper flag
13402 inline decltype(auto) decldiag( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13403 {
13405 
13406  if( !isSquare( dm ) ) {
13407  BLAZE_THROW_INVALID_ARGUMENT( "Invalid diagonal matrix specification" );
13408  }
13409 
13411  return ReturnType( dm.leftOperand(), dm.rightOperand() );
13412 }
13414 //*************************************************************************************************
13415 
13416 
13417 
13418 
13419 //=================================================================================================
13420 //
13421 // SIZE SPECIALIZATIONS
13422 //
13423 //=================================================================================================
13424 
13425 //*************************************************************************************************
13427 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13428 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
13429  : public Size<MT1,0UL>
13430 {};
13431 
13432 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13433 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
13434  : public Size<MT2,1UL>
13435 {};
13437 //*************************************************************************************************
13438 
13439 
13440 
13441 
13442 //=================================================================================================
13443 //
13444 // ISALIGNED SPECIALIZATIONS
13445 //
13446 //=================================================================================================
13447 
13448 //*************************************************************************************************
13450 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13451 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13452  : public And< IsAligned<MT1>, IsAligned<MT2> >
13453 {};
13455 //*************************************************************************************************
13456 
13457 
13458 
13459 
13460 //=================================================================================================
13461 //
13462 // ISSYMMETRIC SPECIALIZATIONS
13463 //
13464 //=================================================================================================
13465 
13466 //*************************************************************************************************
13468 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13469 struct IsSymmetric< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13470  : public Or< Bool<SF>
13471  , And< Bool<HF>
13472  , IsBuiltin< ElementType_< TDMatDMatMultExpr<MT1,MT2,false,true,false,false> > > >
13473  , And< Bool<LF>, Bool<UF> > >
13474 {};
13476 //*************************************************************************************************
13477 
13478 
13479 
13480 
13481 //=================================================================================================
13482 //
13483 // ISHERMITIAN SPECIALIZATIONS
13484 //
13485 //=================================================================================================
13486 
13487 //*************************************************************************************************
13489 template< typename MT1, typename MT2, bool SF, bool LF, bool UF >
13490 struct IsHermitian< TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF> >
13491  : public TrueType
13492 {};
13494 //*************************************************************************************************
13495 
13496 
13497 
13498 
13499 //=================================================================================================
13500 //
13501 // ISLOWER SPECIALIZATIONS
13502 //
13503 //=================================================================================================
13504 
13505 //*************************************************************************************************
13507 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13508 struct IsLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13509  : public Or< Bool<LF>
13510  , And< IsLower<MT1>, IsLower<MT2> >
13511  , And< Or< Bool<SF>, Bool<HF> >
13512  , IsUpper<MT1>, IsUpper<MT2> > >
13513 {};
13515 //*************************************************************************************************
13516 
13517 
13518 
13519 
13520 //=================================================================================================
13521 //
13522 // ISUNILOWER SPECIALIZATIONS
13523 //
13524 //=================================================================================================
13525 
13526 //*************************************************************************************************
13528 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13529 struct IsUniLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13530  : public Or< And< IsUniLower<MT1>, IsUniLower<MT2> >
13531  , And< Or< Bool<SF>, Bool<HF> >
13532  , IsUniUpper<MT1>, IsUniUpper<MT2> > >
13533 {};
13535 //*************************************************************************************************
13536 
13537 
13538 
13539 
13540 //=================================================================================================
13541 //
13542 // ISSTRICTLYLOWER SPECIALIZATIONS
13543 //
13544 //=================================================================================================
13545 
13546 //*************************************************************************************************
13548 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13549 struct IsStrictlyLower< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13550  : public Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
13551  , And< IsStrictlyLower<MT2>, IsLower<MT1> >
13552  , And< Or< Bool<SF>, Bool<HF> >
13553  , Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
13554  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > > > >
13555 {};
13557 //*************************************************************************************************
13558 
13559 
13560 
13561 
13562 //=================================================================================================
13563 //
13564 // ISUPPER SPECIALIZATIONS
13565 //
13566 //=================================================================================================
13567 
13568 //*************************************************************************************************
13570 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13571 struct IsUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13572  : public Or< Bool<UF>
13573  , And< IsUpper<MT1>, IsUpper<MT2> >
13574  , And< Or< Bool<SF>, Bool<HF> >
13575  , IsLower<MT1>, IsLower<MT2> > >
13576 {};
13578 //*************************************************************************************************
13579 
13580 
13581 
13582 
13583 //=================================================================================================
13584 //
13585 // ISUNIUPPER SPECIALIZATIONS
13586 //
13587 //=================================================================================================
13588 
13589 //*************************************************************************************************
13591 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13592 struct IsUniUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13593  : public Or< And< IsUniUpper<MT1>, IsUniUpper<MT2> >
13594  , And< Or< Bool<SF>, Bool<HF> >
13595  , IsUniLower<MT1>, IsUniLower<MT2> > >
13596 {};
13598 //*************************************************************************************************
13599 
13600 
13601 
13602 
13603 //=================================================================================================
13604 //
13605 // ISSTRICTLYUPPER SPECIALIZATIONS
13606 //
13607 //=================================================================================================
13608 
13609 //*************************************************************************************************
13611 template< typename MT1, typename MT2, bool SF, bool HF, bool LF, bool UF >
13612 struct IsStrictlyUpper< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13613  : public Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
13614  , And< IsStrictlyUpper<MT2>, IsUpper<MT1> >
13615  , And< Or< Bool<SF>, Bool<HF> >
13616  , Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
13617  , And< IsStrictlyLower<MT2>, IsLower<MT1> > > > >
13618 {};
13620 //*************************************************************************************************
13621 
13622 } // namespace blaze
13623 
13624 #endif
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:174
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:131
Data type constraint.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
Constraint on the data type.
Header file for kernel specific block sizes.
If_< IsExpression< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:270
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:996
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:264
Header file for the IsUniUpper type trait.
EnableIf_< IsDenseMatrix< MT1 > > smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:196
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:468
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:544
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:469
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:173
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:617
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:534
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
Base class for all matrix/scalar multiplication expression templates.The MatScalarMultExpr class serv...
Definition: MatScalarMultExpr.h:67
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1903
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:276
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1026
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
Constraints on the storage order of matrix types.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:402
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:412
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1950
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
Base class for sparse matrices.The SparseMatrix class is a base class for all sparse matrix classes...
Definition: Forward.h:129
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:260
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:392
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:148
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:261
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Compile time check for upper unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniUpper.h:86
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Headerfile for the generic max algorithm.
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:424
Compile time check for row-major matrix types.This type trait tests whether or not the given template...
Definition: IsRowMajorMatrix.h:110
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
Header file for the Or class template.
Expression object for dense matrix-scalar multiplications.The DMatScalarMultExpr class represents the...
Definition: DMatScalarMultExpr.h:107
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:263
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Not class template.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:258
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:382
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:273
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1026
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
Compile time check for strictly triangular matrix types.This type trait tests whether or not the give...
Definition: IsStrictlyTriangular.h:86
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsStrictlyTriangular type trait.
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for symmetric matrices.This type trait tests whether or not the given template par...
Definition: IsSymmetric.h:85
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:616
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:430
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:456
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:107
Compile time check for lower unitriangular matrices.This type trait tests whether or not the given te...
Definition: IsUniLower.h:86
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:436
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:366
Header file for the conjugate shim.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:262
Header file for run time assertion macros.
Compile time check for column-major matrix types.This type trait tests whether or not the given templ...
Definition: IsColumnMajorMatrix.h:110
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:259
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:131
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:175
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:302
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time type negation.The Not alias declaration negates the given compile time condition...
Definition: Not.h:70
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1028
Compile time check for Hermitian matrices.This type trait tests whether or not the given template par...
Definition: IsHermitian.h:85
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:172
Compile time check for integral data types.This type trait tests whether or not the given template pa...
Definition: IsIntegral.h:75
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:263
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
If_< IsExpression< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:267
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1028
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:446
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:151
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:317
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time logical &#39;or&#39; evaluation.The Or alias declaration performs at compile time a logical &#39;or&#39;...
Definition: Or.h:76
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Compile time logical &#39;and&#39; evaluation.The And alias declaration performs at compile time a logical &#39;a...
Definition: And.h:76
Header file for the DeclHerm functor.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1321
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
BLAZE_ALWAYS_INLINE bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:908
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the Bool class template.
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.